Commit 382c958e authored by Mingming Cao's avatar Mingming Cao Committed by Linus Torvalds

[PATCH] ext3 block reservations

rbtree implementation and other changes From: Stephen Tweedie <sct@redhat.com>
contributions From: Badari Pulavarty <pbadari@us.ibm.com> and probably me.

This is the ext3 block reservation patch.  It improves the layout of ext3
files by establishing, for each inode, reserved areas of the disk in which
only that file can allocate blocks.  Those reserved areas are managed in an
rbtree, via the in-core inode.

It's a bit like ext2 preallocation only stronger in that it can span
already-allocated blocks, including the per-blockgroup inode tables and
bitmaps.

The patch fixes ext3's worst performance problem: disastrous layout when
multiple files are being concurrently grown.

It increases the size of the inode by rather a lot.  A todo item is to
dynamically allocate the `struct reserve_window_node', so we don't need to
carry this storage for inodes which aren't opened for writing.

The feature is enabled by mounting with the "reservation" mount option. 
Reservations default to "off".
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 9a995d44
......@@ -96,9 +96,185 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
error_out:
return bh;
}
/*
* The reservation window structure operations
* --------------------------------------------
* Operations include:
* dump, find, add, remove, is_empty, find_next_reservable_window, etc.
*
* We use sorted double linked list for the per-filesystem reservation
* window list. (like in vm_region).
*
* Initially, we keep those small operations in the abstract functions,
* so later if we need a better searching tree than double linked-list,
* we could easily switch to that without changing too much
* code.
*/
#if 0
static void __rsv_window_dump(struct rb_root *root, int verbose,
const char *fn)
{
struct rb_node *n;
struct reserve_window_node *rsv, *prev;
int bad;
restart:
n = rb_first(root);
bad = 0;
prev = NULL;
printk("Block Allocation Reservation Windows Map (%s):\n", fn);
while (n) {
rsv = list_entry(n, struct reserve_window_node, rsv_node);
if (verbose)
printk("reservation window 0x%p "
"start: %d, end: %d\n",
rsv, rsv->rsv_start, rsv->rsv_end);
if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
printk("Bad reservation %p (start >= end)\n",
rsv);
bad = 1;
}
if (prev && prev->rsv_end >= rsv->rsv_start) {
printk("Bad reservation %p (prev->end >= start)\n",
rsv);
bad = 1;
}
if (bad) {
if (!verbose) {
printk("Restarting reservation walk in verbose mode\n");
verbose = 1;
goto restart;
}
}
n = rb_next(n);
prev = rsv;
}
printk("Window map complete.\n");
if (bad)
BUG();
}
#define rsv_window_dump(root, verbose) \
__rsv_window_dump((root), (verbose), __FUNCTION__)
#else
#define rsv_window_dump(root, verbose) do {} while (0)
#endif
static int
goal_in_my_reservation(struct reserve_window *rsv, int goal,
unsigned int group, struct super_block * sb)
{
unsigned long group_first_block, group_last_block;
group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
group * EXT3_BLOCKS_PER_GROUP(sb);
group_last_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
if ((rsv->_rsv_start > group_last_block) ||
(rsv->_rsv_end < group_first_block))
return 0;
if ((goal >= 0) && ((goal + group_first_block < rsv->_rsv_start)
|| (goal + group_first_block > rsv->_rsv_end)))
return 0;
return 1;
}
/*
* Find the reserved window which includes the goal, or the previous one
* if the goal is not in any window.
* Returns NULL if there are no windows or if all windows start after the goal.
*/
static struct reserve_window_node *search_reserve_window(struct rb_root *root,
unsigned long goal)
{
struct rb_node *n = root->rb_node;
struct reserve_window_node *rsv;
if (!n)
return NULL;
while (n)
{
rsv = rb_entry(n, struct reserve_window_node, rsv_node);
if (goal < rsv->rsv_start)
n = n->rb_left;
else if (goal > rsv->rsv_end)
n = n->rb_right;
else
return rsv;
}
/*
* We've fallen off the end of the tree: the goal wasn't inside
* any particular node. OK, the previous node must be to one
* side of the interval containing the goal. If it's the RHS,
* we need to back up one.
*/
if (rsv->rsv_start > goal) {
n = rb_prev(&rsv->rsv_node);
rsv = rb_entry(n, struct reserve_window_node, rsv_node);
}
return rsv;
}
void rsv_window_add(struct super_block *sb,
struct reserve_window_node *rsv)
{
struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
struct rb_node *node = &rsv->rsv_node;
unsigned int start = rsv->rsv_start;
struct rb_node ** p = &root->rb_node;
struct rb_node * parent = NULL;
struct reserve_window_node *this;
while (*p)
{
parent = *p;
this = rb_entry(parent, struct reserve_window_node, rsv_node);
if (start < this->rsv_start)
p = &(*p)->rb_left;
else if (start > this->rsv_end)
p = &(*p)->rb_right;
else
BUG();
}
rb_link_node(node, parent, p);
rb_insert_color(node, root);
}
static void rsv_window_remove(struct super_block *sb,
struct reserve_window_node *rsv)
{
rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
atomic_set(&rsv->rsv_alloc_hit, 0);
rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root);
}
static inline int rsv_is_empty(struct reserve_window *rsv)
{
/* a valid reservation end block could not be 0 */
return (rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED);
}
void ext3_discard_reservation(struct inode *inode)
{
struct ext3_inode_info *ei = EXT3_I(inode);
struct reserve_window_node *rsv = &ei->i_rsv_window;
spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock;
if (!rsv_is_empty(&rsv->rsv_window)) {
spin_lock(rsv_lock);
rsv_window_remove(inode->i_sb, rsv);
spin_unlock(rsv_lock);
}
}
/* Free given blocks, update quota and i_blocks field */
void ext3_free_blocks (handle_t *handle, struct inode * inode,
void ext3_free_blocks(handle_t *handle, struct inode *inode,
unsigned long block, unsigned long count)
{
struct buffer_head *bitmap_bh = NULL;
......@@ -296,7 +472,7 @@ void ext3_free_blocks (handle_t *handle, struct inode * inode,
* data-writes at some point, and disable it for metadata allocations or
* sync-data inodes.
*/
static inline int ext3_test_allocatable(int nr, struct buffer_head *bh)
static int ext3_test_allocatable(int nr, struct buffer_head *bh)
{
int ret;
struct journal_head *jh = bh2jh(bh);
......@@ -313,6 +489,33 @@ static inline int ext3_test_allocatable(int nr, struct buffer_head *bh)
return ret;
}
static int
bitmap_search_next_usable_block(int start, struct buffer_head *bh,
int maxblocks)
{
int next;
struct journal_head *jh = bh2jh(bh);
/*
* The bitmap search --- search forward alternately through the actual
* bitmap and the last-committed copy until we find a bit free in
* both
*/
while (start < maxblocks) {
next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start);
if (next >= maxblocks)
return -1;
if (ext3_test_allocatable(next, bh))
return next;
jbd_lock_bh_state(bh);
if (jh->b_committed_data)
start = ext3_find_next_zero_bit(jh->b_committed_data,
maxblocks, next);
jbd_unlock_bh_state(bh);
}
return -1;
}
/*
* Find an allocatable block in a bitmap. We honour both the bitmap and
* its last-committed copy (if that exists), and perform the "most
......@@ -325,7 +528,6 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
{
int here, next;
char *p, *r;
struct journal_head *jh = bh2jh(bh);
if (start > 0) {
/*
......@@ -337,6 +539,8 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
* next 64-bit boundary is simple..
*/
int end_goal = (start + 63) & ~63;
if (end_goal > maxblocks)
end_goal = maxblocks;
here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
if (here < end_goal && ext3_test_allocatable(here, bh))
return here;
......@@ -351,7 +555,7 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
r = memscan(p, 0, (maxblocks - here + 7) >> 3);
next = (r - ((char *)bh->b_data)) << 3;
if (next < maxblocks && ext3_test_allocatable(next, bh))
if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
return next;
/*
......@@ -359,19 +563,8 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
* bitmap and the last-committed copy until we find a bit free in
* both
*/
while (here < maxblocks) {
next = ext3_find_next_zero_bit(bh->b_data, maxblocks, here);
if (next >= maxblocks)
return -1;
if (ext3_test_allocatable(next, bh))
return next;
jbd_lock_bh_state(bh);
if (jh->b_committed_data)
here = ext3_find_next_zero_bit(jh->b_committed_data,
maxblocks, next);
jbd_unlock_bh_state(bh);
}
return -1;
here = bitmap_search_next_usable_block(here, bh, maxblocks);
return here;
}
/*
......@@ -407,62 +600,484 @@ claim_block(spinlock_t *lock, int block, struct buffer_head *bh)
*/
static int
ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
struct buffer_head *bitmap_bh, int goal, int *errp)
struct buffer_head *bitmap_bh, int goal, struct reserve_window *my_rsv)
{
int i;
int fatal;
int credits = 0;
*errp = 0;
/*
* Make sure we use undo access for the bitmap, because it is critical
* that we do the frozen_data COW on bitmap buffers in all cases even
* if the buffer is in BJ_Forget state in the committing transaction.
*/
BUFFER_TRACE(bitmap_bh, "get undo access for new block");
fatal = ext3_journal_get_undo_access(handle, bitmap_bh, &credits);
if (fatal) {
*errp = fatal;
goto fail;
int group_first_block, start, end;
/* we do allocation within the reservation window if we have a window */
if (my_rsv) {
group_first_block =
le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
group * EXT3_BLOCKS_PER_GROUP(sb);
if (my_rsv->_rsv_start >= group_first_block)
start = my_rsv->_rsv_start - group_first_block;
else
/* reservation window cross group boundary */
start = 0;
end = my_rsv->_rsv_end - group_first_block + 1;
if (end > EXT3_BLOCKS_PER_GROUP(sb))
/* reservation window crosses group boundary */
end = EXT3_BLOCKS_PER_GROUP(sb);
if ((start <= goal) && (goal < end))
start = goal;
else
goal = -1;
} else {
if (goal > 0)
start = goal;
else
start = 0;
end = EXT3_BLOCKS_PER_GROUP(sb);
}
BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
repeat:
if (goal < 0 || !ext3_test_allocatable(goal, bitmap_bh)) {
goal = find_next_usable_block(goal, bitmap_bh,
EXT3_BLOCKS_PER_GROUP(sb));
goal = find_next_usable_block(start, bitmap_bh, end);
if (goal < 0)
goto fail_access;
for (i = 0; i < 7 && goal > 0 &&
ext3_test_allocatable(goal - 1, bitmap_bh);
i++, goal--);
if (!my_rsv) {
int i;
for (i = 0; i < 7 && goal > start &&
ext3_test_allocatable(goal - 1,
bitmap_bh);
i++, goal--)
;
}
}
start = goal;
if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) {
/*
* The block was allocated by another thread, or it was
* allocated and then freed by another thread
*/
start++;
goal++;
if (goal >= EXT3_BLOCKS_PER_GROUP(sb))
if (start >= end)
goto fail_access;
goto repeat;
}
return goal;
fail_access:
return -1;
}
BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for bitmap block");
fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
/**
* find_next_reservable_window():
* find a reservable space within the given range.
* It does not allocate the reservation window for now:
* alloc_new_reservation() will do the work later.
*
* @search_head: the head of the searching list;
* This is not necessarily the list head of the whole filesystem
*
* We have both head and start_block to assist the search
* for the reservable space. The list starts from head,
* but we will shift to the place where start_block is,
* then start from there, when looking for a reservable space.
*
* @size: the target new reservation window size
*
* @group_first_block: the first block we consider to start
* the real search from
*
* @last_block:
* the maximum block number that our goal reservable space
* could start from. This is normally the last block in this
* group. The search will end when we found the start of next
* possible reservable space is out of this boundary.
* This could handle the cross boundary reservation window
* request.
*
* basically we search from the given range, rather than the whole
* reservation double linked list, (start_block, last_block)
* to find a free region that is of my size and has not
* been reserved.
*
* on succeed, it returns the reservation window to be appended to.
* failed, return NULL.
*/
static struct reserve_window_node *find_next_reservable_window(
struct reserve_window_node *search_head,
unsigned long size, int *start_block,
int last_block)
{
struct rb_node *next;
struct reserve_window_node *rsv, *prev;
int cur;
/* TODO: make the start of the reservation window byte-aligned */
/* cur = *start_block & ~7;*/
cur = *start_block;
rsv = search_head;
if (!rsv)
return NULL;
while (1) {
if (cur <= rsv->rsv_end)
cur = rsv->rsv_end + 1;
/* TODO?
* in the case we could not find a reservable space
* that is what is expected, during the re-search, we could
* remember what's the largest reservable space we could have
* and return that one.
*
* For now it will fail if we could not find the reservable
* space with expected-size (or more)...
*/
if (cur > last_block)
return NULL; /* fail */
prev = rsv;
next = rb_next(&rsv->rsv_node);
rsv = list_entry(next, struct reserve_window_node, rsv_node);
/*
* Reached the last reservation, we can just append to the
* previous one.
*/
if (!next)
break;
if (cur + size <= rsv->rsv_start) {
/*
* Found a reserveable space big enough. We could
* have a reservation across the group boundary here
*/
break;
}
}
/*
* we come here either :
* when we reach the end of the whole list,
* and there is empty reservable space after last entry in the list.
* append it to the end of the list.
*
* or we found one reservable space in the middle of the list,
* return the reservation window that we could append to.
* succeed.
*/
*start_block = cur;
return prev;
}
/**
* alloc_new_reservation()--allocate a new reservation window
*
* To make a new reservation, we search part of the filesystem
* reservation list (the list that inside the group). We try to
* allocate a new reservation window near the allocation goal,
* or the beginning of the group, if there is no goal.
*
* We first find a reservable space after the goal, then from
* there, we check the bitmap for the first free block after
* it. If there is no free block until the end of group, then the
* whole group is full, we failed. Otherwise, check if the free
* block is inside the expected reservable space, if so, we
* succeed.
* If the first free block is outside the reservable space, then
* start from the first free block, we search for next available
* space, and go on.
*
* on succeed, a new reservation will be found and inserted into the list
* It contains at least one free block, and it does not overlap with other
* reservation windows.
*
* failed: we failed to find a reservation window in this group
*
* @rsv: the reservation
*
* @goal: The goal (group-relative). It is where the search for a
* free reservable space should start from.
* if we have a goal(goal >0 ), then start from there,
* no goal(goal = -1), we start from the first block
* of the group.
*
* @sb: the super block
* @group: the group we are trying to allocate in
* @bitmap_bh: the block group block bitmap
*/
static int alloc_new_reservation(struct reserve_window_node *my_rsv,
int goal, struct super_block *sb,
unsigned int group, struct buffer_head *bitmap_bh)
{
struct reserve_window_node *search_head;
int group_first_block, group_end_block, start_block;
int first_free_block;
int reservable_space_start;
struct reserve_window_node *prev_rsv;
struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
unsigned long size;
group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
group * EXT3_BLOCKS_PER_GROUP(sb);
group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
if (goal < 0)
start_block = group_first_block;
else
start_block = goal + group_first_block;
size = atomic_read(&my_rsv->rsv_goal_size);
if (!rsv_is_empty(&my_rsv->rsv_window)) {
/*
* if the old reservation is cross group boundary
* and if the goal is inside the old reservation window,
* we will come here when we just failed to allocate from
* the first part of the window. We still have another part
* that belongs to the next group. In this case, there is no
* point to discard our window and try to allocate a new one
* in this group(which will fail). we should
* keep the reservation window, just simply move on.
*
* Maybe we could shift the start block of the reservation
* window to the first block of next group.
*/
if ((my_rsv->rsv_start <= group_end_block) &&
(my_rsv->rsv_end > group_end_block) &&
(start_block >= my_rsv->rsv_start))
return -1;
if ((atomic_read(&my_rsv->rsv_alloc_hit) >
(my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
/*
* if we previously allocation hit ration is greater than half
* we double the size of reservation window next time
* otherwise keep the same
*/
size = size * 2;
if (size > EXT3_MAX_RESERVE_BLOCKS)
size = EXT3_MAX_RESERVE_BLOCKS;
atomic_set(&my_rsv->rsv_goal_size, size);
}
}
/*
* shift the search start to the window near the goal block
*/
search_head = search_reserve_window(fs_rsv_root, start_block);
/*
* find_next_reservable_window() simply finds a reservable window
* inside the given range(start_block, group_end_block).
*
* To make sure the reservation window has a free bit inside it, we
* need to check the bitmap after we found a reservable window.
*/
retry:
prev_rsv = find_next_reservable_window(search_head, size,
&start_block, group_end_block);
if (prev_rsv == NULL)
goto failed;
reservable_space_start = start_block;
/*
* On success, find_next_reservable_window() returns the
* reservation window where there is a reservable space after it.
* Before we reserve this reservable space, we need
* to make sure there is at least a free block inside this region.
*
* searching the first free bit on the block bitmap and copy of
* last committed bitmap alternatively, until we found a allocatable
* block. Search start from the start block of the reservable space
* we just found.
*/
first_free_block = bitmap_search_next_usable_block(
reservable_space_start - group_first_block,
bitmap_bh, group_end_block - group_first_block + 1);
if (first_free_block < 0) {
/*
* no free block left on the bitmap, no point
* to reserve the space. return failed.
*/
goto failed;
}
start_block = first_free_block + group_first_block;
/*
* check if the first free block is within the
* free space we just found
*/
if ((start_block >= reservable_space_start) &&
(start_block < reservable_space_start + size))
goto found_rsv_window;
/*
* if the first free bit we found is out of the reservable space
* this means there is no free block on the reservable space
* we should continue search for next reservable space,
* start from where the free block is,
* we also shift the list head to where we stopped last time
*/
search_head = prev_rsv;
goto retry;
found_rsv_window:
/*
* great! the reservable space contains some free blocks.
* if the search returns that we should add the new
* window just next to where the old window, we don't
* need to remove the old window first then add it to the
* same place, just update the new start and new end.
*/
if (my_rsv != prev_rsv) {
if (!rsv_is_empty(&my_rsv->rsv_window))
rsv_window_remove(sb, my_rsv);
}
my_rsv->rsv_start = reservable_space_start;
my_rsv->rsv_end = my_rsv->rsv_start + size - 1;
atomic_set(&my_rsv->rsv_alloc_hit, 0);
if (my_rsv != prev_rsv) {
rsv_window_add(sb, my_rsv);
}
return 0; /* succeed */
failed:
/*
* failed to find a new reservation window in the current
* group, remove the current(stale) reservation window
* if there is any
*/
if (!rsv_is_empty(&my_rsv->rsv_window))
rsv_window_remove(sb, my_rsv);
return -1; /* failed */
}
/*
* This is the main function used to allocate a new block and its reservation
* window.
*
* Each time when a new block allocation is need, first try to allocate from
* its own reservation. If it does not have a reservation window, instead of
* looking for a free bit on bitmap first, then look up the reservation list to
* see if it is inside somebody else's reservation window, we try to allocate a
* reservation window for it starting from the goal first. Then do the block
* allocation within the reservation window.
*
* This will avoid keeping on searching the reservation list again and
* again when someboday is looking for a free block (without
* reservation), and there are lots of free blocks, but they are all
* being reserved.
*
* We use a sorted double linked list for the per-filesystem reservation list.
* The insert, remove and find a free space(non-reserved) operations for the
* sorted double linked list should be fast.
*
*/
static int
ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
unsigned int group, struct buffer_head *bitmap_bh,
int goal, struct reserve_window_node * my_rsv,
int *errp)
{
spinlock_t *rsv_lock;
unsigned long group_first_block;
int ret = 0;
int fatal;
int credits = 0;
*errp = 0;
/*
* Make sure we use undo access for the bitmap, because it is critical
* that we do the frozen_data COW on bitmap buffers in all cases even
* if the buffer is in BJ_Forget state in the committing transaction.
*/
BUFFER_TRACE(bitmap_bh, "get undo access for new block");
fatal = ext3_journal_get_undo_access(handle, bitmap_bh, &credits);
if (fatal) {
*errp = fatal;
goto fail;
return -1;
}
/*
* we don't deal with reservation when
* filesystem is mounted without reservation
* or the file is not a regular file
* or last attempt to allocate a block with reservation turned on failed
*/
if (my_rsv == NULL ) {
ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, NULL);
goto out;
}
rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
/*
* goal is a group relative block number (if there is a goal)
* 0 < goal < EXT3_BLOCKS_PER_GROUP(sb)
* first block is a filesystem wide block number
* first block is the block number of the first block in this group
*/
group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
group * EXT3_BLOCKS_PER_GROUP(sb);
/*
* Basically we will allocate a new block from inode's reservation
* window.
*
* We need to allocate a new reservation window, if:
* a) inode does not have a reservation window; or
* b) last attempt to allocate a block from existing reservation
* failed; or
* c) we come here with a goal and with a reservation window
*
* We do not need to allocate a new reservation window if we come here
* at the beginning with a goal and the goal is inside the window, or
* we don't have a goal but already have a reservation window.
* then we could go to allocate from the reservation window directly.
*/
while (1) {
struct reserve_window rsv_copy;
unsigned int seq;
do {
seq = read_seqbegin(&my_rsv->rsv_seqlock);
rsv_copy._rsv_start = my_rsv->rsv_start;
rsv_copy._rsv_end = my_rsv->rsv_end;
} while (read_seqretry(&my_rsv->rsv_seqlock, seq));
if (rsv_is_empty(&rsv_copy) || (ret < 0) ||
!goal_in_my_reservation(&rsv_copy, goal, group, sb)) {
spin_lock(rsv_lock);
write_seqlock(&my_rsv->rsv_seqlock);
ret = alloc_new_reservation(my_rsv, goal, sb,
group, bitmap_bh);
rsv_copy._rsv_start = my_rsv->rsv_start;
rsv_copy._rsv_end = my_rsv->rsv_end;
write_sequnlock(&my_rsv->rsv_seqlock);
spin_unlock(rsv_lock);
if (ret < 0)
break; /* failed */
if (!goal_in_my_reservation(&rsv_copy, goal, group, sb))
goal = -1;
}
if ((rsv_copy._rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb))
|| (rsv_copy._rsv_end < group_first_block))
BUG();
ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal,
&rsv_copy);
if (ret >= 0) {
if (!read_seqretry(&my_rsv->rsv_seqlock, seq))
atomic_inc(&my_rsv->rsv_alloc_hit);
break; /* succeed */
}
}
out:
if (ret >= 0) {
BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
"bitmap block");
fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
if (fatal) {
*errp = fatal;
return -1;
}
return ret;
}
return goal;
fail_access:
BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
ext3_journal_release_buffer(handle, bitmap_bh, credits);
fail:
return -1;
return ret;
}
static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
......@@ -503,15 +1118,16 @@ int ext3_should_retry_alloc(struct super_block *sb, int *retries)
* bitmap, and then for any free bit if that fails.
* This function also updates quota and i_blocks field.
*/
int
ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal, int *errp)
int ext3_new_block(handle_t *handle, struct inode *inode,
unsigned long goal, int *errp)
{
struct buffer_head *bitmap_bh = NULL; /* bh */
struct buffer_head *gdp_bh; /* bh2 */
int group_no; /* i */
int ret_block; /* j */
int bgi; /* blockgroup iteration index */
int target_block; /* tmp */
struct buffer_head *bitmap_bh = NULL;
struct buffer_head *gdp_bh;
int group_no;
int goal_group;
int ret_block;
int bgi; /* blockgroup iteration index */
int target_block;
int fatal = 0, err;
int performed_allocation = 0;
int free_blocks;
......@@ -519,6 +1135,9 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal, int *e
struct ext3_group_desc *gdp;
struct ext3_super_block *es;
struct ext3_sb_info *sbi;
struct reserve_window_node *my_rsv = NULL;
struct reserve_window_node *rsv = &EXT3_I(inode)->i_rsv_window;
unsigned short windowsz = 0;
#ifdef EXT3FS_DEBUG
static int goal_hits, goal_attempts;
#endif
......@@ -540,7 +1159,18 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal, int *e
sbi = EXT3_SB(sb);
es = EXT3_SB(sb)->s_es;
ext3_debug("goal=%lu.\n", goal);
/*
* Allocate a block from reservation only when
* filesystem is mounted with reservation(default,-o reservation), and
* it's a regular file, and
* the desired window size is greater than 0 (One could use ioctl
* command EXT3_IOC_SETRSVSZ to set the window size to 0 to turn off
* reservation on that particular file)
*/
windowsz = atomic_read(&rsv->rsv_goal_size);
if (test_opt(sb, RESERVATION) &&
S_ISREG(inode->i_mode) && (windowsz > 0))
my_rsv = rsv;
if (!ext3_has_free_blocks(sbi)) {
*errp = -ENOSPC;
goto out;
......@@ -558,6 +1188,8 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal, int *e
if (!gdp)
goto io_error;
goal_group = group_no;
retry:
free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
if (free_blocks > 0) {
ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
......@@ -565,8 +1197,8 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal, int *e
bitmap_bh = read_block_bitmap(sb, group_no);
if (!bitmap_bh)
goto io_error;
ret_block = ext3_try_to_allocate(sb, handle, group_no,
bitmap_bh, ret_block, &fatal);
ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
bitmap_bh, ret_block, my_rsv, &fatal);
if (fatal)
goto out;
if (ret_block >= 0)
......@@ -587,21 +1219,37 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal, int *e
goto out;
}
free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
if (free_blocks <= 0)
/*
* skip this group if the number of
* free blocks is less than half of the reservation
* window size.
*/
if (free_blocks <= (windowsz/2))
continue;
brelse(bitmap_bh);
bitmap_bh = read_block_bitmap(sb, group_no);
if (!bitmap_bh)
goto io_error;
ret_block = ext3_try_to_allocate(sb, handle, group_no,
bitmap_bh, -1, &fatal);
ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
bitmap_bh, -1, my_rsv, &fatal);
if (fatal)
goto out;
if (ret_block >= 0)
goto allocated;
}
/*
* We may end up a bogus ealier ENOSPC error due to
* filesystem is "full" of reservations, but
* there maybe indeed free blocks avaliable on disk
* In this case, we just forget about the reservations
* just do block allocation as without reservations.
*/
if (my_rsv) {
my_rsv = NULL;
group_no = goal_group;
goto retry;
}
/* No space left on the device */
*errp = -ENOSPC;
goto out;
......
......@@ -33,6 +33,10 @@
*/
static int ext3_release_file (struct inode * inode, struct file * filp)
{
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
(atomic_read(&inode->i_writecount) == 1))
ext3_discard_reservation(inode);
if (is_dx(inode) && filp->private_data)
ext3_htree_free_dir_info(filp->private_data);
......
......@@ -582,6 +582,11 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
ei->i_file_acl = 0;
ei->i_dir_acl = 0;
ei->i_dtime = 0;
ei->i_rsv_window.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
atomic_set(&ei->i_rsv_window.rsv_goal_size, EXT3_DEFAULT_RESERVE_BLOCKS);
atomic_set(&ei->i_rsv_window.rsv_alloc_hit, 0);
seqlock_init(&ei->i_rsv_window.rsv_seqlock);
ei->i_block_group = group;
ext3_set_inode_flags(inode);
......
......@@ -178,17 +178,6 @@ static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
return ext3_journal_restart(handle, blocks_for_truncate(inode));
}
/*
* Called at each iput()
*
* The inode may be "bad" if ext3_read_inode() saw an error from
* ext3_get_inode(), so we need to check that to avoid freeing random disk
* blocks.
*/
void ext3_put_inode(struct inode *inode)
{
}
/*
* Called at the last iput() if i_nlink is zero.
*/
......@@ -2086,6 +2075,8 @@ void ext3_truncate(struct inode * inode)
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return;
ext3_discard_reservation(inode);
/*
* We have to lock the EOF page here, because lock_page() nests
* outside journal_start().
......@@ -2418,6 +2409,8 @@ void ext3_read_inode(struct inode * inode)
ei->i_acl = EXT3_ACL_NOT_CACHED;
ei->i_default_acl = EXT3_ACL_NOT_CACHED;
#endif
ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
if (ext3_get_inode_loc(inode, &iloc, 0))
goto bad_inode;
bh = iloc.bh;
......@@ -2478,7 +2471,10 @@ void ext3_read_inode(struct inode * inode)
ei->i_disksize = inode->i_size;
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
ei->i_block_group = iloc.block_group;
ei->i_rsv_window.rsv_start = 0;
ei->i_rsv_window.rsv_end= 0;
atomic_set(&ei->i_rsv_window.rsv_goal_size, EXT3_DEFAULT_RESERVE_BLOCKS);
seqlock_init(&ei->i_rsv_window.rsv_seqlock);
/*
* NOTE! The in-memory inode i_data array is in little-endian order
* even on big-endian machines: we do NOT byteswap the block numbers!
......
......@@ -20,6 +20,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
{
struct ext3_inode_info *ei = EXT3_I(inode);
unsigned int flags;
unsigned short rsv_window_size;
ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
......@@ -151,6 +152,29 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
return ret;
}
#endif
case EXT3_IOC_GETRSVSZ:
if (test_opt(inode->i_sb, RESERVATION) && S_ISREG(inode->i_mode)) {
rsv_window_size = atomic_read(&ei->i_rsv_window.rsv_goal_size);
return put_user(rsv_window_size, (int *)arg);
}
return -ENOTTY;
case EXT3_IOC_SETRSVSZ:
if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
return -ENOTTY;
if (IS_RDONLY(inode))
return -EROFS;
if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
return -EACCES;
if (get_user(rsv_window_size, (int *)arg))
return -EFAULT;
if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS)
rsv_window_size = EXT3_MAX_RESERVE_BLOCKS;
atomic_set(&ei->i_rsv_window.rsv_goal_size, rsv_window_size);
return 0;
default:
return -ENOTTY;
}
......
......@@ -449,6 +449,7 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
ei->i_acl = EXT3_ACL_NOT_CACHED;
ei->i_default_acl = EXT3_ACL_NOT_CACHED;
#endif
ei->i_rsv_window.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
ei->vfs_inode.i_version = 1;
return &ei->vfs_inode;
}
......@@ -490,10 +491,9 @@ static void destroy_inodecache(void)
printk(KERN_INFO "ext3_inode_cache: not all structures were freed\n");
}
#ifdef CONFIG_EXT3_FS_POSIX_ACL
static void ext3_clear_inode(struct inode *inode)
{
#ifdef CONFIG_EXT3_FS_POSIX_ACL
if (EXT3_I(inode)->i_acl &&
EXT3_I(inode)->i_acl != EXT3_ACL_NOT_CACHED) {
posix_acl_release(EXT3_I(inode)->i_acl);
......@@ -504,11 +504,9 @@ static void ext3_clear_inode(struct inode *inode)
posix_acl_release(EXT3_I(inode)->i_default_acl);
EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED;
}
}
#else
# define ext3_clear_inode NULL
#endif
ext3_discard_reservation(inode);
}
#ifdef CONFIG_QUOTA
......@@ -558,7 +556,6 @@ static struct super_operations ext3_sops = {
.read_inode = ext3_read_inode,
.write_inode = ext3_write_inode,
.dirty_inode = ext3_dirty_inode,
.put_inode = ext3_put_inode,
.delete_inode = ext3_delete_inode,
.put_super = ext3_put_super,
.write_super = ext3_write_super,
......@@ -579,7 +576,8 @@ enum {
Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_noload,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
Opt_reservation, Opt_noreservation, Opt_noload,
Opt_commit, Opt_journal_update, Opt_journal_inum,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
......@@ -611,6 +609,8 @@ static match_table_t tokens = {
{Opt_nouser_xattr, "nouser_xattr"},
{Opt_acl, "acl"},
{Opt_noacl, "noacl"},
{Opt_reservation, "reservation"},
{Opt_noreservation, "noreservation"},
{Opt_noload, "noload"},
{Opt_commit, "commit=%u"},
{Opt_journal_update, "journal=update"},
......@@ -765,6 +765,12 @@ static int parse_options (char * options, struct super_block *sb,
printk("EXT3 (no)acl options not supported\n");
break;
#endif
case Opt_reservation:
set_opt(sbi->s_mount_opt, RESERVATION);
break;
case Opt_noreservation:
clear_opt(sbi->s_mount_opt, RESERVATION);
break;
case Opt_journal_update:
/* @@@ FIXME */
/* Eventually we will want to be able to create
......@@ -1469,6 +1475,19 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
sbi->s_gdb_count = db_count;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
/* per fileystem reservation list head & lock */
spin_lock_init(&sbi->s_rsv_window_lock);
sbi->s_rsv_window_root = RB_ROOT;
/* Add a single, static dummy reservation to the start of the
* reservation window list --- it gives us a placeholder for
* append-at-start-of-list which makes the allocation logic
* _much_ simpler. */
sbi->s_rsv_window_head.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
sbi->s_rsv_window_head.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
atomic_set(&sbi->s_rsv_window_head.rsv_alloc_hit, 0);
atomic_set(&sbi->s_rsv_window_head.rsv_goal_size, 0);
rsv_window_add(sb, &sbi->s_rsv_window_head);
/*
* set up enough so that it can read an inode
*/
......
......@@ -32,6 +32,12 @@ struct statfs;
*/
#undef EXT3FS_DEBUG
/*
* Define EXT3_RESERVATION to reserve data blocks for expanding files
*/
#define EXT3_DEFAULT_RESERVE_BLOCKS 8
#define EXT3_MAX_RESERVE_BLOCKS 1024
#define EXT3_RESERVE_WINDOW_NOT_ALLOCATED 0
/*
* Always enable hashed directories
*/
......@@ -202,6 +208,8 @@ struct ext3_group_desc
#ifdef CONFIG_JBD_DEBUG
#define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long)
#endif
#define EXT3_IOC_GETRSVSZ _IOR('f', 5, long)
#define EXT3_IOC_SETRSVSZ _IOW('f', 6, long)
/*
* Structure of an inode on the disk
......@@ -300,25 +308,26 @@ struct ext3_inode {
/*
* Mount flags
*/
#define EXT3_MOUNT_CHECK 0x0001 /* Do mount-time checks */
#define EXT3_MOUNT_OLDALLOC 0x0002 /* Don't use the new Orlov allocator */
#define EXT3_MOUNT_GRPID 0x0004 /* Create files with directory's group */
#define EXT3_MOUNT_DEBUG 0x0008 /* Some debugging messages */
#define EXT3_MOUNT_ERRORS_CONT 0x0010 /* Continue on errors */
#define EXT3_MOUNT_ERRORS_RO 0x0020 /* Remount fs ro on errors */
#define EXT3_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */
#define EXT3_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */
#define EXT3_MOUNT_NOLOAD 0x0100 /* Don't use existing journal*/
#define EXT3_MOUNT_ABORT 0x0200 /* Fatal error detected */
#define EXT3_MOUNT_DATA_FLAGS 0x0C00 /* Mode for data writes: */
#define EXT3_MOUNT_JOURNAL_DATA 0x0400 /* Write data to journal */
#define EXT3_MOUNT_ORDERED_DATA 0x0800 /* Flush data before commit */
#define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */
#define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */
#define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */
#define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */
#define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */
#define EXT3_MOUNT_BARRIER 0x10000 /* Use block barriers */
#define EXT3_MOUNT_CHECK 0x00001 /* Do mount-time checks */
#define EXT3_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */
#define EXT3_MOUNT_GRPID 0x00004 /* Create files with directory's group */
#define EXT3_MOUNT_DEBUG 0x00008 /* Some debugging messages */
#define EXT3_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
#define EXT3_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */
#define EXT3_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */
#define EXT3_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
#define EXT3_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
#define EXT3_MOUNT_ABORT 0x00200 /* Fatal error detected */
#define EXT3_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
#define EXT3_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
#define EXT3_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
#define EXT3_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */
#define EXT3_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */
#define EXT3_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */
#define EXT3_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */
#define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
#define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */
#define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */
/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
......@@ -684,6 +693,7 @@ extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
unsigned int block_group,
struct buffer_head ** bh);
extern int ext3_should_retry_alloc(struct super_block *sb, int *retries);
extern void rsv_window_add(struct super_block *sb, struct reserve_window_node *rsv);
/* dir.c */
extern int ext3_check_dir_entry(const char *, struct inode *,
......@@ -722,7 +732,7 @@ extern int ext3_setattr (struct dentry *, struct iattr *);
extern void ext3_put_inode (struct inode *);
extern void ext3_delete_inode (struct inode *);
extern int ext3_sync_inode (handle_t *, struct inode *);
extern void ext3_discard_prealloc (struct inode *);
extern void ext3_discard_reservation (struct inode *);
extern void ext3_dirty_inode(struct inode *);
extern int ext3_change_inode_journal_flag(struct inode *, int);
extern void ext3_truncate (struct inode *);
......
......@@ -17,9 +17,27 @@
#define _LINUX_EXT3_FS_I
#include <linux/rwsem.h>
#include <linux/rbtree.h>
#include <linux/seqlock.h>
struct reserve_window {
__u32 _rsv_start; /* First byte reserved */
__u32 _rsv_end; /* Last byte reserved or 0 */
};
struct reserve_window_node {
struct rb_node rsv_node;
atomic_t rsv_goal_size;
atomic_t rsv_alloc_hit;
seqlock_t rsv_seqlock;
struct reserve_window rsv_window;
};
#define rsv_start rsv_window._rsv_start
#define rsv_end rsv_window._rsv_end
/*
* second extended file system inode data in memory
* third extended file system inode data in memory
*/
struct ext3_inode_info {
__le32 i_data[15]; /* unconverted */
......@@ -57,6 +75,9 @@ struct ext3_inode_info {
* allocation when we detect linearly ascending requests.
*/
__u32 i_next_alloc_goal;
/* block reservation window */
struct reserve_window_node i_rsv_window;
__u32 i_dir_start_lookup;
#ifdef CONFIG_EXT3_FS_XATTR
/*
......
......@@ -22,6 +22,7 @@
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
#endif
#include <linux/rbtree.h>
/*
* third extended-fs super-block data in memory
......@@ -59,6 +60,11 @@ struct ext3_sb_info {
struct percpu_counter s_dirs_counter;
struct blockgroup_lock s_blockgroup_lock;
/* root of the per fs reservation window tree */
spinlock_t s_rsv_window_lock;
struct rb_root s_rsv_window_root;
struct reserve_window_node s_rsv_window_head;
/* Journaling */
struct inode * s_journal_inode;
struct journal_s * s_journal;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment