ext4: synchronize ext4_mb_init_group() with buddy page lock

The old routines ext4_mb_[get|put]_buddy_cache_lock(), which used to take grp->alloc_sem for all groups on the buddy page have been replaced with the routines ext4_mb_[get|put]_buddy_page_lock(). The new routines take both buddy and bitmap page locks to protect against concurrent init of groups on the same buddy page. The GROUP_NEED_INIT flag is tested again under page lock to check if the group was initialized by another caller. Signed-off-by: Amir Goldstein <amir73il@users.sf.net> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

ext4: synchronize ext4_mb_init_group() with buddy page lock
The old routines ext4_mb_[get|put]_buddy_cache_lock(), which used to take grp->alloc_sem for all groups on the buddy page have been replaced with the routines ext4_mb_[get|put]_buddy_page_lock(). The new routines take both buddy and bitmap page locks to protect against concurrent init of groups on the same buddy page. The GROUP_NEED_INIT flag is tested again under page lock to check if the group was initialized by another caller. Signed-off-by: Amir Goldstein <amir73il@users.sf.net> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2de8807b · Amir Goldstein · Theodore Ts'o · e73a347b · 2de8807b
Commit 2de8807b authored May 09, 2011 by Amir Goldstein Committed by Theodore Ts'o May 09, 2011
Show whitespace changes
Inline Side-by-side

Showing with 62 additions and 113 deletions

fs/ext4/mballoc.c fs/ext4/mballoc.c +62 -113

No files found.
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -957,22 +957,21 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 }
 /*
- * lock the group_info alloc_sem of all the groups
+ * Lock the buddy and bitmap pages. This make sure other parallel init_group
- * belonging to the same buddy cache page. This
+ * on the same buddy page doesn't happen whild holding the buddy page lock.
- * make sure other parallel operation on the buddy
+ * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
- * cache doesn't happen  whild holding the buddy cache
+ * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
- * lock
 */
-static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
+static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
-					ext4_group_t group)
+		ext4_group_t group, struct ext4_buddy *e4b)
 {
-	int i;
+	struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
-	int block, pnum;
+	int block, pnum, poff;
 	int blocks_per_page;
-	int groups_per_page;
+	struct page *page;
-	ext4_group_t ngroups = ext4_get_groups_count(sb);
-	ext4_group_t first_group;
+	e4b->bd_buddy_page = NULL;
-	struct ext4_group_info *grp;
+	e4b->bd_bitmap_page = NULL;
 	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
 	/*
@@ -982,57 +981,40 @@ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
 	 */
 	block = group * 2;
 	pnum = block / blocks_per_page;
-	first_group = pnum * blocks_per_page / 2;
+	poff = block % blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-	groups_per_page = blocks_per_page >> 1;
+	if (!page)
-	if (groups_per_page == 0)
+		return -EIO;
-		groups_per_page = 1;
+	BUG_ON(page->mapping != inode->i_mapping);
-	/* read all groups the page covers into the cache */
+	e4b->bd_bitmap_page = page;
-	for (i = 0; i < groups_per_page; i++) {
+	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
-		if ((first_group + i) >= ngroups)
+	if (blocks_per_page >= 2) {
-			break;
+		/* buddy and bitmap are on the same page */
-		grp = ext4_get_group_info(sb, first_group + i);
+		return 0;
-		/* take all groups write allocation
-		 * semaphore. This make sure there is
-		 * no block allocation going on in any
-		 * of that groups
-		 */
-		down_write_nested(&grp->alloc_sem, i);
 	}
-	return i;
+	block++;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+	if (!page)
+		return -EIO;
+	BUG_ON(page->mapping != inode->i_mapping);
+	e4b->bd_buddy_page = page;
+	return 0;
 }
-static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
+static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
-					 ext4_group_t group, int locked_group)
 {
-	int i;
+	if (e4b->bd_bitmap_page) {
-	int block, pnum;
+		unlock_page(e4b->bd_bitmap_page);
-	int blocks_per_page;
+		page_cache_release(e4b->bd_bitmap_page);
-	ext4_group_t first_group;
+	}
-	struct ext4_group_info *grp;
+	if (e4b->bd_buddy_page) {
+		unlock_page(e4b->bd_buddy_page);
-	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+		page_cache_release(e4b->bd_buddy_page);
-	/*
-	 * the buddy cache inode stores the block bitmap
-	 * and buddy information in consecutive blocks.
-	 * So for each group we need two blocks.
-	 */
-	block = group * 2;
-	pnum = block / blocks_per_page;
-	first_group = pnum * blocks_per_page / 2;
-	/* release locks on all the groups */
-	for (i = 0; i < locked_group; i++) {
-		grp = ext4_get_group_info(sb, first_group + i);
-		/* take all groups write allocation
-		 * semaphore. This make sure there is
-		 * no block allocation going on in any
-		 * of that groups
-		 */
-		up_write(&grp->alloc_sem);
 	}
 }
 /*
@@ -1044,93 +1026,60 @@ static noinline_for_stack
 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
-	int ret = 0;
-	void *bitmap;
-	int blocks_per_page;
-	int block, pnum, poff;
-	int num_grp_locked = 0;
 	struct ext4_group_info *this_grp;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_buddy e4b;
-	struct inode *inode = sbi->s_buddy_cache;
+	struct page *page;
-	struct page *page = NULL, *bitmap_page = NULL;
+	int ret = 0;
 	mb_debug(1, "init group %u\n", group);
-	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
 	this_grp = ext4_get_group_info(sb, group);
 	/*
 	 * This ensures that we don't reinit the buddy cache
 	 * page which map to the group from which we are already
 	 * allocating. If we are looking at the buddy cache we would
 	 * have taken a reference using ext4_mb_load_buddy and that
-	 * would have taken the alloc_sem lock.
+	 * would have pinned buddy page to page cache.
 	 */
-	num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
+	ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
-	if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+	if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
 		/*
 		 * somebody initialized the group
 		 * return without doing anything
 		 */
-		ret = 0;
 		goto err;
 	}
-	/*
-	 * the buddy cache inode stores the block bitmap
+	page = e4b.bd_bitmap_page;
-	 * and buddy information in consecutive blocks.
-	 * So for each group we need two blocks.
-	 */
-	block = group * 2;
-	pnum = block / blocks_per_page;
-	poff = block % blocks_per_page;
-	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-	if (page) {
-		BUG_ON(page->mapping != inode->i_mapping);
 	ret = ext4_mb_init_cache(page, NULL);
-		if (ret) {
+	if (ret)
-			unlock_page(page);
 		goto err;
-		}
+	if (!PageUptodate(page)) {
-		unlock_page(page);
-	}
-	if (page == NULL || !PageUptodate(page)) {
 		ret = -EIO;
 		goto err;
 	}
 	mark_page_accessed(page);
-	bitmap_page = page;
-	bitmap = page_address(page) + (poff * sb->s_blocksize);
-	/* init buddy cache */
+	if (e4b.bd_buddy_page == NULL) {
-	block++;
-	pnum = block / blocks_per_page;
-	poff = block % blocks_per_page;
-	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-	if (page == bitmap_page) {
 		/*
 		 * If both the bitmap and buddy are in
 		 * the same page we don't need to force
 		 * init the buddy
 		 */
-		unlock_page(page);
+		ret = 0;
-	} else if (page) {
-		BUG_ON(page->mapping != inode->i_mapping);
-		ret = ext4_mb_init_cache(page, bitmap);
-		if (ret) {
-			unlock_page(page);
 		goto err;
 	}
-		unlock_page(page);
+	/* init buddy cache */
-	}
+	page = e4b.bd_buddy_page;
-	if (page == NULL || !PageUptodate(page)) {
+	ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+	if (ret)
+		goto err;
+	if (!PageUptodate(page)) {
 		ret = -EIO;
 		goto err;
 	}
 	mark_page_accessed(page);
 err:
-	ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+	ext4_mb_put_buddy_page_lock(&e4b);
-	if (bitmap_page)
-		page_cache_release(bitmap_page);
-	if (page)
-		page_cache_release(page);
 	return ret;
 }