Commit 22208ded authored by Aneesh Kumar K.V's avatar Aneesh Kumar K.V Committed by Theodore Ts'o

ext4: Fix file fragmentation during large file write.

The range_cyclic writeback mode uses the address_space writeback_index
as the start index for writeback.  With delayed allocation we were
updating writeback_index wrongly resulting in highly fragmented file.
This patch reduces the number of extents reduced from 4000 to 27 for a
3GB file.
Signed-off-by: default avatarAneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
parent 17bc6c30
...@@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) ...@@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
int ret = 0, err, nr_pages, i; int ret = 0, err, nr_pages, i;
unsigned long index, end; unsigned long index, end;
struct pagevec pvec; struct pagevec pvec;
long pages_skipped;
BUG_ON(mpd->next_page <= mpd->first_page); BUG_ON(mpd->next_page <= mpd->first_page);
pagevec_init(&pvec, 0); pagevec_init(&pvec, 0);
...@@ -1655,7 +1656,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) ...@@ -1655,7 +1656,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
end = mpd->next_page - 1; end = mpd->next_page - 1;
while (index <= end) { while (index <= end) {
/* XXX: optimize tail */
/* /*
* We can use PAGECACHE_TAG_DIRTY lookup here because * We can use PAGECACHE_TAG_DIRTY lookup here because
* even though we have cleared the dirty flag on the page * even though we have cleared the dirty flag on the page
...@@ -1673,8 +1673,13 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) ...@@ -1673,8 +1673,13 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
for (i = 0; i < nr_pages; i++) { for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i]; struct page *page = pvec.pages[i];
pages_skipped = mpd->wbc->pages_skipped;
err = mapping->a_ops->writepage(page, mpd->wbc); err = mapping->a_ops->writepage(page, mpd->wbc);
if (!err) if (!err && (pages_skipped == mpd->wbc->pages_skipped))
/*
* have successfully written the page
* without skipping the same
*/
mpd->pages_written++; mpd->pages_written++;
/* /*
* In error case, we have to continue because * In error case, we have to continue because
...@@ -2110,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping, ...@@ -2110,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping,
struct writeback_control *wbc, struct writeback_control *wbc,
struct mpage_da_data *mpd) struct mpage_da_data *mpd)
{ {
long to_write;
int ret; int ret;
if (!mpd->get_block) if (!mpd->get_block)
...@@ -2125,19 +2129,18 @@ static int mpage_da_writepages(struct address_space *mapping, ...@@ -2125,19 +2129,18 @@ static int mpage_da_writepages(struct address_space *mapping,
mpd->pages_written = 0; mpd->pages_written = 0;
mpd->retval = 0; mpd->retval = 0;
to_write = wbc->nr_to_write;
ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
/* /*
* Handle last extent of pages * Handle last extent of pages
*/ */
if (!mpd->io_done && mpd->next_page != mpd->first_page) { if (!mpd->io_done && mpd->next_page != mpd->first_page) {
if (mpage_da_map_blocks(mpd) == 0) if (mpage_da_map_blocks(mpd) == 0)
mpage_da_submit_io(mpd); mpage_da_submit_io(mpd);
}
wbc->nr_to_write = to_write - mpd->pages_written; mpd->io_done = 1;
ret = MPAGE_DA_EXTENT_TAIL;
}
wbc->nr_to_write -= mpd->pages_written;
return ret; return ret;
} }
...@@ -2366,11 +2369,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) ...@@ -2366,11 +2369,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
static int ext4_da_writepages(struct address_space *mapping, static int ext4_da_writepages(struct address_space *mapping,
struct writeback_control *wbc) struct writeback_control *wbc)
{ {
pgoff_t index;
int range_whole = 0;
handle_t *handle = NULL; handle_t *handle = NULL;
struct mpage_da_data mpd; struct mpage_da_data mpd;
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
int no_nrwrite_index_update;
long pages_written = 0, pages_skipped;
int needed_blocks, ret = 0, nr_to_writebump = 0; int needed_blocks, ret = 0, nr_to_writebump = 0;
long to_write, pages_skipped = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
/* /*
...@@ -2390,16 +2396,26 @@ static int ext4_da_writepages(struct address_space *mapping, ...@@ -2390,16 +2396,26 @@ static int ext4_da_writepages(struct address_space *mapping,
nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
wbc->nr_to_write = sbi->s_mb_stream_request; wbc->nr_to_write = sbi->s_mb_stream_request;
} }
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
if (wbc->range_cyclic)
pages_skipped = wbc->pages_skipped; index = mapping->writeback_index;
else
index = wbc->range_start >> PAGE_CACHE_SHIFT;
mpd.wbc = wbc; mpd.wbc = wbc;
mpd.inode = mapping->host; mpd.inode = mapping->host;
restart_loop: /*
to_write = wbc->nr_to_write; * we don't want write_cache_pages to update
while (!ret && to_write > 0) { * nr_to_write and writeback_index
*/
no_nrwrite_index_update = wbc->no_nrwrite_index_update;
wbc->no_nrwrite_index_update = 1;
pages_skipped = wbc->pages_skipped;
while (!ret && wbc->nr_to_write > 0) {
/* /*
* we insert one extent at a time. So we need * we insert one extent at a time. So we need
...@@ -2420,46 +2436,53 @@ static int ext4_da_writepages(struct address_space *mapping, ...@@ -2420,46 +2436,53 @@ static int ext4_da_writepages(struct address_space *mapping,
dump_stack(); dump_stack();
goto out_writepages; goto out_writepages;
} }
to_write -= wbc->nr_to_write;
mpd.get_block = ext4_da_get_block_write; mpd.get_block = ext4_da_get_block_write;
ret = mpage_da_writepages(mapping, wbc, &mpd); ret = mpage_da_writepages(mapping, wbc, &mpd);
ext4_journal_stop(handle); ext4_journal_stop(handle);
if (mpd.retval == -ENOSPC) if (mpd.retval == -ENOSPC) {
/* commit the transaction which would
* free blocks released in the transaction
* and try again
*/
jbd2_journal_force_commit_nested(sbi->s_journal); jbd2_journal_force_commit_nested(sbi->s_journal);
wbc->pages_skipped = pages_skipped;
/* reset the retry count */ ret = 0;
if (ret == MPAGE_DA_EXTENT_TAIL) { } else if (ret == MPAGE_DA_EXTENT_TAIL) {
/* /*
* got one extent now try with * got one extent now try with
* rest of the pages * rest of the pages
*/ */
to_write += wbc->nr_to_write; pages_written += mpd.pages_written;
wbc->pages_skipped = pages_skipped;
ret = 0; ret = 0;
} else if (wbc->nr_to_write) { } else if (wbc->nr_to_write)
/* /*
* There is no more writeout needed * There is no more writeout needed
* or we requested for a noblocking writeout * or we requested for a noblocking writeout
* and we found the device congested * and we found the device congested
*/ */
to_write += wbc->nr_to_write;
break; break;
}
wbc->nr_to_write = to_write;
}
if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) {
/* We skipped pages in this loop */
wbc->nr_to_write = to_write +
wbc->pages_skipped - pages_skipped;
wbc->pages_skipped = pages_skipped;
goto restart_loop;
} }
if (pages_skipped != wbc->pages_skipped)
printk(KERN_EMERG "This should not happen leaving %s "
"with nr_to_write = %ld ret = %d\n",
__func__, wbc->nr_to_write, ret);
/* Update index */
index += pages_written;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
/*
* set the writeback_index so that range_cyclic
* mode will write it back later
*/
mapping->writeback_index = index;
out_writepages: out_writepages:
wbc->nr_to_write = to_write - nr_to_writebump; if (!no_nrwrite_index_update)
wbc->no_nrwrite_index_update = 0;
wbc->nr_to_write -= nr_to_writebump;
return ret; return ret;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment