Commit dddbd6ac authored by Jan Kara's avatar Jan Kara Committed by Theodore Ts'o

ext4: avoid unnecessary transaction stalls during writeback

Currently ext4_writepages() submits all pages with transaction started.
When no page needs block allocation or extent conversion we can submit
all dirty pages in the inode while holding a single transaction handle
and when device is congested this can take significant amount of time.
Thus ext4_writepages() can block transaction commits for extended
periods of time.

Take for example a simple benchmark simulating PostgreSQL database
(pgioperf in mmtest). The benchmark runs 16 processes doing random reads
from a huge file, one process doing random writes to the huge file, and
one process doing sequential writes to a small files and frequently
running fsync. With unpatched kernel transaction commits take on average
~18s with standard deviation of ~41s, top 5 commit times are:

274.466639s, 126.467347s, 86.992429s, 34.351563s, 31.517653s.

After this patch transaction commits take on average 0.1s with standard
deviation of 0.15s, top 5 commit times are:

0.563792s, 0.519980s, 0.509841s, 0.471700s, 0.469899s

[ Modified so we use an explicit do_map flag instead of relying on
  io_end not being allocated, the since io_end->inode is needed for I/O
  error handling. -- tytso ]
Signed-off-by: default avatarJan Kara <jack@suse.cz>
Signed-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
parent 85c8f176
...@@ -1643,6 +1643,7 @@ struct mpage_da_data { ...@@ -1643,6 +1643,7 @@ struct mpage_da_data {
*/ */
struct ext4_map_blocks map; struct ext4_map_blocks map;
struct ext4_io_submit io_submit; /* IO submission data */ struct ext4_io_submit io_submit; /* IO submission data */
unsigned int do_map:1;
}; };
static void mpage_release_unused_pages(struct mpage_da_data *mpd, static void mpage_release_unused_pages(struct mpage_da_data *mpd,
...@@ -2179,6 +2180,9 @@ static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, ...@@ -2179,6 +2180,9 @@ static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
/* First block in the extent? */ /* First block in the extent? */
if (map->m_len == 0) { if (map->m_len == 0) {
/* We cannot map unless handle is started... */
if (!mpd->do_map)
return false;
map->m_lblk = lblk; map->m_lblk = lblk;
map->m_len = 1; map->m_len = 1;
map->m_flags = bh->b_state & BH_FLAGS; map->m_flags = bh->b_state & BH_FLAGS;
...@@ -2231,6 +2235,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, ...@@ -2231,6 +2235,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
/* Found extent to map? */ /* Found extent to map? */
if (mpd->map.m_len) if (mpd->map.m_len)
return 0; return 0;
/* Buffer needs mapping and handle is not started? */
if (!mpd->do_map)
return 0;
/* Everything mapped so far and we hit EOF */ /* Everything mapped so far and we hit EOF */
break; break;
} }
...@@ -2747,6 +2754,29 @@ static int ext4_writepages(struct address_space *mapping, ...@@ -2747,6 +2754,29 @@ static int ext4_writepages(struct address_space *mapping,
tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
done = false; done = false;
blk_start_plug(&plug); blk_start_plug(&plug);
/*
* First writeback pages that don't need mapping - we can avoid
* starting a transaction unnecessarily and also avoid being blocked
* in the block layer on device congestion while having transaction
* started.
*/
mpd.do_map = 0;
mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
if (!mpd.io_submit.io_end) {
ret = -ENOMEM;
goto unplug;
}
ret = mpage_prepare_extent_to_map(&mpd);
/* Submit prepared bio */
ext4_io_submit(&mpd.io_submit);
ext4_put_io_end_defer(mpd.io_submit.io_end);
mpd.io_submit.io_end = NULL;
/* Unlock pages we didn't use */
mpage_release_unused_pages(&mpd, false);
if (ret < 0)
goto unplug;
while (!done && mpd.first_page <= mpd.last_page) { while (!done && mpd.first_page <= mpd.last_page) {
/* For each extent of pages we use new io_end */ /* For each extent of pages we use new io_end */
mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
...@@ -2775,8 +2805,10 @@ static int ext4_writepages(struct address_space *mapping, ...@@ -2775,8 +2805,10 @@ static int ext4_writepages(struct address_space *mapping,
wbc->nr_to_write, inode->i_ino, ret); wbc->nr_to_write, inode->i_ino, ret);
/* Release allocated io_end */ /* Release allocated io_end */
ext4_put_io_end(mpd.io_submit.io_end); ext4_put_io_end(mpd.io_submit.io_end);
mpd.io_submit.io_end = NULL;
break; break;
} }
mpd.do_map = 1;
trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
ret = mpage_prepare_extent_to_map(&mpd); ret = mpage_prepare_extent_to_map(&mpd);
...@@ -2807,6 +2839,7 @@ static int ext4_writepages(struct address_space *mapping, ...@@ -2807,6 +2839,7 @@ static int ext4_writepages(struct address_space *mapping,
if (!ext4_handle_valid(handle) || handle->h_sync == 0) { if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
ext4_journal_stop(handle); ext4_journal_stop(handle);
handle = NULL; handle = NULL;
mpd.do_map = 0;
} }
/* Submit prepared bio */ /* Submit prepared bio */
ext4_io_submit(&mpd.io_submit); ext4_io_submit(&mpd.io_submit);
...@@ -2824,6 +2857,7 @@ static int ext4_writepages(struct address_space *mapping, ...@@ -2824,6 +2857,7 @@ static int ext4_writepages(struct address_space *mapping,
ext4_journal_stop(handle); ext4_journal_stop(handle);
} else } else
ext4_put_io_end(mpd.io_submit.io_end); ext4_put_io_end(mpd.io_submit.io_end);
mpd.io_submit.io_end = NULL;
if (ret == -ENOSPC && sbi->s_journal) { if (ret == -ENOSPC && sbi->s_journal) {
/* /*
...@@ -2839,6 +2873,7 @@ static int ext4_writepages(struct address_space *mapping, ...@@ -2839,6 +2873,7 @@ static int ext4_writepages(struct address_space *mapping,
if (ret) if (ret)
break; break;
} }
unplug:
blk_finish_plug(&plug); blk_finish_plug(&plug);
if (!ret && !cycled && wbc->nr_to_write > 0) { if (!ret && !cycled && wbc->nr_to_write > 0) {
cycled = 1; cycled = 1;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment