Commit acb5f6f9 authored by Andrew Morton's avatar Andrew Morton Committed by Arnaldo Carvalho de Melo

[PATCH] writeback tuning

Tune up the VM-based writeback a bit.

- Always use the multipage clustered-writeback function from within
  shrink_cache(), even if the page's mapping has a NULL ->vm_writeback().  So
  clustered writeback is turned on for all address_spaces, not just ext2.

  Subtle effect of this change: it is now the case that *all* writeback
  proceeds along the mapping->dirty_pages list.  The orderedness of the page
  LRUs no longer has an impact on disk scheduling.  So we only have one list
  to keep well-sorted rather than two, and churning pages around on the LRU
  will no longer damage write bandwidth - it's all up to the filesystem.

- Decrease the clustered writeback from 1024 pages(!) to 32 pages.

  (1024 was a leftover from when this code was always dispatching writeback
  to a pdflush thread).

- Fix wakeup_bdflush() so that it actually does write something (duh).

  do_wp_page() needs to call balance_dirty_pages_ratelimited(), so we
  throttle mmap page-dirtiers in the same way as write(2) page-dirtiers.
  This may make wakeup_bdflush() obsolete, but it doesn't hurt.

- Converts generic_vm_writeback() to directly call ->writeback_mapping(),
  rather that going through writeback_single_inode().  This prevents memory
  allocators from blocking on the inode's I_LOCK.  But it does mean that two
  processes can be writing pages from the same mapping at the same time.  If
  filesystems care about this (for layout reasons) then they should serialise
  in their ->writeback_mapping a_op.

  This means that memory-allocators will writeback only pages, not pages
  and inodes.  There are no locks in that writeback path (except for request
  queue exhaustion).  Reduces memory allocation latency.

- Implement new background_writeback function, which when kicked off
  will perform writeback until dirty memory falls below the background
  threshold.

- Put written-back pages onto the remote end of the page LRU.  It
  does this in the slow-and-stupid way at present.  pagemap_lru_lock
  stress-relief is planned...

- Remove the funny writeback_unused_inodes() stuff from prune_icache().
  Writeback from wakeup_bdflush() and the `kupdate' function now just
  naturally cleanses the oldest inodes so we don't need to do anything
  there.

- Dirty memory balancing is still using magic numbers: "after you
  dirtied your 1,000th page, go write 1,500".  Obviously, this needs
  more work.
parent 17a74e88
...@@ -2408,11 +2408,6 @@ asmlinkage long sys_bdflush(int func, long data) ...@@ -2408,11 +2408,6 @@ asmlinkage long sys_bdflush(int func, long data)
return 0; return 0;
} }
void wakeup_bdflush(void)
{
pdflush_flush(0);
}
/* /*
* Buffer-head allocation * Buffer-head allocation
*/ */
......
...@@ -402,14 +402,6 @@ void prune_icache(int goal) ...@@ -402,14 +402,6 @@ void prune_icache(int goal)
spin_unlock(&inode_lock); spin_unlock(&inode_lock);
dispose_list(freeable); dispose_list(freeable);
/*
* If we didn't free enough clean inodes then schedule writeback of
* the dirty inodes. We cannot do it from here or we're either
* synchronously dogslow or we deadlock with oom.
*/
if (goal)
pdflush_operation(try_to_writeback_unused_inodes, 0);
} }
/* /*
......
...@@ -46,17 +46,9 @@ static inline void wait_on_inode(struct inode *inode) ...@@ -46,17 +46,9 @@ static inline void wait_on_inode(struct inode *inode)
/* /*
* mm/page-writeback.c * mm/page-writeback.c
*/ */
/*
* How much data to write out at a time in various places. This isn't
* really very important - it's just here to prevent any thread from
* locking an inode for too long and blocking other threads which wish
* to write the same file for allocation throttling purposes.
*/
#define WRITEOUT_PAGES ((4096 * 1024) / PAGE_CACHE_SIZE)
void balance_dirty_pages(struct address_space *mapping); void balance_dirty_pages(struct address_space *mapping);
void balance_dirty_pages_ratelimited(struct address_space *mapping); void balance_dirty_pages_ratelimited(struct address_space *mapping);
int pdflush_flush(unsigned long nr_pages);
int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0); int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
int writeback_mapping(struct address_space *mapping, int *nr_to_write);
#endif /* WRITEBACK_H */ #endif /* WRITEBACK_H */
...@@ -453,9 +453,7 @@ EXPORT_SYMBOL(fail_writepage); ...@@ -453,9 +453,7 @@ EXPORT_SYMBOL(fail_writepage);
*/ */
int filemap_fdatawrite(struct address_space *mapping) int filemap_fdatawrite(struct address_space *mapping)
{ {
if (mapping->a_ops->writeback_mapping) return writeback_mapping(mapping, NULL);
return mapping->a_ops->writeback_mapping(mapping, NULL);
return generic_writeback_mapping(mapping, NULL);
} }
/** /**
......
This diff is collapsed.
...@@ -31,7 +31,25 @@ static int swap_writepage(struct page *page) ...@@ -31,7 +31,25 @@ static int swap_writepage(struct page *page)
return 0; return 0;
} }
/*
* swapper_space doesn't have a real inode, so it gets a special vm_writeback()
* so we don't need swap special cases in generic_vm_writeback().
*
* FIXME: swap pages are locked, but not PageWriteback while under writeout.
* This will confuse throttling in shrink_cache(). It may be advantageous to
* set PG_writeback against swap pages while they're also locked. Either that,
* or special-case swap pages in shrink_cache().
*/
static int swap_vm_writeback(struct page *page, int *nr_to_write)
{
struct address_space *mapping = page->mapping;
unlock_page(page);
return generic_writeback_mapping(mapping, nr_to_write);
}
static struct address_space_operations swap_aops = { static struct address_space_operations swap_aops = {
vm_writeback: swap_vm_writeback,
writepage: swap_writepage, writepage: swap_writepage,
sync_page: block_sync_page, sync_page: block_sync_page,
}; };
......
...@@ -458,36 +458,21 @@ static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, ...@@ -458,36 +458,21 @@ static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask,
* pinned it and after the I/O to the page is finished, * pinned it and after the I/O to the page is finished,
* so the direct writes to the page cannot get lost. * so the direct writes to the page cannot get lost.
*/ */
struct address_space_operations *a_ops;
int (*writeback)(struct page *, int *); int (*writeback)(struct page *, int *);
int (*writepage)(struct page *); const int nr_pages = SWAP_CLUSTER_MAX;
int nr_to_write = nr_pages;
/* writeback = mapping->a_ops->vm_writeback;
* There's no guarantee that writeback() will actually if (writeback == NULL)
* start I/O against *this* page. Which is broken if we're writeback = generic_vm_writeback;
* trying to free memory in a particular zone. FIXME.
*/
a_ops = mapping->a_ops;
writeback = a_ops->vm_writeback;
writepage = a_ops->writepage;
if (writeback || writepage) {
SetPageLaunder(page);
page_cache_get(page); page_cache_get(page);
spin_unlock(&pagemap_lru_lock); spin_unlock(&pagemap_lru_lock);
ClearPageDirty(page); (*writeback)(page, &nr_to_write);
max_scan -= (nr_pages - nr_to_write);
if (writeback) {
int nr_to_write = WRITEOUT_PAGES;
writeback(page, &nr_to_write);
} else {
writepage(page);
}
page_cache_release(page); page_cache_release(page);
spin_lock(&pagemap_lru_lock); spin_lock(&pagemap_lru_lock);
continue; continue;
} }
}
/* /*
* If the page has buffers, try to free the buffer mappings * If the page has buffers, try to free the buffer mappings
...@@ -648,6 +633,8 @@ static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask ...@@ -648,6 +633,8 @@ static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask
if (nr_pages <= 0) if (nr_pages <= 0)
return 0; return 0;
wakeup_bdflush();
shrink_dcache_memory(priority, gfp_mask); shrink_dcache_memory(priority, gfp_mask);
/* After shrinking the dcache, get rid of unused inodes too .. */ /* After shrinking the dcache, get rid of unused inodes too .. */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment