Commit 407ee6c8 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] low-latency page reclaim

Convert the VM to not wait on other people's dirty data.

 - If we find a dirty page and its queue is not congested, do some writeback.

 - If we find a dirty page and its queue _is_ congested then just
   refile the page.

 - If we find a PageWriteback page then just refile the page.

 - There is additional throttling for write(2) callers.  Within
   generic_file_write(), record their backing queue in ->current.
   Within page reclaim, if this tasks encounters a page which is dirty
   or under writeback onthis queue, block on it.  This gives some more
   writer throttling and reduces the page refiling frequency.

It's somewhat CPU expensive - under really heavy load we only get a 50%
reclaim rate in pages coming off the tail of the LRU.  This can be
fixed by splitting the inactive list into reclaimable and
non-reclaimable lists.  But the CPU load isn't too bad, and latency is
much, much more important in these situations.

Example: with `mem=512m', running 4 instances of `dbench 100', 2.5.34
took 35 minutes to compile a kernel.  With this patch, it took three
minutes, 45 seconds.

I haven't done swapcache or MAP_SHARED pages yet.  If there's tons of
dirty swapcache or mmap data around we still stall heavily in page
reclaim.  That's less important.

This patch also has a tweak for swapless machines: don't even bother
bringing anon pages onto the inactive list if there is no swap online.
parent c9b22619
...@@ -28,7 +28,8 @@ ...@@ -28,7 +28,8 @@
#include <linux/smp_lock.h> #include <linux/smp_lock.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/suspend.h> #include <linux/suspend.h>
#include <linux/buffer_head.h> /* for fsync_bdev()/wakeup_bdflush() */ #include <linux/writeback.h>
#include <linux/buffer_head.h> /* for fsync_bdev() */
#include <linux/spinlock.h> #include <linux/spinlock.h>
...@@ -227,7 +228,7 @@ static void sysrq_handle_sync(int key, struct pt_regs *pt_regs, ...@@ -227,7 +228,7 @@ static void sysrq_handle_sync(int key, struct pt_regs *pt_regs,
struct tty_struct *tty) struct tty_struct *tty)
{ {
emergency_sync_scheduled = EMERG_SYNC; emergency_sync_scheduled = EMERG_SYNC;
wakeup_bdflush(); wakeup_bdflush(0);
} }
static struct sysrq_key_op sysrq_sync_op = { static struct sysrq_key_op sysrq_sync_op = {
handler: sysrq_handle_sync, handler: sysrq_handle_sync,
...@@ -239,7 +240,7 @@ static void sysrq_handle_mountro(int key, struct pt_regs *pt_regs, ...@@ -239,7 +240,7 @@ static void sysrq_handle_mountro(int key, struct pt_regs *pt_regs,
struct tty_struct *tty) struct tty_struct *tty)
{ {
emergency_sync_scheduled = EMERG_REMOUNT; emergency_sync_scheduled = EMERG_REMOUNT;
wakeup_bdflush(); wakeup_bdflush(0);
} }
static struct sysrq_key_op sysrq_mountro_op = { static struct sysrq_key_op sysrq_mountro_op = {
handler: sysrq_handle_mountro, handler: sysrq_handle_mountro,
......
...@@ -458,19 +458,17 @@ void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers) ...@@ -458,19 +458,17 @@ void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
} }
/* /*
* FIXME: What is this function actually trying to do? Why "zones[0]"? * Kick pdflush then try to free up some ZONE_NORMAL memory.
* Is it still correct/needed if/when blockdev mappings use GFP_HIGHUSER?
*/ */
static void free_more_memory(void) static void free_more_memory(void)
{ {
struct zone *zone; struct zone *zone;
zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0]; zone = contig_page_data.node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones[0];
wakeup_bdflush(1024);
wakeup_bdflush();
try_to_free_pages(zone, GFP_NOFS, 0);
blk_run_queues(); blk_run_queues();
yield(); yield();
try_to_free_pages(zone, GFP_NOFS, 0);
} }
/* /*
......
...@@ -163,7 +163,6 @@ struct buffer_head * __getblk(struct block_device *, sector_t, int); ...@@ -163,7 +163,6 @@ struct buffer_head * __getblk(struct block_device *, sector_t, int);
void __brelse(struct buffer_head *); void __brelse(struct buffer_head *);
void __bforget(struct buffer_head *); void __bforget(struct buffer_head *);
struct buffer_head *__bread(struct block_device *, sector_t block, int size); struct buffer_head *__bread(struct block_device *, sector_t block, int size);
void wakeup_bdflush(void);
struct buffer_head *alloc_buffer_head(void); struct buffer_head *alloc_buffer_head(void);
void free_buffer_head(struct buffer_head * bh); void free_buffer_head(struct buffer_head * bh);
void FASTCALL(unlock_buffer(struct buffer_head *bh)); void FASTCALL(unlock_buffer(struct buffer_head *bh));
......
...@@ -273,6 +273,7 @@ extern struct user_struct root_user; ...@@ -273,6 +273,7 @@ extern struct user_struct root_user;
#define INIT_USER (&root_user) #define INIT_USER (&root_user)
typedef struct prio_array prio_array_t; typedef struct prio_array prio_array_t;
struct backing_dev_info;
struct task_struct { struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
...@@ -398,6 +399,7 @@ struct task_struct { ...@@ -398,6 +399,7 @@ struct task_struct {
/* journalling filesystem info */ /* journalling filesystem info */
void *journal_info; void *journal_info;
struct dentry *proc_dentry; struct dentry *proc_dentry;
struct backing_dev_info *backing_dev_info;
}; };
extern void __put_task_struct(struct task_struct *tsk); extern void __put_task_struct(struct task_struct *tsk);
......
...@@ -63,6 +63,8 @@ static inline void wait_on_inode(struct inode *inode) ...@@ -63,6 +63,8 @@ static inline void wait_on_inode(struct inode *inode)
/* /*
* mm/page-writeback.c * mm/page-writeback.c
*/ */
int wakeup_bdflush(long nr_pages);
/* These 5 are exported to sysctl. */ /* These 5 are exported to sysctl. */
extern int dirty_background_ratio; extern int dirty_background_ratio;
extern int dirty_async_ratio; extern int dirty_async_ratio;
......
...@@ -81,7 +81,6 @@ unsigned char software_suspend_enabled = 0; ...@@ -81,7 +81,6 @@ unsigned char software_suspend_enabled = 0;
#define TIMEOUT (6 * HZ) /* Timeout for stopping processes */ #define TIMEOUT (6 * HZ) /* Timeout for stopping processes */
#define ADDRESS(x) ((unsigned long) phys_to_virt(((x) << PAGE_SHIFT))) #define ADDRESS(x) ((unsigned long) phys_to_virt(((x) << PAGE_SHIFT)))
extern void wakeup_bdflush(void);
extern int C_A_D; extern int C_A_D;
/* References to section boundaries */ /* References to section boundaries */
......
...@@ -1755,6 +1755,9 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov, ...@@ -1755,6 +1755,9 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
if (unlikely(pos < 0)) if (unlikely(pos < 0))
return -EINVAL; return -EINVAL;
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
pagevec_init(&lru_pvec); pagevec_init(&lru_pvec);
if (unlikely(file->f_error)) { if (unlikely(file->f_error)) {
...@@ -1959,6 +1962,7 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov, ...@@ -1959,6 +1962,7 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
err = written ? written : status; err = written ? written : status;
out: out:
pagevec_lru_add(&lru_pvec); pagevec_lru_add(&lru_pvec);
current->backing_dev_info = 0;
return err; return err;
} }
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/mempool.h> #include <linux/mempool.h>
#include <linux/buffer_head.h> /* for wakeup_bdflush() */ #include <linux/writeback.h>
static void add_element(mempool_t *pool, void *element) static void add_element(mempool_t *pool, void *element)
{ {
...@@ -210,7 +210,7 @@ void * mempool_alloc(mempool_t *pool, int gfp_mask) ...@@ -210,7 +210,7 @@ void * mempool_alloc(mempool_t *pool, int gfp_mask)
/* /*
* Kick the VM at this point. * Kick the VM at this point.
*/ */
wakeup_bdflush(); wakeup_bdflush(0);
spin_lock_irqsave(&pool->lock, flags); spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr)) { if (likely(pool->curr_nr)) {
......
...@@ -201,14 +201,19 @@ static void background_writeout(unsigned long _min_pages) ...@@ -201,14 +201,19 @@ static void background_writeout(unsigned long _min_pages)
} }
/* /*
* Start heavy writeback of everything. * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
* the whole world. Returns 0 if a pdflush thread was dispatched. Returns
* -1 if all pdflush threads were busy.
*/ */
void wakeup_bdflush(void) int wakeup_bdflush(long nr_pages)
{ {
if (nr_pages == 0) {
struct page_state ps; struct page_state ps;
get_page_state(&ps); get_page_state(&ps);
pdflush_operation(background_writeout, ps.nr_dirty); nr_pages = ps.nr_dirty;
}
return pdflush_operation(background_writeout, nr_pages);
} }
static struct timer_list wb_timer; static struct timer_list wb_timer;
......
...@@ -918,6 +918,26 @@ static int setup_swap_extents(struct swap_info_struct *sis) ...@@ -918,6 +918,26 @@ static int setup_swap_extents(struct swap_info_struct *sis)
return ret; return ret;
} }
#if 0 /* We don't need this yet */
#include <linux/backing-dev.h>
int page_queue_congested(struct page *page)
{
struct backing_dev_info *bdi;
BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
bdi = page->mapping->backing_dev_info;
if (PageSwapCache(page)) {
swp_entry_t entry = { .val = page->index };
struct swap_info_struct *sis;
sis = get_swap_info_struct(swp_type(entry));
bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
}
return bdi_write_congested(bdi);
}
#endif
asmlinkage long sys_swapoff(const char * specialfile) asmlinkage long sys_swapoff(const char * specialfile)
{ {
struct swap_info_struct * p = NULL; struct swap_info_struct * p = NULL;
......
...@@ -21,9 +21,11 @@ ...@@ -21,9 +21,11 @@
#include <linux/file.h> #include <linux/file.h>
#include <linux/writeback.h> #include <linux/writeback.h>
#include <linux/suspend.h> #include <linux/suspend.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h> /* for try_to_release_page() */ #include <linux/buffer_head.h> /* for try_to_release_page() */
#include <linux/mm_inline.h> #include <linux/mm_inline.h>
#include <linux/pagevec.h> #include <linux/pagevec.h>
#include <linux/backing-dev.h>
#include <linux/rmap-locking.h> #include <linux/rmap-locking.h>
#include <asm/pgalloc.h> #include <asm/pgalloc.h>
...@@ -32,11 +34,11 @@ ...@@ -32,11 +34,11 @@
/* /*
* The "priority" of VM scanning is how much of the queues we * The "priority" of VM scanning is how much of the queues we
* will scan in one go. A value of 6 for DEF_PRIORITY implies * will scan in one go. A value of 12 for DEF_PRIORITY implies
* that we'll scan 1/64th of the queues ("queue_length >> 6") * that we'll scan 1/4096th of the queues ("queue_length >> 12")
* during a normal aging round. * during a normal aging round.
*/ */
#define DEF_PRIORITY (6) #define DEF_PRIORITY 12
#ifdef ARCH_HAS_PREFETCH #ifdef ARCH_HAS_PREFETCH
#define prefetch_prev_lru_page(_page, _base, _field) \ #define prefetch_prev_lru_page(_page, _base, _field) \
...@@ -95,7 +97,7 @@ static inline int is_page_cache_freeable(struct page *page) ...@@ -95,7 +97,7 @@ static inline int is_page_cache_freeable(struct page *page)
static /* inline */ int static /* inline */ int
shrink_list(struct list_head *page_list, int nr_pages, shrink_list(struct list_head *page_list, int nr_pages,
unsigned int gfp_mask, int priority, int *max_scan) unsigned int gfp_mask, int *max_scan)
{ {
struct address_space *mapping; struct address_space *mapping;
LIST_HEAD(ret_pages); LIST_HEAD(ret_pages);
...@@ -117,10 +119,21 @@ shrink_list(struct list_head *page_list, int nr_pages, ...@@ -117,10 +119,21 @@ shrink_list(struct list_head *page_list, int nr_pages,
BUG_ON(PageActive(page)); BUG_ON(PageActive(page));
may_enter_fs = (gfp_mask & __GFP_FS) || may_enter_fs = (gfp_mask & __GFP_FS) ||
(PageSwapCache(page) && (gfp_mask & __GFP_IO)); (PageSwapCache(page) && (gfp_mask & __GFP_IO));
/*
* If the page is mapped into pagetables then wait on it, to
* throttle this allocator to the rate at which we can clear
* MAP_SHARED data. This will also throttle against swapcache
* writes.
*/
if (PageWriteback(page)) { if (PageWriteback(page)) {
if (may_enter_fs) if (may_enter_fs) {
wait_on_page_writeback(page); /* throttling */ if (page->pte.direct ||
else page->mapping->backing_dev_info ==
current->backing_dev_info) {
wait_on_page_writeback(page);
}
}
goto keep_locked; goto keep_locked;
} }
...@@ -172,15 +185,43 @@ shrink_list(struct list_head *page_list, int nr_pages, ...@@ -172,15 +185,43 @@ shrink_list(struct list_head *page_list, int nr_pages,
* will write it. So we're back to page-at-a-time writepage * will write it. So we're back to page-at-a-time writepage
* in LRU order. * in LRU order.
*/ */
if (PageDirty(page) && is_page_cache_freeable(page) && /*
mapping && may_enter_fs) { * If the page is dirty, only perform writeback if that write
* will be non-blocking. To prevent this allocation from being
* stalled by pagecache activity. But note that there may be
* stalls if we need to run get_block(). We could test
* PagePrivate for that.
*
* If this process is currently in generic_file_write() against
* this page's queue, we can perform writeback even if that
* will block.
*
* If the page is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
* swap_backing_dev_info is bust: it doesn't reflect the
* congestion state of the swapdevs. Easy to fix, if needed.
* See swapfile.c:page_queue_congested().
*/
if (PageDirty(page)) {
int (*writeback)(struct page *, int (*writeback)(struct page *,
struct writeback_control *); struct writeback_control *);
struct backing_dev_info *bdi;
const int cluster_size = SWAP_CLUSTER_MAX; const int cluster_size = SWAP_CLUSTER_MAX;
struct writeback_control wbc = { struct writeback_control wbc = {
.nr_to_write = cluster_size, .nr_to_write = cluster_size,
}; };
if (!is_page_cache_freeable(page))
goto keep_locked;
if (!mapping)
goto keep_locked;
if (!may_enter_fs)
goto keep_locked;
bdi = mapping->backing_dev_info;
if (bdi != current->backing_dev_info &&
bdi_write_congested(bdi))
goto keep_locked;
writeback = mapping->a_ops->vm_writeback; writeback = mapping->a_ops->vm_writeback;
if (writeback == NULL) if (writeback == NULL)
writeback = generic_vm_writeback; writeback = generic_vm_writeback;
...@@ -279,7 +320,7 @@ shrink_list(struct list_head *page_list, int nr_pages, ...@@ -279,7 +320,7 @@ shrink_list(struct list_head *page_list, int nr_pages,
*/ */
static /* inline */ int static /* inline */ int
shrink_cache(int nr_pages, struct zone *zone, shrink_cache(int nr_pages, struct zone *zone,
unsigned int gfp_mask, int priority, int max_scan) unsigned int gfp_mask, int max_scan)
{ {
LIST_HEAD(page_list); LIST_HEAD(page_list);
struct pagevec pvec; struct pagevec pvec;
...@@ -298,9 +339,11 @@ shrink_cache(int nr_pages, struct zone *zone, ...@@ -298,9 +339,11 @@ shrink_cache(int nr_pages, struct zone *zone,
spin_lock_irq(&zone->lru_lock); spin_lock_irq(&zone->lru_lock);
while (max_scan > 0 && nr_pages > 0) { while (max_scan > 0 && nr_pages > 0) {
struct page *page; struct page *page;
int n = 0; int nr_taken = 0;
int nr_scan = 0;
while (n < nr_to_process && !list_empty(&zone->inactive_list)) { while (nr_scan++ < nr_to_process &&
!list_empty(&zone->inactive_list)) {
page = list_entry(zone->inactive_list.prev, page = list_entry(zone->inactive_list.prev,
struct page, lru); struct page, lru);
...@@ -318,18 +361,17 @@ shrink_cache(int nr_pages, struct zone *zone, ...@@ -318,18 +361,17 @@ shrink_cache(int nr_pages, struct zone *zone,
} }
list_add(&page->lru, &page_list); list_add(&page->lru, &page_list);
page_cache_get(page); page_cache_get(page);
n++; nr_taken++;
} }
zone->nr_inactive -= n; zone->nr_inactive -= nr_taken;
spin_unlock_irq(&zone->lru_lock); spin_unlock_irq(&zone->lru_lock);
if (list_empty(&page_list)) if (nr_taken == 0)
goto done; goto done;
max_scan -= n; max_scan -= nr_scan;
KERNEL_STAT_ADD(pgscan, n); KERNEL_STAT_ADD(pgscan, nr_scan);
nr_pages = shrink_list(&page_list, nr_pages, nr_pages = shrink_list(&page_list,nr_pages,gfp_mask,&max_scan);
gfp_mask, priority, &max_scan);
if (nr_pages <= 0 && list_empty(&page_list)) if (nr_pages <= 0 && list_empty(&page_list))
goto done; goto done;
...@@ -420,6 +462,15 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in) ...@@ -420,6 +462,15 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
} }
pte_chain_unlock(page); pte_chain_unlock(page);
} }
/*
* FIXME: need to consider page_count(page) here if/when we
* reap orphaned pages via the LRU (Daniel's locking stuff)
*/
if (total_swap_pages == 0 && !page->mapping &&
!PagePrivate(page)) {
list_add(&page->lru, &l_active);
continue;
}
list_add(&page->lru, &l_inactive); list_add(&page->lru, &l_inactive);
pgdeactivate++; pgdeactivate++;
} }
...@@ -470,11 +521,10 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in) ...@@ -470,11 +521,10 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
} }
static /* inline */ int static /* inline */ int
shrink_zone(struct zone *zone, int priority, shrink_zone(struct zone *zone, int max_scan,
unsigned int gfp_mask, int nr_pages) unsigned int gfp_mask, int nr_pages)
{ {
unsigned long ratio; unsigned long ratio;
int max_scan;
/* This is bogus for ZONE_HIGHMEM? */ /* This is bogus for ZONE_HIGHMEM? */
if (kmem_cache_reap(gfp_mask) >= nr_pages) if (kmem_cache_reap(gfp_mask) >= nr_pages)
...@@ -497,43 +547,50 @@ shrink_zone(struct zone *zone, int priority, ...@@ -497,43 +547,50 @@ shrink_zone(struct zone *zone, int priority,
atomic_sub(SWAP_CLUSTER_MAX, &zone->refill_counter); atomic_sub(SWAP_CLUSTER_MAX, &zone->refill_counter);
refill_inactive_zone(zone, SWAP_CLUSTER_MAX); refill_inactive_zone(zone, SWAP_CLUSTER_MAX);
} }
nr_pages = shrink_cache(nr_pages, zone, gfp_mask, max_scan);
max_scan = zone->nr_inactive / priority;
nr_pages = shrink_cache(nr_pages, zone,
gfp_mask, priority, max_scan);
if (nr_pages <= 0)
return 0;
wakeup_bdflush();
shrink_dcache_memory(priority, gfp_mask);
/* After shrinking the dcache, get rid of unused inodes too .. */
shrink_icache_memory(1, gfp_mask);
#ifdef CONFIG_QUOTA
shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
#endif
return nr_pages; return nr_pages;
} }
static int static int
shrink_caches(struct zone *classzone, int priority, shrink_caches(struct zone *classzone, int priority,
int gfp_mask, int nr_pages) int *total_scanned, int gfp_mask, int nr_pages)
{ {
struct zone *first_classzone; struct zone *first_classzone;
struct zone *zone; struct zone *zone;
first_classzone = classzone->zone_pgdat->node_zones; first_classzone = classzone->zone_pgdat->node_zones;
zone = classzone; for (zone = classzone; zone >= first_classzone; zone--) {
while (zone >= first_classzone && nr_pages > 0) { int max_scan;
if (zone->free_pages <= zone->pages_high) { int to_reclaim;
nr_pages = shrink_zone(zone, priority, int unreclaimed;
gfp_mask, nr_pages);
} to_reclaim = zone->pages_high - zone->free_pages;
zone--; if (to_reclaim < 0)
continue; /* zone has enough memory */
if (to_reclaim > SWAP_CLUSTER_MAX)
to_reclaim = SWAP_CLUSTER_MAX;
if (to_reclaim < nr_pages)
to_reclaim = nr_pages;
/*
* If we cannot reclaim `nr_pages' pages by scanning twice
* that many pages then fall back to the next zone.
*/
max_scan = zone->nr_inactive >> priority;
if (max_scan < to_reclaim * 2)
max_scan = to_reclaim * 2;
unreclaimed = shrink_zone(zone, max_scan, gfp_mask, to_reclaim);
nr_pages -= to_reclaim - unreclaimed;
*total_scanned += max_scan;
} }
shrink_dcache_memory(priority, gfp_mask);
shrink_icache_memory(1, gfp_mask);
#ifdef CONFIG_QUOTA
shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
#endif
return nr_pages; return nr_pages;
} }
...@@ -564,12 +621,25 @@ try_to_free_pages(struct zone *classzone, ...@@ -564,12 +621,25 @@ try_to_free_pages(struct zone *classzone,
KERNEL_STAT_INC(pageoutrun); KERNEL_STAT_INC(pageoutrun);
for (priority = DEF_PRIORITY; priority; priority--) { for (priority = DEF_PRIORITY; priority; priority--) {
nr_pages = shrink_caches(classzone, priority, int total_scanned = 0;
nr_pages = shrink_caches(classzone, priority, &total_scanned,
gfp_mask, nr_pages); gfp_mask, nr_pages);
if (nr_pages <= 0) if (nr_pages <= 0)
return 1; return 1;
if (total_scanned == 0)
return 1; /* All zones had enough free memory */
if (!(gfp_mask & __GFP_FS)) if (!(gfp_mask & __GFP_FS))
break; break; /* Let the caller handle it */
/*
* Try to write back as many pages as we just scanned. Not
* sure if that makes sense, but it's an attempt to avoid
* creating IO storms unnecessarily
*/
wakeup_bdflush(total_scanned);
/* Take a nap, wait for some writeback to complete */
blk_congestion_wait(WRITE, HZ/4);
} }
if (gfp_mask & __GFP_FS) if (gfp_mask & __GFP_FS)
out_of_memory(); out_of_memory();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment