Commit 8f7a1404 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] reduced and tunable swappiness

/proc/sys/vm/swappiness controls the VM's tendency to unmap pages and to
swap things out.

100 -> basically current 2.5 behaviour
0 -> not very swappy at all

The mechanism which is used to control swappiness is: to be reluctant
to bring mapped pages onto the inactive list.  Prefer to reclaim
pagecache instead.

The control for that mechanism is as follows:

- If there is a large amount of mapped memory in the machine, we
  prefer to bring mapped pages onto the inactive list.

- If page reclaim is under distress (more scanning is happening) then
  prefer to bring mapped pages onto the inactive list.  This is
  basically the 2.4 algorithm, really.

- If the /proc/sys/vm/swappiness control is high then prefer to bring
  mapped pages onto the inactive list.

The implementation is simple: calculate the above three things as
percentages and add them up.  If that's over 100% then start reclaiming
mapped pages.

The `proportion of mapped memory' is downgraded so that we don't swap
just because a lot of memory is mapped into pagetables - we still need
some VM distress before starting to swap that memory out.

For a while I was adding a little bias so that we prefer to unmap
file-backed memory before swapping out anon memory.  Because usually
file backed memory can be evicted and reestablished with one I/O, not
two.  It was unmapping executable text too easily, so here I just treat
them equally.
parent 5bbac23e
...@@ -164,6 +164,7 @@ extern void swap_setup(void); ...@@ -164,6 +164,7 @@ extern void swap_setup(void);
/* linux/mm/vmscan.c */ /* linux/mm/vmscan.c */
extern int try_to_free_pages(struct zone *, unsigned int, unsigned int); extern int try_to_free_pages(struct zone *, unsigned int, unsigned int);
int shrink_all_memory(int nr_pages); int shrink_all_memory(int nr_pages);
extern int vm_swappiness;
/* linux/mm/page_io.c */ /* linux/mm/page_io.c */
int swap_readpage(struct file *file, struct page *page); int swap_readpage(struct file *file, struct page *page);
......
...@@ -153,6 +153,7 @@ enum ...@@ -153,6 +153,7 @@ enum
VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */ VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */
VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ VM_PAGEBUF=17, /* struct: Control pagebuf parameters */
VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */
VM_SWAPPINESS=19, /* Tendency to steal mapped memory */
}; };
......
...@@ -311,6 +311,9 @@ static ctl_table vm_table[] = { ...@@ -311,6 +311,9 @@ static ctl_table vm_table[] = {
{ VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads", { VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads",
&nr_pdflush_threads, sizeof nr_pdflush_threads, &nr_pdflush_threads, sizeof nr_pdflush_threads,
0444 /* read-only*/, NULL, &proc_dointvec}, 0444 /* read-only*/, NULL, &proc_dointvec},
{VM_SWAPPINESS, "swappiness", &vm_swappiness, sizeof(vm_swappiness),
0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &zero,
&one_hundred },
#ifdef CONFIG_HUGETLB_PAGE #ifdef CONFIG_HUGETLB_PAGE
{VM_HUGETLB_PAGES, "nr_hugepages", &htlbpage_max, sizeof(int), 0644, NULL, {VM_HUGETLB_PAGES, "nr_hugepages", &htlbpage_max, sizeof(int), 0644, NULL,
&proc_dointvec}, &proc_dointvec},
......
...@@ -35,13 +35,18 @@ ...@@ -35,13 +35,18 @@
#include <linux/swapops.h> #include <linux/swapops.h>
/* /*
* The "priority" of VM scanning is how much of the queues we * The "priority" of VM scanning is how much of the queues we will scan in one
* will scan in one go. A value of 12 for DEF_PRIORITY implies * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
* that we'll scan 1/4096th of the queues ("queue_length >> 12") * queues ("queue_length >> 12") during an aging round.
* during a normal aging round.
*/ */
#define DEF_PRIORITY 12 #define DEF_PRIORITY 12
/*
* From 0 .. 100. Higher means more swappy.
*/
int vm_swappiness = 60;
static long total_memory;
#ifdef ARCH_HAS_PREFETCH #ifdef ARCH_HAS_PREFETCH
#define prefetch_prev_lru_page(_page, _base, _field) \ #define prefetch_prev_lru_page(_page, _base, _field) \
do { \ do { \
...@@ -101,7 +106,6 @@ static inline int is_page_cache_freeable(struct page *page) ...@@ -101,7 +106,6 @@ static inline int is_page_cache_freeable(struct page *page)
return page_count(page) - !!PagePrivate(page) == 2; return page_count(page) - !!PagePrivate(page) == 2;
} }
/* /*
* shrink_list returns the number of reclaimed pages * shrink_list returns the number of reclaimed pages
*/ */
...@@ -439,7 +443,8 @@ shrink_cache(const int nr_pages, struct zone *zone, ...@@ -439,7 +443,8 @@ shrink_cache(const int nr_pages, struct zone *zone,
* But we had to alter page->flags anyway. * But we had to alter page->flags anyway.
*/ */
static /* inline */ void static /* inline */ void
refill_inactive_zone(struct zone *zone, const int nr_pages_in) refill_inactive_zone(struct zone *zone, const int nr_pages_in,
struct page_state *ps, int priority)
{ {
int pgdeactivate = 0; int pgdeactivate = 0;
int nr_pages = nr_pages_in; int nr_pages = nr_pages_in;
...@@ -448,6 +453,10 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in) ...@@ -448,6 +453,10 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
LIST_HEAD(l_active); /* Pages to go onto the active_list */ LIST_HEAD(l_active); /* Pages to go onto the active_list */
struct page *page; struct page *page;
struct pagevec pvec; struct pagevec pvec;
int reclaim_mapped = 0;
long mapped_ratio;
long distress;
long swap_tendency;
lru_add_drain(); lru_add_drain();
spin_lock_irq(&zone->lru_lock); spin_lock_irq(&zone->lru_lock);
...@@ -469,6 +478,37 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in) ...@@ -469,6 +478,37 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
} }
spin_unlock_irq(&zone->lru_lock); spin_unlock_irq(&zone->lru_lock);
/*
* `distress' is a measure of how much trouble we're having reclaiming
* pages. 0 -> no problems. 100 -> great trouble.
*/
distress = 100 >> priority;
/*
* The point of this algorithm is to decide when to start reclaiming
* mapped memory instead of just pagecache. Work out how much memory
* is mapped.
*/
mapped_ratio = (ps->nr_mapped * 100) / total_memory;
/*
* Now decide how much we really want to unmap some pages. The mapped
* ratio is downgraded - just because there's a lot of mapped memory
* doesn't necessarily mean that page reclaim isn't succeeding.
*
* The distress ratio is important - we don't want to start going oom.
*
* A 100% value of vm_swappiness overrides this algorithm altogether.
*/
swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
/*
* Now use this metric to decide whether to start moving mapped memory
* onto the inactive list.
*/
if (swap_tendency >= 100)
reclaim_mapped = 1;
while (!list_empty(&l_hold)) { while (!list_empty(&l_hold)) {
page = list_entry(l_hold.prev, struct page, lru); page = list_entry(l_hold.prev, struct page, lru);
list_del(&page->lru); list_del(&page->lru);
...@@ -480,6 +520,10 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in) ...@@ -480,6 +520,10 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
continue; continue;
} }
pte_chain_unlock(page); pte_chain_unlock(page);
if (!reclaim_mapped) {
list_add(&page->lru, &l_active);
continue;
}
} }
/* /*
* FIXME: need to consider page_count(page) here if/when we * FIXME: need to consider page_count(page) here if/when we
...@@ -546,7 +590,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in) ...@@ -546,7 +590,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
*/ */
static /* inline */ int static /* inline */ int
shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask, shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
const int nr_pages, int *nr_mapped) const int nr_pages, int *nr_mapped, struct page_state *ps, int priority)
{ {
unsigned long ratio; unsigned long ratio;
...@@ -563,11 +607,23 @@ shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask, ...@@ -563,11 +607,23 @@ shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
ratio = (unsigned long)nr_pages * zone->nr_active / ratio = (unsigned long)nr_pages * zone->nr_active /
((zone->nr_inactive | 1) * 2); ((zone->nr_inactive | 1) * 2);
atomic_add(ratio+1, &zone->refill_counter); atomic_add(ratio+1, &zone->refill_counter);
while (atomic_read(&zone->refill_counter) > SWAP_CLUSTER_MAX) { if (atomic_read(&zone->refill_counter) > SWAP_CLUSTER_MAX) {
atomic_sub(SWAP_CLUSTER_MAX, &zone->refill_counter); int count;
refill_inactive_zone(zone, SWAP_CLUSTER_MAX);
/*
* Don't try to bring down too many pages in one attempt.
* If this fails, the caller will increase `priority' and
* we'll try again, with an increased chance of reclaiming
* mapped memory.
*/
count = atomic_read(&zone->refill_counter);
if (count > SWAP_CLUSTER_MAX * 4)
count = SWAP_CLUSTER_MAX * 4;
atomic_sub(count, &zone->refill_counter);
refill_inactive_zone(zone, count, ps, priority);
} }
return shrink_cache(nr_pages, zone, gfp_mask, max_scan, nr_mapped); return shrink_cache(nr_pages, zone, gfp_mask,
max_scan, nr_mapped);
} }
/* /*
...@@ -603,7 +659,8 @@ static void shrink_slab(int total_scanned, int gfp_mask) ...@@ -603,7 +659,8 @@ static void shrink_slab(int total_scanned, int gfp_mask)
*/ */
static int static int
shrink_caches(struct zone *classzone, int priority, int *total_scanned, shrink_caches(struct zone *classzone, int priority, int *total_scanned,
int gfp_mask, const int nr_pages, int order) int gfp_mask, const int nr_pages, int order,
struct page_state *ps)
{ {
struct zone *first_classzone; struct zone *first_classzone;
struct zone *zone; struct zone *zone;
...@@ -630,7 +687,7 @@ shrink_caches(struct zone *classzone, int priority, int *total_scanned, ...@@ -630,7 +687,7 @@ shrink_caches(struct zone *classzone, int priority, int *total_scanned,
if (max_scan < to_reclaim * 2) if (max_scan < to_reclaim * 2)
max_scan = to_reclaim * 2; max_scan = to_reclaim * 2;
ret += shrink_zone(zone, max_scan, gfp_mask, ret += shrink_zone(zone, max_scan, gfp_mask,
to_reclaim, &nr_mapped); to_reclaim, &nr_mapped, ps, priority);
*total_scanned += max_scan; *total_scanned += max_scan;
*total_scanned += nr_mapped; *total_scanned += nr_mapped;
if (ret >= nr_pages) if (ret >= nr_pages)
...@@ -666,12 +723,14 @@ try_to_free_pages(struct zone *classzone, ...@@ -666,12 +723,14 @@ try_to_free_pages(struct zone *classzone,
inc_page_state(pageoutrun); inc_page_state(pageoutrun);
for (priority = DEF_PRIORITY; priority; priority--) { for (priority = DEF_PRIORITY; priority >= 0; priority--) {
int total_scanned = 0; int total_scanned = 0;
struct page_state ps;
get_page_state(&ps);
nr_reclaimed += shrink_caches(classzone, priority, nr_reclaimed += shrink_caches(classzone, priority,
&total_scanned, gfp_mask, &total_scanned, gfp_mask,
nr_pages, order); nr_pages, order, &ps);
if (nr_reclaimed >= nr_pages) if (nr_reclaimed >= nr_pages)
return 1; return 1;
if (total_scanned == 0) if (total_scanned == 0)
...@@ -704,7 +763,7 @@ try_to_free_pages(struct zone *classzone, ...@@ -704,7 +763,7 @@ try_to_free_pages(struct zone *classzone,
* *
* Returns the number of pages which were actually freed. * Returns the number of pages which were actually freed.
*/ */
static int balance_pgdat(pg_data_t *pgdat, int nr_pages) static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
{ {
int to_free = nr_pages; int to_free = nr_pages;
int priority; int priority;
...@@ -729,7 +788,7 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages) ...@@ -729,7 +788,7 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages)
if (max_scan < to_reclaim * 2) if (max_scan < to_reclaim * 2)
max_scan = to_reclaim * 2; max_scan = to_reclaim * 2;
to_free -= shrink_zone(zone, max_scan, GFP_KSWAPD, to_free -= shrink_zone(zone, max_scan, GFP_KSWAPD,
to_reclaim, &nr_mapped); to_reclaim, &nr_mapped, ps, priority);
shrink_slab(max_scan + nr_mapped, GFP_KSWAPD); shrink_slab(max_scan + nr_mapped, GFP_KSWAPD);
} }
if (success) if (success)
...@@ -778,12 +837,15 @@ int kswapd(void *p) ...@@ -778,12 +837,15 @@ int kswapd(void *p)
tsk->flags |= PF_MEMALLOC|PF_KSWAPD; tsk->flags |= PF_MEMALLOC|PF_KSWAPD;
for ( ; ; ) { for ( ; ; ) {
struct page_state ps;
if (current->flags & PF_FREEZE) if (current->flags & PF_FREEZE)
refrigerator(PF_IOTHREAD); refrigerator(PF_IOTHREAD);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
schedule(); schedule();
finish_wait(&pgdat->kswapd_wait, &wait); finish_wait(&pgdat->kswapd_wait, &wait);
balance_pgdat(pgdat, 0); get_page_state(&ps);
balance_pgdat(pgdat, 0, &ps);
blk_run_queues(); blk_run_queues();
} }
} }
...@@ -801,8 +863,10 @@ int shrink_all_memory(int nr_pages) ...@@ -801,8 +863,10 @@ int shrink_all_memory(int nr_pages)
for_each_pgdat(pgdat) { for_each_pgdat(pgdat) {
int freed; int freed;
struct page_state ps;
freed = balance_pgdat(pgdat, nr_to_free); get_page_state(&ps);
freed = balance_pgdat(pgdat, nr_to_free, &ps);
ret += freed; ret += freed;
nr_to_free -= freed; nr_to_free -= freed;
if (nr_to_free <= 0) if (nr_to_free <= 0)
...@@ -819,6 +883,7 @@ static int __init kswapd_init(void) ...@@ -819,6 +883,7 @@ static int __init kswapd_init(void)
swap_setup(); swap_setup();
for_each_pgdat(pgdat) for_each_pgdat(pgdat)
kernel_thread(kswapd, pgdat, CLONE_KERNEL); kernel_thread(kswapd, pgdat, CLONE_KERNEL);
total_memory = nr_free_pagecache_pages();
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment