[PATCH] reduced and tunable swappiness

/proc/sys/vm/swappiness controls the VM's tendency to unmap pages and to swap things out. 100 -> basically current 2.5 behaviour 0 -> not very swappy at all The mechanism which is used to control swappiness is: to be reluctant to bring mapped pages onto the inactive list. Prefer to reclaim pagecache instead. The control for that mechanism is as follows: - If there is a large amount of mapped memory in the machine, we prefer to bring mapped pages onto the inactive list. - If page reclaim is under distress (more scanning is happening) then prefer to bring mapped pages onto the inactive list. This is basically the 2.4 algorithm, really. - If the /proc/sys/vm/swappiness control is high then prefer to bring mapped pages onto the inactive list. The implementation is simple: calculate the above three things as percentages and add them up. If that's over 100% then start reclaiming mapped pages. The `proportion of mapped memory' is downgraded so that we don't swap just because a lot of memory is mapped into pagetables - we still need some VM distress before starting to swap that memory out. For a while I was adding a little bias so that we prefer to unmap file-backed memory before swapping out anon memory. Because usually file backed memory can be evicted and reestablished with one I/O, not two. It was unmapping executable text too easily, so here I just treat them equally.

[PATCH] reduced and tunable swappiness
/proc/sys/vm/swappiness controls the VM's tendency to unmap pages and to swap things out. 100 -> basically current 2.5 behaviour 0 -> not very swappy at all The mechanism which is used to control swappiness is: to be reluctant to bring mapped pages onto the inactive list. Prefer to reclaim pagecache instead. The control for that mechanism is as follows: - If there is a large amount of mapped memory in the machine, we prefer to bring mapped pages onto the inactive list. - If page reclaim is under distress (more scanning is happening) then prefer to bring mapped pages onto the inactive list. This is basically the 2.4 algorithm, really. - If the /proc/sys/vm/swappiness control is high then prefer to bring mapped pages onto the inactive list. The implementation is simple: calculate the above three things as percentages and add them up. If that's over 100% then start reclaiming mapped pages. The `proportion of mapped memory' is downgraded so that we don't swap just because a lot of memory is mapped into pagetables - we still need some VM distress before starting to swap that memory out. For a while I was adding a little bias so that we prefer to unmap file-backed memory before swapping out anon memory. Because usually file backed memory can be evicted and reestablished with one I/O, not two. It was unmapping executable text too easily, so here I just treat them equally.
8f7a1404 · Andrew Morton · Linus Torvalds · 5bbac23e · 8f7a1404 · 8f7a1404
Commit 8f7a1404 authored Oct 12, 2002 by Andrew Morton Committed by Linus Torvalds Oct 12, 2002
Showing with 89 additions and 19 deletions

include/linux/swap.h include/linux/swap.h +1 -0

include/linux/sysctl.h include/linux/sysctl.h +1 -0

kernel/sysctl.c kernel/sysctl.c +3 -0

mm/vmscan.c mm/vmscan.c +84 -19

No files found.
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -164,6 +164,7 @@ extern void swap_setup(void);
 /* linux/mm/vmscan.c */
 extern int try_to_free_pages(struct zone *, unsigned int, unsigned int);
 int shrink_all_memory(int nr_pages);
+extern int vm_swappiness;

 /* linux/mm/page_io.c */
 int swap_readpage(struct file *file, struct page *page);

--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -153,6 +153,7 @@ enum
 	VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */
 	VM_PAGEBUF=17,		/* struct: Control pagebuf parameters */
 	VM_HUGETLB_PAGES=18,	/* int: Number of available Huge Pages */
+	VM_SWAPPINESS=19,	/* Tendency to steal mapped memory */
 };



--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -311,6 +311,9 @@ static ctl_table vm_table[] = {
 	{ VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads",
 	  &nr_pdflush_threads, sizeof nr_pdflush_threads,
 	  0444 /* read-only*/, NULL, &proc_dointvec},
+	{VM_SWAPPINESS, "swappiness", &vm_swappiness, sizeof(vm_swappiness),
+	 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &zero,
+	 &one_hundred },
 #ifdef CONFIG_HUGETLB_PAGE
 	 {VM_HUGETLB_PAGES, "nr_hugepages", &htlbpage_max, sizeof(int), 0644, NULL, 
 	  &proc_dointvec},

--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -35,13 +35,18 @@
 #include <linux/swapops.h>

 /*
- * The "priority" of VM scanning is how much of the queues we
- * will scan in one go. A value of 12 for DEF_PRIORITY implies
- * that we'll scan 1/4096th of the queues ("queue_length >> 12")
- * during a normal aging round.
+ * The "priority" of VM scanning is how much of the queues we will scan in one
+ * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
+ * queues ("queue_length >> 12") during an aging round.
 */
 #define DEF_PRIORITY 12

+/*
+ * From 0 .. 100.  Higher means more swappy.
+ */
+int vm_swappiness = 60;
+static long total_memory;
+
 #ifdef ARCH_HAS_PREFETCH
 #define prefetch_prev_lru_page(_page, _base, _field)			\
 	do {								\
@@ -101,7 +106,6 @@ static inline int is_page_cache_freeable(struct page *page)
 	return page_count(page) - !!PagePrivate(page) == 2;
 }

-
 /*
 * shrink_list returns the number of reclaimed pages
 */
@@ -439,7 +443,8 @@ shrink_cache(const int nr_pages, struct zone *zone,
 * But we had to alter page->flags anyway.
 */
 static /* inline */ void
-refill_inactive_zone(struct zone *zone, const int nr_pages_in)
+refill_inactive_zone(struct zone *zone, const int nr_pages_in,
+			struct page_state *ps, int priority)
 {
 	int pgdeactivate = 0;
 	int nr_pages = nr_pages_in;
@@ -448,6 +453,10 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
 	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
 	struct page *page;
 	struct pagevec pvec;
+	int reclaim_mapped = 0;
+	long mapped_ratio;
+	long distress;
+	long swap_tendency;

 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
@@ -469,6 +478,37 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
 	}
 	spin_unlock_irq(&zone->lru_lock);

+	/*
+	 * `distress' is a measure of how much trouble we're having reclaiming
+	 * pages.  0 -> no problems.  100 -> great trouble.
+	 */
+	distress = 100 >> priority;
+
+	/*
+	 * The point of this algorithm is to decide when to start reclaiming
+	 * mapped memory instead of just pagecache.  Work out how much memory
+	 * is mapped.
+	 */
+	mapped_ratio = (ps->nr_mapped * 100) / total_memory;
+
+	/*
+	 * Now decide how much we really want to unmap some pages.  The mapped
+	 * ratio is downgraded - just because there's a lot of mapped memory
+	 * doesn't necessarily mean that page reclaim isn't succeeding.
+	 *
+	 * The distress ratio is important - we don't want to start going oom.
+	 *
+	 * A 100% value of vm_swappiness overrides this algorithm altogether.
+	 */
+	swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+
+	/*
+	 * Now use this metric to decide whether to start moving mapped memory
+	 * onto the inactive list.
+	 */
+	if (swap_tendency >= 100)
+		reclaim_mapped = 1;
+
 	while (!list_empty(&l_hold)) {
 		page = list_entry(l_hold.prev, struct page, lru);
 		list_del(&page->lru);
@@ -480,6 +520,10 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
 				continue;
 			}
 			pte_chain_unlock(page);
+			if (!reclaim_mapped) {
+				list_add(&page->lru, &l_active);
+				continue;
+			}
 		}
 		/*
 		 * FIXME: need to consider page_count(page) here if/when we
@@ -546,7 +590,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
 */
 static /* inline */ int
 shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
-		const int nr_pages, int *nr_mapped)
+	const int nr_pages, int *nr_mapped, struct page_state *ps, int priority)
 {
 	unsigned long ratio;

@@ -563,11 +607,23 @@ shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
 	ratio = (unsigned long)nr_pages * zone->nr_active /
 				((zone->nr_inactive | 1) * 2);
 	atomic_add(ratio+1, &zone->refill_counter);
-	while (atomic_read(&zone->refill_counter) > SWAP_CLUSTER_MAX) {
-		atomic_sub(SWAP_CLUSTER_MAX, &zone->refill_counter);
-		refill_inactive_zone(zone, SWAP_CLUSTER_MAX);
+	if (atomic_read(&zone->refill_counter) > SWAP_CLUSTER_MAX) {
+		int count;
+
+		/*
+		 * Don't try to bring down too many pages in one attempt.
+		 * If this fails, the caller will increase `priority' and
+		 * we'll try again, with an increased chance of reclaiming
+		 * mapped memory.
+		 */
+		count = atomic_read(&zone->refill_counter);
+		if (count > SWAP_CLUSTER_MAX * 4)
+			count = SWAP_CLUSTER_MAX * 4;
+		atomic_sub(count, &zone->refill_counter);
+		refill_inactive_zone(zone, count, ps, priority);
 	}
-	return shrink_cache(nr_pages, zone, gfp_mask, max_scan, nr_mapped);
+	return shrink_cache(nr_pages, zone, gfp_mask,
+				max_scan, nr_mapped);
 }

 /*
@@ -603,7 +659,8 @@ static void shrink_slab(int total_scanned, int gfp_mask)
 */
 static int
 shrink_caches(struct zone *classzone, int priority, int *total_scanned,
-		int gfp_mask, const int nr_pages, int order)
+		int gfp_mask, const int nr_pages, int order,
+		struct page_state *ps)
 {
 	struct zone *first_classzone;
 	struct zone *zone;
@@ -630,7 +687,7 @@ shrink_caches(struct zone *classzone, int priority, int *total_scanned,
 		if (max_scan < to_reclaim * 2)
 			max_scan = to_reclaim * 2;
 		ret += shrink_zone(zone, max_scan, gfp_mask,
-				to_reclaim, &nr_mapped);
+				to_reclaim, &nr_mapped, ps, priority);
 		*total_scanned += max_scan;
 		*total_scanned += nr_mapped;
 		if (ret >= nr_pages)
@@ -666,12 +723,14 @@ try_to_free_pages(struct zone *classzone,

 	inc_page_state(pageoutrun);

-	for (priority = DEF_PRIORITY; priority; priority--) {
+	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
 		int total_scanned = 0;
+		struct page_state ps;

+		get_page_state(&ps);
 		nr_reclaimed += shrink_caches(classzone, priority,
 					&total_scanned, gfp_mask,
-					nr_pages, order);
+					nr_pages, order, &ps);
 		if (nr_reclaimed >= nr_pages)
 			return 1;
 		if (total_scanned == 0)
@@ -704,7 +763,7 @@ try_to_free_pages(struct zone *classzone,
 *
 * Returns the number of pages which were actually freed.
 */
-static int balance_pgdat(pg_data_t *pgdat, int nr_pages)
+static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
 {
 	int to_free = nr_pages;
 	int priority;
@@ -729,7 +788,7 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages)
 			if (max_scan < to_reclaim * 2)
 				max_scan = to_reclaim * 2;
 			to_free -= shrink_zone(zone, max_scan, GFP_KSWAPD,
-					to_reclaim, &nr_mapped);
+					to_reclaim, &nr_mapped, ps, priority);
 			shrink_slab(max_scan + nr_mapped, GFP_KSWAPD);
 		}
 		if (success)
@@ -778,12 +837,15 @@ int kswapd(void *p)
 	tsk->flags |= PF_MEMALLOC|PF_KSWAPD;

 	for ( ; ; ) {
+		struct page_state ps;
+
 		if (current->flags & PF_FREEZE)
 			refrigerator(PF_IOTHREAD);
 		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 		schedule();
 		finish_wait(&pgdat->kswapd_wait, &wait);
-		balance_pgdat(pgdat, 0);
+		get_page_state(&ps);
+		balance_pgdat(pgdat, 0, &ps);
 		blk_run_queues();
 	}
 }
@@ -801,8 +863,10 @@ int shrink_all_memory(int nr_pages)

 	for_each_pgdat(pgdat) {
 		int freed;
+		struct page_state ps;

-		freed = balance_pgdat(pgdat, nr_to_free);
+		get_page_state(&ps);
+		freed = balance_pgdat(pgdat, nr_to_free, &ps);
 		ret += freed;
 		nr_to_free -= freed;
 		if (nr_to_free <= 0)
@@ -819,6 +883,7 @@ static int __init kswapd_init(void)
 	swap_setup();
 	for_each_pgdat(pgdat)
 		kernel_thread(kswapd, pgdat, CLONE_KERNEL);
+	total_memory = nr_free_pagecache_pages();
 	return 0;
 }