Commit 36fb7f84 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] handle zones which are full of unreclaimable pages

This patch is a general solution to the situation where a zone is full
of pinned pages.

This can come about if:

a) Someone has allocated all of ZONE_DMA for IO buffers

b) Some application is mlocking some memory and a zone ends up full
   of mlocked pages (can happen on a 1G ia32 system)

c) All of ZONE_HIGHMEM is pinned in hugetlb pages (can happen on 1G
   machines)

We'll currently burn 10% of CPU in kswapd when this happens, although
it is quite hard to trigger.

The algorithm is:

- If page reclaim has scanned 2 * the total number of pages in the
  zone and there have been no pages freed in that zone then mark the
  zone as "all unreclaimable".

- When a zone is "all unreclaimable" page reclaim almost ignores it.
  We will perform a "light" scan at DEF_PRIORITY (typically 1/4096'th of
  the zone, or 64 pages) and then forget about the zone.

- When a batch of pages are freed into the zone, clear its "all
  unreclaimable" state and start full scanning again.  The assumption
  being that some state change has come about which will make reclaim
  successful again.

  So if a "light scan" actually frees some pages, the zone will revert to
  normal state immediately.

So we're effectively putting the zone into "low power" mode, and lightly
polling it to see if something has changed.

The code works OK, but is quite hard to test - I mainly tested it by
pinning all highmem in hugetlb pages.
parent fee2b68d
...@@ -84,6 +84,8 @@ struct zone { ...@@ -84,6 +84,8 @@ struct zone {
atomic_t refill_counter; atomic_t refill_counter;
unsigned long nr_active; unsigned long nr_active;
unsigned long nr_inactive; unsigned long nr_inactive;
int all_unreclaimable; /* All pages pinned */
unsigned long pages_scanned; /* since last reclaim */
ZONE_PADDING(_pad2_) ZONE_PADDING(_pad2_)
...@@ -203,6 +205,7 @@ memclass(struct zone *pgzone, struct zone *classzone) ...@@ -203,6 +205,7 @@ memclass(struct zone *pgzone, struct zone *classzone)
void get_zone_counts(unsigned long *active, unsigned long *inactive); void get_zone_counts(unsigned long *active, unsigned long *inactive);
void build_all_zonelists(void); void build_all_zonelists(void);
void wakeup_kswapd(struct zone *zone);
/** /**
* for_each_pgdat - helper macro to iterate over all nodes * for_each_pgdat - helper macro to iterate over all nodes
......
...@@ -167,6 +167,12 @@ static inline void free_pages_check(const char *function, struct page *page) ...@@ -167,6 +167,12 @@ static inline void free_pages_check(const char *function, struct page *page)
* Frees a list of pages. * Frees a list of pages.
* Assumes all pages on list are in same zone, and of same order. * Assumes all pages on list are in same zone, and of same order.
* count is the number of pages to free, or 0 for all on the list. * count is the number of pages to free, or 0 for all on the list.
*
* If the zone was previously in an "all pages pinned" state then look to
* see if this freeing clears that state.
*
* And clear the zone's pages_scanned counter, to hold off the "all pages are
* pinned" detection logic.
*/ */
static int static int
free_pages_bulk(struct zone *zone, int count, free_pages_bulk(struct zone *zone, int count,
...@@ -181,6 +187,8 @@ free_pages_bulk(struct zone *zone, int count, ...@@ -181,6 +187,8 @@ free_pages_bulk(struct zone *zone, int count,
base = zone->zone_mem_map; base = zone->zone_mem_map;
area = zone->free_area + order; area = zone->free_area + order;
spin_lock_irqsave(&zone->lock, flags); spin_lock_irqsave(&zone->lock, flags);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
while (!list_empty(list) && count--) { while (!list_empty(list) && count--) {
page = list_entry(list->prev, struct page, list); page = list_entry(list->prev, struct page, list);
/* have to delete it as __free_pages_bulk list manipulates */ /* have to delete it as __free_pages_bulk list manipulates */
...@@ -464,12 +472,8 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -464,12 +472,8 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
} }
/* we're somewhat low on memory, failed to find what we needed */ /* we're somewhat low on memory, failed to find what we needed */
for (i = 0; zones[i] != NULL; i++) { for (i = 0; zones[i] != NULL; i++)
struct zone *z = zones[i]; wakeup_kswapd(zones[i]);
if (z->free_pages <= z->pages_low &&
waitqueue_active(&z->zone_pgdat->kswapd_wait))
wake_up_interruptible(&z->zone_pgdat->kswapd_wait);
}
/* Go through the zonelist again, taking __GFP_HIGH into account */ /* Go through the zonelist again, taking __GFP_HIGH into account */
min = 1UL << order; min = 1UL << order;
......
...@@ -468,6 +468,7 @@ shrink_cache(const int nr_pages, struct zone *zone, ...@@ -468,6 +468,7 @@ shrink_cache(const int nr_pages, struct zone *zone,
nr_taken++; nr_taken++;
} }
zone->nr_inactive -= nr_taken; zone->nr_inactive -= nr_taken;
zone->pages_scanned += nr_taken;
spin_unlock_irq(&zone->lru_lock); spin_unlock_irq(&zone->lru_lock);
if (nr_taken == 0) if (nr_taken == 0)
...@@ -720,6 +721,9 @@ shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask, ...@@ -720,6 +721,9 @@ shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
* satisfy the `incremental min' zone defense algorithm. * satisfy the `incremental min' zone defense algorithm.
* *
* Returns the number of reclaimed pages. * Returns the number of reclaimed pages.
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/ */
static int static int
shrink_caches(struct zone *classzone, int priority, int *total_scanned, shrink_caches(struct zone *classzone, int priority, int *total_scanned,
...@@ -735,6 +739,9 @@ shrink_caches(struct zone *classzone, int priority, int *total_scanned, ...@@ -735,6 +739,9 @@ shrink_caches(struct zone *classzone, int priority, int *total_scanned,
int nr_mapped = 0; int nr_mapped = 0;
int max_scan; int max_scan;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
/* /*
* If we cannot reclaim `nr_pages' pages by scanning twice * If we cannot reclaim `nr_pages' pages by scanning twice
* that many pages then fall back to the next zone. * that many pages then fall back to the next zone.
...@@ -817,6 +824,14 @@ try_to_free_pages(struct zone *classzone, ...@@ -817,6 +824,14 @@ try_to_free_pages(struct zone *classzone,
* special. * special.
* *
* Returns the number of pages which were actually freed. * Returns the number of pages which were actually freed.
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
* device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
* What we do is to detect the case where all pages in the zone have been
* scanned twice and there has been zero successful reclaim. Mark the zone as
* dead and from now on, only perform a short scan. Basically we're polling
* the zone for when the problem goes away.
*/ */
static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps) static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
{ {
...@@ -833,6 +848,9 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps) ...@@ -833,6 +848,9 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
int max_scan; int max_scan;
int to_reclaim; int to_reclaim;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue;
if (nr_pages && to_free > 0) { /* Software suspend */ if (nr_pages && to_free > 0) { /* Software suspend */
to_reclaim = min(to_free, SWAP_CLUSTER_MAX*8); to_reclaim = min(to_free, SWAP_CLUSTER_MAX*8);
} else { /* Zone balancing */ } else { /* Zone balancing */
...@@ -849,6 +867,10 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps) ...@@ -849,6 +867,10 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
to_free -= shrink_zone(zone, max_scan, GFP_KSWAPD, to_free -= shrink_zone(zone, max_scan, GFP_KSWAPD,
to_reclaim, &nr_mapped, ps, priority); to_reclaim, &nr_mapped, ps, priority);
shrink_slab(max_scan + nr_mapped, GFP_KSWAPD); shrink_slab(max_scan + nr_mapped, GFP_KSWAPD);
if (zone->all_unreclaimable)
continue;
if (zone->pages_scanned > zone->present_pages * 2)
zone->all_unreclaimable = 1;
} }
if (all_zones_ok) if (all_zones_ok)
break; break;
...@@ -909,6 +931,18 @@ int kswapd(void *p) ...@@ -909,6 +931,18 @@ int kswapd(void *p)
} }
} }
/*
* A zone is low on free memory, so wake its kswapd task to service it.
*/
void wakeup_kswapd(struct zone *zone)
{
if (zone->free_pages > zone->pages_low)
return;
if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
return;
wake_up_interruptible(&zone->zone_pgdat->kswapd_wait);
}
#ifdef CONFIG_SOFTWARE_SUSPEND #ifdef CONFIG_SOFTWARE_SUSPEND
/* /*
* Try to free `nr_pages' of memory, system-wide. Returns the number of freed * Try to free `nr_pages' of memory, system-wide. Returns the number of freed
...@@ -938,7 +972,6 @@ int shrink_all_memory(int nr_pages) ...@@ -938,7 +972,6 @@ int shrink_all_memory(int nr_pages)
static int __init kswapd_init(void) static int __init kswapd_init(void)
{ {
pg_data_t *pgdat; pg_data_t *pgdat;
printk("Starting kswapd\n");
swap_setup(); swap_setup();
for_each_pgdat(pgdat) for_each_pgdat(pgdat)
kernel_thread(kswapd, pgdat, CLONE_KERNEL); kernel_thread(kswapd, pgdat, CLONE_KERNEL);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment