Commit d867ca5c authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] vmscan: zone pressure simplification and fix

The zone->pressure field is supposed to record the amount of reclaim pressure
which this zone is under.  We need this info so we know whether to unmap
pages from pagetables right from the outset of a balance_pgdat() or
try_to_free_pages() invokation.

The problem with the current code is that the exponential average gets tugged
around too much: as we perform the increasing-priority scan, the pressure
metric is made artificially low by the early part of the scan.

So instead what we do here is to record within the zone the scanning priority
from the zone's previous scan.  It is defined as the priority at which the
zone achieved the "enough pages free" state.  This prev_priority is used on
the next scan for the do-we-need-to-be-unmapping-pages decision.
parent b25bb608
...@@ -89,17 +89,24 @@ struct zone { ...@@ -89,17 +89,24 @@ struct zone {
ZONE_PADDING(_pad2_) ZONE_PADDING(_pad2_)
/* /*
* measure of scanning intensity for this zone. It is calculated * prev_priority holds the scanning priority for this zone. It is
* as exponentially decaying average of the scanning priority * defined as the scanning priority at which we achieved our reclaim
* required to free enough pages in this zone * target at the previous try_to_free_pages() or balance_pgdat()
* (zone_adj_pressure()). * invokation.
*
* We use prev_priority as a measure of how much stress page reclaim is
* under - it drives the swappiness decision: whether to unmap mapped
* pages.
* *
* 0 --- low pressure * temp_priority is used to remember the scanning priority at which
* this zone was successfully refilled to free_pages == pages_high.
* *
* (DEF_PRIORITY << 10) --- high pressure * Access to both these fields is quite racy even on uniprocessor. But
* it is expected to average out OK.
*/ */
int pressure; int temp_priority;
int prev_priority;
/* /*
* free areas of different sizes * free areas of different sizes
......
...@@ -79,25 +79,6 @@ static long total_memory; ...@@ -79,25 +79,6 @@ static long total_memory;
#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
#endif #endif
/*
* exponentially decaying average
*/
static inline int expavg(int avg, int val)
{
return ((val - avg) >> 1) + avg;
}
static void zone_adj_pressure(struct zone *zone, int priority)
{
zone->pressure = expavg(zone->pressure,
(DEF_PRIORITY - priority) << 10);
}
static int pressure_to_priority(int pressure)
{
return DEF_PRIORITY - (pressure >> 10);
}
/* /*
* The list of shrinker callbacks used by to apply pressure to * The list of shrinker callbacks used by to apply pressure to
* ageable caches. * ageable caches.
...@@ -646,7 +627,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in, ...@@ -646,7 +627,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in,
* `distress' is a measure of how much trouble we're having reclaiming * `distress' is a measure of how much trouble we're having reclaiming
* pages. 0 -> no problems. 100 -> great trouble. * pages. 0 -> no problems. 100 -> great trouble.
*/ */
distress = 100 >> pressure_to_priority(zone->pressure); distress = 100 >> zone->prev_priority;
/* /*
* The point of this algorithm is to decide when to start reclaiming * The point of this algorithm is to decide when to start reclaiming
...@@ -830,6 +811,9 @@ shrink_caches(struct zone *classzone, int priority, int *total_scanned, ...@@ -830,6 +811,9 @@ shrink_caches(struct zone *classzone, int priority, int *total_scanned,
int nr_mapped = 0; int nr_mapped = 0;
int max_scan; int max_scan;
if (zone->free_pages < zone->pages_high)
zone->temp_priority = priority;
if (zone->all_unreclaimable && priority != DEF_PRIORITY) if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */ continue; /* Let kswapd poll it */
...@@ -843,10 +827,8 @@ shrink_caches(struct zone *classzone, int priority, int *total_scanned, ...@@ -843,10 +827,8 @@ shrink_caches(struct zone *classzone, int priority, int *total_scanned,
ret += shrink_zone(zone, max_scan, gfp_mask, ret += shrink_zone(zone, max_scan, gfp_mask,
to_reclaim, &nr_mapped, ps, priority); to_reclaim, &nr_mapped, ps, priority);
*total_scanned += max_scan + nr_mapped; *total_scanned += max_scan + nr_mapped;
if (ret >= nr_pages) { if (ret >= nr_pages)
zone_adj_pressure(zone, priority);
break; break;
}
} }
return ret; return ret;
} }
...@@ -880,6 +862,9 @@ int try_to_free_pages(struct zone *cz, ...@@ -880,6 +862,9 @@ int try_to_free_pages(struct zone *cz,
inc_page_state(allocstall); inc_page_state(allocstall);
for (zone = cz; zone >= cz->zone_pgdat->node_zones; --zone)
zone->temp_priority = DEF_PRIORITY;
for (priority = DEF_PRIORITY; priority >= 0; priority--) { for (priority = DEF_PRIORITY; priority >= 0; priority--) {
int total_scanned = 0; int total_scanned = 0;
struct page_state ps; struct page_state ps;
...@@ -912,9 +897,9 @@ int try_to_free_pages(struct zone *cz, ...@@ -912,9 +897,9 @@ int try_to_free_pages(struct zone *cz,
} }
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
out_of_memory(); out_of_memory();
for (zone = cz; zone >= cz->zone_pgdat->node_zones; -- zone)
zone_adj_pressure(zone, -1);
out: out:
for (zone = cz; zone >= cz->zone_pgdat->node_zones; --zone)
zone->prev_priority = zone->temp_priority;
return ret; return ret;
} }
...@@ -945,6 +930,12 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps) ...@@ -945,6 +930,12 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
inc_page_state(pageoutrun); inc_page_state(pageoutrun);
for (i = 0; i < pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;
zone->temp_priority = DEF_PRIORITY;
}
for (priority = DEF_PRIORITY; priority; priority--) { for (priority = DEF_PRIORITY; priority; priority--) {
int all_zones_ok = 1; int all_zones_ok = 1;
...@@ -961,11 +952,10 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps) ...@@ -961,11 +952,10 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
to_reclaim = min(to_free, SWAP_CLUSTER_MAX*8); to_reclaim = min(to_free, SWAP_CLUSTER_MAX*8);
} else { /* Zone balancing */ } else { /* Zone balancing */
to_reclaim = zone->pages_high-zone->free_pages; to_reclaim = zone->pages_high-zone->free_pages;
if (to_reclaim <= 0) { if (to_reclaim <= 0)
zone_adj_pressure(zone, priority);
continue; continue;
}
} }
zone->temp_priority = priority;
all_zones_ok = 0; all_zones_ok = 0;
max_scan = zone->nr_inactive >> priority; max_scan = zone->nr_inactive >> priority;
if (max_scan < to_reclaim * 2) if (max_scan < to_reclaim * 2)
...@@ -989,13 +979,11 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps) ...@@ -989,13 +979,11 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
if (to_free > 0) if (to_free > 0)
blk_congestion_wait(WRITE, HZ/10); blk_congestion_wait(WRITE, HZ/10);
} }
if (priority <= 0) {
for (i = 0; i < pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;
if (zone->free_pages < zone->pages_high) for (i = 0; i < pgdat->nr_zones; i++) {
zone_adj_pressure(zone, -1); struct zone *zone = pgdat->node_zones + i;
}
zone->prev_priority = zone->temp_priority;
} }
return nr_pages - to_free; return nr_pages - to_free;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment