Commit 4ffeaf35 authored by Mel Gorman's avatar Mel Gorman Committed by Linus Torvalds

mm: page_alloc: reduce cost of the fair zone allocation policy

The fair zone allocation policy round-robins allocations between zones
within a node to avoid age inversion problems during reclaim.  If the
first allocation fails, the batch counts are reset and a second attempt
made before entering the slow path.

One assumption made with this scheme is that batches expire at roughly
the same time and the resets each time are justified.  This assumption
does not hold when zones reach their low watermark as the batches will
be consumed at uneven rates.  Allocation failure due to watermark
depletion result in additional zonelist scans for the reset and another
watermark check before hitting the slowpath.

On UMA, the benefit is negligible -- around 0.25%.  On 4-socket NUMA
machine it's variable due to the variability of measuring overhead with
the vmstat changes.  The system CPU overhead comparison looks like

          3.16.0-rc3  3.16.0-rc3  3.16.0-rc3
             vanilla   vmstat-v5 lowercost-v5
User          746.94      774.56      802.00
System      65336.22    32847.27    40852.33
Elapsed     27553.52    27415.04    27368.46

However it is worth noting that the overall benchmark still completed
faster and intuitively it makes sense to take as few passes as possible
through the zonelists.
Signed-off-by: default avatarMel Gorman <mgorman@suse.de>
Acked-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent f7b5d647
...@@ -534,6 +534,7 @@ typedef enum { ...@@ -534,6 +534,7 @@ typedef enum {
ZONE_WRITEBACK, /* reclaim scanning has recently found ZONE_WRITEBACK, /* reclaim scanning has recently found
* many pages under writeback * many pages under writeback
*/ */
ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */
} zone_flags_t; } zone_flags_t;
static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
...@@ -571,6 +572,11 @@ static inline int zone_is_reclaim_locked(const struct zone *zone) ...@@ -571,6 +572,11 @@ static inline int zone_is_reclaim_locked(const struct zone *zone)
return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
} }
static inline int zone_is_fair_depleted(const struct zone *zone)
{
return test_bit(ZONE_FAIR_DEPLETED, &zone->flags);
}
static inline int zone_is_oom_locked(const struct zone *zone) static inline int zone_is_oom_locked(const struct zone *zone)
{ {
return test_bit(ZONE_OOM_LOCKED, &zone->flags); return test_bit(ZONE_OOM_LOCKED, &zone->flags);
......
...@@ -1612,6 +1612,9 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, ...@@ -1612,6 +1612,9 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
} }
__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
!zone_is_fair_depleted(zone))
zone_set_flag(zone, ZONE_FAIR_DEPLETED);
__count_zone_vm_events(PGALLOC, zone, 1 << order); __count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags); zone_statistics(preferred_zone, zone, gfp_flags);
...@@ -1923,6 +1926,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) ...@@ -1923,6 +1926,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
#endif /* CONFIG_NUMA */ #endif /* CONFIG_NUMA */
static void reset_alloc_batches(struct zone *preferred_zone)
{
struct zone *zone = preferred_zone->zone_pgdat->node_zones;
do {
mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) - low_wmark_pages(zone) -
atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
} while (zone++ != preferred_zone);
}
/* /*
* get_page_from_freelist goes through the zonelist trying to allocate * get_page_from_freelist goes through the zonelist trying to allocate
* a page. * a page.
...@@ -1940,8 +1955,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, ...@@ -1940,8 +1955,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
int did_zlc_setup = 0; /* just call zlc_setup() one time */ int did_zlc_setup = 0; /* just call zlc_setup() one time */
bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
(gfp_mask & __GFP_WRITE); (gfp_mask & __GFP_WRITE);
int nr_fair_skipped = 0;
bool zonelist_rescan;
zonelist_scan: zonelist_scan:
zonelist_rescan = false;
/* /*
* Scan zonelist, looking for a zone with enough free. * Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
...@@ -1966,9 +1985,11 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, ...@@ -1966,9 +1985,11 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
if (alloc_flags & ALLOC_FAIR) { if (alloc_flags & ALLOC_FAIR) {
if (!zone_local(preferred_zone, zone)) if (!zone_local(preferred_zone, zone))
break; break;
if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) if (zone_is_fair_depleted(zone)) {
nr_fair_skipped++;
continue; continue;
} }
}
/* /*
* When allocating a page cache page for writing, we * When allocating a page cache page for writing, we
* want to get it from a zone that is within its dirty * want to get it from a zone that is within its dirty
...@@ -2073,13 +2094,7 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, ...@@ -2073,13 +2094,7 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
zlc_mark_zone_full(zonelist, z); zlc_mark_zone_full(zonelist, z);
} }
if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { if (page) {
/* Disable zlc cache for second zonelist scan */
zlc_active = 0;
goto zonelist_scan;
}
if (page)
/* /*
* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
* necessary to allocate the page. The expectation is * necessary to allocate the page. The expectation is
...@@ -2088,8 +2103,37 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, ...@@ -2088,8 +2103,37 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
* for !PFMEMALLOC purposes. * for !PFMEMALLOC purposes.
*/ */
page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
return page; return page;
}
/*
* The first pass makes sure allocations are spread fairly within the
* local node. However, the local node might have free pages left
* after the fairness batches are exhausted, and remote zones haven't
* even been considered yet. Try once more without fairness, and
* include remote zones now, before entering the slowpath and waking
* kswapd: prefer spilling to a remote zone over swapping locally.
*/
if (alloc_flags & ALLOC_FAIR) {
alloc_flags &= ~ALLOC_FAIR;
if (nr_fair_skipped) {
zonelist_rescan = true;
reset_alloc_batches(preferred_zone);
}
if (nr_online_nodes > 1)
zonelist_rescan = true;
}
if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
zlc_active = 0;
zonelist_rescan = true;
}
if (zonelist_rescan)
goto zonelist_scan;
return NULL;
} }
/* /*
...@@ -2410,28 +2454,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, ...@@ -2410,28 +2454,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
return page; return page;
} }
static void reset_alloc_batches(struct zonelist *zonelist,
enum zone_type high_zoneidx,
struct zone *preferred_zone)
{
struct zoneref *z;
struct zone *zone;
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
/*
* Only reset the batches of zones that were actually
* considered in the fairness pass, we don't want to
* trash fairness information for zones that are not
* actually part of this zonelist's round-robin cycle.
*/
if (!zone_local(preferred_zone, zone))
continue;
mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) - low_wmark_pages(zone) -
atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
}
}
static void wake_all_kswapds(unsigned int order, static void wake_all_kswapds(unsigned int order,
struct zonelist *zonelist, struct zonelist *zonelist,
enum zone_type high_zoneidx, enum zone_type high_zoneidx,
...@@ -2767,28 +2789,11 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, ...@@ -2767,28 +2789,11 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA; alloc_flags |= ALLOC_CMA;
#endif #endif
retry:
/* First allocation attempt */ /* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, alloc_flags, zonelist, high_zoneidx, alloc_flags,
preferred_zone, classzone_idx, migratetype); preferred_zone, classzone_idx, migratetype);
if (unlikely(!page)) { if (unlikely(!page)) {
/*
* The first pass makes sure allocations are spread
* fairly within the local node. However, the local
* node might have free pages left after the fairness
* batches are exhausted, and remote zones haven't
* even been considered yet. Try once more without
* fairness, and include remote zones now, before
* entering the slowpath and waking kswapd: prefer
* spilling to a remote zone over swapping locally.
*/
if (alloc_flags & ALLOC_FAIR) {
reset_alloc_batches(zonelist, high_zoneidx,
preferred_zone);
alloc_flags &= ~ALLOC_FAIR;
goto retry;
}
/* /*
* Runtime PM, block IO and its error handling path * Runtime PM, block IO and its error handling path
* can deadlock because I/O on the device might not * can deadlock because I/O on the device might not
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment