Commit 74f44822 authored by Mel Gorman's avatar Mel Gorman Committed by Linus Torvalds

mm/page_alloc: introduce vm.percpu_pagelist_high_fraction

This introduces a new sysctl vm.percpu_pagelist_high_fraction.  It is
similar to the old vm.percpu_pagelist_fraction.  The old sysctl increased
both pcp->batch and pcp->high with the higher pcp->high potentially
reducing zone->lock contention.  However, the higher pcp->batch value also
potentially increased allocation latency while the PCP was refilled.  This
sysctl only adjusts pcp->high so that zone->lock contention is potentially
reduced but allocation latency during a PCP refill remains the same.

  # grep -E "high:|batch" /proc/zoneinfo | tail -2
              high:  649
              batch: 63

  # sysctl vm.percpu_pagelist_high_fraction=8
  # grep -E "high:|batch" /proc/zoneinfo | tail -2
              high:  35071
              batch: 63

  # sysctl vm.percpu_pagelist_high_fraction=64
              high:  4383
              batch: 63

  # sysctl vm.percpu_pagelist_high_fraction=0
              high:  649
              batch: 63

[mgorman@techsingularity.net: fix documentation]
  Link: https://lkml.kernel.org/r/20210528151010.GQ30378@techsingularity.net

Link: https://lkml.kernel.org/r/20210525080119.5455-7-mgorman@techsingularity.netSigned-off-by: default avatarMel Gorman <mgorman@techsingularity.net>
Acked-by: default avatarDave Hansen <dave.hansen@linux.intel.com>
Acked-by: default avatarVlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent c49c2c47
......@@ -64,6 +64,7 @@ Currently, these files are in /proc/sys/vm:
- overcommit_ratio
- page-cluster
- panic_on_oom
- percpu_pagelist_high_fraction
- stat_interval
- stat_refresh
- numa_stat
......@@ -789,6 +790,26 @@ panic_on_oom=2+kdump gives you very strong tool to investigate
why oom happens. You can get snapshot.
percpu_pagelist_high_fraction
=============================
This is the fraction of pages in each zone that are can be stored to
per-cpu page lists. It is an upper boundary that is divided depending
on the number of online CPUs. The min value for this is 8 which means
that we do not allow more than 1/8th of pages in each zone to be stored
on per-cpu page lists. This entry only changes the value of hot per-cpu
page lists. A user can specify a number like 100 to allocate 1/100th of
each zone between per-cpu lists.
The batch value of each per-cpu page list remains the same regardless of
the value of the high fraction so allocation latencies are unaffected.
The initial value is zero. Kernel uses this value to set the high pcp->high
mark based on the low watermark for the zone and the number of local
online CPUs. If the user writes '0' to this sysctl, it will revert to
this default behavior.
stat_interval
=============
......
......@@ -1029,12 +1029,15 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *,
extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES];
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *,
size_t *, loff_t *);
int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *, int,
void *, size_t *, loff_t *);
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
void *, size_t *, loff_t *);
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
void *, size_t *, loff_t *);
int numa_zonelist_order_handler(struct ctl_table *, int,
void *, size_t *, loff_t *);
extern int percpu_pagelist_high_fraction;
extern char numa_zonelist_order[];
#define NUMA_ZONELIST_ORDER_LEN 16
......
......@@ -2908,6 +2908,14 @@ static struct ctl_table vm_table[] = {
.extra1 = SYSCTL_ONE,
.extra2 = &one_thousand,
},
{
.procname = "percpu_pagelist_high_fraction",
.data = &percpu_pagelist_high_fraction,
.maxlen = sizeof(percpu_pagelist_high_fraction),
.mode = 0644,
.proc_handler = percpu_pagelist_high_fraction_sysctl_handler,
.extra1 = SYSCTL_ZERO,
},
{
.procname = "page_lock_unfairness",
.data = &sysctl_page_lock_unfairness,
......
......@@ -120,6 +120,7 @@ typedef int __bitwise fpi_t;
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
struct pagesets {
local_lock_t lock;
......@@ -192,6 +193,7 @@ EXPORT_SYMBOL(_totalram_pages);
unsigned long totalreserve_pages __read_mostly;
unsigned long totalcma_pages __read_mostly;
int percpu_pagelist_high_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
EXPORT_SYMBOL(init_on_alloc);
......@@ -6725,17 +6727,32 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online)
#ifdef CONFIG_MMU
int high;
int nr_local_cpus;
unsigned long total_pages;
if (!percpu_pagelist_high_fraction) {
/*
* By default, the high value of the pcp is based on the zone
* low watermark so that if they are full then background
* reclaim will not be started prematurely.
*/
total_pages = low_wmark_pages(zone);
} else {
/*
* If percpu_pagelist_high_fraction is configured, the high
* value is based on a fraction of the managed pages in the
* zone.
*/
total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
}
/*
* The high value of the pcp is based on the zone low watermark
* so that if they are full then background reclaim will not be
* started prematurely. The value is split across all online CPUs
* local to the zone. Note that early in boot that CPUs may not be
* online yet and that during CPU hotplug that the cpumask is not
* yet updated when a CPU is being onlined.
* Split the high value across all online CPUs local to the zone. Note
* that early in boot that CPUs may not be online yet and that during
* CPU hotplug that the cpumask is not yet updated when a CPU is being
* onlined.
*/
nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))) + cpu_online;
high = low_wmark_pages(zone) / nr_local_cpus;
high = total_pages / nr_local_cpus;
/*
* Ensure high is at least batch*4. The multiple is based on the
......@@ -8500,6 +8517,44 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
/*
* percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
* cpu. It is the fraction of total pages in each zone that a hot per cpu
* pagelist can have before it gets flushed back to buddy allocator.
*/
int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
int write, void *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
int old_percpu_pagelist_high_fraction;
int ret;
mutex_lock(&pcp_batch_high_lock);
old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (!write || ret < 0)
goto out;
/* Sanity checking to avoid pcp imbalance */
if (percpu_pagelist_high_fraction &&
percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) {
percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction;
ret = -EINVAL;
goto out;
}
/* No change? */
if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction)
goto out;
for_each_populated_zone(zone)
zone_set_pageset_high_and_batch(zone, 0);
out:
mutex_unlock(&pcp_batch_high_lock);
return ret;
}
#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
/*
* Returns the number of pages that arch has reserved but
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment