Commit 96fa06f4 authored by Andrea Arcangeli's avatar Andrea Arcangeli Committed by Linus Torvalds

[PATCH] mm: rework lower-zone protection initialisation

- Rename various fields related to the lower-zone protection code to sync
  up with 2.4.

- Remove the automatic determination of the values of the per-zone
  protection levels from a single tunable.  Replace this with a simple
  per-zone sysctl.
Signed-off-by: default avatarAndrea Arcangeli <andrea@suse.de>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent be1d0e0e
......@@ -112,18 +112,14 @@ struct zone {
unsigned long free_pages;
unsigned long pages_min, pages_low, pages_high;
/*
* protection[] is a pre-calculated number of extra pages that must be
* available in a zone in order for __alloc_pages() to allocate memory
* from the zone. i.e., for a GFP_KERNEL alloc of "order" there must
* be "(1<<order) + protection[ZONE_NORMAL]" free pages in the zone
* for us to choose to allocate the page from that zone.
*
* It uses both min_free_kbytes and sysctl_lower_zone_protection.
* The protection values are recalculated if either of these values
* change. The array elements are in zonelist order:
* [0] == GFP_DMA, [1] == GFP_KERNEL, [2] == GFP_HIGHMEM.
* We don't know if the memory that we're going to allocate will be freeable
* or/and it will be released eventually, so to avoid totally wasting several
* GB of ram we must reserve some of the lower zone memory (otherwise we risk
* to run OOM on the lower zones despite there's tons of freeable ram
* on the higher zones). This array is recalculated at runtime if the
* sysctl_lowmem_reserve_ratio sysctl changes.
*/
unsigned long protection[MAX_NR_ZONES];
unsigned long lowmem_reserve[MAX_NR_ZONES];
struct per_cpu_pageset pageset[NR_CPUS];
......@@ -368,7 +364,8 @@ struct ctl_table;
struct file;
int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
int lower_zone_protection_sysctl_handler(struct ctl_table *, int, struct file *,
extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
#include <linux/topology.h>
......
......@@ -160,7 +160,7 @@ enum
VM_PAGEBUF=17, /* struct: Control pagebuf parameters */
VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */
VM_SWAPPINESS=19, /* Tendency to steal mapped memory */
VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */
VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */
VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */
VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */
VM_LAPTOP_MODE=23, /* vm laptop mode */
......
......@@ -61,7 +61,6 @@ extern int core_uses_pid;
extern char core_pattern[];
extern int cad_pid;
extern int pid_max;
extern int sysctl_lower_zone_protection;
extern int min_free_kbytes;
extern int printk_ratelimit_jiffies;
extern int printk_ratelimit_burst;
......@@ -745,14 +744,13 @@ static ctl_table vm_table[] = {
},
#endif
{
.ctl_name = VM_LOWER_ZONE_PROTECTION,
.procname = "lower_zone_protection",
.data = &sysctl_lower_zone_protection,
.maxlen = sizeof(sysctl_lower_zone_protection),
.ctl_name = VM_LOWMEM_RESERVE_RATIO,
.procname = "lowmem_reserve_ratio",
.data = &sysctl_lowmem_reserve_ratio,
.maxlen = sizeof(sysctl_lowmem_reserve_ratio),
.mode = 0644,
.proc_handler = &lower_zone_protection_sysctl_handler,
.proc_handler = &lowmem_reserve_ratio_sysctl_handler,
.strategy = &sysctl_intvec,
.extra1 = &zero,
},
{
.ctl_name = VM_MIN_FREE_KBYTES,
......
......@@ -44,7 +44,15 @@ struct pglist_data *pgdat_list;
unsigned long totalram_pages;
unsigned long totalhigh_pages;
long nr_swap_pages;
int sysctl_lower_zone_protection = 0;
/*
* results with 256, 32 in the lowmem_reserve sysctl:
* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
* 1G machine -> (16M dma, 784M normal, 224M high)
* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
*/
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
EXPORT_SYMBOL(totalram_pages);
EXPORT_SYMBOL(nr_swap_pages);
......@@ -654,7 +662,7 @@ buffered_rmqueue(struct zone *zone, int order, int gfp_flags)
* of the allocation.
*/
int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int alloc_type, int can_try_harder, int gfp_high)
int classzone_idx, int can_try_harder, int gfp_high)
{
/* free_pages my go negative - that's OK */
long min = mark, free_pages = z->free_pages - (1 << order) + 1;
......@@ -665,7 +673,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
if (can_try_harder)
min -= min / 4;
if (free_pages <= min + z->protection[alloc_type])
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
return 0;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
......@@ -682,19 +690,6 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
/*
* This is the 'heart' of the zoned buddy allocator.
*
* Herein lies the mysterious "incremental min". That's the
*
* local_low = z->pages_low;
* min += local_low;
*
* thing. The intent here is to provide additional protection to low zones for
* allocation requests which _could_ use higher zones. So a GFP_HIGHMEM
* request is not allowed to dip as deeply into the normal zone as a GFP_KERNEL
* request. This preserves additional space in those lower zones for requests
* which really do need memory from those zones. It means that on a decent
* sized machine, GFP_HIGHMEM and GFP_KERNEL requests basically leave the DMA
* zone untouched.
*/
struct page * fastcall
__alloc_pages(unsigned int gfp_mask, unsigned int order,
......@@ -706,7 +701,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
struct reclaim_state reclaim_state;
struct task_struct *p = current;
int i;
int alloc_type;
int classzone_idx;
int do_retry;
int can_try_harder;
......@@ -726,13 +721,13 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
return NULL;
}
alloc_type = zone_idx(zones[0]);
classzone_idx = zone_idx(zones[0]);
/* Go through the zonelist once, looking for a zone with enough free */
for (i = 0; (z = zones[i]) != NULL; i++) {
if (!zone_watermark_ok(z, order, z->pages_low,
alloc_type, 0, 0))
classzone_idx, 0, 0))
continue;
page = buffered_rmqueue(z, order, gfp_mask);
......@@ -749,8 +744,8 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
*/
for (i = 0; (z = zones[i]) != NULL; i++) {
if (!zone_watermark_ok(z, order, z->pages_min,
alloc_type, can_try_harder,
gfp_mask & __GFP_HIGH))
classzone_idx, can_try_harder,
gfp_mask & __GFP_HIGH))
continue;
page = buffered_rmqueue(z, order, gfp_mask);
......@@ -787,8 +782,8 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
/* go through the zonelist yet one more time */
for (i = 0; (z = zones[i]) != NULL; i++) {
if (!zone_watermark_ok(z, order, z->pages_min,
alloc_type, can_try_harder,
gfp_mask & __GFP_HIGH))
classzone_idx, can_try_harder,
gfp_mask & __GFP_HIGH))
continue;
page = buffered_rmqueue(z, order, gfp_mask);
......@@ -1210,9 +1205,9 @@ void show_free_areas(void)
zone->pages_scanned,
(zone->all_unreclaimable ? "yes" : "no")
);
printk("protections[]:");
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
printk(" %lu", zone->protection[i]);
printk(" %lu", zone->lowmem_reserve[i]);
printk("\n");
}
......@@ -1884,87 +1879,29 @@ void __init page_alloc_init(void)
hotcpu_notifier(page_alloc_cpu_notify, 0);
}
static unsigned long higherzone_val(struct zone *z, int max_zone,
int alloc_type)
{
int z_idx = zone_idx(z);
struct zone *higherzone;
unsigned long pages;
/* there is no higher zone to get a contribution from */
if (z_idx == MAX_NR_ZONES-1)
return 0;
higherzone = &z->zone_pgdat->node_zones[z_idx+1];
/* We always start with the higher zone's protection value */
pages = higherzone->protection[alloc_type];
/*
* We get a lower-zone-protection contribution only if there are
* pages in the higher zone and if we're not the highest zone
* in the current zonelist. e.g., never happens for GFP_DMA. Happens
* only for ZONE_DMA in a GFP_KERNEL allocation and happens for ZONE_DMA
* and ZONE_NORMAL for a GFP_HIGHMEM allocation.
*/
if (higherzone->present_pages && z_idx < alloc_type)
pages += higherzone->pages_low * sysctl_lower_zone_protection;
return pages;
}
/*
* setup_per_zone_protection - called whenver min_free_kbytes or
* sysctl_lower_zone_protection changes. Ensures that each zone
* has a correct pages_protected value, so an adequate number of
* setup_per_zone_lowmem_reserve - called whenever
* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
* has a correct pages reserved value, so an adequate number of
* pages are left in the zone after a successful __alloc_pages().
*
* This algorithm is way confusing. I tries to keep the same behavior
* as we had with the incremental min iterative algorithm.
*/
static void setup_per_zone_protection(void)
static void setup_per_zone_lowmem_reserve(void)
{
struct pglist_data *pgdat;
struct zone *zones, *zone;
int max_zone;
int i, j;
int j, idx;
for_each_pgdat(pgdat) {
zones = pgdat->node_zones;
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone * zone = pgdat->node_zones + j;
unsigned long present_pages = zone->present_pages;
for (i = 0, max_zone = 0; i < MAX_NR_ZONES; i++)
if (zones[i].present_pages)
max_zone = i;
zone->lowmem_reserve[j] = 0;
/*
* For each of the different allocation types:
* GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM
*/
for (i = 0; i < GFP_ZONETYPES; i++) {
/*
* For each of the zones:
* ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA
*/
for (j = MAX_NR_ZONES-1; j >= 0; j--) {
zone = &zones[j];
/*
* We never protect zones that don't have memory
* in them (j>max_zone) or zones that aren't in
* the zonelists for a certain type of
* allocation (j>=i). We have to assign these
* to zero because the lower zones take
* contributions from the higher zones.
*/
if (j > max_zone || j >= i) {
zone->protection[i] = 0;
continue;
}
/*
* The contribution of the next higher zone
*/
zone->protection[i] = higherzone_val(zone,
max_zone, i);
for (idx = j-1; idx >= 0; idx--) {
struct zone * lower_zone = pgdat->node_zones + idx;
lower_zone->lowmem_reserve[j] = present_pages / sysctl_lowmem_reserve_ratio[idx];
present_pages += lower_zone->present_pages;
}
}
}
......@@ -2059,7 +1996,7 @@ static int __init init_per_zone_pages_min(void)
if (min_free_kbytes > 65536)
min_free_kbytes = 65536;
setup_per_zone_pages_min();
setup_per_zone_protection();
setup_per_zone_lowmem_reserve();
return 0;
}
module_init(init_per_zone_pages_min)
......@@ -2074,20 +2011,23 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
{
proc_dointvec(table, write, file, buffer, length, ppos);
setup_per_zone_pages_min();
setup_per_zone_protection();
return 0;
}
/*
* lower_zone_protection_sysctl_handler - just a wrapper around
* proc_dointvec() so that we can call setup_per_zone_protection()
* whenever sysctl_lower_zone_protection changes.
* lowmem_reserve_ratio_sysctl_handler - just a wrapper around
* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
* whenever sysctl_lowmem_reserve_ratio changes.
*
* The reserve ratio obviously has absolutely no relation with the
* pages_min watermarks. The lowmem reserve ratio can only make sense
* if in function of the boot time zone sizes.
*/
int lower_zone_protection_sysctl_handler(ctl_table *table, int write,
int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec_minmax(table, write, file, buffer, length, ppos);
setup_per_zone_protection();
setup_per_zone_lowmem_reserve();
return 0;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment