Commit 96fa06f4 authored by Andrea Arcangeli's avatar Andrea Arcangeli Committed by Linus Torvalds

[PATCH] mm: rework lower-zone protection initialisation

- Rename various fields related to the lower-zone protection code to sync
  up with 2.4.

- Remove the automatic determination of the values of the per-zone
  protection levels from a single tunable.  Replace this with a simple
  per-zone sysctl.
Signed-off-by: default avatarAndrea Arcangeli <andrea@suse.de>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent be1d0e0e
...@@ -112,18 +112,14 @@ struct zone { ...@@ -112,18 +112,14 @@ struct zone {
unsigned long free_pages; unsigned long free_pages;
unsigned long pages_min, pages_low, pages_high; unsigned long pages_min, pages_low, pages_high;
/* /*
* protection[] is a pre-calculated number of extra pages that must be * We don't know if the memory that we're going to allocate will be freeable
* available in a zone in order for __alloc_pages() to allocate memory * or/and it will be released eventually, so to avoid totally wasting several
* from the zone. i.e., for a GFP_KERNEL alloc of "order" there must * GB of ram we must reserve some of the lower zone memory (otherwise we risk
* be "(1<<order) + protection[ZONE_NORMAL]" free pages in the zone * to run OOM on the lower zones despite there's tons of freeable ram
* for us to choose to allocate the page from that zone. * on the higher zones). This array is recalculated at runtime if the
* * sysctl_lowmem_reserve_ratio sysctl changes.
* It uses both min_free_kbytes and sysctl_lower_zone_protection.
* The protection values are recalculated if either of these values
* change. The array elements are in zonelist order:
* [0] == GFP_DMA, [1] == GFP_KERNEL, [2] == GFP_HIGHMEM.
*/ */
unsigned long protection[MAX_NR_ZONES]; unsigned long lowmem_reserve[MAX_NR_ZONES];
struct per_cpu_pageset pageset[NR_CPUS]; struct per_cpu_pageset pageset[NR_CPUS];
...@@ -368,7 +364,8 @@ struct ctl_table; ...@@ -368,7 +364,8 @@ struct ctl_table;
struct file; struct file;
int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *); void __user *, size_t *, loff_t *);
int lower_zone_protection_sysctl_handler(struct ctl_table *, int, struct file *, extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *); void __user *, size_t *, loff_t *);
#include <linux/topology.h> #include <linux/topology.h>
......
...@@ -160,7 +160,7 @@ enum ...@@ -160,7 +160,7 @@ enum
VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ VM_PAGEBUF=17, /* struct: Control pagebuf parameters */
VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */
VM_SWAPPINESS=19, /* Tendency to steal mapped memory */ VM_SWAPPINESS=19, /* Tendency to steal mapped memory */
VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */ VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */
VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */
VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */
VM_LAPTOP_MODE=23, /* vm laptop mode */ VM_LAPTOP_MODE=23, /* vm laptop mode */
......
...@@ -61,7 +61,6 @@ extern int core_uses_pid; ...@@ -61,7 +61,6 @@ extern int core_uses_pid;
extern char core_pattern[]; extern char core_pattern[];
extern int cad_pid; extern int cad_pid;
extern int pid_max; extern int pid_max;
extern int sysctl_lower_zone_protection;
extern int min_free_kbytes; extern int min_free_kbytes;
extern int printk_ratelimit_jiffies; extern int printk_ratelimit_jiffies;
extern int printk_ratelimit_burst; extern int printk_ratelimit_burst;
...@@ -745,14 +744,13 @@ static ctl_table vm_table[] = { ...@@ -745,14 +744,13 @@ static ctl_table vm_table[] = {
}, },
#endif #endif
{ {
.ctl_name = VM_LOWER_ZONE_PROTECTION, .ctl_name = VM_LOWMEM_RESERVE_RATIO,
.procname = "lower_zone_protection", .procname = "lowmem_reserve_ratio",
.data = &sysctl_lower_zone_protection, .data = &sysctl_lowmem_reserve_ratio,
.maxlen = sizeof(sysctl_lower_zone_protection), .maxlen = sizeof(sysctl_lowmem_reserve_ratio),
.mode = 0644, .mode = 0644,
.proc_handler = &lower_zone_protection_sysctl_handler, .proc_handler = &lowmem_reserve_ratio_sysctl_handler,
.strategy = &sysctl_intvec, .strategy = &sysctl_intvec,
.extra1 = &zero,
}, },
{ {
.ctl_name = VM_MIN_FREE_KBYTES, .ctl_name = VM_MIN_FREE_KBYTES,
......
...@@ -44,7 +44,15 @@ struct pglist_data *pgdat_list; ...@@ -44,7 +44,15 @@ struct pglist_data *pgdat_list;
unsigned long totalram_pages; unsigned long totalram_pages;
unsigned long totalhigh_pages; unsigned long totalhigh_pages;
long nr_swap_pages; long nr_swap_pages;
int sysctl_lower_zone_protection = 0; /*
* results with 256, 32 in the lowmem_reserve sysctl:
* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
* 1G machine -> (16M dma, 784M normal, 224M high)
* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
*/
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
EXPORT_SYMBOL(totalram_pages); EXPORT_SYMBOL(totalram_pages);
EXPORT_SYMBOL(nr_swap_pages); EXPORT_SYMBOL(nr_swap_pages);
...@@ -654,7 +662,7 @@ buffered_rmqueue(struct zone *zone, int order, int gfp_flags) ...@@ -654,7 +662,7 @@ buffered_rmqueue(struct zone *zone, int order, int gfp_flags)
* of the allocation. * of the allocation.
*/ */
int zone_watermark_ok(struct zone *z, int order, unsigned long mark, int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int alloc_type, int can_try_harder, int gfp_high) int classzone_idx, int can_try_harder, int gfp_high)
{ {
/* free_pages my go negative - that's OK */ /* free_pages my go negative - that's OK */
long min = mark, free_pages = z->free_pages - (1 << order) + 1; long min = mark, free_pages = z->free_pages - (1 << order) + 1;
...@@ -665,7 +673,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, ...@@ -665,7 +673,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
if (can_try_harder) if (can_try_harder)
min -= min / 4; min -= min / 4;
if (free_pages <= min + z->protection[alloc_type]) if (free_pages <= min + z->lowmem_reserve[classzone_idx])
return 0; return 0;
for (o = 0; o < order; o++) { for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */ /* At the next order, this order's pages become unavailable */
...@@ -682,19 +690,6 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, ...@@ -682,19 +690,6 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
/* /*
* This is the 'heart' of the zoned buddy allocator. * This is the 'heart' of the zoned buddy allocator.
*
* Herein lies the mysterious "incremental min". That's the
*
* local_low = z->pages_low;
* min += local_low;
*
* thing. The intent here is to provide additional protection to low zones for
* allocation requests which _could_ use higher zones. So a GFP_HIGHMEM
* request is not allowed to dip as deeply into the normal zone as a GFP_KERNEL
* request. This preserves additional space in those lower zones for requests
* which really do need memory from those zones. It means that on a decent
* sized machine, GFP_HIGHMEM and GFP_KERNEL requests basically leave the DMA
* zone untouched.
*/ */
struct page * fastcall struct page * fastcall
__alloc_pages(unsigned int gfp_mask, unsigned int order, __alloc_pages(unsigned int gfp_mask, unsigned int order,
...@@ -706,7 +701,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -706,7 +701,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
struct reclaim_state reclaim_state; struct reclaim_state reclaim_state;
struct task_struct *p = current; struct task_struct *p = current;
int i; int i;
int alloc_type; int classzone_idx;
int do_retry; int do_retry;
int can_try_harder; int can_try_harder;
...@@ -726,13 +721,13 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -726,13 +721,13 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
return NULL; return NULL;
} }
alloc_type = zone_idx(zones[0]); classzone_idx = zone_idx(zones[0]);
/* Go through the zonelist once, looking for a zone with enough free */ /* Go through the zonelist once, looking for a zone with enough free */
for (i = 0; (z = zones[i]) != NULL; i++) { for (i = 0; (z = zones[i]) != NULL; i++) {
if (!zone_watermark_ok(z, order, z->pages_low, if (!zone_watermark_ok(z, order, z->pages_low,
alloc_type, 0, 0)) classzone_idx, 0, 0))
continue; continue;
page = buffered_rmqueue(z, order, gfp_mask); page = buffered_rmqueue(z, order, gfp_mask);
...@@ -749,7 +744,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -749,7 +744,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
*/ */
for (i = 0; (z = zones[i]) != NULL; i++) { for (i = 0; (z = zones[i]) != NULL; i++) {
if (!zone_watermark_ok(z, order, z->pages_min, if (!zone_watermark_ok(z, order, z->pages_min,
alloc_type, can_try_harder, classzone_idx, can_try_harder,
gfp_mask & __GFP_HIGH)) gfp_mask & __GFP_HIGH))
continue; continue;
...@@ -787,7 +782,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -787,7 +782,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
/* go through the zonelist yet one more time */ /* go through the zonelist yet one more time */
for (i = 0; (z = zones[i]) != NULL; i++) { for (i = 0; (z = zones[i]) != NULL; i++) {
if (!zone_watermark_ok(z, order, z->pages_min, if (!zone_watermark_ok(z, order, z->pages_min,
alloc_type, can_try_harder, classzone_idx, can_try_harder,
gfp_mask & __GFP_HIGH)) gfp_mask & __GFP_HIGH))
continue; continue;
...@@ -1210,9 +1205,9 @@ void show_free_areas(void) ...@@ -1210,9 +1205,9 @@ void show_free_areas(void)
zone->pages_scanned, zone->pages_scanned,
(zone->all_unreclaimable ? "yes" : "no") (zone->all_unreclaimable ? "yes" : "no")
); );
printk("protections[]:"); printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++) for (i = 0; i < MAX_NR_ZONES; i++)
printk(" %lu", zone->protection[i]); printk(" %lu", zone->lowmem_reserve[i]);
printk("\n"); printk("\n");
} }
...@@ -1884,87 +1879,29 @@ void __init page_alloc_init(void) ...@@ -1884,87 +1879,29 @@ void __init page_alloc_init(void)
hotcpu_notifier(page_alloc_cpu_notify, 0); hotcpu_notifier(page_alloc_cpu_notify, 0);
} }
static unsigned long higherzone_val(struct zone *z, int max_zone,
int alloc_type)
{
int z_idx = zone_idx(z);
struct zone *higherzone;
unsigned long pages;
/* there is no higher zone to get a contribution from */
if (z_idx == MAX_NR_ZONES-1)
return 0;
higherzone = &z->zone_pgdat->node_zones[z_idx+1];
/* We always start with the higher zone's protection value */
pages = higherzone->protection[alloc_type];
/*
* We get a lower-zone-protection contribution only if there are
* pages in the higher zone and if we're not the highest zone
* in the current zonelist. e.g., never happens for GFP_DMA. Happens
* only for ZONE_DMA in a GFP_KERNEL allocation and happens for ZONE_DMA
* and ZONE_NORMAL for a GFP_HIGHMEM allocation.
*/
if (higherzone->present_pages && z_idx < alloc_type)
pages += higherzone->pages_low * sysctl_lower_zone_protection;
return pages;
}
/* /*
* setup_per_zone_protection - called whenver min_free_kbytes or * setup_per_zone_lowmem_reserve - called whenever
* sysctl_lower_zone_protection changes. Ensures that each zone * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
* has a correct pages_protected value, so an adequate number of * has a correct pages reserved value, so an adequate number of
* pages are left in the zone after a successful __alloc_pages(). * pages are left in the zone after a successful __alloc_pages().
*
* This algorithm is way confusing. I tries to keep the same behavior
* as we had with the incremental min iterative algorithm.
*/ */
static void setup_per_zone_protection(void) static void setup_per_zone_lowmem_reserve(void)
{ {
struct pglist_data *pgdat; struct pglist_data *pgdat;
struct zone *zones, *zone; int j, idx;
int max_zone;
int i, j;
for_each_pgdat(pgdat) { for_each_pgdat(pgdat) {
zones = pgdat->node_zones; for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone * zone = pgdat->node_zones + j;
unsigned long present_pages = zone->present_pages;
for (i = 0, max_zone = 0; i < MAX_NR_ZONES; i++) zone->lowmem_reserve[j] = 0;
if (zones[i].present_pages)
max_zone = i;
/* for (idx = j-1; idx >= 0; idx--) {
* For each of the different allocation types: struct zone * lower_zone = pgdat->node_zones + idx;
* GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM
*/
for (i = 0; i < GFP_ZONETYPES; i++) {
/*
* For each of the zones:
* ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA
*/
for (j = MAX_NR_ZONES-1; j >= 0; j--) {
zone = &zones[j];
/* lower_zone->lowmem_reserve[j] = present_pages / sysctl_lowmem_reserve_ratio[idx];
* We never protect zones that don't have memory present_pages += lower_zone->present_pages;
* in them (j>max_zone) or zones that aren't in
* the zonelists for a certain type of
* allocation (j>=i). We have to assign these
* to zero because the lower zones take
* contributions from the higher zones.
*/
if (j > max_zone || j >= i) {
zone->protection[i] = 0;
continue;
}
/*
* The contribution of the next higher zone
*/
zone->protection[i] = higherzone_val(zone,
max_zone, i);
} }
} }
} }
...@@ -2059,7 +1996,7 @@ static int __init init_per_zone_pages_min(void) ...@@ -2059,7 +1996,7 @@ static int __init init_per_zone_pages_min(void)
if (min_free_kbytes > 65536) if (min_free_kbytes > 65536)
min_free_kbytes = 65536; min_free_kbytes = 65536;
setup_per_zone_pages_min(); setup_per_zone_pages_min();
setup_per_zone_protection(); setup_per_zone_lowmem_reserve();
return 0; return 0;
} }
module_init(init_per_zone_pages_min) module_init(init_per_zone_pages_min)
...@@ -2074,20 +2011,23 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, ...@@ -2074,20 +2011,23 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
{ {
proc_dointvec(table, write, file, buffer, length, ppos); proc_dointvec(table, write, file, buffer, length, ppos);
setup_per_zone_pages_min(); setup_per_zone_pages_min();
setup_per_zone_protection();
return 0; return 0;
} }
/* /*
* lower_zone_protection_sysctl_handler - just a wrapper around * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
* proc_dointvec() so that we can call setup_per_zone_protection() * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
* whenever sysctl_lower_zone_protection changes. * whenever sysctl_lowmem_reserve_ratio changes.
*
* The reserve ratio obviously has absolutely no relation with the
* pages_min watermarks. The lowmem reserve ratio can only make sense
* if in function of the boot time zone sizes.
*/ */
int lower_zone_protection_sysctl_handler(ctl_table *table, int write, int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length, loff_t *ppos) struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{ {
proc_dointvec_minmax(table, write, file, buffer, length, ppos); proc_dointvec_minmax(table, write, file, buffer, length, ppos);
setup_per_zone_protection(); setup_per_zone_lowmem_reserve();
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment