Commit 6b19b0c2 authored by Tejun Heo's avatar Tejun Heo

x86, percpu: setup reserved percpu area for x86_64

Impact: fix relocation overflow during module load

x86_64 uses 32bit relocations for symbol access and static percpu
symbols whether in core or modules must be inside 2GB of the percpu
segement base which the dynamic percpu allocator doesn't guarantee.
This patch makes x86_64 reserve PERCPU_MODULE_RESERVE bytes in the
first chunk so that module percpu areas are always allocated from the
first chunk which is always inside the relocatable range.

This problem exists for any percpu allocator but is easily triggered
when using the embedding allocator because the second chunk is located
beyond 2GB on it.

This patch also changes the meaning of PERCPU_DYNAMIC_RESERVE such
that it only indicates the size of the area to reserve for dynamic
allocation as static and dynamic areas can be separate.  New
PERCPU_DYNAMIC_RESERVED is increased by 4k for both 32 and 64bits as
the reserved area separation eats away some allocatable space and
having slightly more headroom (currently between 4 and 8k after
minimal boot sans module area) makes sense for common case
performance.

x86_32 can address anywhere from anywhere and doesn't need reserving.

Mike Galbraith first reported the problem first and bisected it to the
embedding percpu allocator commit.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Reported-by: default avatarMike Galbraith <efault@gmx.de>
Reported-by: default avatarJaswinder Singh Rajput <jaswinder@kernel.org>
parent edcb4639
...@@ -42,6 +42,19 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { ...@@ -42,6 +42,19 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
}; };
EXPORT_SYMBOL(__per_cpu_offset); EXPORT_SYMBOL(__per_cpu_offset);
/*
* On x86_64 symbols referenced from code should be reachable using
* 32bit relocations. Reserve space for static percpu variables in
* modules so that they are always served from the first chunk which
* is located at the percpu segment base. On x86_32, anything can
* address anywhere. No need to reserve space in the first chunk.
*/
#ifdef CONFIG_X86_64
#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE
#else
#define PERCPU_FIRST_CHUNK_RESERVE 0
#endif
/** /**
* pcpu_need_numa - determine percpu allocation needs to consider NUMA * pcpu_need_numa - determine percpu allocation needs to consider NUMA
* *
...@@ -141,7 +154,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) ...@@ -141,7 +154,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
{ {
static struct vm_struct vm; static struct vm_struct vm;
pg_data_t *last; pg_data_t *last;
size_t ptrs_size; size_t ptrs_size, dyn_size;
unsigned int cpu; unsigned int cpu;
ssize_t ret; ssize_t ret;
...@@ -169,12 +182,14 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) ...@@ -169,12 +182,14 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
* Currently supports only single page. Supporting multiple * Currently supports only single page. Supporting multiple
* pages won't be too difficult if it ever becomes necessary. * pages won't be too difficult if it ever becomes necessary.
*/ */
pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
PERCPU_DYNAMIC_RESERVE);
if (pcpur_size > PMD_SIZE) { if (pcpur_size > PMD_SIZE) {
pr_warning("PERCPU: static data is larger than large page, " pr_warning("PERCPU: static data is larger than large page, "
"can't use large page\n"); "can't use large page\n");
return -EINVAL; return -EINVAL;
} }
dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
/* allocate pointer array and alloc large pages */ /* allocate pointer array and alloc large pages */
ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
...@@ -217,8 +232,9 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) ...@@ -217,8 +232,9 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
pr_info("PERCPU: Remapped at %p with large pages, static data " pr_info("PERCPU: Remapped at %p with large pages, static data "
"%zu bytes\n", vm.addr, static_size); "%zu bytes\n", vm.addr, static_size);
ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 0, PMD_SIZE, ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
pcpur_size - static_size, vm.addr, NULL); PERCPU_FIRST_CHUNK_RESERVE,
PMD_SIZE, dyn_size, vm.addr, NULL);
goto out_free_ar; goto out_free_ar;
enomem: enomem:
...@@ -276,9 +292,10 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) ...@@ -276,9 +292,10 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
return -EINVAL; return -EINVAL;
/* allocate and copy */ /* allocate and copy */
pcpue_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
PERCPU_DYNAMIC_RESERVE);
pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
dyn_size = pcpue_size - static_size; dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
PAGE_SIZE); PAGE_SIZE);
...@@ -297,7 +314,8 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) ...@@ -297,7 +314,8 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
return pcpu_setup_first_chunk(pcpue_get_page, static_size, 0, return pcpu_setup_first_chunk(pcpue_get_page, static_size,
PERCPU_FIRST_CHUNK_RESERVE,
pcpue_unit_size, dyn_size, pcpue_unit_size, dyn_size,
pcpue_ptr, NULL); pcpue_ptr, NULL);
} }
...@@ -356,8 +374,9 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) ...@@ -356,8 +374,9 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
pcpu4k_nr_static_pages, static_size); pcpu4k_nr_static_pages, static_size);
ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, -1, -1, ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
NULL, pcpu4k_populate_pte); PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL,
pcpu4k_populate_pte);
goto out_free_ar; goto out_free_ar;
enomem: enomem:
......
...@@ -85,31 +85,20 @@ ...@@ -85,31 +85,20 @@
/* /*
* PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
* back on the first chunk if arch is manually allocating and mapping * back on the first chunk for dynamic percpu allocation if arch is
* it for faster access (as a part of large page mapping for example). * manually allocating and mapping it for faster access (as a part of
* Note that dynamic percpu allocator covers both static and dynamic * large page mapping for example).
* areas, so these values are bigger than PERCPU_MODULE_RESERVE.
* *
* On typical configuration with modules, the following values leave * The following values give between one and two pages of free space
* about 8k of free space on the first chunk after boot on both x86_32 * after typical minimal boot (2-way SMP, single disk and NIC) with
* and 64 when module support is enabled. When module support is * both defconfig and a distro config on x86_64 and 32. More
* disabled, it's much tighter. * intelligent way to determine this would be nice.
*/ */
#ifndef PERCPU_DYNAMIC_RESERVE #if BITS_PER_LONG > 32
# if BITS_PER_LONG > 32 #define PERCPU_DYNAMIC_RESERVE (20 << 10)
# ifdef CONFIG_MODULES #else
# define PERCPU_DYNAMIC_RESERVE (24 << 10) #define PERCPU_DYNAMIC_RESERVE (12 << 10)
# else #endif
# define PERCPU_DYNAMIC_RESERVE (16 << 10)
# endif
# else
# ifdef CONFIG_MODULES
# define PERCPU_DYNAMIC_RESERVE (16 << 10)
# else
# define PERCPU_DYNAMIC_RESERVE (8 << 10)
# endif
# endif
#endif /* PERCPU_DYNAMIC_RESERVE */
extern void *pcpu_base_addr; extern void *pcpu_base_addr;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment