Commit ccea34b5 authored by Tejun Heo's avatar Tejun Heo

percpu: finer grained locking to break deadlock and allow atomic free

Impact: fix deadlock and allow atomic free

Percpu allocation always uses GFP_KERNEL and whole alloc/free paths
were protected by single mutex.  All percpu allocations have been from
GFP_KERNEL-safe context and the original allocator had this assumption
too.  However, by protecting both alloc and free paths with the same
mutex, the new allocator creates free -> alloc -> GFP_KERNEL
dependency which the original allocator didn't have.  This can lead to
deadlock if free is called from FS or IO paths.  Also, in general,
allocators are expected to allow free to be called from atomic
context.

This patch implements finer grained locking to break the deadlock and
allow atomic free.  For details, please read the "Synchronization
rules" comment.

While at it, also add CONTEXT: to function comments to describe which
context they expect to be called from and what they do to it.

This problem was reported by Thomas Gleixner and Peter Zijlstra.

  http://thread.gmane.org/gmane.linux.kernel/802384Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Reported-by: default avatarThomas Gleixner <tglx@linutronix.de>
Reported-by: default avatarPeter Zijlstra <peterz@infradead.org>
parent a56dbddf
...@@ -62,6 +62,7 @@ ...@@ -62,6 +62,7 @@
#include <linux/pfn.h> #include <linux/pfn.h>
#include <linux/rbtree.h> #include <linux/rbtree.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <linux/workqueue.h> #include <linux/workqueue.h>
...@@ -101,20 +102,28 @@ static struct pcpu_chunk *pcpu_reserved_chunk; ...@@ -101,20 +102,28 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
static int pcpu_reserved_chunk_limit; static int pcpu_reserved_chunk_limit;
/* /*
* One mutex to rule them all. * Synchronization rules.
* *
* The following mutex is grabbed in the outermost public alloc/free * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
* interface functions and released only when the operation is * protects allocation/reclaim paths, chunks and chunk->page arrays.
* complete. As such, every function in this file other than the * The latter is a spinlock and protects the index data structures -
* outermost functions are called under pcpu_mutex. * chunk slots, rbtree, chunks and area maps in chunks.
* *
* It can easily be switched to use spinlock such that only the area * During allocation, pcpu_alloc_mutex is kept locked all the time and
* allocation and page population commit are protected with it doing * pcpu_lock is grabbed and released as necessary. All actual memory
* actual [de]allocation without holding any lock. However, given * allocations are done using GFP_KERNEL with pcpu_lock released.
* what this allocator does, I think it's better to let them run *
* sequentially. * Free path accesses and alters only the index data structures, so it
* can be safely called from atomic context. When memory needs to be
* returned to the system, free path schedules reclaim_work which
* grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
* reclaimed, release both locks and frees the chunks. Note that it's
* necessary to grab both locks to remove a chunk from circulation as
* allocation path might be referencing the chunk with only
* pcpu_alloc_mutex locked.
*/ */
static DEFINE_MUTEX(pcpu_mutex); static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
...@@ -176,6 +185,9 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, ...@@ -176,6 +185,9 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
* kzalloc() is used; otherwise, vmalloc() is used. The returned * kzalloc() is used; otherwise, vmalloc() is used. The returned
* memory is always zeroed. * memory is always zeroed.
* *
* CONTEXT:
* Does GFP_KERNEL allocation.
*
* RETURNS: * RETURNS:
* Pointer to the allocated area on success, NULL on failure. * Pointer to the allocated area on success, NULL on failure.
*/ */
...@@ -215,6 +227,9 @@ static void pcpu_mem_free(void *ptr, size_t size) ...@@ -215,6 +227,9 @@ static void pcpu_mem_free(void *ptr, size_t size)
* New slot according to the changed state is determined and @chunk is * New slot according to the changed state is determined and @chunk is
* moved to the slot. Note that the reserved chunk is never put on * moved to the slot. Note that the reserved chunk is never put on
* chunk slots. * chunk slots.
*
* CONTEXT:
* pcpu_lock.
*/ */
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{ {
...@@ -260,6 +275,9 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr, ...@@ -260,6 +275,9 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr,
* searchs for the chunk with the highest start address which isn't * searchs for the chunk with the highest start address which isn't
* beyond @addr. * beyond @addr.
* *
* CONTEXT:
* pcpu_lock.
*
* RETURNS: * RETURNS:
* The address of the found chunk. * The address of the found chunk.
*/ */
...@@ -300,6 +318,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) ...@@ -300,6 +318,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
* @new: chunk to insert * @new: chunk to insert
* *
* Insert @new into address rb tree. * Insert @new into address rb tree.
*
* CONTEXT:
* pcpu_lock.
*/ */
static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
{ {
...@@ -319,6 +340,10 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) ...@@ -319,6 +340,10 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
* A single allocation can split an area into three areas, so this * A single allocation can split an area into three areas, so this
* function makes sure that @chunk->map has at least two extra slots. * function makes sure that @chunk->map has at least two extra slots.
* *
* CONTEXT:
* pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired
* if area map is extended.
*
* RETURNS: * RETURNS:
* 0 if noop, 1 if successfully extended, -errno on failure. * 0 if noop, 1 if successfully extended, -errno on failure.
*/ */
...@@ -332,13 +357,25 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk) ...@@ -332,13 +357,25 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
if (chunk->map_alloc >= chunk->map_used + 2) if (chunk->map_alloc >= chunk->map_used + 2)
return 0; return 0;
spin_unlock_irq(&pcpu_lock);
new_alloc = PCPU_DFL_MAP_ALLOC; new_alloc = PCPU_DFL_MAP_ALLOC;
while (new_alloc < chunk->map_used + 2) while (new_alloc < chunk->map_used + 2)
new_alloc *= 2; new_alloc *= 2;
new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
if (!new) if (!new) {
spin_lock_irq(&pcpu_lock);
return -ENOMEM; return -ENOMEM;
}
/*
* Acquire pcpu_lock and switch to new area map. Only free
* could have happened inbetween, so map_used couldn't have
* grown.
*/
spin_lock_irq(&pcpu_lock);
BUG_ON(new_alloc < chunk->map_used + 2);
size = chunk->map_alloc * sizeof(chunk->map[0]); size = chunk->map_alloc * sizeof(chunk->map[0]);
memcpy(new, chunk->map, size); memcpy(new, chunk->map, size);
...@@ -371,6 +408,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk) ...@@ -371,6 +408,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
* is inserted after the target block. * is inserted after the target block.
* *
* @chunk->map must have enough free slots to accomodate the split. * @chunk->map must have enough free slots to accomodate the split.
*
* CONTEXT:
* pcpu_lock.
*/ */
static void pcpu_split_block(struct pcpu_chunk *chunk, int i, static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
int head, int tail) int head, int tail)
...@@ -406,6 +446,9 @@ static void pcpu_split_block(struct pcpu_chunk *chunk, int i, ...@@ -406,6 +446,9 @@ static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
* *
* @chunk->map must have at least two free slots. * @chunk->map must have at least two free slots.
* *
* CONTEXT:
* pcpu_lock.
*
* RETURNS: * RETURNS:
* Allocated offset in @chunk on success, -1 if no matching area is * Allocated offset in @chunk on success, -1 if no matching area is
* found. * found.
...@@ -495,6 +538,9 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) ...@@ -495,6 +538,9 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
* Free area starting from @freeme to @chunk. Note that this function * Free area starting from @freeme to @chunk. Note that this function
* only modifies the allocation map. It doesn't depopulate or unmap * only modifies the allocation map. It doesn't depopulate or unmap
* the area. * the area.
*
* CONTEXT:
* pcpu_lock.
*/ */
static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
{ {
...@@ -580,6 +626,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, ...@@ -580,6 +626,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
* For each cpu, depopulate and unmap pages [@page_start,@page_end) * For each cpu, depopulate and unmap pages [@page_start,@page_end)
* from @chunk. If @flush is true, vcache is flushed before unmapping * from @chunk. If @flush is true, vcache is flushed before unmapping
* and tlb after. * and tlb after.
*
* CONTEXT:
* pcpu_alloc_mutex.
*/ */
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
bool flush) bool flush)
...@@ -658,6 +707,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) ...@@ -658,6 +707,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
* *
* For each cpu, populate and map pages [@page_start,@page_end) into * For each cpu, populate and map pages [@page_start,@page_end) into
* @chunk. The area is cleared on return. * @chunk. The area is cleared on return.
*
* CONTEXT:
* pcpu_alloc_mutex, does GFP_KERNEL allocation.
*/ */
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
{ {
...@@ -748,15 +800,16 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) ...@@ -748,15 +800,16 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
* @align: alignment of area (max PAGE_SIZE) * @align: alignment of area (max PAGE_SIZE)
* @reserved: allocate from the reserved chunk if available * @reserved: allocate from the reserved chunk if available
* *
* Allocate percpu area of @size bytes aligned at @align. Might * Allocate percpu area of @size bytes aligned at @align.
* sleep. Might trigger writeouts. *
* CONTEXT:
* Does GFP_KERNEL allocation.
* *
* RETURNS: * RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure. * Percpu pointer to the allocated area on success, NULL on failure.
*/ */
static void *pcpu_alloc(size_t size, size_t align, bool reserved) static void *pcpu_alloc(size_t size, size_t align, bool reserved)
{ {
void *ptr = NULL;
struct pcpu_chunk *chunk; struct pcpu_chunk *chunk;
int slot, off; int slot, off;
...@@ -766,27 +819,37 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) ...@@ -766,27 +819,37 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
return NULL; return NULL;
} }
mutex_lock(&pcpu_mutex); mutex_lock(&pcpu_alloc_mutex);
spin_lock_irq(&pcpu_lock);
/* serve reserved allocations from the reserved chunk if available */ /* serve reserved allocations from the reserved chunk if available */
if (reserved && pcpu_reserved_chunk) { if (reserved && pcpu_reserved_chunk) {
chunk = pcpu_reserved_chunk; chunk = pcpu_reserved_chunk;
if (size > chunk->contig_hint || if (size > chunk->contig_hint ||
pcpu_extend_area_map(chunk) < 0) pcpu_extend_area_map(chunk) < 0)
goto out_unlock; goto fail_unlock;
off = pcpu_alloc_area(chunk, size, align); off = pcpu_alloc_area(chunk, size, align);
if (off >= 0) if (off >= 0)
goto area_found; goto area_found;
goto out_unlock; goto fail_unlock;
} }
restart:
/* search through normal chunks */ /* search through normal chunks */
for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
list_for_each_entry(chunk, &pcpu_slot[slot], list) { list_for_each_entry(chunk, &pcpu_slot[slot], list) {
if (size > chunk->contig_hint) if (size > chunk->contig_hint)
continue; continue;
if (pcpu_extend_area_map(chunk) < 0)
goto out_unlock; switch (pcpu_extend_area_map(chunk)) {
case 0:
break;
case 1:
goto restart; /* pcpu_lock dropped, restart */
default:
goto fail_unlock;
}
off = pcpu_alloc_area(chunk, size, align); off = pcpu_alloc_area(chunk, size, align);
if (off >= 0) if (off >= 0)
goto area_found; goto area_found;
...@@ -794,27 +857,36 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) ...@@ -794,27 +857,36 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
} }
/* hmmm... no space left, create a new chunk */ /* hmmm... no space left, create a new chunk */
spin_unlock_irq(&pcpu_lock);
chunk = alloc_pcpu_chunk(); chunk = alloc_pcpu_chunk();
if (!chunk) if (!chunk)
goto out_unlock; goto fail_unlock_mutex;
spin_lock_irq(&pcpu_lock);
pcpu_chunk_relocate(chunk, -1); pcpu_chunk_relocate(chunk, -1);
pcpu_chunk_addr_insert(chunk); pcpu_chunk_addr_insert(chunk);
goto restart;
off = pcpu_alloc_area(chunk, size, align);
if (off < 0)
goto out_unlock;
area_found: area_found:
spin_unlock_irq(&pcpu_lock);
/* populate, map and clear the area */ /* populate, map and clear the area */
if (pcpu_populate_chunk(chunk, off, size)) { if (pcpu_populate_chunk(chunk, off, size)) {
spin_lock_irq(&pcpu_lock);
pcpu_free_area(chunk, off); pcpu_free_area(chunk, off);
goto out_unlock; goto fail_unlock;
} }
ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off); mutex_unlock(&pcpu_alloc_mutex);
out_unlock:
mutex_unlock(&pcpu_mutex); return __addr_to_pcpu_ptr(chunk->vm->addr + off);
return ptr;
fail_unlock:
spin_unlock_irq(&pcpu_lock);
fail_unlock_mutex:
mutex_unlock(&pcpu_alloc_mutex);
return NULL;
} }
/** /**
...@@ -825,6 +897,9 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) ...@@ -825,6 +897,9 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
* Allocate percpu area of @size bytes aligned at @align. Might * Allocate percpu area of @size bytes aligned at @align. Might
* sleep. Might trigger writeouts. * sleep. Might trigger writeouts.
* *
* CONTEXT:
* Does GFP_KERNEL allocation.
*
* RETURNS: * RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure. * Percpu pointer to the allocated area on success, NULL on failure.
*/ */
...@@ -843,6 +918,9 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); ...@@ -843,6 +918,9 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
* percpu area if arch has set it up; otherwise, allocation is served * percpu area if arch has set it up; otherwise, allocation is served
* from the same dynamic area. Might sleep. Might trigger writeouts. * from the same dynamic area. Might sleep. Might trigger writeouts.
* *
* CONTEXT:
* Does GFP_KERNEL allocation.
*
* RETURNS: * RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure. * Percpu pointer to the allocated area on success, NULL on failure.
*/ */
...@@ -856,6 +934,9 @@ void *__alloc_reserved_percpu(size_t size, size_t align) ...@@ -856,6 +934,9 @@ void *__alloc_reserved_percpu(size_t size, size_t align)
* @work: unused * @work: unused
* *
* Reclaim all fully free chunks except for the first one. * Reclaim all fully free chunks except for the first one.
*
* CONTEXT:
* workqueue context.
*/ */
static void pcpu_reclaim(struct work_struct *work) static void pcpu_reclaim(struct work_struct *work)
{ {
...@@ -863,7 +944,8 @@ static void pcpu_reclaim(struct work_struct *work) ...@@ -863,7 +944,8 @@ static void pcpu_reclaim(struct work_struct *work)
struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
struct pcpu_chunk *chunk, *next; struct pcpu_chunk *chunk, *next;
mutex_lock(&pcpu_mutex); mutex_lock(&pcpu_alloc_mutex);
spin_lock_irq(&pcpu_lock);
list_for_each_entry_safe(chunk, next, head, list) { list_for_each_entry_safe(chunk, next, head, list) {
WARN_ON(chunk->immutable); WARN_ON(chunk->immutable);
...@@ -876,7 +958,8 @@ static void pcpu_reclaim(struct work_struct *work) ...@@ -876,7 +958,8 @@ static void pcpu_reclaim(struct work_struct *work)
list_move(&chunk->list, &todo); list_move(&chunk->list, &todo);
} }
mutex_unlock(&pcpu_mutex); spin_unlock_irq(&pcpu_lock);
mutex_unlock(&pcpu_alloc_mutex);
list_for_each_entry_safe(chunk, next, &todo, list) { list_for_each_entry_safe(chunk, next, &todo, list) {
pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
...@@ -888,18 +971,22 @@ static void pcpu_reclaim(struct work_struct *work) ...@@ -888,18 +971,22 @@ static void pcpu_reclaim(struct work_struct *work)
* free_percpu - free percpu area * free_percpu - free percpu area
* @ptr: pointer to area to free * @ptr: pointer to area to free
* *
* Free percpu area @ptr. Might sleep. * Free percpu area @ptr.
*
* CONTEXT:
* Can be called from atomic context.
*/ */
void free_percpu(void *ptr) void free_percpu(void *ptr)
{ {
void *addr = __pcpu_ptr_to_addr(ptr); void *addr = __pcpu_ptr_to_addr(ptr);
struct pcpu_chunk *chunk; struct pcpu_chunk *chunk;
unsigned long flags;
int off; int off;
if (!ptr) if (!ptr)
return; return;
mutex_lock(&pcpu_mutex); spin_lock_irqsave(&pcpu_lock, flags);
chunk = pcpu_chunk_addr_search(addr); chunk = pcpu_chunk_addr_search(addr);
off = addr - chunk->vm->addr; off = addr - chunk->vm->addr;
...@@ -917,7 +1004,7 @@ void free_percpu(void *ptr) ...@@ -917,7 +1004,7 @@ void free_percpu(void *ptr)
} }
} }
mutex_unlock(&pcpu_mutex); spin_unlock_irqrestore(&pcpu_lock, flags);
} }
EXPORT_SYMBOL_GPL(free_percpu); EXPORT_SYMBOL_GPL(free_percpu);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment