Commit 279ce7b2 authored by Rusty Russell's avatar Rusty Russell Committed by Linus Torvalds

[PATCH] Hotplug CPUs: Other CPU_DEAD Notifiers

Various files keep per-cpu caches which need to be freed/moved when a
CPU goes down.  All under CONFIG_HOTPLUG_CPU ifdefs.

scsi.c: drain dead cpu's scsi_done_q onto this cpu.

buffer.c: brelse the bh_lrus queue for dead cpu.

timer.c: migrate timers from dead cpu, being careful of lock order vs
	__mod_timer.

radix_tree.c: free dead cpu's radix_tree_preloads

page_alloc.c: empty dead cpu's nr_pagecache_local into nr_pagecache, and
	free pages on cpu's local cache.

slab.c: stop reap_timer for dead cpu, adjust each cache's free limit, and
	free each slab cache's per-cpu block.

swap.c: drain dead cpu's lru_add_pvecs into ours, and empty its committed_space
	counter into global counter.

dev.c: drain device queues from dead cpu into this one.

flow.c: drain dead cpu's flow cache.
parent 9bd3badf
......@@ -53,6 +53,8 @@
#include <linux/spinlock.h>
#include <linux/kmod.h>
#include <linux/interrupt.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <scsi/scsi_host.h>
#include "scsi.h"
......@@ -1130,6 +1132,38 @@ int scsi_device_cancel(struct scsi_device *sdev, int recovery)
return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
static int scsi_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
int cpu = (unsigned long)hcpu;
switch(action) {
case CPU_DEAD:
/* Drain scsi_done_q. */
local_irq_disable();
list_splice_init(&per_cpu(scsi_done_q, cpu),
&__get_cpu_var(scsi_done_q));
raise_softirq_irqoff(SCSI_SOFTIRQ);
local_irq_enable();
break;
default:
break;
}
return NOTIFY_OK;
}
static struct notifier_block __devinitdata scsi_cpu_nb = {
.notifier_call = scsi_cpu_notify,
};
#define register_scsi_cpu() register_cpu_notifier(&scsi_cpu_nb)
#define unregister_scsi_cpu() unregister_cpu_notifier(&scsi_cpu_nb)
#else
#define register_scsi_cpu()
#define unregister_scsi_cpu()
#endif /* CONFIG_HOTPLUG_CPU */
MODULE_DESCRIPTION("SCSI core");
MODULE_LICENSE("GPL");
......@@ -1164,6 +1198,7 @@ static int __init init_scsi(void)
devfs_mk_dir("scsi");
open_softirq(SCSI_SOFTIRQ, scsi_softirq, NULL);
register_scsi_cpu();
printk(KERN_NOTICE "SCSI subsystem initialized\n");
return 0;
......@@ -1191,6 +1226,7 @@ static void __exit exit_scsi(void)
devfs_remove("scsi");
scsi_exit_procfs();
scsi_exit_queue();
unregister_scsi_cpu();
}
subsys_initcall(init_scsi);
......
......@@ -3024,6 +3024,26 @@ init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
}
}
#ifdef CONFIG_HOTPLUG_CPU
static void buffer_exit_cpu(int cpu)
{
int i;
struct bh_lru *b = &per_cpu(bh_lrus, cpu);
for (i = 0; i < BH_LRU_SIZE; i++) {
brelse(b->bhs[i]);
b->bhs[i] = NULL;
}
}
static int buffer_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
if (action == CPU_DEAD)
buffer_exit_cpu((unsigned long)hcpu);
return NOTIFY_OK;
}
#endif /* CONFIG_HOTPLUG_CPU */
void __init buffer_init(void)
{
......@@ -3041,6 +3061,7 @@ void __init buffer_init(void)
*/
nrpages = (nr_free_buffer_pages() * 10) / 100;
max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
hotcpu_notifier(buffer_cpu_notify, 0);
}
EXPORT_SYMBOL(__bforget);
......
......@@ -1223,6 +1223,72 @@ static void __devinit init_timers_cpu(int cpu)
base->timer_jiffies = jiffies;
}
#ifdef CONFIG_HOTPLUG_CPU
static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
{
struct timer_list *timer;
while (!list_empty(head)) {
timer = list_entry(head->next, struct timer_list, entry);
/* We're locking backwards from __mod_timer order here,
beware deadlock. */
if (!spin_trylock(&timer->lock))
return 0;
list_del(&timer->entry);
internal_add_timer(new_base, timer);
timer->base = new_base;
spin_unlock(&timer->lock);
}
return 1;
}
static void __devinit migrate_timers(int cpu)
{
tvec_base_t *old_base;
tvec_base_t *new_base;
int i;
BUG_ON(cpu_online(cpu));
old_base = &per_cpu(tvec_bases, cpu);
new_base = &get_cpu_var(tvec_bases);
local_irq_disable();
again:
/* Prevent deadlocks via ordering by old_base < new_base. */
if (old_base < new_base) {
spin_lock(&new_base->lock);
spin_lock(&old_base->lock);
} else {
spin_lock(&old_base->lock);
spin_lock(&new_base->lock);
}
if (old_base->running_timer)
BUG();
for (i = 0; i < TVR_SIZE; i++)
if (!migrate_timer_list(new_base, old_base->tv1.vec + i))
goto unlock_again;
for (i = 0; i < TVN_SIZE; i++)
if (!migrate_timer_list(new_base, old_base->tv2.vec + i)
|| !migrate_timer_list(new_base, old_base->tv3.vec + i)
|| !migrate_timer_list(new_base, old_base->tv4.vec + i)
|| !migrate_timer_list(new_base, old_base->tv5.vec + i))
goto unlock_again;
spin_unlock(&old_base->lock);
spin_unlock(&new_base->lock);
local_irq_enable();
put_cpu_var(tvec_bases);
return;
unlock_again:
/* Avoid deadlock with __mod_timer, by backing off. */
spin_unlock(&old_base->lock);
spin_unlock(&new_base->lock);
cpu_relax();
goto again;
}
#endif /* CONFIG_HOTPLUG_CPU */
static int __devinit timer_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
......@@ -1231,6 +1297,11 @@ static int __devinit timer_cpu_notify(struct notifier_block *self,
case CPU_UP_PREPARE:
init_timers_cpu(cpu);
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
migrate_timers(cpu);
break;
#endif
default:
break;
}
......
......@@ -24,6 +24,8 @@
#include <linux/radix-tree.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/gfp.h>
#include <linux/string.h>
......@@ -420,6 +422,28 @@ static __init void radix_tree_init_maxindex(void)
height_to_maxindex[i] = __maxindex(i);
}
#ifdef CONFIG_HOTPLUG_CPU
static int radix_tree_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
int cpu = (long)hcpu;
struct radix_tree_preload *rtp;
/* Free per-cpu pool of perloaded nodes */
if (action == CPU_DEAD) {
rtp = &per_cpu(radix_tree_preloads, cpu);
while (rtp->nr) {
kmem_cache_free(radix_tree_node_cachep,
rtp->nodes[rtp->nr-1]);
rtp->nodes[rtp->nr-1] = NULL;
rtp->nr--;
}
}
return NOTIFY_OK;
}
#endif /* CONFIG_HOTPLUG_CPU */
void __init radix_tree_init(void)
{
radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
......@@ -428,4 +452,5 @@ void __init radix_tree_init(void)
if (!radix_tree_node_cachep)
panic ("Failed to create radix_tree_node cache\n");
radix_tree_init_maxindex();
hotcpu_notifier(radix_tree_callback, 0);
}
......@@ -1716,9 +1716,29 @@ struct seq_operations vmstat_op = {
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_HOTPLUG_CPU
static int page_alloc_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
int cpu = (unsigned long)hcpu;
long *count;
if (action == CPU_DEAD) {
/* Drain local pagecache count. */
count = &per_cpu(nr_pagecache_local, cpu);
atomic_add(*count, &nr_pagecache);
*count = 0;
local_irq_disable();
__drain_pages(cpu);
local_irq_enable();
}
return NOTIFY_OK;
}
#endif /* CONFIG_HOTPLUG_CPU */
void __init page_alloc_init(void)
{
hotcpu_notifier(page_alloc_cpu_notify, 0);
}
/*
......
......@@ -589,12 +589,19 @@ static void __init start_cpu_timer(int cpu)
}
}
/*
* Note: if someone calls kmem_cache_alloc() on the new
* cpu before the cpuup callback had a chance to allocate
* the head arrays, it will oops.
* Is CPU_ONLINE early enough?
*/
#ifdef CONFIG_HOTPLUG_CPU
static void stop_cpu_timer(int cpu)
{
struct timer_list *rt = &per_cpu(reap_timers, cpu);
if (rt->function) {
del_timer_sync(rt);
WARN_ON(timer_pending(rt));
rt->function = NULL;
}
}
#endif
static int __devinit cpuup_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
......@@ -630,18 +637,28 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
case CPU_ONLINE:
start_cpu_timer(cpu);
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
stop_cpu_timer(cpu);
/* fall thru */
case CPU_UP_CANCELED:
down(&cache_chain_sem);
list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
spin_lock_irq(&cachep->spinlock);
/* cpu is dead; no one can alloc from it. */
nc = cachep->array[cpu];
cachep->array[cpu] = NULL;
cachep->free_limit -= cachep->batchcount;
free_block(cachep, ac_entry(nc), nc->avail);
spin_unlock_irq(&cachep->spinlock);
kfree(nc);
}
up(&cache_chain_sem);
break;
#endif
}
return NOTIFY_OK;
bad:
......@@ -1486,6 +1503,9 @@ int kmem_cache_destroy (kmem_cache_t * cachep)
return 1;
}
/* no cpu_online check required here since we clear the percpu
* array on cpu offline and set this to NULL.
*/
for (i = 0; i < NR_CPUS; i++)
kfree(cachep->array[i]);
......
......@@ -27,6 +27,9 @@
#include <linux/module.h>
#include <linux/percpu_counter.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/init.h>
/* How many pages do we try to swap or page in/out together? */
int page_cluster;
......@@ -381,7 +384,37 @@ void vm_acct_memory(long pages)
preempt_enable();
}
EXPORT_SYMBOL(vm_acct_memory);
#endif
#ifdef CONFIG_HOTPLUG_CPU
static void lru_drain_cache(unsigned int cpu)
{
struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
/* CPU is dead, so no locking needed. */
if (pagevec_count(pvec))
__pagevec_lru_add(pvec);
pvec = &per_cpu(lru_add_active_pvecs, cpu);
if (pagevec_count(pvec))
__pagevec_lru_add_active(pvec);
}
/* Drop the CPU's cached committed space back into the central pool. */
static int cpu_swap_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
long *committed;
committed = &per_cpu(committed_space, (long)hcpu);
if (action == CPU_DEAD) {
atomic_add(*committed, &vm_committed_space);
*committed = 0;
lru_drain_cache((long)hcpu);
}
return NOTIFY_OK;
}
#endif /* CONFIG_HOTPLUG_CPU */
#endif /* CONFIG_SMP */
#ifdef CONFIG_SMP
void percpu_counter_mod(struct percpu_counter *fbc, long amount)
......@@ -420,4 +453,5 @@ void __init swap_setup(void)
* Right now other parts of the system means that we
* _really_ don't want to cluster much more
*/
hotcpu_notifier(cpu_swap_callback, 0);
}
......@@ -76,6 +76,7 @@
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/config.h>
#include <linux/cpu.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
......@@ -3131,6 +3132,52 @@ int unregister_netdevice(struct net_device *dev)
return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
static int dev_cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *ocpu)
{
struct sk_buff **list_skb;
struct net_device **list_net;
struct sk_buff *skb;
unsigned int cpu, oldcpu = (unsigned long)ocpu;
struct softnet_data *sd, *oldsd;
if (action != CPU_DEAD)
return NOTIFY_OK;
local_irq_disable();
cpu = smp_processor_id();
sd = &per_cpu(softnet_data, cpu);
oldsd = &per_cpu(softnet_data, oldcpu);
/* Find end of our completion_queue. */
list_skb = &sd->completion_queue;
while (*list_skb)
list_skb = &(*list_skb)->next;
/* Append completion queue from offline CPU. */
*list_skb = oldsd->completion_queue;
oldsd->completion_queue = NULL;
/* Find end of our output_queue. */
list_net = &sd->output_queue;
while (*list_net)
list_net = &(*list_net)->next_sched;
/* Append output queue from offline CPU. */
*list_net = oldsd->output_queue;
oldsd->output_queue = NULL;
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
/* Process offline CPU's input_pkt_queue */
while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
netif_rx(skb);
return NOTIFY_OK;
}
#endif /* CONFIG_HOTPLUG_CPU */
/*
* Initialize the DEV module. At boot time this walks the device list and
......@@ -3195,6 +3242,7 @@ static int __init net_dev_init(void)
open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
hotcpu_notifier(dev_cpu_callback, 0);
dst_init();
dev_mcast_init();
rc = 0;
......
......@@ -326,6 +326,17 @@ static void __devinit flow_cache_cpu_prepare(int cpu)
tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
}
#ifdef CONFIG_HOTPLUG_CPU
static int flow_cache_cpu(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
if (action == CPU_DEAD)
__flow_cache_shrink((unsigned long)hcpu, 0);
return NOTIFY_OK;
}
#endif /* CONFIG_HOTPLUG_CPU */
static int __init flow_cache_init(void)
{
int i;
......@@ -350,6 +361,7 @@ static int __init flow_cache_init(void)
for_each_cpu(i)
flow_cache_cpu_prepare(i);
hotcpu_notifier(flow_cache_cpu, 0);
return 0;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment