[PATCH] Hotplug CPUs: Other CPU_DEAD Notifiers

Various files keep per-cpu caches which need to be freed/moved when a CPU goes down. All under CONFIG_HOTPLUG_CPU ifdefs. scsi.c: drain dead cpu's scsi_done_q onto this cpu. buffer.c: brelse the bh_lrus queue for dead cpu. timer.c: migrate timers from dead cpu, being careful of lock order vs __mod_timer. radix_tree.c: free dead cpu's radix_tree_preloads page_alloc.c: empty dead cpu's nr_pagecache_local into nr_pagecache, and free pages on cpu's local cache. slab.c: stop reap_timer for dead cpu, adjust each cache's free limit, and free each slab cache's per-cpu block. swap.c: drain dead cpu's lru_add_pvecs into ours, and empty its committed_space counter into global counter. dev.c: drain device queues from dead cpu into this one. flow.c: drain dead cpu's flow cache.

[PATCH] Hotplug CPUs: Other CPU_DEAD Notifiers
Various files keep per-cpu caches which need to be freed/moved when a CPU goes down. All under CONFIG_HOTPLUG_CPU ifdefs. scsi.c: drain dead cpu's scsi_done_q onto this cpu. buffer.c: brelse the bh_lrus queue for dead cpu. timer.c: migrate timers from dead cpu, being careful of lock order vs __mod_timer. radix_tree.c: free dead cpu's radix_tree_preloads page_alloc.c: empty dead cpu's nr_pagecache_local into nr_pagecache, and free pages on cpu's local cache. slab.c: stop reap_timer for dead cpu, adjust each cache's free limit, and free each slab cache's per-cpu block. swap.c: drain dead cpu's lru_add_pvecs into ours, and empty its committed_space counter into global counter. dev.c: drain device queues from dead cpu into this one. flow.c: drain dead cpu's flow cache.
279ce7b2 · Rusty Russell · Linus Torvalds · 9bd3badf · 279ce7b2 · 279ce7b2
Commit 279ce7b2 authored Mar 18, 2004 by Rusty Russell Committed by Linus Torvalds Mar 18, 2004
9 changed files
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -53,6 +53,8 @@
 #include <linux/spinlock.h>
 #include <linux/kmod.h>
 #include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>

 #include <scsi/scsi_host.h>
 #include "scsi.h"
@@ -1130,6 +1132,38 @@ int scsi_device_cancel(struct scsi_device *sdev, int recovery)
 	return 0;
 }

+#ifdef CONFIG_HOTPLUG_CPU
+static int scsi_cpu_notify(struct notifier_block *self,
+			   unsigned long action, void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+
+	switch(action) {
+	case CPU_DEAD:
+		/* Drain scsi_done_q. */
+		local_irq_disable();
+		list_splice_init(&per_cpu(scsi_done_q, cpu),
+				 &__get_cpu_var(scsi_done_q));
+		raise_softirq_irqoff(SCSI_SOFTIRQ);
+		local_irq_enable();
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata scsi_cpu_nb = {
+	.notifier_call	= scsi_cpu_notify,
+};
+
+#define register_scsi_cpu() register_cpu_notifier(&scsi_cpu_nb)
+#define unregister_scsi_cpu() unregister_cpu_notifier(&scsi_cpu_nb)
+#else
+#define register_scsi_cpu()
+#define unregister_scsi_cpu()
+#endif /* CONFIG_HOTPLUG_CPU */
+
 MODULE_DESCRIPTION("SCSI core");
 MODULE_LICENSE("GPL");

@@ -1164,6 +1198,7 @@ static int __init init_scsi(void)

 	devfs_mk_dir("scsi");
 	open_softirq(SCSI_SOFTIRQ, scsi_softirq, NULL);
+	register_scsi_cpu();
 	printk(KERN_NOTICE "SCSI subsystem initialized\n");
 	return 0;

@@ -1191,6 +1226,7 @@ static void __exit exit_scsi(void)
 	devfs_remove("scsi");
 	scsi_exit_procfs();
 	scsi_exit_queue();
+	unregister_scsi_cpu();
 }

 subsys_initcall(init_scsi);

--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3024,6 +3024,26 @@ init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
 	}
 }

+#ifdef CONFIG_HOTPLUG_CPU
+static void buffer_exit_cpu(int cpu)
+{
+	int i;
+	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
+
+	for (i = 0; i < BH_LRU_SIZE; i++) {
+		brelse(b->bhs[i]);
+		b->bhs[i] = NULL;
+	}
+}
+
+static int buffer_cpu_notify(struct notifier_block *self,
+			      unsigned long action, void *hcpu)
+{
+	if (action == CPU_DEAD)
+		buffer_exit_cpu((unsigned long)hcpu);
+	return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */

 void __init buffer_init(void)
 {
@@ -3041,6 +3061,7 @@ void __init buffer_init(void)
 	 */
 	nrpages = (nr_free_buffer_pages() * 10) / 100;
 	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
+	hotcpu_notifier(buffer_cpu_notify, 0);
 }

 EXPORT_SYMBOL(__bforget);

--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1223,6 +1223,72 @@ static void __devinit init_timers_cpu(int cpu)
 	base->timer_jiffies = jiffies;
 }

+#ifdef CONFIG_HOTPLUG_CPU
+static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
+{
+	struct timer_list *timer;
+
+	while (!list_empty(head)) {
+		timer = list_entry(head->next, struct timer_list, entry);
+		/* We're locking backwards from __mod_timer order here,
+		   beware deadlock. */
+		if (!spin_trylock(&timer->lock))
+			return 0;
+		list_del(&timer->entry);
+		internal_add_timer(new_base, timer);
+		timer->base = new_base;
+		spin_unlock(&timer->lock);
+	}
+	return 1;
+}
+
+static void __devinit migrate_timers(int cpu)
+{
+	tvec_base_t *old_base;
+	tvec_base_t *new_base;
+	int i;
+
+	BUG_ON(cpu_online(cpu));
+	old_base = &per_cpu(tvec_bases, cpu);
+	new_base = &get_cpu_var(tvec_bases);
+
+	local_irq_disable();
+again:
+	/* Prevent deadlocks via ordering by old_base < new_base. */
+	if (old_base < new_base) {
+		spin_lock(&new_base->lock);
+		spin_lock(&old_base->lock);
+	} else {
+		spin_lock(&old_base->lock);
+		spin_lock(&new_base->lock);
+	}
+
+	if (old_base->running_timer)
+		BUG();
+	for (i = 0; i < TVR_SIZE; i++)
+		if (!migrate_timer_list(new_base, old_base->tv1.vec + i))
+			goto unlock_again;
+	for (i = 0; i < TVN_SIZE; i++)
+		if (!migrate_timer_list(new_base, old_base->tv2.vec + i)
+		    || !migrate_timer_list(new_base, old_base->tv3.vec + i)
+		    || !migrate_timer_list(new_base, old_base->tv4.vec + i)
+		    || !migrate_timer_list(new_base, old_base->tv5.vec + i))
+			goto unlock_again;
+	spin_unlock(&old_base->lock);
+	spin_unlock(&new_base->lock);
+	local_irq_enable();
+	put_cpu_var(tvec_bases);
+	return;
+
+unlock_again:
+	/* Avoid deadlock with __mod_timer, by backing off. */
+	spin_unlock(&old_base->lock);
+	spin_unlock(&new_base->lock);
+	cpu_relax();
+	goto again;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
 static int __devinit timer_cpu_notify(struct notifier_block *self, 
 				unsigned long action, void *hcpu)
 {
@@ -1231,6 +1297,11 @@ static int __devinit timer_cpu_notify(struct notifier_block *self,
 	case CPU_UP_PREPARE:
 		init_timers_cpu(cpu);
 		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+		migrate_timers(cpu);
+		break;
+#endif
 	default:
 		break;
 	}

--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -24,6 +24,8 @@
 #include <linux/radix-tree.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
 #include <linux/gfp.h>
 #include <linux/string.h>

@@ -420,6 +422,28 @@ static __init void radix_tree_init_maxindex(void)
 		height_to_maxindex[i] = __maxindex(i);
 }

+#ifdef CONFIG_HOTPLUG_CPU
+static int radix_tree_callback(struct notifier_block *nfb,
+                            unsigned long action,
+                            void *hcpu)
+{
+       int cpu = (long)hcpu;
+       struct radix_tree_preload *rtp;
+
+       /* Free per-cpu pool of perloaded nodes */
+       if (action == CPU_DEAD) {
+               rtp = &per_cpu(radix_tree_preloads, cpu);
+               while (rtp->nr) {
+                       kmem_cache_free(radix_tree_node_cachep,
+                                       rtp->nodes[rtp->nr-1]);
+                       rtp->nodes[rtp->nr-1] = NULL;
+                       rtp->nr--;
+               }
+       }
+       return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
 void __init radix_tree_init(void)
 {
 	radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
@@ -428,4 +452,5 @@ void __init radix_tree_init(void)
 	if (!radix_tree_node_cachep)
 		panic ("Failed to create radix_tree_node cache\n");
 	radix_tree_init_maxindex();
+	hotcpu_notifier(radix_tree_callback, 0);
 }
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1716,9 +1716,29 @@ struct seq_operations vmstat_op = {

 #endif /* CONFIG_PROC_FS */

+#ifdef CONFIG_HOTPLUG_CPU
+static int page_alloc_cpu_notify(struct notifier_block *self,
+				 unsigned long action, void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+	long *count;
+
+	if (action == CPU_DEAD) {
+		/* Drain local pagecache count. */
+		count = &per_cpu(nr_pagecache_local, cpu);
+		atomic_add(*count, &nr_pagecache);
+		*count = 0;
+		local_irq_disable();
+		__drain_pages(cpu);
+		local_irq_enable();
+	}
+	return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */

 void __init page_alloc_init(void)
 {
+	hotcpu_notifier(page_alloc_cpu_notify, 0);
 }

 /*

--- a/mm/slab.c
+++ b/mm/slab.c
@@ -589,12 +589,19 @@ static void __init start_cpu_timer(int cpu)
 	}
 }

-/*
- * Note: if someone calls kmem_cache_alloc() on the new
- * cpu before the cpuup callback had a chance to allocate
- * the head arrays, it will oops.
- * Is CPU_ONLINE early enough?
- */
+#ifdef CONFIG_HOTPLUG_CPU
+static void stop_cpu_timer(int cpu)
+{
+	struct timer_list *rt = &per_cpu(reap_timers, cpu);
+
+	if (rt->function) {
+		del_timer_sync(rt);
+		WARN_ON(timer_pending(rt));
+		rt->function = NULL;
+	}
+}
+#endif
+
 static int __devinit cpuup_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
@@ -630,18 +637,28 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 	case CPU_ONLINE:
 		start_cpu_timer(cpu);
 		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+		stop_cpu_timer(cpu);
+		/* fall thru */
 	case CPU_UP_CANCELED:
 		down(&cache_chain_sem);

 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;

+			spin_lock_irq(&cachep->spinlock);
+			/* cpu is dead; no one can alloc from it. */
 			nc = cachep->array[cpu];
 			cachep->array[cpu] = NULL;
+			cachep->free_limit -= cachep->batchcount;
+			free_block(cachep, ac_entry(nc), nc->avail);
+			spin_unlock_irq(&cachep->spinlock);
 			kfree(nc);
 		}
 		up(&cache_chain_sem);
 		break;
+#endif
 	}
 	return NOTIFY_OK;
 bad:
@@ -1486,6 +1503,9 @@ int kmem_cache_destroy (kmem_cache_t * cachep)
 		return 1;
 	}

+	/* no cpu_online check required here since we clear the percpu
+	 * array on cpu offline and set this to NULL.
+	 */
 	for (i = 0; i < NR_CPUS; i++)
 		kfree(cachep->array[i]);


--- a/mm/swap.c
+++ b/mm/swap.c
@@ -27,6 +27,9 @@
 #include <linux/module.h>
 #include <linux/percpu_counter.h>
 #include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/init.h>

 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
@@ -381,7 +384,37 @@ void vm_acct_memory(long pages)
 	preempt_enable();
 }
 EXPORT_SYMBOL(vm_acct_memory);
-#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void lru_drain_cache(unsigned int cpu)
+{
+	struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
+
+	/* CPU is dead, so no locking needed. */
+	if (pagevec_count(pvec))
+		__pagevec_lru_add(pvec);
+	pvec = &per_cpu(lru_add_active_pvecs, cpu);
+	if (pagevec_count(pvec))
+		__pagevec_lru_add_active(pvec);
+}
+
+/* Drop the CPU's cached committed space back into the central pool. */
+static int cpu_swap_callback(struct notifier_block *nfb,
+			     unsigned long action,
+			     void *hcpu)
+{
+	long *committed;
+
+	committed = &per_cpu(committed_space, (long)hcpu);
+	if (action == CPU_DEAD) {
+		atomic_add(*committed, &vm_committed_space);
+		*committed = 0;
+		lru_drain_cache((long)hcpu);
+	}
+	return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+#endif /* CONFIG_SMP */

 #ifdef CONFIG_SMP
 void percpu_counter_mod(struct percpu_counter *fbc, long amount)
@@ -420,4 +453,5 @@ void __init swap_setup(void)
 	 * Right now other parts of the system means that we
 	 * _really_ don't want to cluster much more
 	 */
+	hotcpu_notifier(cpu_swap_callback, 0);
 }
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -76,6 +76,7 @@
 #include <asm/system.h>
 #include <asm/bitops.h>
 #include <linux/config.h>
+#include <linux/cpu.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
@@ -3131,6 +3132,52 @@ int unregister_netdevice(struct net_device *dev)
 	return 0;
 }

+#ifdef CONFIG_HOTPLUG_CPU
+static int dev_cpu_callback(struct notifier_block *nfb,
+			    unsigned long action,
+			    void *ocpu)
+{
+	struct sk_buff **list_skb;
+	struct net_device **list_net;
+	struct sk_buff *skb;
+	unsigned int cpu, oldcpu = (unsigned long)ocpu;
+	struct softnet_data *sd, *oldsd;
+
+	if (action != CPU_DEAD)
+		return NOTIFY_OK;
+
+	local_irq_disable();
+	cpu = smp_processor_id();
+	sd = &per_cpu(softnet_data, cpu);
+	oldsd = &per_cpu(softnet_data, oldcpu);
+
+	/* Find end of our completion_queue. */
+	list_skb = &sd->completion_queue;
+	while (*list_skb)
+		list_skb = &(*list_skb)->next;
+	/* Append completion queue from offline CPU. */
+	*list_skb = oldsd->completion_queue;
+	oldsd->completion_queue = NULL;
+
+	/* Find end of our output_queue. */
+	list_net = &sd->output_queue;
+	while (*list_net)
+		list_net = &(*list_net)->next_sched;
+	/* Append output queue from offline CPU. */
+	*list_net = oldsd->output_queue;
+	oldsd->output_queue = NULL;
+
+	raise_softirq_irqoff(NET_TX_SOFTIRQ);
+	local_irq_enable();
+
+	/* Process offline CPU's input_pkt_queue */
+	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
+		netif_rx(skb);
+
+	return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+

 /*
 *	Initialize the DEV module. At boot time this walks the device list and
@@ -3195,6 +3242,7 @@ static int __init net_dev_init(void)
 	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
 	open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);

+	hotcpu_notifier(dev_cpu_callback, 0);
 	dst_init();
 	dev_mcast_init();
 	rc = 0;

--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -326,6 +326,17 @@ static void __devinit flow_cache_cpu_prepare(int cpu)
 	tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
 }

+#ifdef CONFIG_HOTPLUG_CPU
+static int flow_cache_cpu(struct notifier_block *nfb,
+			  unsigned long action,
+			  void *hcpu)
+{
+	if (action == CPU_DEAD)
+		__flow_cache_shrink((unsigned long)hcpu, 0);
+	return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
 static int __init flow_cache_init(void)
 {
 	int i;
@@ -350,6 +361,7 @@ static int __init flow_cache_init(void)
 	for_each_cpu(i)
 		flow_cache_cpu_prepare(i);

+	hotcpu_notifier(flow_cache_cpu, 0);
 	return 0;
 }