Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar: "The main changes in this cycle were: - another round of rq-clock handling debugging, robustization and fixes - PELT accounting improvements - CPU hotplug related ->cpus_allowed affinity handling fixes all around the tree - ... plus misc fixes, cleanups and updates" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (35 commits) sched/x86: Update reschedule warning text crypto: N2 - Replace racy task affinity logic cpufreq/sparc-us2e: Replace racy task affinity logic cpufreq/sparc-us3: Replace racy task affinity logic cpufreq/sh: Replace racy task affinity logic cpufreq/ia64: Replace racy task affinity logic ACPI/processor: Replace racy task affinity logic ACPI/processor: Fix error handling in __acpi_processor_start() sparc/sysfs: Replace racy task affinity logic powerpc/smp: Replace open coded task affinity logic ia64/sn/hwperf: Replace racy task affinity logic ia64/salinfo: Replace racy task affinity logic workqueue: Provide work_on_cpu_safe() ia64/topology: Remove cpus_allowed manipulation sched/fair: Move the PELT constants into a generated header sched/fair: Increase PELT accuracy for small tasks sched/fair: Fix comments sched/Documentation: Add 'sched-pelt' tool sched/fair: Fix corner case in __accumulate_sum() sched/core: Remove 'task' parameter and rename tsk_restore_flags() to current_restore_flags() ...

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main changes in this cycle were: - another round of rq-clock handling debugging, robustization and fixes - PELT accounting improvements - CPU hotplug related ->cpus_allowed affinity handling fixes all around the tree - ... plus misc fixes, cleanups and updates" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (35 commits) sched/x86: Update reschedule warning text crypto: N2 - Replace racy task affinity logic cpufreq/sparc-us2e: Replace racy task affinity logic cpufreq/sparc-us3: Replace racy task affinity logic cpufreq/sh: Replace racy task affinity logic cpufreq/ia64: Replace racy task affinity logic ACPI/processor: Replace racy task affinity logic ACPI/processor: Fix error handling in __acpi_processor_start() sparc/sysfs: Replace racy task affinity logic powerpc/smp: Replace open coded task affinity logic ia64/sn/hwperf: Replace racy task affinity logic ia64/salinfo: Replace racy task affinity logic workqueue: Provide work_on_cpu_safe() ia64/topology: Remove cpus_allowed manipulation sched/fair: Move the PELT constants into a generated header sched/fair: Increase PELT accuracy for small tasks sched/fair: Fix comments sched/Documentation: Add 'sched-pelt' tool sched/fair: Fix corner case in __accumulate_sum() sched/core: Remove 'task' parameter and rename tsk_restore_flags() to current_restore_flags() ...
3527d3e9 · Linus Torvalds · 3711c94f · 21173d0b · 3527d3e9 · 3527d3e9
Commit 3527d3e9 authored May 01, 2017 by Linus Torvalds
29 changed files
--- a/Documentation/scheduler/sched-pelt.c
+++ b/Documentation/scheduler/sched-pelt.c
+/*
+ * The following program is used to generate the constants for
+ * computing sched averages.
+ *
+ * ==============================================================
+ *		C program (compile with -lm)
+ * ==============================================================
+ */
+#include <math.h>
+#include <stdio.h>
+#define HALFLIFE 32
+#define SHIFT 32
+double y;
+void calc_runnable_avg_yN_inv(void)
+{
+	int i;
+	unsigned int x;
+	printf("static const u32 runnable_avg_yN_inv[] = {");
+	for (i = 0; i < HALFLIFE; i++) {
+		x = ((1UL<<32)-1)*pow(y, i);
+		if (i % 6 == 0) printf("\n\t");
+		printf("0x%8x, ", x);
+	}
+	printf("\n};\n\n");
+}
+int sum = 1024;
+void calc_runnable_avg_yN_sum(void)
+{
+	int i;
+	printf("static const u32 runnable_avg_yN_sum[] = {\n\t    0,");
+	for (i = 1; i <= HALFLIFE; i++) {
+		if (i == 1)
+			sum *= y;
+		else
+			sum = sum*y + 1024*y;
+		if (i % 11 == 0)
+			printf("\n\t");
+		printf("%5d,", sum);
+	}
+	printf("\n};\n\n");
+}
+int n = -1;
+/* first period */
+long max = 1024;
+void calc_converged_max(void)
+{
+	long last = 0, y_inv = ((1UL<<32)-1)*y;
+	for (; ; n++) {
+		if (n > -1)
+			max = ((max*y_inv)>>SHIFT) + 1024;
+			/*
+			 * This is the same as:
+			 * max = max*y + 1024;
+			 */
+		if (last == max)
+			break;
+		last = max;
+	}
+	n--;
+	printf("#define LOAD_AVG_PERIOD %d\n", HALFLIFE);
+	printf("#define LOAD_AVG_MAX %ld\n", max);
+//	printf("#define LOAD_AVG_MAX_N %d\n\n", n);
+}
+void calc_accumulated_sum_32(void)
+{
+	int i, x = sum;
+	printf("static const u32 __accumulated_sum_N32[] = {\n\t     0,");
+	for (i = 1; i <= n/HALFLIFE+1; i++) {
+		if (i > 1)
+			x = x/2 + sum;
+		if (i % 6 == 0)
+			printf("\n\t");
+		printf("%6d,", x);
+	}
+	printf("\n};\n\n");
+}
+void main(void)
+{
+	printf("/* Generated by Documentation/scheduler/sched-pelt; do not modify. */\n\n");
+	y = pow(0.5, 1/(double)HALFLIFE);
+	calc_runnable_avg_yN_inv();
+//	calc_runnable_avg_yN_sum();
+	calc_converged_max();
+//	calc_accumulated_sum_32();
+}
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -179,14 +179,14 @@ struct salinfo_platform_oemdata_parms {
 	const u8 *efi_guid;
 	u8 **oemdata;
 	u64 *oemdata_size;
-	int ret;
 };
-static void
+static long
 salinfo_platform_oemdata_cpu(void *context)
 {
 	struct salinfo_platform_oemdata_parms *parms = context;
-	parms->ret = salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, parms->oemdata_size);
+	return salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, parms->oemdata_size);
 }
 static void
@@ -380,16 +380,7 @@ salinfo_log_release(struct inode *inode, struct file *file)
 	return 0;
 }
-static void
+static long
-call_on_cpu(int cpu, void (*fn)(void *), void *arg)
-{
-	cpumask_t save_cpus_allowed = current->cpus_allowed;
-	set_cpus_allowed_ptr(current, cpumask_of(cpu));
-	(*fn)(arg);
-	set_cpus_allowed_ptr(current, &save_cpus_allowed);
-}
-static void
 salinfo_log_read_cpu(void *context)
 {
 	struct salinfo_data *data = context;
@@ -399,6 +390,7 @@ salinfo_log_read_cpu(void *context)
 	/* Clear corrected errors as they are read from SAL */
 	if (rh->severity == sal_log_severity_corrected)
 		ia64_sal_clear_state_info(data->type);
+	return 0;
 }
 static void
@@ -430,7 +422,7 @@ salinfo_log_new_read(int cpu, struct salinfo_data *data)
 	spin_unlock_irqrestore(&data_saved_lock, flags);
 	if (!data->saved_num)
-		call_on_cpu(cpu, salinfo_log_read_cpu, data);
+		work_on_cpu_safe(cpu, salinfo_log_read_cpu, data);
 	if (!data->log_size) {
 		data->state = STATE_NO_DATA;
 		cpumask_clear_cpu(cpu, &data->cpu_event);
@@ -459,11 +451,13 @@ salinfo_log_read(struct file *file, char __user *buffer, size_t count, loff_t *p
 	return simple_read_from_buffer(buffer, count, ppos, buf, bufsize);
 }
-static void
+static long
 salinfo_log_clear_cpu(void *context)
 {
 	struct salinfo_data *data = context;
 	ia64_sal_clear_state_info(data->type);
+	return 0;
 }
 static int
@@ -486,7 +480,7 @@ salinfo_log_clear(struct salinfo_data *data, int cpu)
 	rh = (sal_log_record_header_t *)(data->log_buffer);
 	/* Corrected errors have already been cleared from SAL */
 	if (rh->severity != sal_log_severity_corrected)
-		call_on_cpu(cpu, salinfo_log_clear_cpu, data);
+		work_on_cpu_safe(cpu, salinfo_log_clear_cpu, data);
 	/* clearing a record may make a new record visible */
 	salinfo_log_new_read(cpu, data);
 	if (data->state == STATE_LOG_RECORD) {
@@ -531,9 +525,8 @@ salinfo_log_write(struct file *file, const char __user *buffer, size_t count, lo
 				.oemdata = &data->oemdata,
 				.oemdata_size = &data->oemdata_size
 			};
-			call_on_cpu(cpu, salinfo_platform_oemdata_cpu, &parms);
+			count = work_on_cpu_safe(cpu, salinfo_platform_oemdata_cpu,
-			if (parms.ret)
+						 &parms);
-				count = parms.ret;
 		} else
 			data->oemdata_size = 0;
 	} else

--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -355,18 +355,12 @@ static int cache_add_dev(unsigned int cpu)
 	unsigned long i, j;
 	struct cache_info *this_object;
 	int retval = 0;
-	cpumask_t oldmask;
 	if (all_cpu_cache_info[cpu].kobj.parent)
 		return 0;
-	oldmask = current->cpus_allowed;
-	retval = set_cpus_allowed_ptr(current, cpumask_of(cpu));
-	if (unlikely(retval))
-		return retval;
 	retval = cpu_cache_sysfs_init(cpu);
-	set_cpus_allowed_ptr(current, &oldmask);
 	if (unlikely(retval < 0))
 		return retval;

--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -598,12 +598,17 @@ static void sn_hwperf_call_sal(void *info)
 	op_info->ret = r;
 }
+static long sn_hwperf_call_sal_work(void *info)
+{
+	sn_hwperf_call_sal(info);
+	return 0;
+}
 static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info)
 {
 	u32 cpu;
 	u32 use_ipi;
 	int r = 0;
-	cpumask_t save_allowed;
 	cpu = (op_info->a->arg & SN_HWPERF_ARG_CPU_MASK) >> 32;
 	use_ipi = op_info->a->arg & SN_HWPERF_ARG_USE_IPI_MASK;
@@ -629,13 +634,9 @@ static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info)
 			/* use an interprocessor interrupt to call SAL */
 			smp_call_function_single(cpu, sn_hwperf_call_sal,
 				op_info, 1);
-		}
+		} else {
-		else {
+			/* Call on the target CPU */
-			/* migrate the task before calling SAL */ 
+			work_on_cpu_safe(cpu, sn_hwperf_call_sal_work, op_info);
-			save_allowed = current->cpus_allowed;
-			set_cpus_allowed_ptr(current, cpumask_of(cpu));
-			sn_hwperf_call_sal(op_info);
-			set_cpus_allowed_ptr(current, &save_allowed);
 		}
 	}
 	r = op_info->ret;

--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -787,24 +787,21 @@ static struct sched_domain_topology_level powerpc_topology[] = {
 	{ NULL, },
 };
-void __init smp_cpus_done(unsigned int max_cpus)
+static __init long smp_setup_cpu_workfn(void *data __always_unused)
 {
-	cpumask_var_t old_mask;
+	smp_ops->setup_cpu(boot_cpuid);
+	return 0;
+}
-	/* We want the setup_cpu() here to be called from CPU 0, but our
+void __init smp_cpus_done(unsigned int max_cpus)
-	 * init thread may have been "borrowed" by another CPU in the meantime
+{
-	 * se we pin us down to CPU 0 for a short while
+	/*
+	 * We want the setup_cpu() here to be called on the boot CPU, but
+	 * init might run on any CPU, so make sure it's invoked on the boot
+	 * CPU.
 	 */
-	alloc_cpumask_var(&old_mask, GFP_NOWAIT);
-	cpumask_copy(old_mask, &current->cpus_allowed);
-	set_cpus_allowed_ptr(current, cpumask_of(boot_cpuid));
 	if (smp_ops && smp_ops->setup_cpu)
-		smp_ops->setup_cpu(boot_cpuid);
+		work_on_cpu_safe(boot_cpuid, smp_setup_cpu_workfn, NULL);
-	set_cpus_allowed_ptr(current, old_mask);
-	free_cpumask_var(old_mask);
 	if (smp_ops && smp_ops->bringup_done)
 		smp_ops->bringup_done();
@@ -812,7 +809,6 @@ void __init smp_cpus_done(unsigned int max_cpus)
 	dump_numa_cpu_topology();
 	set_sched_topology(powerpc_topology);
 }
 #ifdef CONFIG_HOTPLUG_CPU

--- a/arch/sparc/kernel/sysfs.c
+++ b/arch/sparc/kernel/sysfs.c
@@ -98,27 +98,7 @@ static struct attribute_group mmu_stat_group = {
 	.name = "mmu_stats",
 };
-/* XXX convert to rusty's on_one_cpu */
+static long read_mmustat_enable(void *data __maybe_unused)
-static unsigned long run_on_cpu(unsigned long cpu,
-			        unsigned long (*func)(unsigned long),
-				unsigned long arg)
-{
-	cpumask_t old_affinity;
-	unsigned long ret;
-	cpumask_copy(&old_affinity, &current->cpus_allowed);
-	/* should return -EINVAL to userspace */
-	if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
-		return 0;
-	ret = func(arg);
-	set_cpus_allowed_ptr(current, &old_affinity);
-	return ret;
-}
-static unsigned long read_mmustat_enable(unsigned long junk)
 {
 	unsigned long ra = 0;
@@ -127,11 +107,11 @@ static unsigned long read_mmustat_enable(unsigned long junk)
 	return ra != 0;
 }
-static unsigned long write_mmustat_enable(unsigned long val)
+static long write_mmustat_enable(void *data)
 {
-	unsigned long ra, orig_ra;
+	unsigned long ra, orig_ra, *val = data;
-	if (val)
+	if (*val)
 		ra = __pa(&per_cpu(mmu_stats, smp_processor_id()));
 	else
 		ra = 0UL;
@@ -142,7 +122,8 @@ static unsigned long write_mmustat_enable(unsigned long val)
 static ssize_t show_mmustat_enable(struct device *s,
 				struct device_attribute *attr, char *buf)
 {
-	unsigned long val = run_on_cpu(s->id, read_mmustat_enable, 0);
+	long val = work_on_cpu(s->id, read_mmustat_enable, NULL);
 	return sprintf(buf, "%lx\n", val);
 }
@@ -150,13 +131,15 @@ static ssize_t store_mmustat_enable(struct device *s,
 			struct device_attribute *attr, const char *buf,
 			size_t count)
 {
-	unsigned long val, err;
+	unsigned long val;
-	int ret = sscanf(buf, "%lu", &val);
+	long err;
+	int ret;
+	ret = sscanf(buf, "%lu", &val);
 	if (ret != 1)
 		return -EINVAL;
-	err = run_on_cpu(s->id, write_mmustat_enable, val);
+	err = work_on_cpu(s->id, write_mmustat_enable, &val);
 	if (err)
 		return -EIO;

--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -124,7 +124,7 @@ static bool smp_no_nmi_ipi = false;
 static void native_smp_send_reschedule(int cpu)
 {
 	if (unlikely(cpu_is_offline(cpu))) {
-		WARN_ON(1);
+		WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu);
 		return;
 	}
 	apic->send_IPI(cpu, RESCHEDULE_VECTOR);

--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -251,6 +251,9 @@ static int __acpi_processor_start(struct acpi_device *device)
 	if (ACPI_SUCCESS(status))
 		return 0;
+	result = -ENODEV;
+	acpi_pss_perf_exit(pr, device);
 err_power_exit:
 	acpi_processor_power_exit(pr);
 	return result;
@@ -259,11 +262,16 @@ static int __acpi_processor_start(struct acpi_device *device)
 static int acpi_processor_start(struct device *dev)
 {
 	struct acpi_device *device = ACPI_COMPANION(dev);
+	int ret;
 	if (!device)
 		return -ENODEV;
-	return __acpi_processor_start(device);
+	/* Protect against concurrent CPU hotplug operations */
+	get_online_cpus();
+	ret = __acpi_processor_start(device);
+	put_online_cpus();
+	return ret;
 }
 static int acpi_processor_stop(struct device *dev)

--- a/drivers/acpi/processor_throttling.c
+++ b/drivers/acpi/processor_throttling.c
@@ -62,8 +62,8 @@ struct acpi_processor_throttling_arg {
 #define THROTTLING_POSTCHANGE      (2)
 static int acpi_processor_get_throttling(struct acpi_processor *pr);
-int acpi_processor_set_throttling(struct acpi_processor *pr,
+static int __acpi_processor_set_throttling(struct acpi_processor *pr,
-						int state, bool force);
+					   int state, bool force, bool direct);
 static int acpi_processor_update_tsd_coord(void)
 {
@@ -891,7 +891,8 @@ static int acpi_processor_get_throttling_ptc(struct acpi_processor *pr)
 			ACPI_DEBUG_PRINT((ACPI_DB_INFO,
 				"Invalid throttling state, reset\n"));
 			state = 0;
-			ret = acpi_processor_set_throttling(pr, state, true);
+			ret = __acpi_processor_set_throttling(pr, state, true,
+							      true);
 			if (ret)
 				return ret;
 		}
@@ -901,36 +902,31 @@ static int acpi_processor_get_throttling_ptc(struct acpi_processor *pr)
 	return 0;
 }
-static int acpi_processor_get_throttling(struct acpi_processor *pr)
+static long __acpi_processor_get_throttling(void *data)
 {
-	cpumask_var_t saved_mask;
+	struct acpi_processor *pr = data;
-	int ret;
+	return pr->throttling.acpi_processor_get_throttling(pr);
+}
+static int acpi_processor_get_throttling(struct acpi_processor *pr)
+{
 	if (!pr)
 		return -EINVAL;
 	if (!pr->flags.throttling)
 		return -ENODEV;
-	if (!alloc_cpumask_var(&saved_mask, GFP_KERNEL))
-		return -ENOMEM;
 	/*
-	 * Migrate task to the cpu pointed by pr.
+	 * This is either called from the CPU hotplug callback of
+	 * processor_driver or via the ACPI probe function. In the latter
+	 * case the CPU is not guaranteed to be online. Both call sites are
+	 * protected against CPU hotplug.
 	 */
-	cpumask_copy(saved_mask, &current->cpus_allowed);
+	if (!cpu_online(pr->id))
-	/* FIXME: use work_on_cpu() */
-	if (set_cpus_allowed_ptr(current, cpumask_of(pr->id))) {
-		/* Can't migrate to the target pr->id CPU. Exit */
-		free_cpumask_var(saved_mask);
 		return -ENODEV;
-	}
-	ret = pr->throttling.acpi_processor_get_throttling(pr);
-	/* restore the previous state */
-	set_cpus_allowed_ptr(current, saved_mask);
-	free_cpumask_var(saved_mask);
-	return ret;
+	return work_on_cpu(pr->id, __acpi_processor_get_throttling, pr);
 }
 static int acpi_processor_get_fadt_info(struct acpi_processor *pr)
@@ -1080,8 +1076,15 @@ static long acpi_processor_throttling_fn(void *data)
 			arg->target_state, arg->force);
 }
-int acpi_processor_set_throttling(struct acpi_processor *pr,
+static int call_on_cpu(int cpu, long (*fn)(void *), void *arg, bool direct)
-						int state, bool force)
+{
+	if (direct)
+		return fn(arg);
+	return work_on_cpu(cpu, fn, arg);
+}
+static int __acpi_processor_set_throttling(struct acpi_processor *pr,
+					   int state, bool force, bool direct)
 {
 	int ret = 0;
 	unsigned int i;
@@ -1130,7 +1133,8 @@ int acpi_processor_set_throttling(struct acpi_processor *pr,
 		arg.pr = pr;
 		arg.target_state = state;
 		arg.force = force;
-		ret = work_on_cpu(pr->id, acpi_processor_throttling_fn, &arg);
+		ret = call_on_cpu(pr->id, acpi_processor_throttling_fn, &arg,
+				  direct);
 	} else {
 		/*
 		 * When the T-state coordination is SW_ALL or HW_ALL,
@@ -1163,8 +1167,8 @@ int acpi_processor_set_throttling(struct acpi_processor *pr,
 			arg.pr = match_pr;
 			arg.target_state = state;
 			arg.force = force;
-			ret = work_on_cpu(pr->id, acpi_processor_throttling_fn,
+			ret = call_on_cpu(pr->id, acpi_processor_throttling_fn,
-				&arg);
+					  &arg, direct);
 		}
 	}
 	/*
@@ -1182,6 +1186,12 @@ int acpi_processor_set_throttling(struct acpi_processor *pr,
 	return ret;
 }
+int acpi_processor_set_throttling(struct acpi_processor *pr, int state,
+				  bool force)
+{
+	return __acpi_processor_set_throttling(pr, state, force, false);
+}
 int acpi_processor_get_throttling_info(struct acpi_processor *pr)
 {
 	int result = 0;

--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -381,7 +381,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
 			*sent += result;
 	} while (msg_data_left(&msg));
-	tsk_restore_flags(current, pflags, PF_MEMALLOC);
+	current_restore_flags(pflags, PF_MEMALLOC);
 	return result;
 }

--- a/drivers/cpufreq/ia64-acpi-cpufreq.c
+++ b/drivers/cpufreq/ia64-acpi-cpufreq.c
@@ -34,6 +34,11 @@ struct cpufreq_acpi_io {
 	unsigned int				resume;
 };
+struct cpufreq_acpi_req {
+	unsigned int		cpu;
+	unsigned int		state;
+};
 static struct cpufreq_acpi_io	*acpi_io_data[NR_CPUS];
 static struct cpufreq_driver acpi_cpufreq_driver;
@@ -83,8 +88,7 @@ processor_get_pstate (
 static unsigned
 extract_clock (
 	struct cpufreq_acpi_io *data,
-	unsigned value,
+	unsigned value)
-	unsigned int cpu)
 {
 	unsigned long i;
@@ -98,60 +102,43 @@ extract_clock (
 }
-static unsigned int
+static long
 processor_get_freq (
-	struct cpufreq_acpi_io	*data,
+	void *arg)
-	unsigned int		cpu)
 {
-	int			ret = 0;
+	struct cpufreq_acpi_req *req = arg;
-	u32			value = 0;
+	unsigned int		cpu = req->cpu;
-	cpumask_t		saved_mask;
+	struct cpufreq_acpi_io	*data = acpi_io_data[cpu];
-	unsigned long 		clock_freq;
+	u32			value;
+	int			ret;
 	pr_debug("processor_get_freq\n");
-	saved_mask = current->cpus_allowed;
-	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 	if (smp_processor_id() != cpu)
-		goto migrate_end;
+		return -EAGAIN;
 	/* processor_get_pstate gets the instantaneous frequency */
 	ret = processor_get_pstate(&value);
 	if (ret) {
-		set_cpus_allowed_ptr(current, &saved_mask);
 		pr_warn("get performance failed with error %d\n", ret);
-		ret = 0;
-		goto migrate_end;
-	}
-	clock_freq = extract_clock(data, value, cpu);
-	ret = (clock_freq*1000);
-migrate_end:
-	set_cpus_allowed_ptr(current, &saved_mask);
 		return ret;
+	}
+	return 1000 * extract_clock(data, value);
 }
-static int
+static long
 processor_set_freq (
-	struct cpufreq_acpi_io	*data,
+	void *arg)
-	struct cpufreq_policy   *policy,
-	int			state)
 {
-	int			ret = 0;
+	struct cpufreq_acpi_req *req = arg;
-	u32			value = 0;
+	unsigned int		cpu = req->cpu;
-	cpumask_t		saved_mask;
+	struct cpufreq_acpi_io	*data = acpi_io_data[cpu];
-	int			retval;
+	int			ret, state = req->state;
+	u32			value;
 	pr_debug("processor_set_freq\n");
+	if (smp_processor_id() != cpu)
-	saved_mask = current->cpus_allowed;
+		return -EAGAIN;
-	set_cpus_allowed_ptr(current, cpumask_of(policy->cpu));
-	if (smp_processor_id() != policy->cpu) {
-		retval = -EAGAIN;
-		goto migrate_end;
-	}
 	if (state == data->acpi_data.state) {
 		if (unlikely(data->resume)) {
@@ -159,8 +146,7 @@ processor_set_freq (
 			data->resume = 0;
 		} else {
 			pr_debug("Already at target state (P%d)\n", state);
-			retval = 0;
+			return 0;
-			goto migrate_end;
 		}
 	}
@@ -171,7 +157,6 @@ processor_set_freq (
 	 * First we write the target state's 'control' value to the
 	 * control_register.
 	 */
 	value = (u32) data->acpi_data.states[state].control;
 	pr_debug("Transitioning to state: 0x%08x\n", value);
@@ -179,17 +164,11 @@ processor_set_freq (
 	ret = processor_set_pstate(value);
 	if (ret) {
 		pr_warn("Transition failed with error %d\n", ret);
-		retval = -ENODEV;
+		return -ENODEV;
-		goto migrate_end;
 	}
 	data->acpi_data.state = state;
+	return 0;
-	retval = 0;
-migrate_end:
-	set_cpus_allowed_ptr(current, &saved_mask);
-	return (retval);
 }
@@ -197,11 +176,13 @@ static unsigned int
 acpi_cpufreq_get (
 	unsigned int		cpu)
 {
-	struct cpufreq_acpi_io *data = acpi_io_data[cpu];
+	struct cpufreq_acpi_req req;
+	long ret;
-	pr_debug("acpi_cpufreq_get\n");
+	req.cpu = cpu;
+	ret = work_on_cpu(cpu, processor_get_freq, &req);
-	return processor_get_freq(data, cpu);
+	return ret > 0 ? (unsigned int) ret : 0;
 }
@@ -210,7 +191,12 @@ acpi_cpufreq_target (
 	struct cpufreq_policy   *policy,
 	unsigned int index)
 {
-	return processor_set_freq(acpi_io_data[policy->cpu], policy, index);
+	struct cpufreq_acpi_req req;
+	req.cpu = policy->cpu;
+	req.state = index;
+	return work_on_cpu(req.cpu, processor_set_freq, &req);
 }
 static int

--- a/drivers/cpufreq/sh-cpufreq.c
+++ b/drivers/cpufreq/sh-cpufreq.c
@@ -30,54 +30,63 @@
 static DEFINE_PER_CPU(struct clk, sh_cpuclk);
+struct cpufreq_target {
+	struct cpufreq_policy	*policy;
+	unsigned int		freq;
+};
 static unsigned int sh_cpufreq_get(unsigned int cpu)
 {
 	return (clk_get_rate(&per_cpu(sh_cpuclk, cpu)) + 500) / 1000;
 }
-/*
+static long __sh_cpufreq_target(void *arg)
- * Here we notify other drivers of the proposed change and the final change.
- */
-static int sh_cpufreq_target(struct cpufreq_policy *policy,
-			     unsigned int target_freq,
-			     unsigned int relation)
 {
-	unsigned int cpu = policy->cpu;
+	struct cpufreq_target *target = arg;
+	struct cpufreq_policy *policy = target->policy;
+	int cpu = policy->cpu;
 	struct clk *cpuclk = &per_cpu(sh_cpuclk, cpu);
-	cpumask_t cpus_allowed;
 	struct cpufreq_freqs freqs;
 	struct device *dev;
 	long freq;
-	cpus_allowed = current->cpus_allowed;
+	if (smp_processor_id() != cpu)
-	set_cpus_allowed_ptr(current, cpumask_of(cpu));
+		return -ENODEV;
-	BUG_ON(smp_processor_id() != cpu);
 	dev = get_cpu_device(cpu);
 	/* Convert target_freq from kHz to Hz */
-	freq = clk_round_rate(cpuclk, target_freq * 1000);
+	freq = clk_round_rate(cpuclk, target->freq * 1000);
 	if (freq < (policy->min * 1000) || freq > (policy->max * 1000))
 		return -EINVAL;
-	dev_dbg(dev, "requested frequency %u Hz\n", target_freq * 1000);
+	dev_dbg(dev, "requested frequency %u Hz\n", target->freq * 1000);
 	freqs.old	= sh_cpufreq_get(cpu);
 	freqs.new	= (freq + 500) / 1000;
 	freqs.flags	= 0;
-	cpufreq_freq_transition_begin(policy, &freqs);
+	cpufreq_freq_transition_begin(target->policy, &freqs);
-	set_cpus_allowed_ptr(current, &cpus_allowed);
 	clk_set_rate(cpuclk, freq);
-	cpufreq_freq_transition_end(policy, &freqs, 0);
+	cpufreq_freq_transition_end(target->policy, &freqs, 0);
 	dev_dbg(dev, "set frequency %lu Hz\n", freq);
 	return 0;
 }
+/*
+ * Here we notify other drivers of the proposed change and the final change.
+ */
+static int sh_cpufreq_target(struct cpufreq_policy *policy,
+			     unsigned int target_freq,
+			     unsigned int relation)
+{
+	struct cpufreq_target data = { .policy = policy, .freq = target_freq };
+	return work_on_cpu(policy->cpu, __sh_cpufreq_target, &data);
+}
 static int sh_cpufreq_verify(struct cpufreq_policy *policy)
 {
 	struct clk *cpuclk = &per_cpu(sh_cpuclk, policy->cpu);

--- a/drivers/cpufreq/sparc-us2e-cpufreq.c
+++ b/drivers/cpufreq/sparc-us2e-cpufreq.c
@@ -118,10 +118,6 @@ static void us2e_transition(unsigned long estar, unsigned long new_bits,
 			    unsigned long clock_tick,
 			    unsigned long old_divisor, unsigned long divisor)
 {
-	unsigned long flags;
-	local_irq_save(flags);
 	estar &= ~ESTAR_MODE_DIV_MASK;
 	/* This is based upon the state transition diagram in the IIe manual.  */
@@ -152,8 +148,6 @@ static void us2e_transition(unsigned long estar, unsigned long new_bits,
 	} else {
 		BUG();
 	}
-	local_irq_restore(flags);
 }
 static unsigned long index_to_estar_mode(unsigned int index)
@@ -229,48 +223,51 @@ static unsigned long estar_to_divisor(unsigned long estar)
 	return ret;
 }
+static void __us2e_freq_get(void *arg)
+{
+	unsigned long *estar = arg;
+	*estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR);
+}
 static unsigned int us2e_freq_get(unsigned int cpu)
 {
-	cpumask_t cpus_allowed;
 	unsigned long clock_tick, estar;
-	cpumask_copy(&cpus_allowed, &current->cpus_allowed);
-	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 	clock_tick = sparc64_get_clock_tick(cpu) / 1000;
-	estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR);
+	if (smp_call_function_single(cpu, __us2e_freq_get, &estar, 1))
+		return 0;
-	set_cpus_allowed_ptr(current, &cpus_allowed);
 	return clock_tick / estar_to_divisor(estar);
 }
-static int us2e_freq_target(struct cpufreq_policy *policy, unsigned int index)
+static void __us2e_freq_target(void *arg)
 {
-	unsigned int cpu = policy->cpu;
+	unsigned int cpu = smp_processor_id();
+	unsigned int *index = arg;
 	unsigned long new_bits, new_freq;
 	unsigned long clock_tick, divisor, old_divisor, estar;
-	cpumask_t cpus_allowed;
-	cpumask_copy(&cpus_allowed, &current->cpus_allowed);
-	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 	new_freq = clock_tick = sparc64_get_clock_tick(cpu) / 1000;
-	new_bits = index_to_estar_mode(index);
+	new_bits = index_to_estar_mode(*index);
-	divisor = index_to_divisor(index);
+	divisor = index_to_divisor(*index);
 	new_freq /= divisor;
 	estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR);
 	old_divisor = estar_to_divisor(estar);
-	if (old_divisor != divisor)
+	if (old_divisor != divisor) {
 		us2e_transition(estar, new_bits, clock_tick * 1000,
 				old_divisor, divisor);
+	}
+}
-	set_cpus_allowed_ptr(current, &cpus_allowed);
+static int us2e_freq_target(struct cpufreq_policy *policy, unsigned int index)
+{
+	unsigned int cpu = policy->cpu;
-	return 0;
+	return smp_call_function_single(cpu, __us2e_freq_target, &index, 1);
 }
 static int __init us2e_freq_cpu_init(struct cpufreq_policy *policy)

--- a/drivers/cpufreq/sparc-us3-cpufreq.c
+++ b/drivers/cpufreq/sparc-us3-cpufreq.c
@@ -35,22 +35,28 @@ static struct us3_freq_percpu_info *us3_freq_table;
 #define SAFARI_CFG_DIV_32	0x0000000080000000UL
 #define SAFARI_CFG_DIV_MASK	0x00000000C0000000UL
-static unsigned long read_safari_cfg(void)
+static void read_safari_cfg(void *arg)
 {
-	unsigned long ret;
+	unsigned long ret, *val = arg;
 	__asm__ __volatile__("ldxa	[%%g0] %1, %0"
 			     : "=&r" (ret)
 			     : "i" (ASI_SAFARI_CONFIG));
-	return ret;
+	*val = ret;
 }
-static void write_safari_cfg(unsigned long val)
+static void update_safari_cfg(void *arg)
 {
+	unsigned long reg, *new_bits = arg;
+	read_safari_cfg(&reg);
+	reg &= ~SAFARI_CFG_DIV_MASK;
+	reg |= *new_bits;
 	__asm__ __volatile__("stxa	%0, [%%g0] %1\n\t"
 			     "membar	#Sync"
 			     : /* no outputs */
-			     : "r" (val), "i" (ASI_SAFARI_CONFIG)
+			     : "r" (reg), "i" (ASI_SAFARI_CONFIG)
 			     : "memory");
 }
@@ -78,29 +84,17 @@ static unsigned long get_current_freq(unsigned int cpu, unsigned long safari_cfg
 static unsigned int us3_freq_get(unsigned int cpu)
 {
-	cpumask_t cpus_allowed;
 	unsigned long reg;
-	unsigned int ret;
-	cpumask_copy(&cpus_allowed, &current->cpus_allowed);
-	set_cpus_allowed_ptr(current, cpumask_of(cpu));
-	reg = read_safari_cfg();
-	ret = get_current_freq(cpu, reg);
-	set_cpus_allowed_ptr(current, &cpus_allowed);
-	return ret;
+	if (smp_call_function_single(cpu, read_safari_cfg, &reg, 1))
+		return 0;
+	return get_current_freq(cpu, reg);
 }
 static int us3_freq_target(struct cpufreq_policy *policy, unsigned int index)
 {
 	unsigned int cpu = policy->cpu;
-	unsigned long new_bits, new_freq, reg;
+	unsigned long new_bits, new_freq;
-	cpumask_t cpus_allowed;
-	cpumask_copy(&cpus_allowed, &current->cpus_allowed);
-	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 	new_freq = sparc64_get_clock_tick(cpu) / 1000;
 	switch (index) {
@@ -121,15 +115,7 @@ static int us3_freq_target(struct cpufreq_policy *policy, unsigned int index)
 		BUG();
 	}
-	reg = read_safari_cfg();
+	return smp_call_function_single(cpu, update_safari_cfg, &new_bits, 1);
-	reg &= ~SAFARI_CFG_DIV_MASK;
-	reg |= new_bits;
-	write_safari_cfg(reg);
-	set_cpus_allowed_ptr(current, &cpus_allowed);
-	return 0;
 }
 static int __init us3_freq_cpu_init(struct cpufreq_policy *policy)

--- a/drivers/crypto/n2_core.c
+++ b/drivers/crypto/n2_core.c
@@ -65,6 +65,11 @@ struct spu_queue {
 	struct list_head	list;
 };
+struct spu_qreg {
+	struct spu_queue	*queue;
+	unsigned long		type;
+};
 static struct spu_queue **cpu_to_cwq;
 static struct spu_queue **cpu_to_mau;
@@ -1631,31 +1636,27 @@ static void queue_cache_destroy(void)
 	kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_CWQ - 1]);
 }
-static int spu_queue_register(struct spu_queue *p, unsigned long q_type)
+static long spu_queue_register_workfn(void *arg)
 {
-	cpumask_var_t old_allowed;
+	struct spu_qreg *qr = arg;
+	struct spu_queue *p = qr->queue;
+	unsigned long q_type = qr->type;
 	unsigned long hv_ret;
-	if (cpumask_empty(&p->sharing))
-		return -EINVAL;
-	if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
-		return -ENOMEM;
-	cpumask_copy(old_allowed, &current->cpus_allowed);
-	set_cpus_allowed_ptr(current, &p->sharing);
 	hv_ret = sun4v_ncs_qconf(q_type, __pa(p->q),
 				 CWQ_NUM_ENTRIES, &p->qhandle);
 	if (!hv_ret)
 		sun4v_ncs_sethead_marker(p->qhandle, 0);
-	set_cpus_allowed_ptr(current, old_allowed);
+	return hv_ret ? -EINVAL : 0;
+}
-	free_cpumask_var(old_allowed);
+static int spu_queue_register(struct spu_queue *p, unsigned long q_type)
+{
+	int cpu = cpumask_any_and(&p->sharing, cpu_online_mask);
+	struct spu_qreg qr = { .queue = p, .type = q_type };
-	return (hv_ret ? -EINVAL : 0);
+	return work_on_cpu_safe(cpu, spu_queue_register_workfn, &qr);
 }
 static int spu_queue_setup(struct spu_queue *p)

--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -387,7 +387,7 @@ static int iscsi_sw_tcp_pdu_xmit(struct iscsi_task *task)
 		rc = 0;
 	}
-	tsk_restore_flags(current, pflags, PF_MEMALLOC);
+	current_restore_flags(pflags, PF_MEMALLOC);
 	return rc;
 }

--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1004,7 +1004,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	else
 		err = nfserrno(host_err);
 	if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
-		tsk_restore_flags(current, pflags, PF_LESS_THROTTLE);
+		current_restore_flags(pflags, PF_LESS_THROTTLE);
 	return err;
 }

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1290,10 +1290,10 @@ TASK_PFA_TEST(LMK_WAITING, lmk_waiting)
 TASK_PFA_SET(LMK_WAITING, lmk_waiting)
 static inline void
-tsk_restore_flags(struct task_struct *task, unsigned long orig_flags, unsigned long flags)
+current_restore_flags(unsigned long orig_flags, unsigned long flags)
 {
-	task->flags &= ~flags;
+	current->flags &= ~flags;
-	task->flags |= orig_flags & flags;
+	current->flags |= orig_flags & flags;
 }
 extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);

--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -608,8 +608,13 @@ static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
 {
 	return fn(arg);
 }
+static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+{
+	return fn(arg);
+}
 #else
 long work_on_cpu(int cpu, long (*fn)(void *), void *arg);
+long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg);
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_FREEZER

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -56,6 +56,13 @@ SCHED_FEAT(TTWU_QUEUE, true)
 */
 SCHED_FEAT(SIS_AVG_CPU, false)
+/*
+ * Issue a WARN when we do multiple update_rq_clock() calls
+ * in a single rq->lock section. Default disabled because the
+ * annotations are not complete.
+ */
+SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
 #ifdef HAVE_RT_PUSH_IPI
 /*
 * In order to avoid a thundering herd attack of CPUs that are

--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1927,6 +1927,87 @@ static int find_next_push_cpu(struct rq *rq)
 #define RT_PUSH_IPI_EXECUTING		1
 #define RT_PUSH_IPI_RESTART		2
+/*
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * On large CPU boxes, there's the case that several CPUs could schedule
+ * a lower priority task at the same time, in which case it will look for
+ * any overloaded CPUs that it could pull a task from. To do this, the runqueue
+ * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
+ * for a single overloaded CPU's runqueue lock can produce a large latency.
+ * (This has actually been observed on large boxes running cyclictest).
+ * Instead of taking the runqueue lock of the overloaded CPU, each of the
+ * CPUs that scheduled a lower priority task simply sends an IPI to the
+ * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
+ * lots of contention. The overloaded CPU will look to push its non-running
+ * RT task off, and if it does, it can then ignore the other IPIs coming
+ * in, and just pass those IPIs off to any other overloaded CPU.
+ *
+ * When a CPU schedules a lower priority task, it only sends an IPI to
+ * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
+ * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
+ * RT overloaded tasks, would cause 100 IPIs to go out at once.
+ *
+ * The overloaded RT CPU, when receiving an IPI, will try to push off its
+ * overloaded RT tasks and then send an IPI to the next CPU that has
+ * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
+ * have completed. Just because a CPU may have pushed off its own overloaded
+ * RT task does not mean it should stop sending the IPI around to other
+ * overloaded CPUs. There may be another RT task waiting to run on one of
+ * those CPUs that are of higher priority than the one that was just
+ * pushed.
+ *
+ * An optimization that could possibly be made is to make a CPU array similar
+ * to the cpupri array mask of all running RT tasks, but for the overloaded
+ * case, then the IPI could be sent to only the CPU with the highest priority
+ * RT task waiting, and that CPU could send off further IPIs to the CPU with
+ * the next highest waiting task. Since the overloaded case is much less likely
+ * to happen, the complexity of this implementation may not be worth it.
+ * Instead, just send an IPI around to all overloaded CPUs.
+ *
+ * The rq->rt.push_flags holds the status of the IPI that is going around.
+ * A run queue can only send out a single IPI at a time. The possible flags
+ * for rq->rt.push_flags are:
+ *
+ *    (None or zero):		No IPI is going around for the current rq
+ *    RT_PUSH_IPI_EXECUTING:	An IPI for the rq is being passed around
+ *    RT_PUSH_IPI_RESTART:	The priority of the running task for the rq
+ *				has changed, and the IPI should restart
+ *				circulating the overloaded CPUs again.
+ *
+ * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
+ * before sending to the next CPU.
+ *
+ * Instead of having all CPUs that schedule a lower priority task send
+ * an IPI to the same "first" CPU in the RT overload mask, they send it
+ * to the next overloaded CPU after their own CPU. This helps distribute
+ * the work when there's more than one overloaded CPU and multiple CPUs
+ * scheduling in lower priority tasks.
+ *
+ * When a rq schedules a lower priority task than what was currently
+ * running, the next CPU with overloaded RT tasks is examined first.
+ * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
+ * priority task, it will send an IPI first to CPU 5, then CPU 5 will
+ * send to CPU 1 if it is still overloaded. CPU 1 will clear the
+ * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
+ *
+ * The first CPU to notice IPI_RESTART is set, will clear that flag and then
+ * send an IPI to the next overloaded CPU after the rq->cpu and not the next
+ * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
+ * schedules a lower priority task, and the IPI_RESTART gets set while the
+ * handling is being done on CPU 5, it will clear the flag and send it back to
+ * CPU 4 instead of CPU 1.
+ *
+ * Note, the above logic can be disabled by turning off the sched_feature
+ * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
+ * taken by the CPU requesting a pull and the waiting RT task will be pulled
+ * by that CPU. This may be fine for machines with few CPUs.
+ */
 static void tell_cpu_to_push(struct rq *rq)
 {
 	int cpu;

--- a/kernel/sched/sched-pelt.h
+++ b/kernel/sched/sched-pelt.h
+/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
+static const u32 runnable_avg_yN_inv[] = {
+	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+	0x85aac367, 0x82cd8698,
+};
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1331,15 +1331,17 @@ extern const u32 sched_prio_to_wmult[40];
 #define DEQUEUE_SLEEP		0x01
 #define DEQUEUE_SAVE		0x02 /* matches ENQUEUE_RESTORE */
 #define DEQUEUE_MOVE		0x04 /* matches ENQUEUE_MOVE */
+#define DEQUEUE_NOCLOCK		0x08 /* matches ENQUEUE_NOCLOCK */
 #define ENQUEUE_WAKEUP		0x01
 #define ENQUEUE_RESTORE		0x02
 #define ENQUEUE_MOVE		0x04
+#define ENQUEUE_NOCLOCK		0x08
-#define ENQUEUE_HEAD		0x08
+#define ENQUEUE_HEAD		0x10
-#define ENQUEUE_REPLENISH	0x10
+#define ENQUEUE_REPLENISH	0x20
 #ifdef CONFIG_SMP
-#define ENQUEUE_MIGRATED	0x20
+#define ENQUEUE_MIGRATED	0x40
 #else
 #define ENQUEUE_MIGRATED	0x00
 #endif
@@ -1624,6 +1626,7 @@ static inline void sched_avg_update(struct rq *rq) { }
 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 	__acquires(rq->lock);
 struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 	__acquires(p->pi_lock)
 	__acquires(rq->lock);
@@ -1645,6 +1648,62 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
 	raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 }
+static inline void
+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
+	__acquires(rq->lock)
+{
+	raw_spin_lock_irqsave(&rq->lock, rf->flags);
+	rq_pin_lock(rq, rf);
+}
+static inline void
+rq_lock_irq(struct rq *rq, struct rq_flags *rf)
+	__acquires(rq->lock)
+{
+	raw_spin_lock_irq(&rq->lock);
+	rq_pin_lock(rq, rf);
+}
+static inline void
+rq_lock(struct rq *rq, struct rq_flags *rf)
+	__acquires(rq->lock)
+{
+	raw_spin_lock(&rq->lock);
+	rq_pin_lock(rq, rf);
+}
+static inline void
+rq_relock(struct rq *rq, struct rq_flags *rf)
+	__acquires(rq->lock)
+{
+	raw_spin_lock(&rq->lock);
+	rq_repin_lock(rq, rf);
+}
+static inline void
+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
+	__releases(rq->lock)
+{
+	rq_unpin_lock(rq, rf);
+	raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
+}
+static inline void
+rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
+	__releases(rq->lock)
+{
+	rq_unpin_lock(rq, rf);
+	raw_spin_unlock_irq(&rq->lock);
+}
+static inline void
+rq_unlock(struct rq *rq, struct rq_flags *rf)
+	__releases(rq->lock)
+{
+	rq_unpin_lock(rq, rf);
+	raw_spin_unlock(&rq->lock);
+}
 #ifdef CONFIG_SMP
 #ifdef CONFIG_PREEMPT

--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -309,7 +309,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
 	account_irq_exit_time(current);
 	__local_bh_enable(SOFTIRQ_OFFSET);
 	WARN_ON_ONCE(in_interrupt());
-	tsk_restore_flags(current, old_flags, PF_MEMALLOC);
+	current_restore_flags(old_flags, PF_MEMALLOC);
 }
 asmlinkage __visible void do_softirq(void)

--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4734,6 +4734,29 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
 	return wfc.ret;
 }
 EXPORT_SYMBOL_GPL(work_on_cpu);
+/**
+ * work_on_cpu_safe - run a function in thread context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn:  the function to run
+ * @arg: the function argument
+ *
+ * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
+ * any locks which would prevent @fn from completing.
+ *
+ * Return: The value @fn returns.
+ */
+long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+{
+	long ret = -ENODEV;
+	get_online_cpus();
+	if (cpu_online(cpu))
+		ret = work_on_cpu(cpu, fn, arg);
+	put_online_cpus();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu_safe);
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_FREEZER

--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4243,7 +4243,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 		 */
 		current->flags |= PF_MEMALLOC;
 		ret = __netif_receive_skb_core(skb, true);
-		tsk_restore_flags(current, pflags, PF_MEMALLOC);
+		current_restore_flags(pflags, PF_MEMALLOC);
 	} else
 		ret = __netif_receive_skb_core(skb, false);

--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -325,7 +325,7 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 	current->flags |= PF_MEMALLOC;
 	ret = sk->sk_backlog_rcv(sk, skb);
-	tsk_restore_flags(current, pflags, PF_MEMALLOC);
+	current_restore_flags(pflags, PF_MEMALLOC);
 	return ret;
 }