Commit 97db62cc authored by Ingo Molnar's avatar Ingo Molnar Committed by Linus Torvalds

[PATCH] scheduler fixes

 - introduce new type of context-switch locking, this is a must-have for
   ia64 and sparc64.

 - load_balance() bug noticed by Scott Rhine and myself: scan the
   whole list to find imbalance number of tasks, not just the tail
   of the list.

 - sched_yield() fix: use current->array not rq->active.
parent 9e7cec88
...@@ -11,11 +11,6 @@ ...@@ -11,11 +11,6 @@
struct task_struct; /* one of the stranger aspects of C forward declarations.. */ struct task_struct; /* one of the stranger aspects of C forward declarations.. */
extern void FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next)); extern void FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
#define prepare_arch_schedule(prev) do { } while(0)
#define finish_arch_schedule(prev) do { } while(0)
#define prepare_arch_switch(rq) do { } while(0)
#define finish_arch_switch(rq) spin_unlock_irq(&(rq)->lock)
#define switch_to(prev,next,last) do { \ #define switch_to(prev,next,last) do { \
asm volatile("pushl %%esi\n\t" \ asm volatile("pushl %%esi\n\t" \
"pushl %%edi\n\t" \ "pushl %%edi\n\t" \
......
...@@ -83,11 +83,6 @@ extern void cacheable_memzero(void *p, unsigned int nb); ...@@ -83,11 +83,6 @@ extern void cacheable_memzero(void *p, unsigned int nb);
struct device_node; struct device_node;
extern void note_scsi_host(struct device_node *, void *); extern void note_scsi_host(struct device_node *, void *);
#define prepare_arch_schedule(prev) do { } while(0)
#define finish_arch_schedule(prev) do { } while(0)
#define prepare_arch_switch(rq) do { } while(0)
#define finish_arch_switch(rq) spin_unlock_irq(&(rq)->lock)
struct task_struct; struct task_struct;
extern void __switch_to(struct task_struct *, struct task_struct *); extern void __switch_to(struct task_struct *, struct task_struct *);
#define switch_to(prev, next, last) __switch_to((prev), (next)) #define switch_to(prev, next, last) __switch_to((prev), (next))
......
...@@ -18,11 +18,6 @@ ...@@ -18,11 +18,6 @@
#endif #endif
#include <linux/kernel.h> #include <linux/kernel.h>
#define prepare_arch_schedule(prev) do { } while (0)
#define finish_arch_schedule(prev) do { } while (0)
#define prepare_arch_switch(rq) do { } while (0)
#define finish_arch_switch(rq) spin_unlock_irq(&(rq)->lock)
#define switch_to(prev,next,last) do { \ #define switch_to(prev,next,last) do { \
if (prev == next) \ if (prev == next) \
break; \ break; \
......
...@@ -18,11 +18,6 @@ ...@@ -18,11 +18,6 @@
#endif #endif
#include <linux/kernel.h> #include <linux/kernel.h>
#define prepare_arch_schedule(prev) do { } while (0)
#define finish_arch_schedule(prev) do { } while (0)
#define prepare_arch_switch(rq) do { } while (0)
#define finish_arch_switch(rq) spin_unlock_irq(&(rq)->lock)
#define switch_to(prev,next),last do { \ #define switch_to(prev,next),last do { \
if (prev == next) \ if (prev == next) \
break; \ break; \
......
...@@ -140,13 +140,17 @@ extern void __flushw_user(void); ...@@ -140,13 +140,17 @@ extern void __flushw_user(void);
#define flush_user_windows flushw_user #define flush_user_windows flushw_user
#define flush_register_windows flushw_all #define flush_register_windows flushw_all
#define prepare_arch_schedule(prev) task_lock(prev) #define prepare_arch_switch(rq, next) \
#define finish_arch_schedule(prev) task_unlock(prev) do { spin_lock(&(next)->switch_lock); \
#define prepare_arch_switch(rq) \ spin_unlock(&(rq)->lock); \
do { spin_unlock(&(rq)->lock); \ flushw_all(); \
flushw_all(); \
} while (0) } while (0)
#define finish_arch_switch(rq) local_irq_enable()
#define finish_arch_switch(rq, prev) \
do { spin_unlock_irq(&(prev)->switch_lock); \
} while (0)
#ifndef CONFIG_DEBUG_SPINLOCK #ifndef CONFIG_DEBUG_SPINLOCK
#define CHECK_LOCKS(PREV) do { } while(0) #define CHECK_LOCKS(PREV) do { } while(0)
......
...@@ -13,11 +13,6 @@ ...@@ -13,11 +13,6 @@
#define LOCK_PREFIX "" #define LOCK_PREFIX ""
#endif #endif
#define prepare_arch_schedule(prev) do { } while(0)
#define finish_arch_schedule(prev) do { } while(0)
#define prepare_arch_switch(rq) do { } while(0)
#define finish_arch_switch(rq) spin_unlock_irq(&(rq)->lock)
#define __STR(x) #x #define __STR(x) #x
#define STR(x) __STR(x) #define STR(x) __STR(x)
......
...@@ -47,7 +47,7 @@ ...@@ -47,7 +47,7 @@
lock_depth: -1, \ lock_depth: -1, \
prio: MAX_PRIO-20, \ prio: MAX_PRIO-20, \
static_prio: MAX_PRIO-20, \ static_prio: MAX_PRIO-20, \
policy: SCHED_OTHER, \ policy: SCHED_NORMAL, \
cpus_allowed: -1, \ cpus_allowed: -1, \
mm: NULL, \ mm: NULL, \
active_mm: &init_mm, \ active_mm: &init_mm, \
...@@ -78,6 +78,7 @@ ...@@ -78,6 +78,7 @@
pending: { NULL, &tsk.pending.head, {{0}}}, \ pending: { NULL, &tsk.pending.head, {{0}}}, \
blocked: {{0}}, \ blocked: {{0}}, \
alloc_lock: SPIN_LOCK_UNLOCKED, \ alloc_lock: SPIN_LOCK_UNLOCKED, \
switch_lock: SPIN_LOCK_UNLOCKED, \
journal_info: NULL, \ journal_info: NULL, \
} }
......
...@@ -116,7 +116,7 @@ extern unsigned long nr_uninterruptible(void); ...@@ -116,7 +116,7 @@ extern unsigned long nr_uninterruptible(void);
/* /*
* Scheduling policies * Scheduling policies
*/ */
#define SCHED_OTHER 0 #define SCHED_NORMAL 0
#define SCHED_FIFO 1 #define SCHED_FIFO 1
#define SCHED_RR 2 #define SCHED_RR 2
...@@ -207,7 +207,7 @@ struct signal_struct { ...@@ -207,7 +207,7 @@ struct signal_struct {
/* /*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT * Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_OTHER tasks are * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are
* in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values
* are inverted: lower p->prio value means higher priority. * are inverted: lower p->prio value means higher priority.
* *
...@@ -264,7 +264,7 @@ struct task_struct { ...@@ -264,7 +264,7 @@ struct task_struct {
unsigned long policy; unsigned long policy;
unsigned long cpus_allowed; unsigned long cpus_allowed;
unsigned int time_slice; unsigned int time_slice, first_time_slice;
struct list_head tasks; struct list_head tasks;
...@@ -361,6 +361,8 @@ struct task_struct { ...@@ -361,6 +361,8 @@ struct task_struct {
u32 self_exec_id; u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty */ /* Protection of (de-)allocation: mm, files, fs, tty */
spinlock_t alloc_lock; spinlock_t alloc_lock;
/* context-switch lock */
spinlock_t switch_lock;
/* journalling filesystem info */ /* journalling filesystem info */
void *journal_info; void *journal_info;
......
...@@ -184,7 +184,7 @@ void reparent_to_init(void) ...@@ -184,7 +184,7 @@ void reparent_to_init(void)
current->exit_signal = SIGCHLD; current->exit_signal = SIGCHLD;
current->ptrace = 0; current->ptrace = 0;
if ((current->policy == SCHED_OTHER) && (task_nice(current) < 0)) if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0))
set_user_nice(current, 0); set_user_nice(current, 0);
/* cpus_allowed? */ /* cpus_allowed? */
/* rt_priority? */ /* rt_priority? */
......
...@@ -611,7 +611,6 @@ struct task_struct *do_fork(unsigned long clone_flags, ...@@ -611,7 +611,6 @@ struct task_struct *do_fork(unsigned long clone_flags,
unsigned long stack_size) unsigned long stack_size)
{ {
int retval; int retval;
unsigned long flags;
struct task_struct *p = NULL; struct task_struct *p = NULL;
struct completion vfork; struct completion vfork;
...@@ -675,6 +674,7 @@ struct task_struct *do_fork(unsigned long clone_flags, ...@@ -675,6 +674,7 @@ struct task_struct *do_fork(unsigned long clone_flags,
init_completion(&vfork); init_completion(&vfork);
} }
spin_lock_init(&p->alloc_lock); spin_lock_init(&p->alloc_lock);
spin_lock_init(&p->switch_lock);
clear_tsk_thread_flag(p,TIF_SIGPENDING); clear_tsk_thread_flag(p,TIF_SIGPENDING);
init_sigpending(&p->pending); init_sigpending(&p->pending);
...@@ -740,8 +740,13 @@ struct task_struct *do_fork(unsigned long clone_flags, ...@@ -740,8 +740,13 @@ struct task_struct *do_fork(unsigned long clone_flags,
* total amount of pending timeslices in the system doesnt change, * total amount of pending timeslices in the system doesnt change,
* resulting in more scheduling fairness. * resulting in more scheduling fairness.
*/ */
local_irq_save(flags); local_irq_disable();
p->time_slice = (current->time_slice + 1) >> 1; p->time_slice = (current->time_slice + 1) >> 1;
/*
* The remainder of the first timeslice might be recovered by
* the parent if the child exits early enough.
*/
p->first_time_slice = 1;
current->time_slice >>= 1; current->time_slice >>= 1;
p->sleep_timestamp = jiffies; p->sleep_timestamp = jiffies;
if (!current->time_slice) { if (!current->time_slice) {
...@@ -753,11 +758,10 @@ struct task_struct *do_fork(unsigned long clone_flags, ...@@ -753,11 +758,10 @@ struct task_struct *do_fork(unsigned long clone_flags,
current->time_slice = 1; current->time_slice = 1;
preempt_disable(); preempt_disable();
scheduler_tick(0, 0); scheduler_tick(0, 0);
local_irq_restore(flags); local_irq_enable();
preempt_enable(); preempt_enable();
} else } else
local_irq_restore(flags); local_irq_enable();
/* /*
* Ok, add it to the run-queues and make it * Ok, add it to the run-queues and make it
* visible to the rest of the system. * visible to the rest of the system.
......
...@@ -190,16 +190,19 @@ int request_module(const char * module_name) ...@@ -190,16 +190,19 @@ int request_module(const char * module_name)
pid_t pid; pid_t pid;
int waitpid_result; int waitpid_result;
sigset_t tmpsig; sigset_t tmpsig;
int i; int i, ret;
static atomic_t kmod_concurrent = ATOMIC_INIT(0); static atomic_t kmod_concurrent = ATOMIC_INIT(0);
#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
static int kmod_loop_msg; static int kmod_loop_msg;
unsigned long saved_policy = current->policy;
current->policy = SCHED_NORMAL;
/* Don't allow request_module() before the root fs is mounted! */ /* Don't allow request_module() before the root fs is mounted! */
if ( ! current->fs->root ) { if ( ! current->fs->root ) {
printk(KERN_ERR "request_module[%s]: Root fs not mounted\n", printk(KERN_ERR "request_module[%s]: Root fs not mounted\n",
module_name); module_name);
return -EPERM; ret = -EPERM;
goto out;
} }
/* If modprobe needs a service that is in a module, we get a recursive /* If modprobe needs a service that is in a module, we get a recursive
...@@ -220,14 +223,16 @@ int request_module(const char * module_name) ...@@ -220,14 +223,16 @@ int request_module(const char * module_name)
printk(KERN_ERR printk(KERN_ERR
"kmod: runaway modprobe loop assumed and stopped\n"); "kmod: runaway modprobe loop assumed and stopped\n");
atomic_dec(&kmod_concurrent); atomic_dec(&kmod_concurrent);
return -ENOMEM; ret = -ENOMEM;
goto out;
} }
pid = kernel_thread(exec_modprobe, (void*) module_name, 0); pid = kernel_thread(exec_modprobe, (void*) module_name, 0);
if (pid < 0) { if (pid < 0) {
printk(KERN_ERR "request_module[%s]: fork failed, errno %d\n", module_name, -pid); printk(KERN_ERR "request_module[%s]: fork failed, errno %d\n", module_name, -pid);
atomic_dec(&kmod_concurrent); atomic_dec(&kmod_concurrent);
return pid; ret = pid;
goto out;
} }
/* Block everything but SIGKILL/SIGSTOP */ /* Block everything but SIGKILL/SIGSTOP */
...@@ -250,7 +255,10 @@ int request_module(const char * module_name) ...@@ -250,7 +255,10 @@ int request_module(const char * module_name)
printk(KERN_ERR "request_module[%s]: waitpid(%d,...) failed, errno %d\n", printk(KERN_ERR "request_module[%s]: waitpid(%d,...) failed, errno %d\n",
module_name, pid, -waitpid_result); module_name, pid, -waitpid_result);
} }
return 0; ret = 0;
out:
current->policy = saved_policy;
return ret;
} }
#endif /* CONFIG_KMOD */ #endif /* CONFIG_KMOD */
......
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
* hybrid priority-list and round-robin design with * hybrid priority-list and round-robin design with
* an array-switch method of distributing timeslices * an array-switch method of distributing timeslices
* and per-CPU runqueues. Additional code by Davide * and per-CPU runqueues. Cleanups and useful suggestions
* Libenzi, Robert Love, and Rusty Russell. * by Davide Libenzi, preemptible kernel bits by Robert Love.
*/ */
#include <linux/mm.h> #include <linux/mm.h>
...@@ -102,16 +102,23 @@ ...@@ -102,16 +102,23 @@
((p)->prio <= (p)->static_prio - DELTA(p)) ((p)->prio <= (p)->static_prio - DELTA(p))
/* /*
* TASK_TIMESLICE scales user-nice values [ -20 ... 19 ] * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
* to time slice values. * to time slice values.
* *
* The higher a process's priority, the bigger timeslices * The higher a process's priority, the bigger timeslices
* it gets during one round of execution. But even the lowest * it gets during one round of execution. But even the lowest
* priority process gets MIN_TIMESLICE worth of execution time. * priority process gets MIN_TIMESLICE worth of execution time.
*
* task_timeslice() is the interface that is used by the scheduler.
*/ */
#define TASK_TIMESLICE(p) (MIN_TIMESLICE + \ #define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/39)) ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1)))
static inline unsigned int task_timeslice(task_t *p)
{
return BASE_TIMESLICE(p);
}
/* /*
* These are the runqueue data structures: * These are the runqueue data structures:
...@@ -136,13 +143,15 @@ struct prio_array { ...@@ -136,13 +143,15 @@ struct prio_array {
*/ */
struct runqueue { struct runqueue {
spinlock_t lock; spinlock_t lock;
unsigned long nr_running, nr_switches, expired_timestamp; unsigned long nr_running, nr_switches, expired_timestamp,
signed long nr_uninterruptible; nr_uninterruptible;
task_t *curr, *idle; task_t *curr, *idle;
prio_array_t *active, *expired, arrays[2]; prio_array_t *active, *expired, arrays[2];
int prev_nr_running[NR_CPUS]; int prev_nr_running[NR_CPUS];
task_t *migration_thread; task_t *migration_thread;
list_t migration_queue; list_t migration_queue;
} ____cacheline_aligned; } ____cacheline_aligned;
static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
...@@ -153,6 +162,15 @@ static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; ...@@ -153,6 +162,15 @@ static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
#define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define rt_task(p) ((p)->prio < MAX_RT_PRIO) #define rt_task(p) ((p)->prio < MAX_RT_PRIO)
/*
* Default context-switch locking:
*/
#ifndef prepare_arch_switch
# define prepare_arch_switch(rq, next) do { } while(0)
# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
# define task_running(rq, p) ((rq)->curr == (p))
#endif
/* /*
* task_rq_lock - lock the runqueue a given task resides on and disable * task_rq_lock - lock the runqueue a given task resides on and disable
* interrupts. Note the ordering: we can safely lookup the task_rq without * interrupts. Note the ordering: we can safely lookup the task_rq without
...@@ -307,7 +325,7 @@ void wait_task_inactive(task_t * p) ...@@ -307,7 +325,7 @@ void wait_task_inactive(task_t * p)
repeat: repeat:
preempt_disable(); preempt_disable();
rq = task_rq(p); rq = task_rq(p);
if (unlikely(rq->curr == p)) { if (unlikely(task_running(rq, p))) {
cpu_relax(); cpu_relax();
/* /*
* enable/disable preemption just to make this * enable/disable preemption just to make this
...@@ -318,7 +336,7 @@ void wait_task_inactive(task_t * p) ...@@ -318,7 +336,7 @@ void wait_task_inactive(task_t * p)
goto repeat; goto repeat;
} }
rq = task_rq_lock(p, &flags); rq = task_rq_lock(p, &flags);
if (unlikely(rq->curr == p)) { if (unlikely(task_running(rq, p))) {
task_rq_unlock(rq, &flags); task_rq_unlock(rq, &flags);
preempt_enable(); preempt_enable();
goto repeat; goto repeat;
...@@ -326,6 +344,7 @@ void wait_task_inactive(task_t * p) ...@@ -326,6 +344,7 @@ void wait_task_inactive(task_t * p)
task_rq_unlock(rq, &flags); task_rq_unlock(rq, &flags);
preempt_enable(); preempt_enable();
} }
#endif
/* /*
* Kick the remote CPU if the task is running currently, * Kick the remote CPU if the task is running currently,
...@@ -338,10 +357,9 @@ void wait_task_inactive(task_t * p) ...@@ -338,10 +357,9 @@ void wait_task_inactive(task_t * p)
*/ */
void kick_if_running(task_t * p) void kick_if_running(task_t * p)
{ {
if (p == task_rq(p)->curr) if ((task_running(task_rq(p), p)) && (p->thread_info->cpu != smp_processor_id()))
resched_task(p); resched_task(p);
} }
#endif
/* /*
* Wake up a process. Put it on the run-queue if it's not * Wake up a process. Put it on the run-queue if it's not
...@@ -350,6 +368,8 @@ void kick_if_running(task_t * p) ...@@ -350,6 +368,8 @@ void kick_if_running(task_t * p)
* progress), and as such you're allowed to do the simpler * progress), and as such you're allowed to do the simpler
* "current->state = TASK_RUNNING" to mark yourself runnable * "current->state = TASK_RUNNING" to mark yourself runnable
* without the overhead of this. * without the overhead of this.
*
* returns failure only if the task is already active.
*/ */
static int try_to_wake_up(task_t * p, int sync) static int try_to_wake_up(task_t * p, int sync)
{ {
...@@ -366,7 +386,7 @@ static int try_to_wake_up(task_t * p, int sync) ...@@ -366,7 +386,7 @@ static int try_to_wake_up(task_t * p, int sync)
* Fast-migrate the task if it's not running or runnable * Fast-migrate the task if it's not running or runnable
* currently. Do not violate hard affinity. * currently. Do not violate hard affinity.
*/ */
if (unlikely(sync && (rq->curr != p) && if (unlikely(sync && !task_running(rq, p) &&
(task_cpu(p) != smp_processor_id()) && (task_cpu(p) != smp_processor_id()) &&
(p->cpus_allowed & (1UL << smp_processor_id())))) { (p->cpus_allowed & (1UL << smp_processor_id())))) {
...@@ -377,9 +397,7 @@ static int try_to_wake_up(task_t * p, int sync) ...@@ -377,9 +397,7 @@ static int try_to_wake_up(task_t * p, int sync)
if (old_state == TASK_UNINTERRUPTIBLE) if (old_state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible--; rq->nr_uninterruptible--;
activate_task(p, rq); activate_task(p, rq);
/*
* If sync is set, a resched_task() is a NOOP
*/
if (p->prio < rq->curr->prio) if (p->prio < rq->curr->prio)
resched_task(rq->curr); resched_task(rq->curr);
success = 1; success = 1;
...@@ -428,9 +446,11 @@ void wake_up_forked_process(task_t * p) ...@@ -428,9 +446,11 @@ void wake_up_forked_process(task_t * p)
void sched_exit(task_t * p) void sched_exit(task_t * p)
{ {
local_irq_disable(); local_irq_disable();
current->time_slice += p->time_slice; if (p->first_time_slice) {
if (unlikely(current->time_slice > MAX_TIMESLICE)) current->time_slice += p->time_slice;
current->time_slice = MAX_TIMESLICE; if (unlikely(current->time_slice > MAX_TIMESLICE))
current->time_slice = MAX_TIMESLICE;
}
local_irq_enable(); local_irq_enable();
/* /*
* If the child was a (relative-) CPU hog then decrease * If the child was a (relative-) CPU hog then decrease
...@@ -444,8 +464,7 @@ void sched_exit(task_t * p) ...@@ -444,8 +464,7 @@ void sched_exit(task_t * p)
#if CONFIG_SMP || CONFIG_PREEMPT #if CONFIG_SMP || CONFIG_PREEMPT
asmlinkage void schedule_tail(task_t *prev) asmlinkage void schedule_tail(task_t *prev)
{ {
finish_arch_switch(this_rq()); finish_arch_switch(this_rq(), prev);
finish_arch_schedule(prev);
} }
#endif #endif
...@@ -502,7 +521,42 @@ unsigned long nr_context_switches(void) ...@@ -502,7 +521,42 @@ unsigned long nr_context_switches(void)
return sum; return sum;
} }
/*
* double_rq_lock - safely lock two runqueues
*
* Note this does not disable interrupts like task_rq_lock,
* you need to do so manually before calling.
*/
static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
{
if (rq1 == rq2)
spin_lock(&rq1->lock);
else {
if (rq1 < rq2) {
spin_lock(&rq1->lock);
spin_lock(&rq2->lock);
} else {
spin_lock(&rq2->lock);
spin_lock(&rq1->lock);
}
}
}
/*
* double_rq_unlock - safely unlock two runqueues
*
* Note this does not restore interrupts like task_rq_unlock,
* you need to do so manually after calling.
*/
static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
{
spin_unlock(&rq1->lock);
if (rq1 != rq2)
spin_unlock(&rq2->lock);
}
#if CONFIG_SMP #if CONFIG_SMP
/* /*
* Lock the busiest runqueue as well, this_rq is locked already. * Lock the busiest runqueue as well, this_rq is locked already.
* Recalculate nr_running if we have to drop the runqueue lock. * Recalculate nr_running if we have to drop the runqueue lock.
...@@ -526,22 +580,10 @@ static inline unsigned int double_lock_balance(runqueue_t *this_rq, ...@@ -526,22 +580,10 @@ static inline unsigned int double_lock_balance(runqueue_t *this_rq,
return nr_running; return nr_running;
} }
/* static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance)
* Current runqueue is empty, or rebalance tick: if there is an
* inbalance (current runqueue is too short) then pull from
* busiest runqueue(s).
*
* We call this with the current runqueue locked,
* irqs disabled.
*/
static void load_balance(runqueue_t *this_rq, int idle)
{ {
int imbalance, nr_running, load, max_load, int nr_running, load, max_load, i;
idx, i, this_cpu = smp_processor_id();
task_t *next = this_rq->idle, *tmp;
runqueue_t *busiest, *rq_src; runqueue_t *busiest, *rq_src;
prio_array_t *array;
list_t *head, *curr;
/* /*
* We search all runqueues to find the most busy one. * We search all runqueues to find the most busy one.
...@@ -590,21 +632,67 @@ static void load_balance(runqueue_t *this_rq, int idle) ...@@ -590,21 +632,67 @@ static void load_balance(runqueue_t *this_rq, int idle)
} }
if (likely(!busiest)) if (likely(!busiest))
return; goto out;
imbalance = (max_load - nr_running) / 2; *imbalance = (max_load - nr_running) / 2;
/* It needs an at least ~25% imbalance to trigger balancing. */ /* It needs an at least ~25% imbalance to trigger balancing. */
if (!idle && (imbalance < (max_load + 3)/4)) if (!idle && (*imbalance < (max_load + 3)/4)) {
return; busiest = NULL;
goto out;
}
nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running); nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running);
/* /*
* Make sure nothing changed since we checked the * Make sure nothing changed since we checked the
* runqueue length. * runqueue length.
*/ */
if (busiest->nr_running <= nr_running + 1) if (busiest->nr_running <= nr_running + 1) {
goto out_unlock; spin_unlock(&busiest->lock);
busiest = NULL;
}
out:
return busiest;
}
/*
* Move a task from a remote runqueue to the local runqueue.
* Both runqueues must be locked.
*/
static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu)
{
dequeue_task(p, src_array);
src_rq->nr_running--;
set_task_cpu(p, this_cpu);
this_rq->nr_running++;
enqueue_task(p, this_rq->active);
/*
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
*/
if (p->prio < this_rq->curr->prio)
set_need_resched();
}
/*
* Current runqueue is empty, or rebalance tick: if there is an
* inbalance (current runqueue is too short) then pull from
* busiest runqueue(s).
*
* We call this with the current runqueue locked,
* irqs disabled.
*/
static void load_balance(runqueue_t *this_rq, int idle)
{
int imbalance, idx, this_cpu = smp_processor_id();
runqueue_t *busiest;
prio_array_t *array;
list_t *head, *curr;
task_t *tmp;
busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance);
if (!busiest)
goto out;
/* /*
* We first consider expired tasks. Those will likely not be * We first consider expired tasks. Those will likely not be
...@@ -647,36 +735,28 @@ static void load_balance(runqueue_t *this_rq, int idle) ...@@ -647,36 +735,28 @@ static void load_balance(runqueue_t *this_rq, int idle)
#define CAN_MIGRATE_TASK(p,rq,this_cpu) \ #define CAN_MIGRATE_TASK(p,rq,this_cpu) \
((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \ ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \
((p) != (rq)->curr) && \ !task_running(rq, p) && \
((p)->cpus_allowed & (1UL << (this_cpu)))) ((p)->cpus_allowed & (1UL << (this_cpu))))
curr = curr->prev;
if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) { if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
curr = curr->next;
if (curr != head) if (curr != head)
goto skip_queue; goto skip_queue;
idx++; idx++;
goto skip_bitmap; goto skip_bitmap;
} }
next = tmp; pull_task(busiest, array, tmp, this_rq, this_cpu);
/*
* take the task out of the other runqueue and
* put it into this one:
*/
dequeue_task(next, array);
busiest->nr_running--;
set_task_cpu(next, this_cpu);
this_rq->nr_running++;
enqueue_task(next, this_rq->active);
if (next->prio < current->prio)
set_need_resched();
if (!idle && --imbalance) { if (!idle && --imbalance) {
if (array == busiest->expired) { if (curr != head)
array = busiest->active; goto skip_queue;
goto new_array; idx++;
} goto skip_bitmap;
} }
out_unlock: out_unlock:
spin_unlock(&busiest->lock); spin_unlock(&busiest->lock);
out:
;
} }
/* /*
...@@ -691,13 +771,13 @@ static void load_balance(runqueue_t *this_rq, int idle) ...@@ -691,13 +771,13 @@ static void load_balance(runqueue_t *this_rq, int idle)
#define BUSY_REBALANCE_TICK (HZ/4 ?: 1) #define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
static inline void idle_tick(void) static inline void idle_tick(runqueue_t *rq)
{ {
if (jiffies % IDLE_REBALANCE_TICK) if (jiffies % IDLE_REBALANCE_TICK)
return; return;
spin_lock(&this_rq()->lock); spin_lock(&rq->lock);
load_balance(this_rq(), 1); load_balance(rq, 1);
spin_unlock(&this_rq()->lock); spin_unlock(&rq->lock);
} }
#endif #endif
...@@ -720,7 +800,7 @@ static inline void idle_tick(void) ...@@ -720,7 +800,7 @@ static inline void idle_tick(void)
* This function gets called by the timer code, with HZ frequency. * This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled. * We call it with interrupts disabled.
*/ */
void scheduler_tick(int user_tick, int system) void scheduler_tick(int user_ticks, int sys_ticks)
{ {
int cpu = smp_processor_id(); int cpu = smp_processor_id();
runqueue_t *rq = this_rq(); runqueue_t *rq = this_rq();
...@@ -728,18 +808,18 @@ void scheduler_tick(int user_tick, int system) ...@@ -728,18 +808,18 @@ void scheduler_tick(int user_tick, int system)
if (p == rq->idle) { if (p == rq->idle) {
/* note: this timer irq context must be accounted for as well */ /* note: this timer irq context must be accounted for as well */
if (irq_count() >= 2*HARDIRQ_OFFSET) if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
kstat.per_cpu_system[cpu] += system; kstat.per_cpu_system[cpu] += sys_ticks;
#if CONFIG_SMP #if CONFIG_SMP
idle_tick(); idle_tick(rq);
#endif #endif
return; return;
} }
if (TASK_NICE(p) > 0) if (TASK_NICE(p) > 0)
kstat.per_cpu_nice[cpu] += user_tick; kstat.per_cpu_nice[cpu] += user_ticks;
else else
kstat.per_cpu_user[cpu] += user_tick; kstat.per_cpu_user[cpu] += user_ticks;
kstat.per_cpu_system[cpu] += system; kstat.per_cpu_system[cpu] += sys_ticks;
/* Task might have expired already, but not scheduled off yet */ /* Task might have expired already, but not scheduled off yet */
if (p->array != rq->active) { if (p->array != rq->active) {
...@@ -753,7 +833,8 @@ void scheduler_tick(int user_tick, int system) ...@@ -753,7 +833,8 @@ void scheduler_tick(int user_tick, int system)
* FIFO tasks have no timeslices. * FIFO tasks have no timeslices.
*/ */
if ((p->policy == SCHED_RR) && !--p->time_slice) { if ((p->policy == SCHED_RR) && !--p->time_slice) {
p->time_slice = TASK_TIMESLICE(p); p->time_slice = task_timeslice(p);
p->first_time_slice = 0;
set_tsk_need_resched(p); set_tsk_need_resched(p);
/* put it at the end of the queue: */ /* put it at the end of the queue: */
...@@ -776,7 +857,8 @@ void scheduler_tick(int user_tick, int system) ...@@ -776,7 +857,8 @@ void scheduler_tick(int user_tick, int system)
dequeue_task(p, rq->active); dequeue_task(p, rq->active);
set_tsk_need_resched(p); set_tsk_need_resched(p);
p->prio = effective_prio(p); p->prio = effective_prio(p);
p->time_slice = TASK_TIMESLICE(p); p->time_slice = task_timeslice(p);
p->first_time_slice = 0;
if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
if (!rq->expired_timestamp) if (!rq->expired_timestamp)
...@@ -818,7 +900,6 @@ asmlinkage void schedule(void) ...@@ -818,7 +900,6 @@ asmlinkage void schedule(void)
rq = this_rq(); rq = this_rq();
release_kernel_lock(prev); release_kernel_lock(prev);
prepare_arch_schedule(prev);
prev->sleep_timestamp = jiffies; prev->sleep_timestamp = jiffies;
spin_lock_irq(&rq->lock); spin_lock_irq(&rq->lock);
...@@ -875,14 +956,13 @@ asmlinkage void schedule(void) ...@@ -875,14 +956,13 @@ asmlinkage void schedule(void)
rq->nr_switches++; rq->nr_switches++;
rq->curr = next; rq->curr = next;
prepare_arch_switch(rq); prepare_arch_switch(rq, next);
prev = context_switch(prev, next); prev = context_switch(prev, next);
barrier(); barrier();
rq = this_rq(); rq = this_rq();
finish_arch_switch(rq); finish_arch_switch(rq, prev);
} else } else
spin_unlock_irq(&rq->lock); spin_unlock_irq(&rq->lock);
finish_arch_schedule(prev);
reacquire_kernel_lock(current); reacquire_kernel_lock(current);
preempt_enable_no_resched(); preempt_enable_no_resched();
...@@ -1114,7 +1194,8 @@ void set_user_nice(task_t *p, long nice) ...@@ -1114,7 +1194,8 @@ void set_user_nice(task_t *p, long nice)
* If the task is running and lowered its priority, * If the task is running and lowered its priority,
* or increased its priority then reschedule its CPU: * or increased its priority then reschedule its CPU:
*/ */
if ((NICE_TO_PRIO(nice) < p->static_prio) || (p == rq->curr)) if ((NICE_TO_PRIO(nice) < p->static_prio) ||
task_running(rq, p))
resched_task(rq->curr); resched_task(rq->curr);
} }
out_unlock: out_unlock:
...@@ -1228,18 +1309,18 @@ static int setscheduler(pid_t pid, int policy, struct sched_param *param) ...@@ -1228,18 +1309,18 @@ static int setscheduler(pid_t pid, int policy, struct sched_param *param)
else { else {
retval = -EINVAL; retval = -EINVAL;
if (policy != SCHED_FIFO && policy != SCHED_RR && if (policy != SCHED_FIFO && policy != SCHED_RR &&
policy != SCHED_OTHER) policy != SCHED_NORMAL)
goto out_unlock; goto out_unlock;
} }
/* /*
* Valid priorities for SCHED_FIFO and SCHED_RR are * Valid priorities for SCHED_FIFO and SCHED_RR are
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_OTHER is 0. * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
*/ */
retval = -EINVAL; retval = -EINVAL;
if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1) if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
goto out_unlock; goto out_unlock;
if ((policy == SCHED_OTHER) != (lp.sched_priority == 0)) if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0))
goto out_unlock; goto out_unlock;
retval = -EPERM; retval = -EPERM;
...@@ -1260,7 +1341,7 @@ static int setscheduler(pid_t pid, int policy, struct sched_param *param) ...@@ -1260,7 +1341,7 @@ static int setscheduler(pid_t pid, int policy, struct sched_param *param)
retval = 0; retval = 0;
p->policy = policy; p->policy = policy;
p->rt_priority = lp.sched_priority; p->rt_priority = lp.sched_priority;
if (policy != SCHED_OTHER) if (policy != SCHED_NORMAL)
p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
else else
p->prio = p->static_prio; p->prio = p->static_prio;
...@@ -1439,39 +1520,18 @@ asmlinkage long sys_sched_yield(void) ...@@ -1439,39 +1520,18 @@ asmlinkage long sys_sched_yield(void)
prio_array_t *array = current->array; prio_array_t *array = current->array;
/* /*
* There are three levels of how a yielding task will give up * We implement yielding by moving the task into the expired
* the current CPU: * queue.
* *
* #1 - it decreases its priority by one. This priority loss is * (special rule: RT tasks will just roundrobin in the active
* temporary, it's recovered once the current timeslice * array.)
* expires.
*
* #2 - once it has reached the lowest priority level,
* it will give up timeslices one by one. (We do not
* want to give them up all at once, it's gradual,
* to protect the casual yield()er.)
*
* #3 - once all timeslices are gone we put the process into
* the expired array.
*
* (special rule: RT tasks do not lose any priority, they just
* roundrobin on their current priority level.)
*/ */
if (likely(current->prio == MAX_PRIO-1)) { if (likely(!rt_task(current))) {
if (current->time_slice <= 1) { dequeue_task(current, array);
dequeue_task(current, rq->active); enqueue_task(current, rq->expired);
enqueue_task(current, rq->expired);
} else
current->time_slice--;
} else if (unlikely(rt_task(current))) {
list_move_tail(&current->run_list, array->queue + current->prio);
} else { } else {
list_del(&current->run_list); list_del(&current->run_list);
if (list_empty(array->queue + current->prio))
__clear_bit(current->prio, array->bitmap);
current->prio++;
list_add_tail(&current->run_list, array->queue + current->prio); list_add_tail(&current->run_list, array->queue + current->prio);
__set_bit(current->prio, array->bitmap);
} }
/* /*
* Since we are going to call schedule() anyway, there's * Since we are going to call schedule() anyway, there's
...@@ -1506,7 +1566,7 @@ asmlinkage long sys_sched_get_priority_max(int policy) ...@@ -1506,7 +1566,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
case SCHED_RR: case SCHED_RR:
ret = MAX_USER_RT_PRIO-1; ret = MAX_USER_RT_PRIO-1;
break; break;
case SCHED_OTHER: case SCHED_NORMAL:
ret = 0; ret = 0;
break; break;
} }
...@@ -1522,7 +1582,7 @@ asmlinkage long sys_sched_get_priority_min(int policy) ...@@ -1522,7 +1582,7 @@ asmlinkage long sys_sched_get_priority_min(int policy)
case SCHED_RR: case SCHED_RR:
ret = 1; ret = 1;
break; break;
case SCHED_OTHER: case SCHED_NORMAL:
ret = 0; ret = 0;
} }
return ret; return ret;
...@@ -1548,7 +1608,7 @@ asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval) ...@@ -1548,7 +1608,7 @@ asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
goto out_unlock; goto out_unlock;
jiffies_to_timespec(p->policy & SCHED_FIFO ? jiffies_to_timespec(p->policy & SCHED_FIFO ?
0 : TASK_TIMESLICE(p), &t); 0 : task_timeslice(p), &t);
read_unlock(&tasklist_lock); read_unlock(&tasklist_lock);
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
out_nounlock: out_nounlock:
...@@ -1652,40 +1712,6 @@ void show_state(void) ...@@ -1652,40 +1712,6 @@ void show_state(void)
read_unlock(&tasklist_lock); read_unlock(&tasklist_lock);
} }
/*
* double_rq_lock - safely lock two runqueues
*
* Note this does not disable interrupts like task_rq_lock,
* you need to do so manually before calling.
*/
static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
{
if (rq1 == rq2)
spin_lock(&rq1->lock);
else {
if (rq1 < rq2) {
spin_lock(&rq1->lock);
spin_lock(&rq2->lock);
} else {
spin_lock(&rq2->lock);
spin_lock(&rq1->lock);
}
}
}
/*
* double_rq_unlock - safely unlock two runqueues
*
* Note this does not restore interrupts like task_rq_unlock,
* you need to do so manually after calling.
*/
static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
{
spin_unlock(&rq1->lock);
if (rq1 != rq2)
spin_unlock(&rq2->lock);
}
void __init init_idle(task_t *idle, int cpu) void __init init_idle(task_t *idle, int cpu)
{ {
runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle)); runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
...@@ -1712,57 +1738,6 @@ void __init init_idle(task_t *idle, int cpu) ...@@ -1712,57 +1738,6 @@ void __init init_idle(task_t *idle, int cpu)
#endif #endif
} }
extern void init_timervecs(void);
extern void timer_bh(void);
extern void tqueue_bh(void);
extern void immediate_bh(void);
void __init sched_init(void)
{
runqueue_t *rq;
int i, j, k;
for (i = 0; i < NR_CPUS; i++) {
prio_array_t *array;
rq = cpu_rq(i);
rq->active = rq->arrays;
rq->expired = rq->arrays + 1;
spin_lock_init(&rq->lock);
INIT_LIST_HEAD(&rq->migration_queue);
for (j = 0; j < 2; j++) {
array = rq->arrays + j;
for (k = 0; k < MAX_PRIO; k++) {
INIT_LIST_HEAD(array->queue + k);
__clear_bit(k, array->bitmap);
}
// delimiter for bitsearch
__set_bit(MAX_PRIO, array->bitmap);
}
}
/*
* We have to do a little magic to get the first
* process right in SMP mode.
*/
rq = this_rq();
rq->curr = current;
rq->idle = current;
set_task_cpu(current, smp_processor_id());
wake_up_process(current);
init_timervecs();
init_bh(TIMER_BH, timer_bh);
init_bh(TQUEUE_BH, tqueue_bh);
init_bh(IMMEDIATE_BH, immediate_bh);
/*
* The boot idle thread does lazy MMU switching as well:
*/
atomic_inc(&init_mm.mm_count);
enter_lazy_tlb(&init_mm, current, smp_processor_id());
}
#if CONFIG_SMP #if CONFIG_SMP
/* /*
...@@ -1821,7 +1796,7 @@ void set_cpus_allowed(task_t *p, unsigned long new_mask) ...@@ -1821,7 +1796,7 @@ void set_cpus_allowed(task_t *p, unsigned long new_mask)
* If the task is not on a runqueue (and not running), then * If the task is not on a runqueue (and not running), then
* it is sufficient to simply update the task's cpu field. * it is sufficient to simply update the task's cpu field.
*/ */
if (!p->array && (p != rq->curr)) { if (!p->array && !task_running(rq, p)) {
set_task_cpu(p, __ffs(p->cpus_allowed)); set_task_cpu(p, __ffs(p->cpus_allowed));
task_rq_unlock(rq, &flags); task_rq_unlock(rq, &flags);
goto out; goto out;
...@@ -1939,3 +1914,55 @@ void __init migration_init(void) ...@@ -1939,3 +1914,55 @@ void __init migration_init(void)
} }
} }
#endif #endif
extern void init_timervecs(void);
extern void timer_bh(void);
extern void tqueue_bh(void);
extern void immediate_bh(void);
void __init sched_init(void)
{
runqueue_t *rq;
int i, j, k;
for (i = 0; i < NR_CPUS; i++) {
prio_array_t *array;
rq = cpu_rq(i);
rq->active = rq->arrays;
rq->expired = rq->arrays + 1;
spin_lock_init(&rq->lock);
INIT_LIST_HEAD(&rq->migration_queue);
for (j = 0; j < 2; j++) {
array = rq->arrays + j;
for (k = 0; k < MAX_PRIO; k++) {
INIT_LIST_HEAD(array->queue + k);
__clear_bit(k, array->bitmap);
}
// delimiter for bitsearch
__set_bit(MAX_PRIO, array->bitmap);
}
}
/*
* We have to do a little magic to get the first
* process right in SMP mode.
*/
rq = this_rq();
rq->curr = current;
rq->idle = current;
set_task_cpu(current, smp_processor_id());
wake_up_process(current);
init_timervecs();
init_bh(TIMER_BH, timer_bh);
init_bh(TQUEUE_BH, tqueue_bh);
init_bh(IMMEDIATE_BH, immediate_bh);
/*
* The boot idle thread does lazy MMU switching as well:
*/
atomic_inc(&init_mm.mm_count);
enter_lazy_tlb(&init_mm, current, smp_processor_id());
}
...@@ -500,7 +500,6 @@ inline void signal_wake_up(struct task_struct *t) ...@@ -500,7 +500,6 @@ inline void signal_wake_up(struct task_struct *t)
{ {
set_tsk_thread_flag(t,TIF_SIGPENDING); set_tsk_thread_flag(t,TIF_SIGPENDING);
#ifdef CONFIG_SMP
/* /*
* If the task is running on a different CPU * If the task is running on a different CPU
* force a reschedule on the other CPU to make * force a reschedule on the other CPU to make
...@@ -511,9 +510,8 @@ inline void signal_wake_up(struct task_struct *t) ...@@ -511,9 +510,8 @@ inline void signal_wake_up(struct task_struct *t)
* process of changing - but no harm is done by that * process of changing - but no harm is done by that
* other than doing an extra (lightweight) IPI interrupt. * other than doing an extra (lightweight) IPI interrupt.
*/ */
if ((t->state == TASK_RUNNING) && (t->thread_info->cpu != smp_processor_id())) if (t->state == TASK_RUNNING)
kick_if_running(t); kick_if_running(t);
#endif
if (t->state & TASK_INTERRUPTIBLE) { if (t->state & TASK_INTERRUPTIBLE) {
wake_up_process(t); wake_up_process(t);
return; return;
......
...@@ -888,7 +888,7 @@ asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp) ...@@ -888,7 +888,7 @@ asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)
if (t.tv_sec == 0 && t.tv_nsec <= 2000000L && if (t.tv_sec == 0 && t.tv_nsec <= 2000000L &&
current->policy != SCHED_OTHER) current->policy != SCHED_NORMAL)
{ {
/* /*
* Short delay requests up to 2 ms will be handled with * Short delay requests up to 2 ms will be handled with
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment