Commit b29739f9 authored by Ingo Molnar's avatar Ingo Molnar Committed by Linus Torvalds

[PATCH] pi-futex: scheduler support for pi

Add framework to boost/unboost the priority of RT tasks.

This consists of:

 - caching the 'normal' priority in ->normal_prio
 - providing a functions to set/get the priority of the task
 - make sched_setscheduler() aware of boosting

The effective_prio() cleanups also fix a priority-calculation bug pointed out
by Andrey Gelman, in set_user_nice().

has_rt_policy() fix: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
Signed-off-by: default avatarArjan van de Ven <arjan@linux.intel.com>
Cc: Andrey Gelman <agelman@012.net.il>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 77ba89c5
...@@ -87,6 +87,7 @@ extern struct group_info init_groups; ...@@ -87,6 +87,7 @@ extern struct group_info init_groups;
.lock_depth = -1, \ .lock_depth = -1, \
.prio = MAX_PRIO-20, \ .prio = MAX_PRIO-20, \
.static_prio = MAX_PRIO-20, \ .static_prio = MAX_PRIO-20, \
.normal_prio = MAX_PRIO-20, \
.policy = SCHED_NORMAL, \ .policy = SCHED_NORMAL, \
.cpus_allowed = CPU_MASK_ALL, \ .cpus_allowed = CPU_MASK_ALL, \
.mm = NULL, \ .mm = NULL, \
...@@ -122,6 +123,7 @@ extern struct group_info init_groups; ...@@ -122,6 +123,7 @@ extern struct group_info init_groups;
.journal_info = NULL, \ .journal_info = NULL, \
.cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
.fs_excl = ATOMIC_INIT(0), \ .fs_excl = ATOMIC_INIT(0), \
.pi_lock = SPIN_LOCK_UNLOCKED, \
} }
......
...@@ -495,8 +495,11 @@ struct signal_struct { ...@@ -495,8 +495,11 @@ struct signal_struct {
#define MAX_PRIO (MAX_RT_PRIO + 40) #define MAX_PRIO (MAX_RT_PRIO + 40)
#define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) #define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
#define rt_task(p) rt_prio((p)->prio)
#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
#define has_rt_policy(p) \
unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH)
/* /*
* Some day this will be a full-fledged user tracking system.. * Some day this will be a full-fledged user tracking system..
...@@ -725,7 +728,7 @@ struct task_struct { ...@@ -725,7 +728,7 @@ struct task_struct {
#endif #endif
#endif #endif
int load_weight; /* for niceness load balancing purposes */ int load_weight; /* for niceness load balancing purposes */
int prio, static_prio; int prio, static_prio, normal_prio;
struct list_head run_list; struct list_head run_list;
prio_array_t *array; prio_array_t *array;
...@@ -852,6 +855,9 @@ struct task_struct { ...@@ -852,6 +855,9 @@ struct task_struct {
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
spinlock_t alloc_lock; spinlock_t alloc_lock;
/* Protection of the PI data structures: */
spinlock_t pi_lock;
#ifdef CONFIG_DEBUG_MUTEXES #ifdef CONFIG_DEBUG_MUTEXES
/* mutex deadlock detection */ /* mutex deadlock detection */
struct mutex_waiter *blocked_on; struct mutex_waiter *blocked_on;
...@@ -1018,6 +1024,17 @@ static inline void idle_task_exit(void) {} ...@@ -1018,6 +1024,17 @@ static inline void idle_task_exit(void) {}
#endif #endif
extern void sched_idle_next(void); extern void sched_idle_next(void);
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(task_t *p);
extern void rt_mutex_setprio(task_t *p, int prio);
#else
static inline int rt_mutex_getprio(task_t *p)
{
return p->normal_prio;
}
#endif
extern void set_user_nice(task_t *p, long nice); extern void set_user_nice(task_t *p, long nice);
extern int task_prio(const task_t *p); extern int task_prio(const task_t *p);
extern int task_nice(const task_t *p); extern int task_nice(const task_t *p);
......
...@@ -354,6 +354,25 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) ...@@ -354,6 +354,25 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
} }
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
/*
* __task_rq_lock - lock the runqueue a given task resides on.
* Must be called interrupts disabled.
*/
static inline runqueue_t *__task_rq_lock(task_t *p)
__acquires(rq->lock)
{
struct runqueue *rq;
repeat_lock_task:
rq = task_rq(p);
spin_lock(&rq->lock);
if (unlikely(rq != task_rq(p))) {
spin_unlock(&rq->lock);
goto repeat_lock_task;
}
return rq;
}
/* /*
* task_rq_lock - lock the runqueue a given task resides on and disable * task_rq_lock - lock the runqueue a given task resides on and disable
* interrupts. Note the ordering: we can safely lookup the task_rq without * interrupts. Note the ordering: we can safely lookup the task_rq without
...@@ -375,6 +394,12 @@ static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) ...@@ -375,6 +394,12 @@ static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
return rq; return rq;
} }
static inline void __task_rq_unlock(runqueue_t *rq)
__releases(rq->lock)
{
spin_unlock(&rq->lock);
}
static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
__releases(rq->lock) __releases(rq->lock)
{ {
...@@ -638,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) ...@@ -638,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
} }
/* /*
* effective_prio - return the priority that is based on the static * __normal_prio - return the priority that is based on the static
* priority but is modified by bonuses/penalties. * priority but is modified by bonuses/penalties.
* *
* We scale the actual sleep average [0 .... MAX_SLEEP_AVG] * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
...@@ -651,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) ...@@ -651,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
* *
* Both properties are important to certain workloads. * Both properties are important to certain workloads.
*/ */
static int effective_prio(task_t *p)
static inline int __normal_prio(task_t *p)
{ {
int bonus, prio; int bonus, prio;
if (rt_task(p))
return p->prio;
bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
prio = p->static_prio - bonus; prio = p->static_prio - bonus;
...@@ -692,7 +715,7 @@ static int effective_prio(task_t *p) ...@@ -692,7 +715,7 @@ static int effective_prio(task_t *p)
static void set_load_weight(task_t *p) static void set_load_weight(task_t *p)
{ {
if (rt_task(p)) { if (has_rt_policy(p)) {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
if (p == task_rq(p)->migration_thread) if (p == task_rq(p)->migration_thread)
/* /*
...@@ -730,6 +753,44 @@ static inline void dec_nr_running(task_t *p, runqueue_t *rq) ...@@ -730,6 +753,44 @@ static inline void dec_nr_running(task_t *p, runqueue_t *rq)
dec_raw_weighted_load(rq, p); dec_raw_weighted_load(rq, p);
} }
/*
* Calculate the expected normal priority: i.e. priority
* without taking RT-inheritance into account. Might be
* boosted by interactivity modifiers. Changes upon fork,
* setprio syscalls, and whenever the interactivity
* estimator recalculates.
*/
static inline int normal_prio(task_t *p)
{
int prio;
if (has_rt_policy(p))
prio = MAX_RT_PRIO-1 - p->rt_priority;
else
prio = __normal_prio(p);
return prio;
}
/*
* Calculate the current priority, i.e. the priority
* taken into account by the scheduler. This value might
* be boosted by RT tasks, or might be boosted by
* interactivity modifiers. Will be RT if the task got
* RT-boosted. If not then it returns p->normal_prio.
*/
static int effective_prio(task_t *p)
{
p->normal_prio = normal_prio(p);
/*
* If we are RT tasks or we were boosted to RT priority,
* keep the priority unchanged. Otherwise, update priority
* to the normal priority:
*/
if (!rt_prio(p->prio))
return p->normal_prio;
return p->prio;
}
/* /*
* __activate_task - move a task to the runqueue. * __activate_task - move a task to the runqueue.
*/ */
...@@ -752,6 +813,10 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) ...@@ -752,6 +813,10 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
inc_nr_running(p, rq); inc_nr_running(p, rq);
} }
/*
* Recalculate p->normal_prio and p->prio after having slept,
* updating the sleep-average too:
*/
static int recalc_task_prio(task_t *p, unsigned long long now) static int recalc_task_prio(task_t *p, unsigned long long now)
{ {
/* Caller must always ensure 'now >= p->timestamp' */ /* Caller must always ensure 'now >= p->timestamp' */
...@@ -1448,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags) ...@@ -1448,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags)
* event cannot wake it up and insert it on the runqueue either. * event cannot wake it up and insert it on the runqueue either.
*/ */
p->state = TASK_RUNNING; p->state = TASK_RUNNING;
/*
* Make sure we do not leak PI boosting priority to the child:
*/
p->prio = current->normal_prio;
INIT_LIST_HEAD(&p->run_list); INIT_LIST_HEAD(&p->run_list);
p->array = NULL; p->array = NULL;
#ifdef CONFIG_SCHEDSTATS #ifdef CONFIG_SCHEDSTATS
...@@ -1527,6 +1598,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) ...@@ -1527,6 +1598,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
__activate_task(p, rq); __activate_task(p, rq);
else { else {
p->prio = current->prio; p->prio = current->prio;
p->normal_prio = current->normal_prio;
list_add_tail(&p->run_list, &current->run_list); list_add_tail(&p->run_list, &current->run_list);
p->array = current->array; p->array = current->array;
p->array->nr_active++; p->array->nr_active++;
...@@ -3668,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) ...@@ -3668,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
EXPORT_SYMBOL(sleep_on_timeout); EXPORT_SYMBOL(sleep_on_timeout);
#ifdef CONFIG_RT_MUTEXES
/*
* rt_mutex_setprio - set the current priority of a task
* @p: task
* @prio: prio value (kernel-internal form)
*
* This function changes the 'effective' priority of a task. It does
* not touch ->normal_prio like __setscheduler().
*
* Used by the rt_mutex code to implement priority inheritance logic.
*/
void rt_mutex_setprio(task_t *p, int prio)
{
unsigned long flags;
prio_array_t *array;
runqueue_t *rq;
int oldprio;
BUG_ON(prio < 0 || prio > MAX_PRIO);
rq = task_rq_lock(p, &flags);
oldprio = p->prio;
array = p->array;
if (array)
dequeue_task(p, array);
p->prio = prio;
if (array) {
/*
* If changing to an RT priority then queue it
* in the active array!
*/
if (rt_task(p))
array = rq->active;
enqueue_task(p, array);
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
if (task_running(rq, p)) {
if (p->prio > oldprio)
resched_task(rq->curr);
} else if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
}
task_rq_unlock(rq, &flags);
}
#endif
void set_user_nice(task_t *p, long nice) void set_user_nice(task_t *p, long nice)
{ {
unsigned long flags; unsigned long flags;
prio_array_t *array; prio_array_t *array;
runqueue_t *rq; runqueue_t *rq;
int old_prio, new_prio, delta; int old_prio, delta;
if (TASK_NICE(p) == nice || nice < -20 || nice > 19) if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
return; return;
...@@ -3688,7 +3813,7 @@ void set_user_nice(task_t *p, long nice) ...@@ -3688,7 +3813,7 @@ void set_user_nice(task_t *p, long nice)
* it wont have any effect on scheduling until the task is * it wont have any effect on scheduling until the task is
* not SCHED_NORMAL/SCHED_BATCH: * not SCHED_NORMAL/SCHED_BATCH:
*/ */
if (rt_task(p)) { if (has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice); p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock; goto out_unlock;
} }
...@@ -3698,12 +3823,11 @@ void set_user_nice(task_t *p, long nice) ...@@ -3698,12 +3823,11 @@ void set_user_nice(task_t *p, long nice)
dec_raw_weighted_load(rq, p); dec_raw_weighted_load(rq, p);
} }
old_prio = p->prio;
new_prio = NICE_TO_PRIO(nice);
delta = new_prio - old_prio;
p->static_prio = NICE_TO_PRIO(nice); p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p); set_load_weight(p);
p->prio += delta; old_prio = p->prio;
p->prio = effective_prio(p);
delta = p->prio - old_prio;
if (array) { if (array) {
enqueue_task(p, array); enqueue_task(p, array);
...@@ -3718,7 +3842,6 @@ void set_user_nice(task_t *p, long nice) ...@@ -3718,7 +3842,6 @@ void set_user_nice(task_t *p, long nice)
out_unlock: out_unlock:
task_rq_unlock(rq, &flags); task_rq_unlock(rq, &flags);
} }
EXPORT_SYMBOL(set_user_nice); EXPORT_SYMBOL(set_user_nice);
/* /*
...@@ -3833,16 +3956,14 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) ...@@ -3833,16 +3956,14 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
BUG_ON(p->array); BUG_ON(p->array);
p->policy = policy; p->policy = policy;
p->rt_priority = prio; p->rt_priority = prio;
if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { p->normal_prio = normal_prio(p);
p->prio = MAX_RT_PRIO-1 - p->rt_priority; /* we are holding p->pi_lock already */
} else { p->prio = rt_mutex_getprio(p);
p->prio = p->static_prio;
/* /*
* SCHED_BATCH tasks are treated as perpetual CPU hogs: * SCHED_BATCH tasks are treated as perpetual CPU hogs:
*/ */
if (policy == SCHED_BATCH) if (policy == SCHED_BATCH)
p->sleep_avg = 0; p->sleep_avg = 0;
}
set_load_weight(p); set_load_weight(p);
} }
...@@ -3911,15 +4032,21 @@ int sched_setscheduler(struct task_struct *p, int policy, ...@@ -3911,15 +4032,21 @@ int sched_setscheduler(struct task_struct *p, int policy,
retval = security_task_setscheduler(p, policy, param); retval = security_task_setscheduler(p, policy, param);
if (retval) if (retval)
return retval; return retval;
/*
* make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
*/
spin_lock_irqsave(&p->pi_lock, flags);
/* /*
* To be able to change p->policy safely, the apropriate * To be able to change p->policy safely, the apropriate
* runqueue lock must be held. * runqueue lock must be held.
*/ */
rq = task_rq_lock(p, &flags); rq = __task_rq_lock(p);
/* recheck policy now with rq lock held */ /* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1; policy = oldpolicy = -1;
task_rq_unlock(rq, &flags); __task_rq_unlock(rq);
spin_unlock_irqrestore(&p->pi_lock, flags);
goto recheck; goto recheck;
} }
array = p->array; array = p->array;
...@@ -3940,7 +4067,9 @@ int sched_setscheduler(struct task_struct *p, int policy, ...@@ -3940,7 +4067,9 @@ int sched_setscheduler(struct task_struct *p, int policy,
} else if (TASK_PREEMPTS_CURR(p, rq)) } else if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr); resched_task(rq->curr);
} }
task_rq_unlock(rq, &flags); __task_rq_unlock(rq);
spin_unlock_irqrestore(&p->pi_lock, flags);
return 0; return 0;
} }
EXPORT_SYMBOL_GPL(sched_setscheduler); EXPORT_SYMBOL_GPL(sched_setscheduler);
...@@ -4575,7 +4704,7 @@ void __devinit init_idle(task_t *idle, int cpu) ...@@ -4575,7 +4704,7 @@ void __devinit init_idle(task_t *idle, int cpu)
idle->timestamp = sched_clock(); idle->timestamp = sched_clock();
idle->sleep_avg = 0; idle->sleep_avg = 0;
idle->array = NULL; idle->array = NULL;
idle->prio = MAX_PRIO; idle->prio = idle->normal_prio = MAX_PRIO;
idle->state = TASK_RUNNING; idle->state = TASK_RUNNING;
idle->cpus_allowed = cpumask_of_cpu(cpu); idle->cpus_allowed = cpumask_of_cpu(cpu);
set_task_cpu(idle, cpu); set_task_cpu(idle, cpu);
...@@ -6582,7 +6711,8 @@ void normalize_rt_tasks(void) ...@@ -6582,7 +6711,8 @@ void normalize_rt_tasks(void)
if (!rt_task(p)) if (!rt_task(p))
continue; continue;
rq = task_rq_lock(p, &flags); spin_lock_irqsave(&p->pi_lock, flags);
rq = __task_rq_lock(p);
array = p->array; array = p->array;
if (array) if (array)
...@@ -6593,7 +6723,8 @@ void normalize_rt_tasks(void) ...@@ -6593,7 +6723,8 @@ void normalize_rt_tasks(void)
resched_task(rq->curr); resched_task(rq->curr);
} }
task_rq_unlock(rq, &flags); __task_rq_unlock(rq);
spin_unlock_irqrestore(&p->pi_lock, flags);
} }
read_unlock_irq(&tasklist_lock); read_unlock_irq(&tasklist_lock);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment