Commit 99e97b86 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of...

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  sched: fix typo in sched-rt-group.txt file
  ftrace: fix typo about map of kernel priority in ftrace.txt file.
  sched: properly define the sched_group::cpumask and sched_domain::span fields
  sched, timers: cleanup avenrun users
  sched, timers: move calc_load() to scheduler
  sched: Don't export sched_mc_power_savings on multi-socket single core system
  sched: emit thread info flags with stack trace
  sched: rt: document the risk of small values in the bandwidth settings
  sched: Replace first_cpu() with cpumask_first() in ILB nomination code
  sched: remove extra call overhead for schedule()
  sched: use group_first_cpu() instead of cpumask_first(sched_group_cpus())
  wait: don't use __wake_up_common()
  sched: Nominate a power-efficient ilb in select_nohz_balancer()
  sched: Nominate idle load balancer from a semi-idle package.
  sched: remove redundant hierarchy walk in check_preempt_wakeup
parents 82782ca7 f04d82b7
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
CONTENTS CONTENTS
======== ========
0. WARNING
1. Overview 1. Overview
1.1 The problem 1.1 The problem
1.2 The solution 1.2 The solution
...@@ -14,6 +15,23 @@ CONTENTS ...@@ -14,6 +15,23 @@ CONTENTS
3. Future plans 3. Future plans
0. WARNING
==========
Fiddling with these settings can result in an unstable system, the knobs are
root only and assumes root knows what he is doing.
Most notable:
* very small values in sched_rt_period_us can result in an unstable
system when the period is smaller than either the available hrtimer
resolution, or the time it takes to handle the budget refresh itself.
* very small values in sched_rt_runtime_us can result in an unstable
system when the runtime is so small the system has difficulty making
forward progress (NOTE: the migration thread and kstopmachine both
are real-time processes).
1. Overview 1. Overview
=========== ===========
...@@ -169,7 +187,7 @@ get their allocated time. ...@@ -169,7 +187,7 @@ get their allocated time.
Implementing SCHED_EDF might take a while to complete. Priority Inheritance is Implementing SCHED_EDF might take a while to complete. Priority Inheritance is
the biggest challenge as the current linux PI infrastructure is geared towards the biggest challenge as the current linux PI infrastructure is geared towards
the limited static priority levels 0-139. With deadline scheduling you need to the limited static priority levels 0-99. With deadline scheduling you need to
do deadline inheritance (since priority is inversely proportional to the do deadline inheritance (since priority is inversely proportional to the
deadline delta (deadline - now). deadline delta (deadline - now).
......
...@@ -518,9 +518,18 @@ priority with zero (0) being the highest priority and the nice ...@@ -518,9 +518,18 @@ priority with zero (0) being the highest priority and the nice
values starting at 100 (nice -20). Below is a quick chart to map values starting at 100 (nice -20). Below is a quick chart to map
the kernel priority to user land priorities. the kernel priority to user land priorities.
Kernel priority: 0 to 99 ==> user RT priority 99 to 0 Kernel Space User Space
Kernel priority: 100 to 139 ==> user nice -20 to 19 ===============================================================
Kernel priority: 140 ==> idle task priority 0(high) to 98(low) user RT priority 99(high) to 1(low)
with SCHED_RR or SCHED_FIFO
---------------------------------------------------------------
99 sched_priority is not used in scheduling
decisions(it must be specified as 0)
---------------------------------------------------------------
100(high) to 139(low) user nice -20(high) to 19(low)
---------------------------------------------------------------
140 idle task priority
---------------------------------------------------------------
The task states are: The task states are:
......
...@@ -203,7 +203,8 @@ struct pci_bus; ...@@ -203,7 +203,8 @@ struct pci_bus;
void x86_pci_root_bus_res_quirks(struct pci_bus *b); void x86_pci_root_bus_res_quirks(struct pci_bus *b);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#define mc_capable() (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids) #define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \
(cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids))
#define smt_capable() (smp_num_siblings > 1) #define smt_capable() (smp_num_siblings > 1)
#endif #endif
......
...@@ -12,20 +12,14 @@ ...@@ -12,20 +12,14 @@
static int loadavg_proc_show(struct seq_file *m, void *v) static int loadavg_proc_show(struct seq_file *m, void *v)
{ {
int a, b, c; unsigned long avnrun[3];
unsigned long seq;
do { get_avenrun(avnrun, FIXED_1/200, 0);
seq = read_seqbegin(&xtime_lock);
a = avenrun[0] + (FIXED_1/200);
b = avenrun[1] + (FIXED_1/200);
c = avenrun[2] + (FIXED_1/200);
} while (read_seqretry(&xtime_lock, seq));
seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n", seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
LOAD_INT(a), LOAD_FRAC(a), LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
LOAD_INT(b), LOAD_FRAC(b), LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
LOAD_INT(c), LOAD_FRAC(c), LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
nr_running(), nr_threads, nr_running(), nr_threads,
task_active_pid_ns(current)->last_pid); task_active_pid_ns(current)->last_pid);
return 0; return 0;
......
...@@ -116,6 +116,7 @@ struct fs_struct; ...@@ -116,6 +116,7 @@ struct fs_struct;
* 11 bit fractions. * 11 bit fractions.
*/ */
extern unsigned long avenrun[]; /* Load averages */ extern unsigned long avenrun[]; /* Load averages */
extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
#define FSHIFT 11 /* nr of bits of precision */ #define FSHIFT 11 /* nr of bits of precision */
#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
...@@ -135,8 +136,8 @@ DECLARE_PER_CPU(unsigned long, process_counts); ...@@ -135,8 +136,8 @@ DECLARE_PER_CPU(unsigned long, process_counts);
extern int nr_processes(void); extern int nr_processes(void);
extern unsigned long nr_running(void); extern unsigned long nr_running(void);
extern unsigned long nr_uninterruptible(void); extern unsigned long nr_uninterruptible(void);
extern unsigned long nr_active(void);
extern unsigned long nr_iowait(void); extern unsigned long nr_iowait(void);
extern void calc_global_load(void);
extern unsigned long get_parent_ip(unsigned long addr); extern unsigned long get_parent_ip(unsigned long addr);
...@@ -838,7 +839,17 @@ struct sched_group { ...@@ -838,7 +839,17 @@ struct sched_group {
*/ */
u32 reciprocal_cpu_power; u32 reciprocal_cpu_power;
unsigned long cpumask[]; /*
* The CPUs this group covers.
*
* NOTE: this field is variable length. (Allocated dynamically
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
*
* It is also be embedded into static data structures at build
* time. (See 'struct static_sched_group' in kernel/sched.c)
*/
unsigned long cpumask[0];
}; };
static inline struct cpumask *sched_group_cpus(struct sched_group *sg) static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
...@@ -924,8 +935,17 @@ struct sched_domain { ...@@ -924,8 +935,17 @@ struct sched_domain {
char *name; char *name;
#endif #endif
/* span of all CPUs in this domain */ /*
unsigned long span[]; * Span of all CPUs in this domain.
*
* NOTE: this field is variable length. (Allocated dynamically
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
*
* It is also be embedded into static data structures at build
* time. (See 'struct static_sched_domain' in kernel/sched.c)
*/
unsigned long span[0];
}; };
static inline struct cpumask *sched_domain_span(struct sched_domain *sd) static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
......
...@@ -132,8 +132,6 @@ static inline void __remove_wait_queue(wait_queue_head_t *head, ...@@ -132,8 +132,6 @@ static inline void __remove_wait_queue(wait_queue_head_t *head,
list_del(&old->task_list); list_del(&old->task_list);
} }
void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, int sync, void *key);
void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr,
......
...@@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, ...@@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
/* didnt get the lock, go to sleep: */ /* didnt get the lock, go to sleep: */
spin_unlock_mutex(&lock->wait_lock, flags); spin_unlock_mutex(&lock->wait_lock, flags);
__schedule(); preempt_enable_no_resched();
schedule();
preempt_disable();
spin_lock_mutex(&lock->wait_lock, flags); spin_lock_mutex(&lock->wait_lock, flags);
} }
......
This diff is collapsed.
...@@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) ...@@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
find_matching_se(&se, &pse); find_matching_se(&se, &pse);
while (se) { BUG_ON(!pse);
BUG_ON(!pse);
if (wakeup_preempt_entity(se, pse) == 1) { if (wakeup_preempt_entity(se, pse) == 1)
resched_task(curr); resched_task(curr);
break;
}
se = parent_entity(se);
pse = parent_entity(pse);
}
} }
static struct task_struct *pick_next_task_fair(struct rq *rq) static struct task_struct *pick_next_task_fair(struct rq *rq)
......
...@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy ...@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy
static struct task_struct *pick_next_task_idle(struct rq *rq) static struct task_struct *pick_next_task_idle(struct rq *rq)
{ {
schedstat_inc(rq, sched_goidle); schedstat_inc(rq, sched_goidle);
/* adjust the active tasks as we might go into a long sleep */
calc_load_account_active(rq);
return rq->idle; return rq->idle;
} }
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
/* /*
* This read-write spinlock protects us from races in SMP while * This read-write spinlock protects us from races in SMP while
* playing with xtime and avenrun. * playing with xtime.
*/ */
__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
......
...@@ -1122,47 +1122,6 @@ void update_process_times(int user_tick) ...@@ -1122,47 +1122,6 @@ void update_process_times(int user_tick)
run_posix_cpu_timers(p); run_posix_cpu_timers(p);
} }
/*
* Nr of active tasks - counted in fixed-point numbers
*/
static unsigned long count_active_tasks(void)
{
return nr_active() * FIXED_1;
}
/*
* Hmm.. Changed this, as the GNU make sources (load.c) seems to
* imply that avenrun[] is the standard name for this kind of thing.
* Nothing else seems to be standardized: the fractional size etc
* all seem to differ on different machines.
*
* Requires xtime_lock to access.
*/
unsigned long avenrun[3];
EXPORT_SYMBOL(avenrun);
/*
* calc_load - given tick count, update the avenrun load estimates.
* This is called while holding a write_lock on xtime_lock.
*/
static inline void calc_load(unsigned long ticks)
{
unsigned long active_tasks; /* fixed-point */
static int count = LOAD_FREQ;
count -= ticks;
if (unlikely(count < 0)) {
active_tasks = count_active_tasks();
do {
CALC_LOAD(avenrun[0], EXP_1, active_tasks);
CALC_LOAD(avenrun[1], EXP_5, active_tasks);
CALC_LOAD(avenrun[2], EXP_15, active_tasks);
count += LOAD_FREQ;
} while (count < 0);
}
}
/* /*
* This function runs timers and the timer-tq in bottom half context. * This function runs timers and the timer-tq in bottom half context.
*/ */
...@@ -1186,16 +1145,6 @@ void run_local_timers(void) ...@@ -1186,16 +1145,6 @@ void run_local_timers(void)
softlockup_tick(); softlockup_tick();
} }
/*
* Called by the timer interrupt. xtime_lock must already be taken
* by the timer IRQ!
*/
static inline void update_times(unsigned long ticks)
{
update_wall_time();
calc_load(ticks);
}
/* /*
* The 64-bit jiffies value is not atomic - you MUST NOT read it * The 64-bit jiffies value is not atomic - you MUST NOT read it
* without sampling the sequence number in xtime_lock. * without sampling the sequence number in xtime_lock.
...@@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks) ...@@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks)
void do_timer(unsigned long ticks) void do_timer(unsigned long ticks)
{ {
jiffies_64 += ticks; jiffies_64 += ticks;
update_times(ticks); update_wall_time();
calc_global_load();
} }
#ifdef __ARCH_WANT_SYS_ALARM #ifdef __ARCH_WANT_SYS_ALARM
...@@ -1406,37 +1356,17 @@ int do_sysinfo(struct sysinfo *info) ...@@ -1406,37 +1356,17 @@ int do_sysinfo(struct sysinfo *info)
{ {
unsigned long mem_total, sav_total; unsigned long mem_total, sav_total;
unsigned int mem_unit, bitcount; unsigned int mem_unit, bitcount;
unsigned long seq; struct timespec tp;
memset(info, 0, sizeof(struct sysinfo)); memset(info, 0, sizeof(struct sysinfo));
do { ktime_get_ts(&tp);
struct timespec tp; monotonic_to_bootbased(&tp);
seq = read_seqbegin(&xtime_lock); info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
/*
* This is annoying. The below is the same thing
* posix_get_clock_monotonic() does, but it wants to
* take the lock which we want to cover the loads stuff
* too.
*/
getnstimeofday(&tp);
tp.tv_sec += wall_to_monotonic.tv_sec;
tp.tv_nsec += wall_to_monotonic.tv_nsec;
monotonic_to_bootbased(&tp);
if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
tp.tv_sec++;
}
info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
info->procs = nr_threads; info->procs = nr_threads;
} while (read_seqretry(&xtime_lock, seq));
si_meminfo(info); si_meminfo(info);
si_swapinfo(info); si_swapinfo(info);
......
...@@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, ...@@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
if (!list_empty(&wait->task_list)) if (!list_empty(&wait->task_list))
list_del_init(&wait->task_list); list_del_init(&wait->task_list);
else if (waitqueue_active(q)) else if (waitqueue_active(q))
__wake_up_common(q, mode, 1, 0, key); __wake_up_locked_key(q, mode, key);
spin_unlock_irqrestore(&q->lock, flags); spin_unlock_irqrestore(&q->lock, flags);
} }
EXPORT_SYMBOL(abort_exclusive_wait); EXPORT_SYMBOL(abort_exclusive_wait);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment