Commit 774a694f authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of...

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (64 commits)
  sched: Fix sched::sched_stat_wait tracepoint field
  sched: Disable NEW_FAIR_SLEEPERS for now
  sched: Keep kthreads at default priority
  sched: Re-tune the scheduler latency defaults to decrease worst-case latencies
  sched: Turn off child_runs_first
  sched: Ensure that a child can't gain time over it's parent after fork()
  sched: enable SD_WAKE_IDLE
  sched: Deal with low-load in wake_affine()
  sched: Remove short cut from select_task_rq_fair()
  sched: Turn on SD_BALANCE_NEWIDLE
  sched: Clean up topology.h
  sched: Fix dynamic power-balancing crash
  sched: Remove reciprocal for cpu_power
  sched: Try to deal with low capacity, fix update_sd_power_savings_stats()
  sched: Try to deal with low capacity
  sched: Scale down cpu_power due to RT tasks
  sched: Implement dynamic cpu_power
  sched: Add smt_gain
  sched: Update the cpu_power sum during load-balance
  sched: Add SD_PREFER_SIBLING
  ...
parents 4f0ac854 e1f84508
...@@ -129,25 +129,34 @@ extern unsigned long node_remap_size[]; ...@@ -129,25 +129,34 @@ extern unsigned long node_remap_size[];
#endif #endif
/* sched_domains SD_NODE_INIT for NUMA machines */ /* sched_domains SD_NODE_INIT for NUMA machines */
#define SD_NODE_INIT (struct sched_domain) { \ #define SD_NODE_INIT (struct sched_domain) { \
.min_interval = 8, \ .min_interval = 8, \
.max_interval = 32, \ .max_interval = 32, \
.busy_factor = 32, \ .busy_factor = 32, \
.imbalance_pct = 125, \ .imbalance_pct = 125, \
.cache_nice_tries = SD_CACHE_NICE_TRIES, \ .cache_nice_tries = SD_CACHE_NICE_TRIES, \
.busy_idx = 3, \ .busy_idx = 3, \
.idle_idx = SD_IDLE_IDX, \ .idle_idx = SD_IDLE_IDX, \
.newidle_idx = SD_NEWIDLE_IDX, \ .newidle_idx = SD_NEWIDLE_IDX, \
.wake_idx = 1, \ .wake_idx = 1, \
.forkexec_idx = SD_FORKEXEC_IDX, \ .forkexec_idx = SD_FORKEXEC_IDX, \
.flags = SD_LOAD_BALANCE \ \
| SD_BALANCE_EXEC \ .flags = 1*SD_LOAD_BALANCE \
| SD_BALANCE_FORK \ | 1*SD_BALANCE_NEWIDLE \
| SD_WAKE_AFFINE \ | 1*SD_BALANCE_EXEC \
| SD_WAKE_BALANCE \ | 1*SD_BALANCE_FORK \
| SD_SERIALIZE, \ | 0*SD_WAKE_IDLE \
.last_balance = jiffies, \ | 1*SD_WAKE_AFFINE \
.balance_interval = 1, \ | 1*SD_WAKE_BALANCE \
| 0*SD_SHARE_CPUPOWER \
| 0*SD_POWERSAVINGS_BALANCE \
| 0*SD_SHARE_PKG_RESOURCES \
| 1*SD_SERIALIZE \
| 1*SD_WAKE_IDLE_FAR \
| 0*SD_PREFER_SIBLING \
, \
.last_balance = jiffies, \
.balance_interval = 1, \
} }
#ifdef CONFIG_X86_64_ACPI_NUMA #ifdef CONFIG_X86_64_ACPI_NUMA
......
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include <linux/swap.h> #include <linux/swap.h>
#include <linux/bootmem.h> #include <linux/bootmem.h>
#include <linux/fs_struct.h> #include <linux/fs_struct.h>
#include <linux/hardirq.h>
#include "internal.h" #include "internal.h"
int sysctl_vfs_cache_pressure __read_mostly = 100; int sysctl_vfs_cache_pressure __read_mostly = 100;
......
...@@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) ...@@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
* give it the opportunity to lock the file. * give it the opportunity to lock the file.
*/ */
if (found) if (found)
cond_resched_bkl(); cond_resched();
find_conflict: find_conflict:
for_each_lock(inode, before) { for_each_lock(inode, before) {
......
...@@ -64,6 +64,12 @@ ...@@ -64,6 +64,12 @@
#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
#define NMI_OFFSET (1UL << NMI_SHIFT) #define NMI_OFFSET (1UL << NMI_SHIFT)
#ifndef PREEMPT_ACTIVE
#define PREEMPT_ACTIVE_BITS 1
#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
#endif
#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS)) #if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS))
#error PREEMPT_ACTIVE is too low! #error PREEMPT_ACTIVE is too low!
#endif #endif
......
...@@ -125,7 +125,7 @@ extern int _cond_resched(void); ...@@ -125,7 +125,7 @@ extern int _cond_resched(void);
#endif #endif
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
void __might_sleep(char *file, int line); void __might_sleep(char *file, int line, int preempt_offset);
/** /**
* might_sleep - annotation for functions that can sleep * might_sleep - annotation for functions that can sleep
* *
...@@ -137,8 +137,9 @@ extern int _cond_resched(void); ...@@ -137,8 +137,9 @@ extern int _cond_resched(void);
* supposed to. * supposed to.
*/ */
# define might_sleep() \ # define might_sleep() \
do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
#else #else
static inline void __might_sleep(char *file, int line, int preempt_offset) { }
# define might_sleep() do { might_resched(); } while (0) # define might_sleep() do { might_resched(); } while (0)
#endif #endif
......
...@@ -38,6 +38,8 @@ ...@@ -38,6 +38,8 @@
#define SCHED_BATCH 3 #define SCHED_BATCH 3
/* SCHED_ISO: reserved but not implemented yet */ /* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE 5 #define SCHED_IDLE 5
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
#define SCHED_RESET_ON_FORK 0x40000000
#ifdef __KERNEL__ #ifdef __KERNEL__
...@@ -796,18 +798,19 @@ enum cpu_idle_type { ...@@ -796,18 +798,19 @@ enum cpu_idle_type {
#define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE #define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ #define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */
#define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ #define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */
#define SD_BALANCE_EXEC 4 /* Balance on exec */ #define SD_BALANCE_EXEC 0x0004 /* Balance on exec */
#define SD_BALANCE_FORK 8 /* Balance on fork, clone */ #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
#define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */ #define SD_WAKE_IDLE 0x0010 /* Wake to idle CPU on task wakeup */
#define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ #define SD_WAKE_BALANCE 0x0040 /* Perform balancing at task wakeup */
#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ #define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */
#define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 1024 /* Only a single load balancing instance */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
#define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */ #define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
enum powersavings_balance_level { enum powersavings_balance_level {
POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */
...@@ -827,7 +830,7 @@ static inline int sd_balance_for_mc_power(void) ...@@ -827,7 +830,7 @@ static inline int sd_balance_for_mc_power(void)
if (sched_smt_power_savings) if (sched_smt_power_savings)
return SD_POWERSAVINGS_BALANCE; return SD_POWERSAVINGS_BALANCE;
return 0; return SD_PREFER_SIBLING;
} }
static inline int sd_balance_for_package_power(void) static inline int sd_balance_for_package_power(void)
...@@ -835,7 +838,7 @@ static inline int sd_balance_for_package_power(void) ...@@ -835,7 +838,7 @@ static inline int sd_balance_for_package_power(void)
if (sched_mc_power_savings | sched_smt_power_savings) if (sched_mc_power_savings | sched_smt_power_savings)
return SD_POWERSAVINGS_BALANCE; return SD_POWERSAVINGS_BALANCE;
return 0; return SD_PREFER_SIBLING;
} }
/* /*
...@@ -857,15 +860,9 @@ struct sched_group { ...@@ -857,15 +860,9 @@ struct sched_group {
/* /*
* CPU power of this group, SCHED_LOAD_SCALE being max power for a * CPU power of this group, SCHED_LOAD_SCALE being max power for a
* single CPU. This is read only (except for setup, hotplug CPU). * single CPU.
* Note : Never change cpu_power without recompute its reciprocal
*/
unsigned int __cpu_power;
/*
* reciprocal value of cpu_power to avoid expensive divides
* (see include/linux/reciprocal_div.h)
*/ */
u32 reciprocal_cpu_power; unsigned int cpu_power;
/* /*
* The CPUs this group covers. * The CPUs this group covers.
...@@ -918,6 +915,7 @@ struct sched_domain { ...@@ -918,6 +915,7 @@ struct sched_domain {
unsigned int newidle_idx; unsigned int newidle_idx;
unsigned int wake_idx; unsigned int wake_idx;
unsigned int forkexec_idx; unsigned int forkexec_idx;
unsigned int smt_gain;
int flags; /* See SD_* */ int flags; /* See SD_* */
enum sched_domain_level level; enum sched_domain_level level;
...@@ -1045,7 +1043,6 @@ struct sched_class { ...@@ -1045,7 +1043,6 @@ struct sched_class {
struct rq *busiest, struct sched_domain *sd, struct rq *busiest, struct sched_domain *sd,
enum cpu_idle_type idle); enum cpu_idle_type idle);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
int (*needs_post_schedule) (struct rq *this_rq);
void (*post_schedule) (struct rq *this_rq); void (*post_schedule) (struct rq *this_rq);
void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
...@@ -1110,6 +1107,8 @@ struct sched_entity { ...@@ -1110,6 +1107,8 @@ struct sched_entity {
u64 wait_max; u64 wait_max;
u64 wait_count; u64 wait_count;
u64 wait_sum; u64 wait_sum;
u64 iowait_count;
u64 iowait_sum;
u64 sleep_start; u64 sleep_start;
u64 sleep_max; u64 sleep_max;
...@@ -1234,11 +1233,19 @@ struct task_struct { ...@@ -1234,11 +1233,19 @@ struct task_struct {
unsigned did_exec:1; unsigned did_exec:1;
unsigned in_execve:1; /* Tell the LSMs that the process is doing an unsigned in_execve:1; /* Tell the LSMs that the process is doing an
* execve */ * execve */
unsigned in_iowait:1;
/* Revert to default priority/policy when forking */
unsigned sched_reset_on_fork:1;
pid_t pid; pid_t pid;
pid_t tgid; pid_t tgid;
#ifdef CONFIG_CC_STACKPROTECTOR
/* Canary value for the -fstack-protector gcc feature */ /* Canary value for the -fstack-protector gcc feature */
unsigned long stack_canary; unsigned long stack_canary;
#endif
/* /*
* pointers to (original) parent process, youngest child, younger sibling, * pointers to (original) parent process, youngest child, younger sibling,
...@@ -1840,11 +1847,12 @@ extern unsigned int sysctl_sched_min_granularity; ...@@ -1840,11 +1847,12 @@ extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_wakeup_granularity;
extern unsigned int sysctl_sched_shares_ratelimit; extern unsigned int sysctl_sched_shares_ratelimit;
extern unsigned int sysctl_sched_shares_thresh; extern unsigned int sysctl_sched_shares_thresh;
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_child_runs_first;
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate; extern unsigned int sysctl_sched_nr_migrate;
extern unsigned int sysctl_sched_time_avg;
extern unsigned int sysctl_timer_migration; extern unsigned int sysctl_timer_migration;
int sched_nr_latency_handler(struct ctl_table *table, int write, int sched_nr_latency_handler(struct ctl_table *table, int write,
...@@ -2308,23 +2316,31 @@ static inline int need_resched(void) ...@@ -2308,23 +2316,31 @@ static inline int need_resched(void)
* cond_resched_softirq() will enable bhs before scheduling. * cond_resched_softirq() will enable bhs before scheduling.
*/ */
extern int _cond_resched(void); extern int _cond_resched(void);
#ifdef CONFIG_PREEMPT_BKL
static inline int cond_resched(void) #define cond_resched() ({ \
{ __might_sleep(__FILE__, __LINE__, 0); \
return 0; _cond_resched(); \
} })
extern int __cond_resched_lock(spinlock_t *lock);
#ifdef CONFIG_PREEMPT
#define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET
#else #else
static inline int cond_resched(void) #define PREEMPT_LOCK_OFFSET 0
{
return _cond_resched();
}
#endif #endif
extern int cond_resched_lock(spinlock_t * lock);
extern int cond_resched_softirq(void); #define cond_resched_lock(lock) ({ \
static inline int cond_resched_bkl(void) __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
{ __cond_resched_lock(lock); \
return _cond_resched(); })
}
extern int __cond_resched_softirq(void);
#define cond_resched_softirq() ({ \
__might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \
__cond_resched_softirq(); \
})
/* /*
* Does a critical section need to be broken due to another * Does a critical section need to be broken due to another
......
...@@ -85,20 +85,29 @@ int arch_update_cpu_topology(void); ...@@ -85,20 +85,29 @@ int arch_update_cpu_topology(void);
#define ARCH_HAS_SCHED_WAKE_IDLE #define ARCH_HAS_SCHED_WAKE_IDLE
/* Common values for SMT siblings */ /* Common values for SMT siblings */
#ifndef SD_SIBLING_INIT #ifndef SD_SIBLING_INIT
#define SD_SIBLING_INIT (struct sched_domain) { \ #define SD_SIBLING_INIT (struct sched_domain) { \
.min_interval = 1, \ .min_interval = 1, \
.max_interval = 2, \ .max_interval = 2, \
.busy_factor = 64, \ .busy_factor = 64, \
.imbalance_pct = 110, \ .imbalance_pct = 110, \
.flags = SD_LOAD_BALANCE \ \
| SD_BALANCE_NEWIDLE \ .flags = 1*SD_LOAD_BALANCE \
| SD_BALANCE_FORK \ | 1*SD_BALANCE_NEWIDLE \
| SD_BALANCE_EXEC \ | 1*SD_BALANCE_EXEC \
| SD_WAKE_AFFINE \ | 1*SD_BALANCE_FORK \
| SD_WAKE_BALANCE \ | 0*SD_WAKE_IDLE \
| SD_SHARE_CPUPOWER, \ | 1*SD_WAKE_AFFINE \
.last_balance = jiffies, \ | 1*SD_WAKE_BALANCE \
.balance_interval = 1, \ | 1*SD_SHARE_CPUPOWER \
| 0*SD_POWERSAVINGS_BALANCE \
| 0*SD_SHARE_PKG_RESOURCES \
| 0*SD_SERIALIZE \
| 0*SD_WAKE_IDLE_FAR \
| 0*SD_PREFER_SIBLING \
, \
.last_balance = jiffies, \
.balance_interval = 1, \
.smt_gain = 1178, /* 15% */ \
} }
#endif #endif
#endif /* CONFIG_SCHED_SMT */ #endif /* CONFIG_SCHED_SMT */
...@@ -106,69 +115,94 @@ int arch_update_cpu_topology(void); ...@@ -106,69 +115,94 @@ int arch_update_cpu_topology(void);
#ifdef CONFIG_SCHED_MC #ifdef CONFIG_SCHED_MC
/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ /* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
#ifndef SD_MC_INIT #ifndef SD_MC_INIT
#define SD_MC_INIT (struct sched_domain) { \ #define SD_MC_INIT (struct sched_domain) { \
.min_interval = 1, \ .min_interval = 1, \
.max_interval = 4, \ .max_interval = 4, \
.busy_factor = 64, \ .busy_factor = 64, \
.imbalance_pct = 125, \ .imbalance_pct = 125, \
.cache_nice_tries = 1, \ .cache_nice_tries = 1, \
.busy_idx = 2, \ .busy_idx = 2, \
.wake_idx = 1, \ .wake_idx = 1, \
.forkexec_idx = 1, \ .forkexec_idx = 1, \
.flags = SD_LOAD_BALANCE \ \
| SD_BALANCE_FORK \ .flags = 1*SD_LOAD_BALANCE \
| SD_BALANCE_EXEC \ | 1*SD_BALANCE_NEWIDLE \
| SD_WAKE_AFFINE \ | 1*SD_BALANCE_EXEC \
| SD_WAKE_BALANCE \ | 1*SD_BALANCE_FORK \
| SD_SHARE_PKG_RESOURCES\ | 1*SD_WAKE_IDLE \
| sd_balance_for_mc_power()\ | 1*SD_WAKE_AFFINE \
| sd_power_saving_flags(),\ | 1*SD_WAKE_BALANCE \
.last_balance = jiffies, \ | 0*SD_SHARE_CPUPOWER \
.balance_interval = 1, \ | 1*SD_SHARE_PKG_RESOURCES \
| 0*SD_SERIALIZE \
| 0*SD_WAKE_IDLE_FAR \
| sd_balance_for_mc_power() \
| sd_power_saving_flags() \
, \
.last_balance = jiffies, \
.balance_interval = 1, \
} }
#endif #endif
#endif /* CONFIG_SCHED_MC */ #endif /* CONFIG_SCHED_MC */
/* Common values for CPUs */ /* Common values for CPUs */
#ifndef SD_CPU_INIT #ifndef SD_CPU_INIT
#define SD_CPU_INIT (struct sched_domain) { \ #define SD_CPU_INIT (struct sched_domain) { \
.min_interval = 1, \ .min_interval = 1, \
.max_interval = 4, \ .max_interval = 4, \
.busy_factor = 64, \ .busy_factor = 64, \
.imbalance_pct = 125, \ .imbalance_pct = 125, \
.cache_nice_tries = 1, \ .cache_nice_tries = 1, \
.busy_idx = 2, \ .busy_idx = 2, \
.idle_idx = 1, \ .idle_idx = 1, \
.newidle_idx = 2, \ .newidle_idx = 2, \
.wake_idx = 1, \ .wake_idx = 1, \
.forkexec_idx = 1, \ .forkexec_idx = 1, \
.flags = SD_LOAD_BALANCE \ \
| SD_BALANCE_EXEC \ .flags = 1*SD_LOAD_BALANCE \
| SD_BALANCE_FORK \ | 1*SD_BALANCE_NEWIDLE \
| SD_WAKE_AFFINE \ | 1*SD_BALANCE_EXEC \
| SD_WAKE_BALANCE \ | 1*SD_BALANCE_FORK \
| sd_balance_for_package_power()\ | 1*SD_WAKE_IDLE \
| sd_power_saving_flags(),\ | 0*SD_WAKE_AFFINE \
.last_balance = jiffies, \ | 1*SD_WAKE_BALANCE \
.balance_interval = 1, \ | 0*SD_SHARE_CPUPOWER \
| 0*SD_SHARE_PKG_RESOURCES \
| 0*SD_SERIALIZE \
| 0*SD_WAKE_IDLE_FAR \
| sd_balance_for_package_power() \
| sd_power_saving_flags() \
, \
.last_balance = jiffies, \
.balance_interval = 1, \
} }
#endif #endif
/* sched_domains SD_ALLNODES_INIT for NUMA machines */ /* sched_domains SD_ALLNODES_INIT for NUMA machines */
#define SD_ALLNODES_INIT (struct sched_domain) { \ #define SD_ALLNODES_INIT (struct sched_domain) { \
.min_interval = 64, \ .min_interval = 64, \
.max_interval = 64*num_online_cpus(), \ .max_interval = 64*num_online_cpus(), \
.busy_factor = 128, \ .busy_factor = 128, \
.imbalance_pct = 133, \ .imbalance_pct = 133, \
.cache_nice_tries = 1, \ .cache_nice_tries = 1, \
.busy_idx = 3, \ .busy_idx = 3, \
.idle_idx = 3, \ .idle_idx = 3, \
.flags = SD_LOAD_BALANCE \ .flags = 1*SD_LOAD_BALANCE \
| SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_NEWIDLE \
| SD_WAKE_AFFINE \ | 0*SD_BALANCE_EXEC \
| SD_SERIALIZE, \ | 0*SD_BALANCE_FORK \
.last_balance = jiffies, \ | 0*SD_WAKE_IDLE \
.balance_interval = 64, \ | 1*SD_WAKE_AFFINE \
| 0*SD_WAKE_BALANCE \
| 0*SD_SHARE_CPUPOWER \
| 0*SD_POWERSAVINGS_BALANCE \
| 0*SD_SHARE_PKG_RESOURCES \
| 1*SD_SERIALIZE \
| 1*SD_WAKE_IDLE_FAR \
| 0*SD_PREFER_SIBLING \
, \
.last_balance = jiffies, \
.balance_interval = 64, \
} }
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
......
...@@ -340,6 +340,101 @@ TRACE_EVENT(sched_signal_send, ...@@ -340,6 +340,101 @@ TRACE_EVENT(sched_signal_send,
__entry->sig, __entry->comm, __entry->pid) __entry->sig, __entry->comm, __entry->pid)
); );
/*
* XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE
* adding sched_stat support to SCHED_FIFO/RR would be welcome.
*/
/*
* Tracepoint for accounting wait time (time the task is runnable
* but not actually running due to scheduler contention).
*/
TRACE_EVENT(sched_stat_wait,
TP_PROTO(struct task_struct *tsk, u64 delay),
TP_ARGS(tsk, delay),
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__field( u64, delay )
),
TP_fast_assign(
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
__entry->pid = tsk->pid;
__entry->delay = delay;
)
TP_perf_assign(
__perf_count(delay);
),
TP_printk("task: %s:%d wait: %Lu [ns]",
__entry->comm, __entry->pid,
(unsigned long long)__entry->delay)
);
/*
* Tracepoint for accounting sleep time (time the task is not runnable,
* including iowait, see below).
*/
TRACE_EVENT(sched_stat_sleep,
TP_PROTO(struct task_struct *tsk, u64 delay),
TP_ARGS(tsk, delay),
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__field( u64, delay )
),
TP_fast_assign(
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
__entry->pid = tsk->pid;
__entry->delay = delay;
)
TP_perf_assign(
__perf_count(delay);
),
TP_printk("task: %s:%d sleep: %Lu [ns]",
__entry->comm, __entry->pid,
(unsigned long long)__entry->delay)
);
/*
* Tracepoint for accounting iowait time (time the task is not runnable
* due to waiting on IO to complete).
*/
TRACE_EVENT(sched_stat_iowait,
TP_PROTO(struct task_struct *tsk, u64 delay),
TP_ARGS(tsk, delay),
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__field( u64, delay )
),
TP_fast_assign(
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
__entry->pid = tsk->pid;
__entry->delay = delay;
)
TP_perf_assign(
__perf_count(delay);
),
TP_printk("task: %s:%d iowait: %Lu [ns]",
__entry->comm, __entry->pid,
(unsigned long long)__entry->delay)
);
#endif /* _TRACE_SCHED_H */ #endif /* _TRACE_SCHED_H */
/* This part must be outside protection */ /* This part must be outside protection */
......
...@@ -631,7 +631,6 @@ asmlinkage void __init start_kernel(void) ...@@ -631,7 +631,6 @@ asmlinkage void __init start_kernel(void)
softirq_init(); softirq_init();
timekeeping_init(); timekeeping_init();
time_init(); time_init();
sched_clock_init();
profile_init(); profile_init();
if (!irqs_disabled()) if (!irqs_disabled())
printk(KERN_CRIT "start_kernel(): bug: interrupts were " printk(KERN_CRIT "start_kernel(): bug: interrupts were "
...@@ -682,6 +681,7 @@ asmlinkage void __init start_kernel(void) ...@@ -682,6 +681,7 @@ asmlinkage void __init start_kernel(void)
numa_policy_init(); numa_policy_init();
if (late_time_init) if (late_time_init)
late_time_init(); late_time_init();
sched_clock_init();
calibrate_delay(); calibrate_delay();
pidmap_init(); pidmap_init();
anon_vma_init(); anon_vma_init();
......
...@@ -16,8 +16,6 @@ ...@@ -16,8 +16,6 @@
#include <linux/mutex.h> #include <linux/mutex.h>
#include <trace/events/sched.h> #include <trace/events/sched.h>
#define KTHREAD_NICE_LEVEL (-5)
static DEFINE_SPINLOCK(kthread_create_lock); static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list); static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task; struct task_struct *kthreadd_task;
...@@ -145,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), ...@@ -145,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
* The kernel thread should not inherit these properties. * The kernel thread should not inherit these properties.
*/ */
sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param); sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
set_user_nice(create.result, KTHREAD_NICE_LEVEL);
set_cpus_allowed_ptr(create.result, cpu_all_mask); set_cpus_allowed_ptr(create.result, cpu_all_mask);
} }
return create.result; return create.result;
...@@ -221,7 +218,6 @@ int kthreadd(void *unused) ...@@ -221,7 +218,6 @@ int kthreadd(void *unused)
/* Setup a clean context for our children to inherit. */ /* Setup a clean context for our children to inherit. */
set_task_comm(tsk, "kthreadd"); set_task_comm(tsk, "kthreadd");
ignore_signals(tsk); ignore_signals(tsk);
set_user_nice(tsk, KTHREAD_NICE_LEVEL);
set_cpus_allowed_ptr(tsk, cpu_all_mask); set_cpus_allowed_ptr(tsk, cpu_all_mask);
set_mems_allowed(node_possible_map); set_mems_allowed(node_possible_map);
......
This diff is collapsed.
...@@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) ...@@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
/* /*
* If the cpu was currently mapped to a different value, we * If the cpu was currently mapped to a different value, we
* first need to unmap the old value * need to map it to the new value then remove the old value.
* Note, we must add the new value first, otherwise we risk the
* cpu being cleared from pri_active, and this cpu could be
* missed for a push or pull.
*/ */
if (likely(oldpri != CPUPRI_INVALID)) {
struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
spin_lock_irqsave(&vec->lock, flags);
vec->count--;
if (!vec->count)
clear_bit(oldpri, cp->pri_active);
cpumask_clear_cpu(cpu, vec->mask);
spin_unlock_irqrestore(&vec->lock, flags);
}
if (likely(newpri != CPUPRI_INVALID)) { if (likely(newpri != CPUPRI_INVALID)) {
struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
...@@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) ...@@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
spin_unlock_irqrestore(&vec->lock, flags); spin_unlock_irqrestore(&vec->lock, flags);
} }
if (likely(oldpri != CPUPRI_INVALID)) {
struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
spin_lock_irqsave(&vec->lock, flags);
vec->count--;
if (!vec->count)
clear_bit(oldpri, cp->pri_active);
cpumask_clear_cpu(cpu, vec->mask);
spin_unlock_irqrestore(&vec->lock, flags);
}
*currpri = newpri; *currpri = newpri;
} }
......
...@@ -409,6 +409,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) ...@@ -409,6 +409,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
PN(se.wait_max); PN(se.wait_max);
PN(se.wait_sum); PN(se.wait_sum);
P(se.wait_count); P(se.wait_count);
PN(se.iowait_sum);
P(se.iowait_count);
P(sched_info.bkl_count); P(sched_info.bkl_count);
P(se.nr_migrations); P(se.nr_migrations);
P(se.nr_migrations_cold); P(se.nr_migrations_cold);
...@@ -479,6 +481,8 @@ void proc_sched_set_task(struct task_struct *p) ...@@ -479,6 +481,8 @@ void proc_sched_set_task(struct task_struct *p)
p->se.wait_max = 0; p->se.wait_max = 0;
p->se.wait_sum = 0; p->se.wait_sum = 0;
p->se.wait_count = 0; p->se.wait_count = 0;
p->se.iowait_sum = 0;
p->se.iowait_count = 0;
p->se.sleep_max = 0; p->se.sleep_max = 0;
p->se.sum_sleep_runtime = 0; p->se.sum_sleep_runtime = 0;
p->se.block_max = 0; p->se.block_max = 0;
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
/* /*
* Targeted preemption latency for CPU-bound tasks: * Targeted preemption latency for CPU-bound tasks:
* (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
* *
* NOTE: this latency value is not the same as the concept of * NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length * 'timeslice length' - timeslices in CFS are of variable length
...@@ -34,13 +34,13 @@ ...@@ -34,13 +34,13 @@
* (to see the precise effective timeslice length of your workload, * (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches (cs) field) * run vmstat and monitor the context-switches (cs) field)
*/ */
unsigned int sysctl_sched_latency = 20000000ULL; unsigned int sysctl_sched_latency = 5000000ULL;
/* /*
* Minimal preemption granularity for CPU-bound tasks: * Minimal preemption granularity for CPU-bound tasks:
* (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/ */
unsigned int sysctl_sched_min_granularity = 4000000ULL; unsigned int sysctl_sched_min_granularity = 1000000ULL;
/* /*
* is kept at sysctl_sched_latency / sysctl_sched_min_granularity * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
...@@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL; ...@@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL;
static unsigned int sched_nr_latency = 5; static unsigned int sched_nr_latency = 5;
/* /*
* After fork, child runs first. (default) If set to 0 then * After fork, child runs first. If set to 0 (default) then
* parent will (try to) run first. * parent will (try to) run first.
*/ */
const_debug unsigned int sysctl_sched_child_runs_first = 1; unsigned int sysctl_sched_child_runs_first __read_mostly;
/* /*
* sys_sched_yield() compat mode * sys_sched_yield() compat mode
...@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield; ...@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
/* /*
* SCHED_OTHER wake-up granularity. * SCHED_OTHER wake-up granularity.
* (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
* *
* This option delays the preemption effects of decoupled workloads * This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still * and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies. * have immediate wakeup/sleep latencies.
*/ */
unsigned int sysctl_sched_wakeup_granularity = 5000000UL; unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
...@@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class; ...@@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class;
* CFS operations on generic schedulable entities: * CFS operations on generic schedulable entities:
*/ */
static inline struct task_struct *task_of(struct sched_entity *se)
{
return container_of(se, struct task_struct, se);
}
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
/* cpu runqueue to which this cfs_rq is attached */ /* cpu runqueue to which this cfs_rq is attached */
...@@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) ...@@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
/* An entity is a task if it doesn't "own" a runqueue */ /* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q) #define entity_is_task(se) (!se->my_q)
static inline struct task_struct *task_of(struct sched_entity *se)
{
#ifdef CONFIG_SCHED_DEBUG
WARN_ON_ONCE(!entity_is_task(se));
#endif
return container_of(se, struct task_struct, se);
}
/* Walk up scheduling entities hierarchy */ /* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \ #define for_each_sched_entity(se) \
for (; se; se = se->parent) for (; se; se = se->parent)
...@@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) ...@@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
} }
} }
#else /* CONFIG_FAIR_GROUP_SCHED */ #else /* !CONFIG_FAIR_GROUP_SCHED */
static inline struct task_struct *task_of(struct sched_entity *se)
{
return container_of(se, struct task_struct, se);
}
static inline struct rq *rq_of(struct cfs_rq *cfs_rq) static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{ {
...@@ -537,6 +545,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) ...@@ -537,6 +545,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
schedstat_set(se->wait_count, se->wait_count + 1); schedstat_set(se->wait_count, se->wait_count + 1);
schedstat_set(se->wait_sum, se->wait_sum + schedstat_set(se->wait_sum, se->wait_sum +
rq_of(cfs_rq)->clock - se->wait_start); rq_of(cfs_rq)->clock - se->wait_start);
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
trace_sched_stat_wait(task_of(se),
rq_of(cfs_rq)->clock - se->wait_start);
}
#endif
schedstat_set(se->wait_start, 0); schedstat_set(se->wait_start, 0);
} }
...@@ -628,8 +642,10 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) ...@@ -628,8 +642,10 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->sleep_start = 0; se->sleep_start = 0;
se->sum_sleep_runtime += delta; se->sum_sleep_runtime += delta;
if (tsk) if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1); account_scheduler_latency(tsk, delta >> 10, 1);
trace_sched_stat_sleep(tsk, delta);
}
} }
if (se->block_start) { if (se->block_start) {
u64 delta = rq_of(cfs_rq)->clock - se->block_start; u64 delta = rq_of(cfs_rq)->clock - se->block_start;
...@@ -644,6 +660,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) ...@@ -644,6 +660,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->sum_sleep_runtime += delta; se->sum_sleep_runtime += delta;
if (tsk) { if (tsk) {
if (tsk->in_iowait) {
se->iowait_sum += delta;
se->iowait_count++;
trace_sched_stat_iowait(tsk, delta);
}
/* /*
* Blocking time is in units of nanosecs, so shift by * Blocking time is in units of nanosecs, so shift by
* 20 to get a milliseconds-range estimation of the * 20 to get a milliseconds-range estimation of the
...@@ -705,11 +727,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) ...@@ -705,11 +727,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
vruntime -= thresh; vruntime -= thresh;
} }
/* ensure we never gain time by being placed backwards. */
vruntime = max_vruntime(se->vruntime, vruntime);
} }
/* ensure we never gain time by being placed backwards. */
vruntime = max_vruntime(se->vruntime, vruntime);
se->vruntime = vruntime; se->vruntime = vruntime;
} }
...@@ -1046,17 +1068,21 @@ static void yield_task_fair(struct rq *rq) ...@@ -1046,17 +1068,21 @@ static void yield_task_fair(struct rq *rq)
* search starts with cpus closest then further out as needed, * search starts with cpus closest then further out as needed,
* so we always favor a closer, idle cpu. * so we always favor a closer, idle cpu.
* Domains may include CPUs that are not usable for migration, * Domains may include CPUs that are not usable for migration,
* hence we need to mask them out (cpu_active_mask) * hence we need to mask them out (rq->rd->online)
* *
* Returns the CPU we should wake onto. * Returns the CPU we should wake onto.
*/ */
#if defined(ARCH_HAS_SCHED_WAKE_IDLE) #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
static int wake_idle(int cpu, struct task_struct *p) static int wake_idle(int cpu, struct task_struct *p)
{ {
struct sched_domain *sd; struct sched_domain *sd;
int i; int i;
unsigned int chosen_wakeup_cpu; unsigned int chosen_wakeup_cpu;
int this_cpu; int this_cpu;
struct rq *task_rq = task_rq(p);
/* /*
* At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
...@@ -1089,10 +1115,10 @@ static int wake_idle(int cpu, struct task_struct *p) ...@@ -1089,10 +1115,10 @@ static int wake_idle(int cpu, struct task_struct *p)
for_each_domain(cpu, sd) { for_each_domain(cpu, sd) {
if ((sd->flags & SD_WAKE_IDLE) if ((sd->flags & SD_WAKE_IDLE)
|| ((sd->flags & SD_WAKE_IDLE_FAR) || ((sd->flags & SD_WAKE_IDLE_FAR)
&& !task_hot(p, task_rq(p)->clock, sd))) { && !task_hot(p, task_rq->clock, sd))) {
for_each_cpu_and(i, sched_domain_span(sd), for_each_cpu_and(i, sched_domain_span(sd),
&p->cpus_allowed) { &p->cpus_allowed) {
if (cpu_active(i) && idle_cpu(i)) { if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
if (i != task_cpu(p)) { if (i != task_cpu(p)) {
schedstat_inc(p, schedstat_inc(p,
se.nr_wakeups_idle); se.nr_wakeups_idle);
...@@ -1235,7 +1261,17 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, ...@@ -1235,7 +1261,17 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
tg = task_group(p); tg = task_group(p);
weight = p->se.load.weight; weight = p->se.load.weight;
balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= /*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
* due to the sync cause above having dropped tl to 0, we'll always have
* an imbalance, but there's really nothing you can do about that, so
* that's good too.
*
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
balanced = !tl ||
100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
/* /*
...@@ -1278,8 +1314,6 @@ static int select_task_rq_fair(struct task_struct *p, int sync) ...@@ -1278,8 +1314,6 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
this_rq = cpu_rq(this_cpu); this_rq = cpu_rq(this_cpu);
new_cpu = prev_cpu; new_cpu = prev_cpu;
if (prev_cpu == this_cpu)
goto out;
/* /*
* 'this_sd' is the first domain that both * 'this_sd' is the first domain that both
* this_cpu and prev_cpu are present in: * this_cpu and prev_cpu are present in:
...@@ -1721,6 +1755,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) ...@@ -1721,6 +1755,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
sched_info_queued(p); sched_info_queued(p);
update_curr(cfs_rq); update_curr(cfs_rq);
if (curr)
se->vruntime = curr->vruntime;
place_entity(cfs_rq, se, 1); place_entity(cfs_rq, se, 1);
/* 'curr' will be NULL if the child belongs to a different group */ /* 'curr' will be NULL if the child belongs to a different group */
......
SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) SCHED_FEAT(NEW_FAIR_SLEEPERS, 0)
SCHED_FEAT(NORMALIZED_SLEEPER, 0) SCHED_FEAT(NORMALIZED_SLEEPER, 0)
SCHED_FEAT(ADAPTIVE_GRAN, 1) SCHED_FEAT(ADAPTIVE_GRAN, 1)
SCHED_FEAT(WAKEUP_PREEMPT, 1) SCHED_FEAT(WAKEUP_PREEMPT, 1)
......
...@@ -3,15 +3,18 @@ ...@@ -3,15 +3,18 @@
* policies) * policies)
*/ */
#ifdef CONFIG_RT_GROUP_SCHED
#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
{ {
#ifdef CONFIG_SCHED_DEBUG
WARN_ON_ONCE(!rt_entity_is_task(rt_se));
#endif
return container_of(rt_se, struct task_struct, rt); return container_of(rt_se, struct task_struct, rt);
} }
#ifdef CONFIG_RT_GROUP_SCHED
#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{ {
return rt_rq->rq; return rt_rq->rq;
...@@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) ...@@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
#define rt_entity_is_task(rt_se) (1) #define rt_entity_is_task(rt_se) (1)
static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
{
return container_of(rt_se, struct task_struct, rt);
}
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{ {
return container_of(rt_rq, struct rq, rt); return container_of(rt_rq, struct rq, rt);
...@@ -128,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) ...@@ -128,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
} }
static inline int has_pushable_tasks(struct rq *rq)
{
return !plist_head_empty(&rq->rt.pushable_tasks);
}
#else #else
static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
...@@ -602,6 +615,8 @@ static void update_curr_rt(struct rq *rq) ...@@ -602,6 +615,8 @@ static void update_curr_rt(struct rq *rq)
curr->se.exec_start = rq->clock; curr->se.exec_start = rq->clock;
cpuacct_charge(curr, delta_exec); cpuacct_charge(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
if (!rt_bandwidth_enabled()) if (!rt_bandwidth_enabled())
return; return;
...@@ -874,8 +889,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) ...@@ -874,8 +889,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p); enqueue_pushable_task(rq, p);
inc_cpu_load(rq, p->se.load.weight);
} }
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
...@@ -886,8 +899,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) ...@@ -886,8 +899,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
dequeue_rt_entity(rt_se); dequeue_rt_entity(rt_se);
dequeue_pushable_task(rq, p); dequeue_pushable_task(rq, p);
dec_cpu_load(rq, p->se.load.weight);
} }
/* /*
...@@ -1064,6 +1075,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) ...@@ -1064,6 +1075,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
if (p) if (p)
dequeue_pushable_task(rq, p); dequeue_pushable_task(rq, p);
#ifdef CONFIG_SMP
/*
* We detect this state here so that we can avoid taking the RQ
* lock again later if there is no need to push
*/
rq->post_schedule = has_pushable_tasks(rq);
#endif
return p; return p;
} }
...@@ -1161,13 +1180,6 @@ static int find_lowest_rq(struct task_struct *task) ...@@ -1161,13 +1180,6 @@ static int find_lowest_rq(struct task_struct *task)
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
return -1; /* No targets found */ return -1; /* No targets found */
/*
* Only consider CPUs that are usable for migration.
* I guess we might want to change cpupri_find() to ignore those
* in the first place.
*/
cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
/* /*
* At this point we have built a mask of cpus representing the * At this point we have built a mask of cpus representing the
* lowest priority tasks in the system. Now we want to elect * lowest priority tasks in the system. Now we want to elect
...@@ -1262,11 +1274,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) ...@@ -1262,11 +1274,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
return lowest_rq; return lowest_rq;
} }
static inline int has_pushable_tasks(struct rq *rq)
{
return !plist_head_empty(&rq->rt.pushable_tasks);
}
static struct task_struct *pick_next_pushable_task(struct rq *rq) static struct task_struct *pick_next_pushable_task(struct rq *rq)
{ {
struct task_struct *p; struct task_struct *p;
...@@ -1466,23 +1473,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) ...@@ -1466,23 +1473,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
pull_rt_task(rq); pull_rt_task(rq);
} }
/*
* assumes rq->lock is held
*/
static int needs_post_schedule_rt(struct rq *rq)
{
return has_pushable_tasks(rq);
}
static void post_schedule_rt(struct rq *rq) static void post_schedule_rt(struct rq *rq)
{ {
/*
* This is only called if needs_post_schedule_rt() indicates that
* we need to push tasks away
*/
spin_lock_irq(&rq->lock);
push_rt_tasks(rq); push_rt_tasks(rq);
spin_unlock_irq(&rq->lock);
} }
/* /*
...@@ -1758,7 +1751,6 @@ static const struct sched_class rt_sched_class = { ...@@ -1758,7 +1751,6 @@ static const struct sched_class rt_sched_class = {
.rq_online = rq_online_rt, .rq_online = rq_online_rt,
.rq_offline = rq_offline_rt, .rq_offline = rq_offline_rt,
.pre_schedule = pre_schedule_rt, .pre_schedule = pre_schedule_rt,
.needs_post_schedule = needs_post_schedule_rt,
.post_schedule = post_schedule_rt, .post_schedule = post_schedule_rt,
.task_wake_up = task_wake_up_rt, .task_wake_up = task_wake_up_rt,
.switched_from = switched_from_rt, .switched_from = switched_from_rt,
......
...@@ -245,6 +245,14 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ ...@@ -245,6 +245,14 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
#endif #endif
static struct ctl_table kern_table[] = { static struct ctl_table kern_table[] = {
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_child_runs_first",
.data = &sysctl_sched_child_runs_first,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG
{ {
.ctl_name = CTL_UNNUMBERED, .ctl_name = CTL_UNNUMBERED,
...@@ -297,14 +305,6 @@ static struct ctl_table kern_table[] = { ...@@ -297,14 +305,6 @@ static struct ctl_table kern_table[] = {
.strategy = &sysctl_intvec, .strategy = &sysctl_intvec,
.extra1 = &zero, .extra1 = &zero,
}, },
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_child_runs_first",
.data = &sysctl_sched_child_runs_first,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{ {
.ctl_name = CTL_UNNUMBERED, .ctl_name = CTL_UNNUMBERED,
.procname = "sched_features", .procname = "sched_features",
...@@ -329,6 +329,14 @@ static struct ctl_table kern_table[] = { ...@@ -329,6 +329,14 @@ static struct ctl_table kern_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = &proc_dointvec, .proc_handler = &proc_dointvec,
}, },
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_time_avg",
.data = &sysctl_sched_time_avg,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{ {
.ctl_name = CTL_UNNUMBERED, .ctl_name = CTL_UNNUMBERED,
.procname = "timer_migration", .procname = "timer_migration",
......
...@@ -317,8 +317,6 @@ static int worker_thread(void *__cwq) ...@@ -317,8 +317,6 @@ static int worker_thread(void *__cwq)
if (cwq->wq->freezeable) if (cwq->wq->freezeable)
set_freezable(); set_freezable();
set_user_nice(current, -5);
for (;;) { for (;;) {
prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
if (!freezing(current) && if (!freezing(current) &&
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment