Commit b5869ce7 authored by Linus Torvalds's avatar Linus Torvalds

Merge git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched

* git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched: (140 commits)
  sched: sync wakeups preempt too
  sched: affine sync wakeups
  sched: guest CPU accounting: maintain guest state in KVM
  sched: guest CPU accounting: maintain stats in account_system_time()
  sched: guest CPU accounting: add guest-CPU /proc/<pid>/stat fields
  sched: guest CPU accounting: add guest-CPU /proc/stat field
  sched: domain sysctl fixes: add terminator comment
  sched: domain sysctl fixes: do not crash on allocation failure
  sched: domain sysctl fixes: unregister the sysctl table before domains
  sched: domain sysctl fixes: use for_each_online_cpu()
  sched: domain sysctl fixes: use kcalloc()
  Make scheduler debug file operations const
  sched: enable wake-idle on CONFIG_SCHED_MC=y
  sched: reintroduce topology.h tunings
  sched: allow the immediate migration of cache-cold tasks
  sched: debug, improve migration statistics
  sched: debug: increase width of debug line
  sched: activate task_hot() only on fair-scheduled tasks
  sched: reintroduce cache-hot affinity
  sched: speed up context-switches a bit
  ...
parents df3d80f5 9c63d9c0
......@@ -117,3 +117,70 @@ Some implementation details:
iterators of the scheduling modules are used. The balancing code got
quite a bit simpler as a result.
Group scheduler extension to CFS
================================
Normally the scheduler operates on individual tasks and strives to provide
fair CPU time to each task. Sometimes, it may be desirable to group tasks
and provide fair CPU time to each such task group. For example, it may
be desirable to first provide fair CPU time to each user on the system
and then to each task belonging to a user.
CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets
SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such
groups. At present, there are two (mutually exclusive) mechanisms to group
tasks for CPU bandwidth control purpose:
- Based on user id (CONFIG_FAIR_USER_SCHED)
In this option, tasks are grouped according to their user id.
- Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED)
This options lets the administrator create arbitrary groups
of tasks, using the "cgroup" pseudo filesystem. See
Documentation/cgroups.txt for more information about this
filesystem.
Only one of these options to group tasks can be chosen and not both.
Group scheduler tunables:
When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for
each new user and a "cpu_share" file is added in that directory.
# cd /sys/kernel/uids
# cat 512/cpu_share # Display user 512's CPU share
1024
# echo 2048 > 512/cpu_share # Modify user 512's CPU share
# cat 512/cpu_share # Display user 512's CPU share
2048
#
CPU bandwidth between two users are divided in the ratio of their CPU shares.
For ex: if you would like user "root" to get twice the bandwidth of user
"guest", then set the cpu_share for both the users such that "root"'s
cpu_share is twice "guest"'s cpu_share
When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created
for each group created using the pseudo filesystem. See example steps
below to create task groups and modify their CPU share using the "cgroups"
pseudo filesystem
# mkdir /dev/cpuctl
# mount -t cgroup -ocpu none /dev/cpuctl
# cd /dev/cpuctl
# mkdir multimedia # create "multimedia" group of tasks
# mkdir browser # create "browser" group of tasks
# #Configure the multimedia group to receive twice the CPU bandwidth
# #that of browser group
# echo 2048 > multimedia/cpu.shares
# echo 1024 > browser/cpu.shares
# firefox & # Launch firefox and move it to "browser" group
# echo <firefox_pid> > browser/tasks
# #Launch gmplayer (or your favourite movie player)
# echo <movie_player_pid> > multimedia/tasks
......@@ -214,6 +214,17 @@ config X86_ES7000
endchoice
config SCHED_NO_NO_OMIT_FRAME_POINTER
bool "Single-depth WCHAN output"
default y
help
Calculate simpler /proc/<PID>/wchan values. If this option
is disabled then wchan values will recurse back to the
caller function. This provides more accurate wchan values,
at the expense of slightly more scheduling overhead.
If in doubt, say "Y".
config PARAVIRT
bool "Paravirtualization support (EXPERIMENTAL)"
depends on EXPERIMENTAL
......
......@@ -624,6 +624,16 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu);
int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
static inline void kvm_guest_enter(void)
{
current->flags |= PF_VCPU;
}
static inline void kvm_guest_exit(void)
{
current->flags &= ~PF_VCPU;
}
static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code)
{
......
......@@ -2046,6 +2046,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
vcpu->guest_mode = 1;
kvm_guest_enter();
if (vcpu->requests)
if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
......@@ -2053,6 +2054,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
kvm_x86_ops->run(vcpu, kvm_run);
kvm_guest_exit();
vcpu->guest_mode = 0;
local_irq_enable();
......
......@@ -45,8 +45,7 @@ void pipe_wait(struct pipe_inode_info *pipe)
* Pipes are system-local resources, so sleeping on them
* is considered a noninteractive wait:
*/
prepare_to_wait(&pipe->wait, &wait,
TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
if (pipe->inode)
mutex_unlock(&pipe->inode->i_mutex);
schedule();
......@@ -383,7 +382,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
/* Signal writers asynchronously that there is more room. */
if (do_wakeup) {
wake_up_interruptible(&pipe->wait);
wake_up_interruptible_sync(&pipe->wait);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
if (ret > 0)
......@@ -556,7 +555,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
out:
mutex_unlock(&inode->i_mutex);
if (do_wakeup) {
wake_up_interruptible(&pipe->wait);
wake_up_interruptible_sync(&pipe->wait);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
if (ret > 0)
......@@ -650,7 +649,7 @@ pipe_release(struct inode *inode, int decr, int decw)
if (!pipe->readers && !pipe->writers) {
free_pipe_info(inode);
} else {
wake_up_interruptible(&pipe->wait);
wake_up_interruptible_sync(&pipe->wait);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
......
......@@ -370,6 +370,11 @@ static cputime_t task_stime(struct task_struct *p)
}
#endif
static cputime_t task_gtime(struct task_struct *p)
{
return p->gtime;
}
static int do_task_stat(struct task_struct *task, char *buffer, int whole)
{
unsigned long vsize, eip, esp, wchan = ~0UL;
......@@ -385,6 +390,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
unsigned long cmin_flt = 0, cmaj_flt = 0;
unsigned long min_flt = 0, maj_flt = 0;
cputime_t cutime, cstime, utime, stime;
cputime_t cgtime, gtime;
unsigned long rsslim = 0;
char tcomm[sizeof(task->comm)];
unsigned long flags;
......@@ -403,6 +409,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
sigemptyset(&sigign);
sigemptyset(&sigcatch);
cutime = cstime = utime = stime = cputime_zero;
cgtime = gtime = cputime_zero;
rcu_read_lock();
if (lock_task_sighand(task, &flags)) {
......@@ -420,6 +427,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
cmaj_flt = sig->cmaj_flt;
cutime = sig->cutime;
cstime = sig->cstime;
cgtime = sig->cgtime;
rsslim = sig->rlim[RLIMIT_RSS].rlim_cur;
/* add up live thread stats at the group level */
......@@ -430,6 +438,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
maj_flt += t->maj_flt;
utime = cputime_add(utime, task_utime(t));
stime = cputime_add(stime, task_stime(t));
gtime = cputime_add(gtime, task_gtime(t));
t = next_thread(t);
} while (t != task);
......@@ -437,6 +446,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
maj_flt += sig->maj_flt;
utime = cputime_add(utime, sig->utime);
stime = cputime_add(stime, sig->stime);
gtime += cputime_add(gtime, sig->gtime);
}
sid = signal_session(sig);
......@@ -454,6 +464,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
maj_flt = task->maj_flt;
utime = task_utime(task);
stime = task_stime(task);
gtime = task_gtime(task);
}
/* scale priority and nice values from timeslices to -20..20 */
......@@ -471,7 +482,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \
%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n",
%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
task->pid,
tcomm,
state,
......@@ -516,7 +527,9 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
task_cpu(task),
task->rt_priority,
task->policy,
(unsigned long long)delayacct_blkio_ticks(task));
(unsigned long long)delayacct_blkio_ticks(task),
cputime_to_clock_t(gtime),
cputime_to_clock_t(cgtime));
if (mm)
mmput(mm);
return res;
......
......@@ -304,7 +304,7 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
return sprintf(buffer, "%llu %llu %lu\n",
task->sched_info.cpu_time,
task->sched_info.run_delay,
task->sched_info.pcnt);
task->sched_info.pcount);
}
#endif
......
......@@ -443,6 +443,7 @@ static int show_stat(struct seq_file *p, void *v)
int i;
unsigned long jif;
cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
cputime64_t guest;
u64 sum = 0;
struct timespec boottime;
unsigned int *per_irq_sum;
......@@ -453,6 +454,7 @@ static int show_stat(struct seq_file *p, void *v)
user = nice = system = idle = iowait =
irq = softirq = steal = cputime64_zero;
guest = cputime64_zero;
getboottime(&boottime);
jif = boottime.tv_sec;
......@@ -467,6 +469,7 @@ static int show_stat(struct seq_file *p, void *v)
irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
for (j = 0; j < NR_IRQS; j++) {
unsigned int temp = kstat_cpu(i).irqs[j];
sum += temp;
......@@ -474,7 +477,7 @@ static int show_stat(struct seq_file *p, void *v)
}
}
seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu\n",
seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
(unsigned long long)cputime64_to_clock_t(user),
(unsigned long long)cputime64_to_clock_t(nice),
(unsigned long long)cputime64_to_clock_t(system),
......@@ -482,7 +485,8 @@ static int show_stat(struct seq_file *p, void *v)
(unsigned long long)cputime64_to_clock_t(iowait),
(unsigned long long)cputime64_to_clock_t(irq),
(unsigned long long)cputime64_to_clock_t(softirq),
(unsigned long long)cputime64_to_clock_t(steal));
(unsigned long long)cputime64_to_clock_t(steal),
(unsigned long long)cputime64_to_clock_t(guest));
for_each_online_cpu(i) {
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
......@@ -494,7 +498,9 @@ static int show_stat(struct seq_file *p, void *v)
irq = kstat_cpu(i).cpustat.irq;
softirq = kstat_cpu(i).cpustat.softirq;
steal = kstat_cpu(i).cpustat.steal;
seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n",
guest = kstat_cpu(i).cpustat.guest;
seq_printf(p,
"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
i,
(unsigned long long)cputime64_to_clock_t(user),
(unsigned long long)cputime64_to_clock_t(nice),
......@@ -503,7 +509,8 @@ static int show_stat(struct seq_file *p, void *v)
(unsigned long long)cputime64_to_clock_t(iowait),
(unsigned long long)cputime64_to_clock_t(irq),
(unsigned long long)cputime64_to_clock_t(softirq),
(unsigned long long)cputime64_to_clock_t(steal));
(unsigned long long)cputime64_to_clock_t(steal),
(unsigned long long)cputime64_to_clock_t(guest));
}
seq_printf(p, "intr %llu", (unsigned long long)sum);
......
......@@ -23,6 +23,7 @@ struct cpu_usage_stat {
cputime64_t idle;
cputime64_t iowait;
cputime64_t steal;
cputime64_t guest;
};
struct kernel_stat {
......
......@@ -87,6 +87,7 @@ struct sched_param {
#include <linux/timer.h>
#include <linux/hrtimer.h>
#include <linux/task_io_accounting.h>
#include <linux/kobject.h>
#include <asm/processor.h>
......@@ -136,6 +137,7 @@ extern unsigned long weighted_cpuload(const int cpu);
struct seq_file;
struct cfs_rq;
struct task_group;
#ifdef CONFIG_SCHED_DEBUG
extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
extern void proc_sched_set_task(struct task_struct *p);
......@@ -174,8 +176,7 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
#define EXIT_ZOMBIE 16
#define EXIT_DEAD 32
/* in tsk->state again */
#define TASK_NONINTERACTIVE 64
#define TASK_DEAD 128
#define TASK_DEAD 64
#define __set_task_state(tsk, state_value) \
do { (tsk)->state = (state_value); } while (0)
......@@ -516,6 +517,8 @@ struct signal_struct {
* in __exit_signal, except for the group leader.
*/
cputime_t utime, stime, cutime, cstime;
cputime_t gtime;
cputime_t cgtime;
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
unsigned long inblock, oublock, cinblock, coublock;
......@@ -596,8 +599,21 @@ struct user_struct {
/* Hash table maintenance information */
struct hlist_node uidhash_node;
uid_t uid;
#ifdef CONFIG_FAIR_USER_SCHED
struct task_group *tg;
struct kset kset;
struct subsys_attribute user_attr;
struct work_struct work;
#endif
};
#ifdef CONFIG_FAIR_USER_SCHED
extern int uids_kobject_init(void);
#else
static inline int uids_kobject_init(void) { return 0; }
#endif
extern struct user_struct *find_user(uid_t);
extern struct user_struct root_user;
......@@ -609,13 +625,17 @@ struct reclaim_state;
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info {
/* cumulative counters */
unsigned long pcnt; /* # of times run on this cpu */
unsigned long pcount; /* # of times run on this cpu */
unsigned long long cpu_time, /* time spent on the cpu */
run_delay; /* time spent waiting on a runqueue */
/* timestamps */
unsigned long long last_arrival,/* when we last ran on a cpu */
last_queued; /* when we were last queued to run */
#ifdef CONFIG_SCHEDSTATS
/* BKL stats */
unsigned long bkl_count;
#endif
};
#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
......@@ -750,7 +770,7 @@ struct sched_domain {
#ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */
unsigned long lb_cnt[CPU_MAX_IDLE_TYPES];
unsigned long lb_count[CPU_MAX_IDLE_TYPES];
unsigned long lb_failed[CPU_MAX_IDLE_TYPES];
unsigned long lb_balanced[CPU_MAX_IDLE_TYPES];
unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES];
......@@ -760,17 +780,17 @@ struct sched_domain {
unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES];
/* Active load balancing */
unsigned long alb_cnt;
unsigned long alb_count;
unsigned long alb_failed;
unsigned long alb_pushed;
/* SD_BALANCE_EXEC stats */
unsigned long sbe_cnt;
unsigned long sbe_count;
unsigned long sbe_balanced;
unsigned long sbe_pushed;
/* SD_BALANCE_FORK stats */
unsigned long sbf_cnt;
unsigned long sbf_count;
unsigned long sbf_balanced;
unsigned long sbf_pushed;
......@@ -854,11 +874,11 @@ struct rq;
struct sched_domain;
struct sched_class {
struct sched_class *next;
const struct sched_class *next;
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
void (*yield_task) (struct rq *rq, struct task_struct *p);
void (*yield_task) (struct rq *rq);
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
......@@ -888,31 +908,22 @@ struct load_weight {
* 4 se->block_start
* 4 se->run_node
* 4 se->sleep_start
* 4 se->sleep_start_fair
* 6 se->load.weight
* 7 se->delta_fair
* 15 se->wait_runtime
*/
struct sched_entity {
long wait_runtime;
unsigned long delta_fair_run;
unsigned long delta_fair_sleep;
unsigned long delta_exec;
s64 fair_key;
struct load_weight load; /* for load-balancing */
struct rb_node run_node;
unsigned int on_rq;
int peer_preempt;
u64 exec_start;
u64 sum_exec_runtime;
u64 vruntime;
u64 prev_sum_exec_runtime;
u64 wait_start_fair;
u64 sleep_start_fair;
#ifdef CONFIG_SCHEDSTATS
u64 wait_start;
u64 wait_max;
s64 sum_wait_runtime;
u64 sleep_start;
u64 sleep_max;
......@@ -921,9 +932,25 @@ struct sched_entity {
u64 block_start;
u64 block_max;
u64 exec_max;
unsigned long wait_runtime_overruns;
unsigned long wait_runtime_underruns;
u64 slice_max;
u64 nr_migrations;
u64 nr_migrations_cold;
u64 nr_failed_migrations_affine;
u64 nr_failed_migrations_running;
u64 nr_failed_migrations_hot;
u64 nr_forced_migrations;
u64 nr_forced2_migrations;
u64 nr_wakeups;
u64 nr_wakeups_sync;
u64 nr_wakeups_migrate;
u64 nr_wakeups_local;
u64 nr_wakeups_remote;
u64 nr_wakeups_affine;
u64 nr_wakeups_affine_attempts;
u64 nr_wakeups_passive;
u64 nr_wakeups_idle;
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
......@@ -952,7 +979,7 @@ struct task_struct {
int prio, static_prio, normal_prio;
struct list_head run_list;
struct sched_class *sched_class;
const struct sched_class *sched_class;
struct sched_entity se;
#ifdef CONFIG_PREEMPT_NOTIFIERS
......@@ -1023,6 +1050,7 @@ struct task_struct {
unsigned int rt_priority;
cputime_t utime, stime;
cputime_t gtime;
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time; /* monotonic time */
struct timespec real_start_time; /* boot based time */
......@@ -1314,6 +1342,7 @@ static inline void put_task_struct(struct task_struct *t)
#define PF_STARTING 0x00000002 /* being created */
#define PF_EXITING 0x00000004 /* getting shut down */
#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
#define PF_DUMPCORE 0x00000200 /* dumped core */
......@@ -1401,15 +1430,17 @@ static inline void idle_task_exit(void) {}
extern void sched_idle_next(void);
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_nr_latency;
extern unsigned int sysctl_sched_wakeup_granularity;
extern unsigned int sysctl_sched_batch_wakeup_granularity;
extern unsigned int sysctl_sched_stat_granularity;
extern unsigned int sysctl_sched_runtime_limit;
extern unsigned int sysctl_sched_compat_yield;
extern unsigned int sysctl_sched_child_runs_first;
extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost;
#endif
extern unsigned int sysctl_sched_compat_yield;
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
......@@ -1843,6 +1874,18 @@ extern int sched_mc_power_savings, sched_smt_power_savings;
extern void normalize_rt_tasks(void);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern struct task_group init_task_group;
extern struct task_group *sched_create_group(void);
extern void sched_destroy_group(struct task_group *tg);
extern void sched_move_task(struct task_struct *tsk);
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern unsigned long sched_group_shares(struct task_group *tg);
#endif
#ifdef CONFIG_TASK_XACCT
static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
{
......
......@@ -159,15 +159,14 @@
.imbalance_pct = 125, \
.cache_nice_tries = 1, \
.busy_idx = 2, \
.idle_idx = 0, \
.newidle_idx = 0, \
.idle_idx = 1, \
.newidle_idx = 2, \
.wake_idx = 1, \
.forkexec_idx = 1, \
.flags = SD_LOAD_BALANCE \
| SD_BALANCE_NEWIDLE \
| SD_BALANCE_EXEC \
| SD_WAKE_AFFINE \
| SD_WAKE_IDLE \
| BALANCE_FOR_PKG_POWER,\
.last_balance = jiffies, \
.balance_interval = 1, \
......
......@@ -281,6 +281,27 @@ config CPUSETS
Say N if unsure.
config FAIR_GROUP_SCHED
bool "Fair group CPU scheduler"
default y
depends on EXPERIMENTAL
help
This feature lets CPU scheduler recognize task groups and control CPU
bandwidth allocation to such task groups.
choice
depends on FAIR_GROUP_SCHED
prompt "Basis for grouping tasks"
default FAIR_USER_SCHED
config FAIR_USER_SCHED
bool "user id"
help
This option will choose userid as the basis for grouping
tasks, thus providing equal CPU bandwidth to each user.
endchoice
config SYSFS_DEPRECATED
bool "Create deprecated sysfs files"
default y
......
......@@ -119,7 +119,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
* No locking available for sched_info (and too expensive to add one)
* Mitigate by taking snapshot of values
*/
t1 = tsk->sched_info.pcnt;
t1 = tsk->sched_info.pcount;
t2 = tsk->sched_info.run_delay;
t3 = tsk->sched_info.cpu_time;
......
......@@ -111,6 +111,7 @@ static void __exit_signal(struct task_struct *tsk)
*/
sig->utime = cputime_add(sig->utime, tsk->utime);
sig->stime = cputime_add(sig->stime, tsk->stime);
sig->gtime = cputime_add(sig->gtime, tsk->gtime);
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
......@@ -1242,6 +1243,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
cputime_add(p->stime,
cputime_add(sig->stime,
sig->cstime)));
psig->cgtime =
cputime_add(psig->cgtime,
cputime_add(p->gtime,
cputime_add(sig->gtime,
sig->cgtime)));
psig->cmin_flt +=
p->min_flt + sig->min_flt + sig->cmin_flt;
psig->cmaj_flt +=
......
......@@ -877,6 +877,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
sig->tty_old_pgrp = NULL;
sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
sig->gtime = cputime_zero;
sig->cgtime = cputime_zero;
sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
......@@ -1045,6 +1047,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->utime = cputime_zero;
p->stime = cputime_zero;
p->gtime = cputime_zero;
#ifdef CONFIG_TASK_XACCT
p->rchar = 0; /* I/O counter: bytes read */
......
......@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kexec.h>
#include <linux/sched.h>
#define KERNEL_ATTR_RO(_name) \
static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
......@@ -116,6 +117,13 @@ static int __init ksysfs_init(void)
&notes_attr);
}
/*
* Create "/sys/kernel/uids" directory and corresponding root user's
* directory under it.
*/
if (!error)
error = uids_kobject_init();
return error;
}
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -50,10 +50,15 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr)
{
}
static void set_curr_task_idle(struct rq *rq)
{
}
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
static struct sched_class idle_sched_class __read_mostly = {
const struct sched_class idle_sched_class = {
/* .next is NULL */
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
......@@ -66,6 +71,7 @@ static struct sched_class idle_sched_class __read_mostly = {
.load_balance = load_balance_idle,
.set_curr_task = set_curr_task_idle,
.task_tick = task_tick_idle,
/* no .task_new for idle tasks */
};
......@@ -7,7 +7,7 @@
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
*/
static inline void update_curr_rt(struct rq *rq)
static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
u64 delta_exec;
......@@ -59,9 +59,9 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p)
}
static void
yield_task_rt(struct rq *rq, struct task_struct *p)
yield_task_rt(struct rq *rq)
{
requeue_task_rt(rq, p);
requeue_task_rt(rq, rq->curr);
}
/*
......@@ -206,7 +206,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
if (--p->time_slice)
return;
p->time_slice = static_prio_timeslice(p->static_prio);
p->time_slice = DEF_TIMESLICE;
/*
* Requeue to the end of queue if we are not the only element
......@@ -218,7 +218,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
}
}
static struct sched_class rt_sched_class __read_mostly = {
static void set_curr_task_rt(struct rq *rq)
{
struct task_struct *p = rq->curr;
p->se.exec_start = rq->clock;
}
const struct sched_class rt_sched_class = {
.next = &fair_sched_class,
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
......@@ -230,5 +238,6 @@ static struct sched_class rt_sched_class __read_mostly = {
.load_balance = load_balance_rt,
.set_curr_task = set_curr_task_rt,
.task_tick = task_tick_rt,
};
......@@ -16,18 +16,18 @@ static int show_schedstat(struct seq_file *seq, void *v)
struct rq *rq = cpu_rq(cpu);
#ifdef CONFIG_SMP
struct sched_domain *sd;
int dcnt = 0;
int dcount = 0;
#endif
/* runqueue-specific stats */
seq_printf(seq,
"cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu",
cpu, rq->yld_both_empty,
rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
rq->ttwu_cnt, rq->ttwu_local,
rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
rq->sched_switch, rq->sched_count, rq->sched_goidle,
rq->ttwu_count, rq->ttwu_local,
rq->rq_sched_info.cpu_time,
rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
seq_printf(seq, "\n");
......@@ -39,12 +39,12 @@ static int show_schedstat(struct seq_file *seq, void *v)
char mask_str[NR_CPUS];
cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
seq_printf(seq, "domain%d %s", dcnt++, mask_str);
seq_printf(seq, "domain%d %s", dcount++, mask_str);
for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
itype++) {
seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
"%lu",
sd->lb_cnt[itype],
sd->lb_count[itype],
sd->lb_balanced[itype],
sd->lb_failed[itype],
sd->lb_imbalance[itype],
......@@ -55,9 +55,9 @@ static int show_schedstat(struct seq_file *seq, void *v)
}
seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
" %lu %lu %lu\n",
sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
sd->alb_count, sd->alb_failed, sd->alb_pushed,
sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
sd->ttwu_wake_remote, sd->ttwu_move_affine,
sd->ttwu_move_balance);
}
......@@ -101,7 +101,7 @@ rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
{
if (rq) {
rq->rq_sched_info.run_delay += delta;
rq->rq_sched_info.pcnt++;
rq->rq_sched_info.pcount++;
}
}
......@@ -129,7 +129,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
# define schedstat_set(var, val) do { } while (0)
#endif
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
#ifdef CONFIG_SCHEDSTATS
/*
* Called when a process is dequeued from the active array and given
* the cpu. We should note that with the exception of interactive
......@@ -164,7 +164,7 @@ static void sched_info_arrive(struct task_struct *t)
sched_info_dequeued(t);
t->sched_info.run_delay += delta;
t->sched_info.last_arrival = now;
t->sched_info.pcnt++;
t->sched_info.pcount++;
rq_sched_info_arrive(task_rq(t), delta);
}
......@@ -233,5 +233,5 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
#else
#define sched_info_queued(t) do { } while (0)
#define sched_info_switch(t, next) do { } while (0)
#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
#endif /* CONFIG_SCHEDSTATS */
......@@ -222,14 +222,11 @@ static ctl_table kern_table[] = {
#ifdef CONFIG_SCHED_DEBUG
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_min_granularity_ns",
.data = &sysctl_sched_min_granularity,
.procname = "sched_nr_latency",
.data = &sysctl_sched_nr_latency,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &min_sched_granularity_ns,
.extra2 = &max_sched_granularity_ns,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
......@@ -266,38 +263,24 @@ static ctl_table kern_table[] = {
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_stat_granularity_ns",
.data = &sysctl_sched_stat_granularity,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &min_wakeup_granularity_ns,
.extra2 = &max_wakeup_granularity_ns,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_runtime_limit_ns",
.data = &sysctl_sched_runtime_limit,
.procname = "sched_child_runs_first",
.data = &sysctl_sched_child_runs_first,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &min_sched_granularity_ns,
.extra2 = &max_sched_granularity_ns,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_child_runs_first",
.data = &sysctl_sched_child_runs_first,
.procname = "sched_features",
.data = &sysctl_sched_features,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_features",
.data = &sysctl_sched_features,
.procname = "sched_migration_cost",
.data = &sysctl_sched_migration_cost,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
......
......@@ -50,12 +50,16 @@ struct user_struct root_user = {
.uid_keyring = &root_user_keyring,
.session_keyring = &root_session_keyring,
#endif
#ifdef CONFIG_FAIR_USER_SCHED
.tg = &init_task_group,
#endif
};
/*
* These routines must be called with the uidhash spinlock held!
*/
static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
static inline void uid_hash_insert(struct user_struct *up,
struct hlist_head *hashent)
{
hlist_add_head(&up->uidhash_node, hashent);
}
......@@ -65,13 +69,14 @@ static inline void uid_hash_remove(struct user_struct *up)
hlist_del_init(&up->uidhash_node);
}
static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
static inline struct user_struct *uid_hash_find(uid_t uid,
struct hlist_head *hashent)
{
struct user_struct *user;
struct hlist_node *h;
hlist_for_each_entry(user, h, hashent, uidhash_node) {
if(user->uid == uid) {
if (user->uid == uid) {
atomic_inc(&user->__count);
return user;
}
......@@ -80,6 +85,203 @@ static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *ha
return NULL;
}
#ifdef CONFIG_FAIR_USER_SCHED
static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */
static DEFINE_MUTEX(uids_mutex);
static void sched_destroy_user(struct user_struct *up)
{
sched_destroy_group(up->tg);
}
static int sched_create_user(struct user_struct *up)
{
int rc = 0;
up->tg = sched_create_group();
if (IS_ERR(up->tg))
rc = -ENOMEM;
return rc;
}
static void sched_switch_user(struct task_struct *p)
{
sched_move_task(p);
}
static inline void uids_mutex_lock(void)
{
mutex_lock(&uids_mutex);
}
static inline void uids_mutex_unlock(void)
{
mutex_unlock(&uids_mutex);
}
/* return cpu shares held by the user */
ssize_t cpu_shares_show(struct kset *kset, char *buffer)
{
struct user_struct *up = container_of(kset, struct user_struct, kset);
return sprintf(buffer, "%lu\n", sched_group_shares(up->tg));
}
/* modify cpu shares held by the user */
ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size)
{
struct user_struct *up = container_of(kset, struct user_struct, kset);
unsigned long shares;
int rc;
sscanf(buffer, "%lu", &shares);
rc = sched_group_set_shares(up->tg, shares);
return (rc ? rc : size);
}
static void user_attr_init(struct subsys_attribute *sa, char *name, int mode)
{
sa->attr.name = name;
sa->attr.mode = mode;
sa->show = cpu_shares_show;
sa->store = cpu_shares_store;
}
/* Create "/sys/kernel/uids/<uid>" directory and
* "/sys/kernel/uids/<uid>/cpu_share" file for this user.
*/
static int user_kobject_create(struct user_struct *up)
{
struct kset *kset = &up->kset;
struct kobject *kobj = &kset->kobj;
int error;
memset(kset, 0, sizeof(struct kset));
kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */
kobject_set_name(kobj, "%d", up->uid);
kset_init(kset);
user_attr_init(&up->user_attr, "cpu_share", 0644);
error = kobject_add(kobj);
if (error)
goto done;
error = sysfs_create_file(kobj, &up->user_attr.attr);
if (error)
kobject_del(kobj);
kobject_uevent(kobj, KOBJ_ADD);
done:
return error;
}
/* create these in sysfs filesystem:
* "/sys/kernel/uids" directory
* "/sys/kernel/uids/0" directory (for root user)
* "/sys/kernel/uids/0/cpu_share" file (for root user)
*/
int __init uids_kobject_init(void)
{
int error;
/* create under /sys/kernel dir */
uids_kobject.parent = &kernel_subsys.kobj;
uids_kobject.kset = &kernel_subsys;
kobject_set_name(&uids_kobject, "uids");
kobject_init(&uids_kobject);
error = kobject_add(&uids_kobject);
if (!error)
error = user_kobject_create(&root_user);
return error;
}
/* work function to remove sysfs directory for a user and free up
* corresponding structures.
*/
static void remove_user_sysfs_dir(struct work_struct *w)
{
struct user_struct *up = container_of(w, struct user_struct, work);
struct kobject *kobj = &up->kset.kobj;
unsigned long flags;
int remove_user = 0;
/* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
* atomic.
*/
uids_mutex_lock();
local_irq_save(flags);
if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
uid_hash_remove(up);
remove_user = 1;
spin_unlock_irqrestore(&uidhash_lock, flags);
} else {
local_irq_restore(flags);
}
if (!remove_user)
goto done;
sysfs_remove_file(kobj, &up->user_attr.attr);
kobject_uevent(kobj, KOBJ_REMOVE);
kobject_del(kobj);
sched_destroy_user(up);
key_put(up->uid_keyring);
key_put(up->session_keyring);
kmem_cache_free(uid_cachep, up);
done:
uids_mutex_unlock();
}
/* IRQs are disabled and uidhash_lock is held upon function entry.
* IRQ state (as stored in flags) is restored and uidhash_lock released
* upon function exit.
*/
static inline void free_user(struct user_struct *up, unsigned long flags)
{
/* restore back the count */
atomic_inc(&up->__count);
spin_unlock_irqrestore(&uidhash_lock, flags);
INIT_WORK(&up->work, remove_user_sysfs_dir);
schedule_work(&up->work);
}
#else /* CONFIG_FAIR_USER_SCHED */
static void sched_destroy_user(struct user_struct *up) { }
static int sched_create_user(struct user_struct *up) { return 0; }
static void sched_switch_user(struct task_struct *p) { }
static inline int user_kobject_create(struct user_struct *up) { return 0; }
static inline void uids_mutex_lock(void) { }
static inline void uids_mutex_unlock(void) { }
/* IRQs are disabled and uidhash_lock is held upon function entry.
* IRQ state (as stored in flags) is restored and uidhash_lock released
* upon function exit.
*/
static inline void free_user(struct user_struct *up, unsigned long flags)
{
uid_hash_remove(up);
spin_unlock_irqrestore(&uidhash_lock, flags);
sched_destroy_user(up);
key_put(up->uid_keyring);
key_put(up->session_keyring);
kmem_cache_free(uid_cachep, up);
}
#endif /* CONFIG_FAIR_USER_SCHED */
/*
* Locate the user_struct for the passed UID. If found, take a ref on it. The
* caller must undo that ref with free_uid().
......@@ -106,15 +308,10 @@ void free_uid(struct user_struct *up)
return;
local_irq_save(flags);
if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
uid_hash_remove(up);
spin_unlock_irqrestore(&uidhash_lock, flags);
key_put(up->uid_keyring);
key_put(up->session_keyring);
kmem_cache_free(uid_cachep, up);
} else {
if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
free_user(up, flags);
else
local_irq_restore(flags);
}
}
struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
......@@ -122,6 +319,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
struct hlist_head *hashent = uidhashentry(ns, uid);
struct user_struct *up;
/* Make uid_hash_find() + user_kobject_create() + uid_hash_insert()
* atomic.
*/
uids_mutex_lock();
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
spin_unlock_irq(&uidhash_lock);
......@@ -150,6 +352,22 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
return NULL;
}
if (sched_create_user(new) < 0) {
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
return NULL;
}
if (user_kobject_create(new)) {
sched_destroy_user(new);
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
uids_mutex_unlock();
return NULL;
}
/*
* Before adding this, check whether we raced
* on adding the same user already..
......@@ -157,6 +375,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
/* This case is not possible when CONFIG_FAIR_USER_SCHED
* is defined, since we serialize alloc_uid() using
* uids_mutex. Hence no need to call
* sched_destroy_user() or remove_user_sysfs_dir().
*/
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
......@@ -167,6 +390,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
spin_unlock_irq(&uidhash_lock);
}
uids_mutex_unlock();
return up;
}
......@@ -184,6 +410,7 @@ void switch_uid(struct user_struct *new_user)
atomic_dec(&old_user->processes);
switch_uid_keyring(new_user);
current->user = new_user;
sched_switch_user(current);
/*
* We need to synchronize with __sigqueue_alloc()
......
......@@ -334,7 +334,7 @@ static void unix_write_space(struct sock *sk)
read_lock(&sk->sk_callback_lock);
if (unix_writable(sk)) {
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
wake_up_interruptible(sk->sk_sleep);
wake_up_interruptible_sync(sk->sk_sleep);
sk_wake_async(sk, 2, POLL_OUT);
}
read_unlock(&sk->sk_callback_lock);
......@@ -1639,7 +1639,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
if (!skb)
goto out_unlock;
wake_up_interruptible(&u->peer_wait);
wake_up_interruptible_sync(&u->peer_wait);
if (msg->msg_name)
unix_copy_addr(msg, skb->sk);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment