Commit 0b981cb9 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler changes from Ingo Molnar:
 "Continued quest to clean up and enhance the cputime code by Frederic
  Weisbecker, in preparation for future tickless kernel features.

  Other than that, smallish changes."

Fix up trivial conflicts due to additions next to each other in arch/{x86/}Kconfig

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (24 commits)
  cputime: Make finegrained irqtime accounting generally available
  cputime: Gather time/stats accounting config options into a single menu
  ia64: Reuse system and user vtime accounting functions on task switch
  ia64: Consolidate user vtime accounting
  vtime: Consolidate system/idle context detection
  cputime: Use a proper subsystem naming for vtime related APIs
  sched: cpu_power: enable ARCH_POWER
  sched/nohz: Clean up select_nohz_load_balancer()
  sched: Fix load avg vs. cpu-hotplug
  sched: Remove __ARCH_WANT_INTERRUPTS_ON_CTXSW
  sched: Fix nohz_idle_balance()
  sched: Remove useless code in yield_to()
  sched: Add time unit suffix to sched sysctl knobs
  sched/debug: Limit sd->*_idx range on sysctl
  sched: Remove AFFINE_WAKEUPS feature flag
  s390: Remove leftover account_tick_vtime() header
  cputime: Consolidate vtime handling on context switch
  sched: Move cputime code to its own file
  cputime: Generalize CONFIG_VIRT_CPU_ACCOUNTING
  tile: Remove SD_PREFER_LOCAL leftover
  ...
parents 4cba3335 fdf9c356
......@@ -17,16 +17,6 @@ you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file
Unlocked context switches introduce only a very minor performance
penalty to the core scheduler implementation in the CONFIG_SMP case.
2. Interrupt status
By default, the switch_to arch function is called with interrupts
disabled. Interrupts may be enabled over the call if it is likely to
introduce a significant interrupt latency by adding the line
`#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for
unlocked context switches. This define also implies
`__ARCH_WANT_UNLOCKED_CTXSW`. See arch/arm/include/asm/system.h for an
example.
CPU idle
========
Your cpu_idle routines need to obey the following rules:
......
......@@ -304,4 +304,13 @@ config HAVE_RCU_USER_QS
are already protected inside rcu_irq_enter/rcu_irq_exit() but
preemption or signal handling on irq exit still need to be protected.
config HAVE_VIRT_CPU_ACCOUNTING
bool
config HAVE_IRQ_TIME_ACCOUNTING
bool
help
Archs need to ensure they use a high enough resolution clock to
support irq time accounting and then call enable_sched_clock_irqtime().
source "kernel/gcov/Kconfig"
......@@ -25,6 +25,7 @@ config IA64
select HAVE_GENERIC_HARDIRQS
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_VIRT_CPU_ACCOUNTING
select ARCH_DISCARD_MEMBLOCK
select GENERIC_IRQ_PROBE
select GENERIC_PENDING_IRQ if SMP
......@@ -340,17 +341,6 @@ config FORCE_MAX_ZONEORDER
default "17" if HUGETLB_PAGE
default "11"
config VIRT_CPU_ACCOUNTING
bool "Deterministic task and CPU time accounting"
default n
help
Select this option to enable more accurate task and CPU time
accounting. This is done by reading a CPU counter on each
kernel entry and exit and on transitions within the kernel
between system, softirq and hardirq state, so there is a
small performance impact.
If in doubt, say N here.
config SMP
bool "Symmetric multi-processing support"
select USE_GENERIC_SMP_HELPERS
......
......@@ -30,13 +30,6 @@ extern struct task_struct *ia64_switch_to (void *next_task);
extern void ia64_save_extra (struct task_struct *task);
extern void ia64_load_extra (struct task_struct *task);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct *next);
# define IA64_ACCOUNT_ON_SWITCH(p,n) ia64_account_on_switch(p,n)
#else
# define IA64_ACCOUNT_ON_SWITCH(p,n)
#endif
#ifdef CONFIG_PERFMON
DECLARE_PER_CPU(unsigned long, pfm_syst_info);
# define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1)
......@@ -49,7 +42,6 @@ extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct
|| PERFMON_IS_SYSWIDE())
#define __switch_to(prev,next,last) do { \
IA64_ACCOUNT_ON_SWITCH(prev, next); \
if (IA64_HAS_EXTRA_STATE(prev)) \
ia64_save_extra(prev); \
if (IA64_HAS_EXTRA_STATE(next)) \
......
......@@ -83,32 +83,36 @@ static struct clocksource *itc_clocksource;
extern cputime_t cycle_to_cputime(u64 cyc);
static void vtime_account_user(struct task_struct *tsk)
{
cputime_t delta_utime;
struct thread_info *ti = task_thread_info(tsk);
if (ti->ac_utime) {
delta_utime = cycle_to_cputime(ti->ac_utime);
account_user_time(tsk, delta_utime, delta_utime);
ti->ac_utime = 0;
}
}
/*
* Called from the context switch with interrupts disabled, to charge all
* accumulated times to the current process, and to prepare accounting on
* the next process.
*/
void ia64_account_on_switch(struct task_struct *prev, struct task_struct *next)
void vtime_task_switch(struct task_struct *prev)
{
struct thread_info *pi = task_thread_info(prev);
struct thread_info *ni = task_thread_info(next);
cputime_t delta_stime, delta_utime;
__u64 now;
struct thread_info *ni = task_thread_info(current);
now = ia64_get_itc();
delta_stime = cycle_to_cputime(pi->ac_stime + (now - pi->ac_stamp));
if (idle_task(smp_processor_id()) != prev)
account_system_time(prev, 0, delta_stime, delta_stime);
vtime_account_system(prev);
else
account_idle_time(delta_stime);
vtime_account_idle(prev);
if (pi->ac_utime) {
delta_utime = cycle_to_cputime(pi->ac_utime);
account_user_time(prev, delta_utime, delta_utime);
}
vtime_account_user(prev);
pi->ac_stamp = ni->ac_stamp = now;
pi->ac_stamp = ni->ac_stamp;
ni->ac_stime = ni->ac_utime = 0;
}
......@@ -116,29 +120,32 @@ void ia64_account_on_switch(struct task_struct *prev, struct task_struct *next)
* Account time for a transition between system, hard irq or soft irq state.
* Note that this function is called with interrupts enabled.
*/
void account_system_vtime(struct task_struct *tsk)
static cputime_t vtime_delta(struct task_struct *tsk)
{
struct thread_info *ti = task_thread_info(tsk);
unsigned long flags;
cputime_t delta_stime;
__u64 now;
local_irq_save(flags);
now = ia64_get_itc();
delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp));
if (irq_count() || idle_task(smp_processor_id()) != tsk)
account_system_time(tsk, 0, delta_stime, delta_stime);
else
account_idle_time(delta_stime);
ti->ac_stime = 0;
ti->ac_stamp = now;
local_irq_restore(flags);
return delta_stime;
}
void vtime_account_system(struct task_struct *tsk)
{
cputime_t delta = vtime_delta(tsk);
account_system_time(tsk, 0, delta, delta);
}
void vtime_account_idle(struct task_struct *tsk)
{
account_idle_time(vtime_delta(tsk));
}
EXPORT_SYMBOL_GPL(account_system_vtime);
/*
* Called from the timer interrupt handler to charge accumulated user time
......@@ -146,14 +153,7 @@ EXPORT_SYMBOL_GPL(account_system_vtime);
*/
void account_process_tick(struct task_struct *p, int user_tick)
{
struct thread_info *ti = task_thread_info(p);
cputime_t delta_utime;
if (ti->ac_utime) {
delta_utime = cycle_to_cputime(ti->ac_utime);
account_user_time(p, delta_utime, delta_utime);
ti->ac_utime = 0;
}
vtime_account_user(p);
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
......
......@@ -197,12 +197,6 @@ struct cpu_usage {
DECLARE_PER_CPU(struct cpu_usage, cpu_usage_array);
#if defined(CONFIG_VIRT_CPU_ACCOUNTING)
#define account_process_vtime(tsk) account_process_tick(tsk, 0)
#else
#define account_process_vtime(tsk) do { } while (0)
#endif
extern void secondary_cpu_time_init(void);
DECLARE_PER_CPU(u64, decrementers_next_tb);
......
......@@ -514,9 +514,6 @@ struct task_struct *__switch_to(struct task_struct *prev,
local_irq_save(flags);
account_system_vtime(current);
account_process_vtime(current);
/*
* We can't take a PMU exception inside _switch() since there is a
* window where the kernel stack SLB and the kernel stack are out
......
......@@ -291,13 +291,12 @@ static inline u64 calculate_stolen_time(u64 stop_tb)
* Account time for a transition between system, hard irq
* or soft irq state.
*/
void account_system_vtime(struct task_struct *tsk)
static u64 vtime_delta(struct task_struct *tsk,
u64 *sys_scaled, u64 *stolen)
{
u64 now, nowscaled, delta, deltascaled;
unsigned long flags;
u64 stolen, udelta, sys_scaled, user_scaled;
u64 now, nowscaled, deltascaled;
u64 udelta, delta, user_scaled;
local_irq_save(flags);
now = mftb();
nowscaled = read_spurr(now);
get_paca()->system_time += now - get_paca()->starttime;
......@@ -305,7 +304,7 @@ void account_system_vtime(struct task_struct *tsk)
deltascaled = nowscaled - get_paca()->startspurr;
get_paca()->startspurr = nowscaled;
stolen = calculate_stolen_time(now);
*stolen = calculate_stolen_time(now);
delta = get_paca()->system_time;
get_paca()->system_time = 0;
......@@ -322,35 +321,45 @@ void account_system_vtime(struct task_struct *tsk)
* the user ticks get saved up in paca->user_time_scaled to be
* used by account_process_tick.
*/
sys_scaled = delta;
*sys_scaled = delta;
user_scaled = udelta;
if (deltascaled != delta + udelta) {
if (udelta) {
sys_scaled = deltascaled * delta / (delta + udelta);
user_scaled = deltascaled - sys_scaled;
*sys_scaled = deltascaled * delta / (delta + udelta);
user_scaled = deltascaled - *sys_scaled;
} else {
sys_scaled = deltascaled;
*sys_scaled = deltascaled;
}
}
get_paca()->user_time_scaled += user_scaled;
if (in_interrupt() || idle_task(smp_processor_id()) != tsk) {
account_system_time(tsk, 0, delta, sys_scaled);
if (stolen)
account_steal_time(stolen);
} else {
account_idle_time(delta + stolen);
}
local_irq_restore(flags);
return delta;
}
void vtime_account_system(struct task_struct *tsk)
{
u64 delta, sys_scaled, stolen;
delta = vtime_delta(tsk, &sys_scaled, &stolen);
account_system_time(tsk, 0, delta, sys_scaled);
if (stolen)
account_steal_time(stolen);
}
void vtime_account_idle(struct task_struct *tsk)
{
u64 delta, sys_scaled, stolen;
delta = vtime_delta(tsk, &sys_scaled, &stolen);
account_idle_time(delta + stolen);
}
EXPORT_SYMBOL_GPL(account_system_vtime);
/*
* Transfer the user and system times accumulated in the paca
* by the exception entry and exit code to the generic process
* user and system time records.
* Must be called with interrupts disabled.
* Assumes that account_system_vtime() has been called recently
* Assumes that vtime_account() has been called recently
* (i.e. since the last entry from usermode) so that
* get_paca()->user_time_scaled is up to date.
*/
......@@ -366,6 +375,12 @@ void account_process_tick(struct task_struct *tsk, int user_tick)
account_user_time(tsk, utime, utimescaled);
}
void vtime_task_switch(struct task_struct *prev)
{
vtime_account(prev);
account_process_tick(prev, 0);
}
#else /* ! CONFIG_VIRT_CPU_ACCOUNTING */
#define calc_cputime_factors()
#endif
......
config PPC64
bool "64-bit kernel"
default n
select HAVE_VIRT_CPU_ACCOUNTING
help
This option selects whether a 32-bit or a 64-bit kernel
will be built.
......@@ -337,21 +338,6 @@ config PPC_MM_SLICES
default y if (!PPC_FSL_BOOK3E && PPC64 && HUGETLB_PAGE) || (PPC_STD_MMU_64 && PPC_64K_PAGES)
default n
config VIRT_CPU_ACCOUNTING
bool "Deterministic task and CPU time accounting"
depends on PPC64
default y
help
Select this option to enable more accurate task and CPU time
accounting. This is done by reading a CPU counter on each
kernel entry and exit and on transitions within the kernel
between system, softirq and hardirq state, so there is a
small performance impact. This also enables accounting of
stolen time on logically-partitioned systems running on
IBM POWER5-based machines.
If in doubt, say Y here.
config PPC_HAVE_PMU_SUPPORT
bool
......
......@@ -49,9 +49,6 @@ config GENERIC_LOCKBREAK
config PGSTE
def_bool y if KVM
config VIRT_CPU_ACCOUNTING
def_bool y
config ARCH_SUPPORTS_DEBUG_PAGEALLOC
def_bool y
......@@ -89,6 +86,8 @@ config S390
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_CMPXCHG_LOCAL
select HAVE_VIRT_CPU_ACCOUNTING
select VIRT_CPU_ACCOUNTING
select ARCH_DISCARD_MEMBLOCK
select BUILDTIME_EXTABLE_SORT
select ARCH_INLINE_SPIN_TRYLOCK
......
......@@ -12,6 +12,9 @@
#include <linux/spinlock.h>
#include <asm/div64.h>
#define __ARCH_HAS_VTIME_ACCOUNT
/* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */
typedef unsigned long long __nocast cputime_t;
......
......@@ -89,12 +89,8 @@ static inline void restore_access_regs(unsigned int *acrs)
prev = __switch_to(prev,next); \
} while (0)
extern void account_vtime(struct task_struct *, struct task_struct *);
extern void account_tick_vtime(struct task_struct *);
#define finish_arch_switch(prev) do { \
set_fs(current->thread.mm_segment); \
account_vtime(prev, current); \
} while (0)
#endif /* __ASM_SWITCH_TO_H */
......@@ -99,7 +99,7 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset)
return virt_timer_forward(user + system);
}
void account_vtime(struct task_struct *prev, struct task_struct *next)
void vtime_task_switch(struct task_struct *prev)
{
struct thread_info *ti;
......@@ -107,7 +107,7 @@ void account_vtime(struct task_struct *prev, struct task_struct *next)
ti = task_thread_info(prev);
ti->user_timer = S390_lowcore.user_timer;
ti->system_timer = S390_lowcore.system_timer;
ti = task_thread_info(next);
ti = task_thread_info(current);
S390_lowcore.user_timer = ti->user_timer;
S390_lowcore.system_timer = ti->system_timer;
}
......@@ -122,7 +122,7 @@ void account_process_tick(struct task_struct *tsk, int user_tick)
* Update process times based on virtual cpu times stored by entry.S
* to the lowcore fields user_timer, system_timer & steal_clock.
*/
void account_system_vtime(struct task_struct *tsk)
void vtime_account(struct task_struct *tsk)
{
struct thread_info *ti = task_thread_info(tsk);
u64 timer, system;
......@@ -138,7 +138,7 @@ void account_system_vtime(struct task_struct *tsk)
virt_timer_forward(system);
}
EXPORT_SYMBOL_GPL(account_system_vtime);
EXPORT_SYMBOL_GPL(vtime_account);
void __kprobes vtime_stop_cpu(void)
{
......
......@@ -69,7 +69,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
| 1*SD_BALANCE_FORK \
| 0*SD_BALANCE_WAKE \
| 0*SD_WAKE_AFFINE \
| 0*SD_PREFER_LOCAL \
| 0*SD_SHARE_CPUPOWER \
| 0*SD_SHARE_PKG_RESOURCES \
| 0*SD_SERIALIZE \
......
......@@ -101,6 +101,7 @@ config X86
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
select HAVE_RCU_USER_QS if X86_64
select HAVE_IRQ_TIME_ACCOUNTING
config INSTRUCTION_DECODER
def_bool (KPROBES || PERF_EVENTS || UPROBES)
......@@ -800,17 +801,6 @@ config SCHED_MC
making when dealing with multi-core CPU chips at a cost of slightly
increased overhead in some places. If unsure say N here.
config IRQ_TIME_ACCOUNTING
bool "Fine granularity task level IRQ time accounting"
default n
---help---
Select this option to enable fine granularity task irq time
accounting. This is done by reading a timestamp on each
transitions between softirq and hardirq state, so there can be a
small performance impact.
If in doubt, say N here.
source "kernel/Kconfig.preempt"
config X86_UP_APIC
......
......@@ -132,11 +132,11 @@ extern void synchronize_irq(unsigned int irq);
struct task_struct;
#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING)
static inline void account_system_vtime(struct task_struct *tsk)
static inline void vtime_account(struct task_struct *tsk)
{
}
#else
extern void account_system_vtime(struct task_struct *tsk);
extern void vtime_account(struct task_struct *tsk);
#endif
#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
......@@ -162,7 +162,7 @@ extern void rcu_nmi_exit(void);
*/
#define __irq_enter() \
do { \
account_system_vtime(current); \
vtime_account(current); \
add_preempt_count(HARDIRQ_OFFSET); \
trace_hardirq_enter(); \
} while (0)
......@@ -178,7 +178,7 @@ extern void irq_enter(void);
#define __irq_exit() \
do { \
trace_hardirq_exit(); \
account_system_vtime(current); \
vtime_account(current); \
sub_preempt_count(HARDIRQ_OFFSET); \
} while (0)
......
......@@ -130,4 +130,12 @@ extern void account_process_tick(struct task_struct *, int user);
extern void account_steal_ticks(unsigned long ticks);
extern void account_idle_ticks(unsigned long ticks);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
extern void vtime_task_switch(struct task_struct *prev);
extern void vtime_account_system(struct task_struct *tsk);
extern void vtime_account_idle(struct task_struct *tsk);
#else
static inline void vtime_task_switch(struct task_struct *prev) { }
#endif
#endif /* _LINUX_KERNEL_STAT_H */
......@@ -685,7 +685,7 @@ static inline int kvm_deassign_device(struct kvm *kvm,
static inline void kvm_guest_enter(void)
{
BUG_ON(preemptible());
account_system_vtime(current);
vtime_account(current);
current->flags |= PF_VCPU;
/* KVM does not hold any references to rcu protected data when it
* switches CPU into a guest mode. In fact switching to a guest mode
......@@ -699,7 +699,7 @@ static inline void kvm_guest_enter(void)
static inline void kvm_guest_exit(void)
{
account_system_vtime(current);
vtime_account(current);
current->flags &= ~PF_VCPU;
}
......
......@@ -273,11 +273,11 @@ extern void init_idle_bootup_task(struct task_struct *idle);
extern int runqueue_is_locked(int cpu);
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
extern void select_nohz_load_balancer(int stop_tick);
extern void nohz_balance_enter_idle(int cpu);
extern void set_cpu_sd_state_idle(void);
extern int get_nohz_timer_target(void);
#else
static inline void select_nohz_load_balancer(int stop_tick) { }
static inline void nohz_balance_enter_idle(int cpu) { }
static inline void set_cpu_sd_state_idle(void) { }
#endif
......@@ -681,11 +681,6 @@ struct signal_struct {
* (notably. ptrace) */
};
/* Context switch must be unlocked if interrupts are to be enabled */
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
# define __ARCH_WANT_UNLOCKED_CTXSW
#endif
/*
* Bits in flags field of signal_struct.
*/
......@@ -863,7 +858,6 @@ enum cpu_idle_type {
#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
#define SD_PREFER_LOCAL 0x0040 /* Prefer to keep tasks local to this domain */
#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
......
......@@ -129,7 +129,6 @@ int arch_update_cpu_topology(void);
| 1*SD_BALANCE_FORK \
| 0*SD_BALANCE_WAKE \
| 1*SD_WAKE_AFFINE \
| 0*SD_PREFER_LOCAL \
| 0*SD_SHARE_CPUPOWER \
| 1*SD_SHARE_PKG_RESOURCES \
| 0*SD_SERIALIZE \
......@@ -160,7 +159,6 @@ int arch_update_cpu_topology(void);
| 1*SD_BALANCE_FORK \
| 0*SD_BALANCE_WAKE \
| 1*SD_WAKE_AFFINE \
| 0*SD_PREFER_LOCAL \
| 0*SD_SHARE_CPUPOWER \
| 0*SD_SHARE_PKG_RESOURCES \
| 0*SD_SERIALIZE \
......
......@@ -267,6 +267,106 @@ config POSIX_MQUEUE_SYSCTL
depends on SYSCTL
default y
config FHANDLE
bool "open by fhandle syscalls"
select EXPORTFS
help
If you say Y here, a user level program will be able to map
file names to handle and then later use the handle for
different file system operations. This is useful in implementing
userspace file servers, which now track files using handles instead
of names. The handle would remain the same even if file names
get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2)
syscalls.
config AUDIT
bool "Auditing support"
depends on NET
help
Enable auditing infrastructure that can be used with another
kernel subsystem, such as SELinux (which requires this for
logging of avc messages output). Does not do system-call
auditing without CONFIG_AUDITSYSCALL.
config AUDITSYSCALL
bool "Enable system-call auditing support"
depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || (ARM && AEABI && !OABI_COMPAT))
default y if SECURITY_SELINUX
help
Enable low-overhead system-call auditing infrastructure that
can be used independently or with another kernel subsystem,
such as SELinux.
config AUDIT_WATCH
def_bool y
depends on AUDITSYSCALL
select FSNOTIFY
config AUDIT_TREE
def_bool y
depends on AUDITSYSCALL
select FSNOTIFY
config AUDIT_LOGINUID_IMMUTABLE
bool "Make audit loginuid immutable"
depends on AUDIT
help
The config option toggles if a task setting its loginuid requires
CAP_SYS_AUDITCONTROL or if that task should require no special permissions
but should instead only allow setting its loginuid if it was never
previously set. On systems which use systemd or a similar central
process to restart login services this should be set to true. On older
systems in which an admin would typically have to directly stop and
start processes this should be set to false. Setting this to true allows
one to drop potentially dangerous capabilites from the login tasks,
but may not be backwards compatible with older init systems.
source "kernel/irq/Kconfig"
source "kernel/time/Kconfig"
menu "CPU/Task time and stats accounting"
choice
prompt "Cputime accounting"
default TICK_CPU_ACCOUNTING if !PPC64
default VIRT_CPU_ACCOUNTING if PPC64
# Kind of a stub config for the pure tick based cputime accounting
config TICK_CPU_ACCOUNTING
bool "Simple tick based cputime accounting"
depends on !S390
help
This is the basic tick based cputime accounting that maintains
statistics about user, system and idle time spent on per jiffies
granularity.
If unsure, say Y.
config VIRT_CPU_ACCOUNTING
bool "Deterministic task and CPU time accounting"
depends on HAVE_VIRT_CPU_ACCOUNTING
help
Select this option to enable more accurate task and CPU time
accounting. This is done by reading a CPU counter on each
kernel entry and exit and on transitions within the kernel
between system, softirq and hardirq state, so there is a
small performance impact. In the case of s390 or IBM POWER > 5,
this also enables accounting of stolen time on logically-partitioned
systems.
config IRQ_TIME_ACCOUNTING
bool "Fine granularity task level IRQ time accounting"
depends on HAVE_IRQ_TIME_ACCOUNTING
help
Select this option to enable fine granularity task irq time
accounting. This is done by reading a timestamp on each
transitions between softirq and hardirq state, so there can be a
small performance impact.
If in doubt, say N here.
endchoice
config BSD_PROCESS_ACCT
bool "BSD Process Accounting"
help
......@@ -292,18 +392,6 @@ config BSD_PROCESS_ACCT_V3
for processing it. A preliminary version of these tools is available
at <http://www.gnu.org/software/acct/>.
config FHANDLE
bool "open by fhandle syscalls"
select EXPORTFS
help
If you say Y here, a user level program will be able to map
file names to handle and then later use the handle for
different file system operations. This is useful in implementing
userspace file servers, which now track files using handles instead
of names. The handle would remain the same even if file names
get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2)
syscalls.
config TASKSTATS
bool "Export task/process statistics through netlink (EXPERIMENTAL)"
depends on NET
......@@ -346,50 +434,7 @@ config TASK_IO_ACCOUNTING
Say N if unsure.
config AUDIT
bool "Auditing support"
depends on NET
help
Enable auditing infrastructure that can be used with another
kernel subsystem, such as SELinux (which requires this for
logging of avc messages output). Does not do system-call
auditing without CONFIG_AUDITSYSCALL.
config AUDITSYSCALL
bool "Enable system-call auditing support"
depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || (ARM && AEABI && !OABI_COMPAT))
default y if SECURITY_SELINUX
help
Enable low-overhead system-call auditing infrastructure that
can be used independently or with another kernel subsystem,
such as SELinux.
config AUDIT_WATCH
def_bool y
depends on AUDITSYSCALL
select FSNOTIFY
config AUDIT_TREE
def_bool y
depends on AUDITSYSCALL
select FSNOTIFY
config AUDIT_LOGINUID_IMMUTABLE
bool "Make audit loginuid immutable"
depends on AUDIT
help
The config option toggles if a task setting its loginuid requires
CAP_SYS_AUDITCONTROL or if that task should require no special permissions
but should instead only allow setting its loginuid if it was never
previously set. On systems which use systemd or a similar central
process to restart login services this should be set to true. On older
systems in which an admin would typically have to directly stop and
start processes this should be set to false. Setting this to true allows
one to drop potentially dangerous capabilites from the login tasks,
but may not be backwards compatible with older init systems.
source "kernel/irq/Kconfig"
source "kernel/time/Kconfig"
endmenu # "CPU/Task time and stats accounting"
menu "RCU Subsystem"
......
......@@ -1276,11 +1276,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
p->hardirqs_enabled = 1;
#else
p->hardirqs_enabled = 0;
#endif
p->hardirq_enable_ip = 0;
p->hardirq_enable_event = 0;
p->hardirq_disable_ip = _THIS_IP_;
......
......@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
obj-$(CONFIG_SMP) += cpupri.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
......
This diff is collapsed.
This diff is collapsed.
......@@ -597,7 +597,7 @@ calc_delta_fair(unsigned long delta, struct sched_entity *se)
/*
* The idea is to set a period in which each task runs once.
*
* When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
* When there are too many tasks (sched_nr_latency) we have to stretch
* this period because otherwise the slices get too small.
*
* p = (nr <= nl) ? l : l*nr/nl
......@@ -2700,7 +2700,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
int prev_cpu = task_cpu(p);
int new_cpu = cpu;
int want_affine = 0;
int want_sd = 1;
int sync = wake_flags & WF_SYNC;
if (p->nr_cpus_allowed == 1)
......@@ -2717,27 +2716,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
if (!(tmp->flags & SD_LOAD_BALANCE))
continue;
/*
* If power savings logic is enabled for a domain, see if we
* are not overloaded, if so, don't balance wider.
*/
if (tmp->flags & (SD_PREFER_LOCAL)) {
unsigned long power = 0;
unsigned long nr_running = 0;
unsigned long capacity;
int i;
for_each_cpu(i, sched_domain_span(tmp)) {
power += power_of(i);
nr_running += cpu_rq(i)->cfs.nr_running;
}
capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
if (nr_running < capacity)
want_sd = 0;
}
/*
* If both cpu and prev_cpu are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
......@@ -2745,21 +2723,15 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
affine_sd = tmp;
want_affine = 0;
}
if (!want_sd && !want_affine)
break;
}
if (!(tmp->flags & sd_flag))
continue;
if (want_sd)
if (tmp->flags & sd_flag)
sd = tmp;
}
if (affine_sd) {
if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
prev_cpu = cpu;
new_cpu = select_idle_sibling(p, prev_cpu);
......@@ -4295,7 +4267,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
goto out_balanced;
}
BUG_ON(busiest == this_rq);
BUG_ON(busiest == env.dst_rq);
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
......@@ -4316,7 +4288,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
update_h_load(env.src_cpu);
more_balance:
local_irq_save(flags);
double_rq_lock(this_rq, busiest);
double_rq_lock(env.dst_rq, busiest);
/*
* cur_ld_moved - load moved in current iteration
......@@ -4324,7 +4296,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
*/
cur_ld_moved = move_tasks(&env);
ld_moved += cur_ld_moved;
double_rq_unlock(this_rq, busiest);
double_rq_unlock(env.dst_rq, busiest);
local_irq_restore(flags);
if (env.flags & LBF_NEED_BREAK) {
......@@ -4360,8 +4332,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
lb_iterations++ < max_lb_iterations) {
this_rq = cpu_rq(env.new_dst_cpu);
env.dst_rq = this_rq;
env.dst_rq = cpu_rq(env.new_dst_cpu);
env.dst_cpu = env.new_dst_cpu;
env.flags &= ~LBF_SOME_PINNED;
env.loop = 0;
......@@ -4646,7 +4617,7 @@ static void nohz_balancer_kick(int cpu)
return;
}
static inline void clear_nohz_tick_stopped(int cpu)
static inline void nohz_balance_exit_idle(int cpu)
{
if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
......@@ -4686,28 +4657,23 @@ void set_cpu_sd_state_idle(void)
}
/*
* This routine will record that this cpu is going idle with tick stopped.
* This routine will record that the cpu is going idle with tick stopped.
* This info will be used in performing idle load balancing in the future.
*/
void select_nohz_load_balancer(int stop_tick)
void nohz_balance_enter_idle(int cpu)
{
int cpu = smp_processor_id();
/*
* If this cpu is going down, then nothing needs to be done.
*/
if (!cpu_active(cpu))
return;
if (stop_tick) {
if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
return;
if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
return;
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
atomic_inc(&nohz.nr_cpus);
set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
return;
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
atomic_inc(&nohz.nr_cpus);
set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
......@@ -4715,7 +4681,7 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
{
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DYING:
clear_nohz_tick_stopped(smp_processor_id());
nohz_balance_exit_idle(smp_processor_id());
return NOTIFY_OK;
default:
return NOTIFY_DONE;
......@@ -4837,14 +4803,15 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
if (need_resched())
break;
raw_spin_lock_irq(&this_rq->lock);
update_rq_clock(this_rq);
update_idle_cpu_load(this_rq);
raw_spin_unlock_irq(&this_rq->lock);
rq = cpu_rq(balance_cpu);
raw_spin_lock_irq(&rq->lock);
update_rq_clock(rq);
update_idle_cpu_load(rq);
raw_spin_unlock_irq(&rq->lock);
rebalance_domains(balance_cpu, CPU_IDLE);
rq = cpu_rq(balance_cpu);
if (time_after(this_rq->next_balance, rq->next_balance))
this_rq->next_balance = rq->next_balance;
}
......@@ -4875,7 +4842,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
* busy tick after returning from idle, we will update the busy stats.
*/
set_cpu_sd_state_busy();
clear_nohz_tick_stopped(cpu);
nohz_balance_exit_idle(cpu);
/*
* None are in tickless mode and hence no need for NOHZ idle load
......
......@@ -11,14 +11,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
*/
SCHED_FEAT(START_DEBIT, true)
/*
* Based on load and program behaviour, see if it makes sense to place
* a newly woken task on the same cpu as the task that woke it --
* improve cache locality. Typically used with SYNC wakeups as
* generated by pipes and the like, see also SYNC_WAKEUPS.
*/
SCHED_FEAT(AFFINE_WAKEUPS, true)
/*
* Prefer to schedule the task we woke last (assuming it failed
* wakeup-preemption), since its likely going to consume data we
......@@ -42,7 +34,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
/*
* Use arch dependent cpu power functions
*/
SCHED_FEAT(ARCH_POWER, false)
SCHED_FEAT(ARCH_POWER, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
......
......@@ -1632,11 +1632,6 @@ static int push_rt_task(struct rq *rq)
if (!next_task)
return 0;
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
if (unlikely(task_running(rq, next_task)))
return 0;
#endif
retry:
if (unlikely(next_task == rq->curr)) {
WARN_ON(1);
......
......@@ -737,11 +737,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
*/
next->on_cpu = 1;
#endif
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
raw_spin_unlock_irq(&rq->lock);
#else
raw_spin_unlock(&rq->lock);
#endif
}
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
......@@ -755,9 +751,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
smp_wmb();
prev->on_cpu = 0;
#endif
#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
local_irq_enable();
#endif
}
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
......@@ -891,6 +885,9 @@ struct cpuacct {
struct kernel_cpustat __percpu *cpustat;
};
extern struct cgroup_subsys cpuacct_subsys;
extern struct cpuacct root_cpuacct;
/* return cpu accounting group corresponding to this container */
static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
{
......@@ -917,6 +914,16 @@ extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
#endif
#ifdef CONFIG_PARAVIRT
static inline u64 steal_ticks(u64 steal)
{
if (unlikely(steal > NSEC_PER_SEC))
return div_u64(steal, TICK_NSEC);
return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
}
#endif
static inline void inc_nr_running(struct rq *rq)
{
rq->nr_running++;
......@@ -1156,3 +1163,53 @@ enum rq_nohz_flag_bits {
#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
DECLARE_PER_CPU(u64, cpu_hardirq_time);
DECLARE_PER_CPU(u64, cpu_softirq_time);
#ifndef CONFIG_64BIT
DECLARE_PER_CPU(seqcount_t, irq_time_seq);
static inline void irq_time_write_begin(void)
{
__this_cpu_inc(irq_time_seq.sequence);
smp_wmb();
}
static inline void irq_time_write_end(void)
{
smp_wmb();
__this_cpu_inc(irq_time_seq.sequence);
}
static inline u64 irq_time_read(int cpu)
{
u64 irq_time;
unsigned seq;
do {
seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
irq_time = per_cpu(cpu_softirq_time, cpu) +
per_cpu(cpu_hardirq_time, cpu);
} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
return irq_time;
}
#else /* CONFIG_64BIT */
static inline void irq_time_write_begin(void)
{
}
static inline void irq_time_write_end(void)
{
}
static inline u64 irq_time_read(int cpu)
{
return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
}
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
......@@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void)
current->flags &= ~PF_MEMALLOC;
pending = local_softirq_pending();
account_system_vtime(current);
vtime_account(current);
__local_bh_disable((unsigned long)__builtin_return_address(0),
SOFTIRQ_OFFSET);
......@@ -272,7 +272,7 @@ asmlinkage void __do_softirq(void)
lockdep_softirq_exit();
account_system_vtime(current);
vtime_account(current);
__local_bh_enable(SOFTIRQ_OFFSET);
tsk_restore_flags(current, old_flags, PF_MEMALLOC);
}
......@@ -341,7 +341,7 @@ static inline void invoke_softirq(void)
*/
void irq_exit(void)
{
account_system_vtime(current);
vtime_account(current);
trace_hardirq_exit();
sub_preempt_count(IRQ_EXIT_OFFSET);
if (!in_interrupt() && local_softirq_pending())
......
......@@ -307,7 +307,7 @@ static struct ctl_table kern_table[] = {
.extra2 = &max_sched_tunable_scaling,
},
{
.procname = "sched_migration_cost",
.procname = "sched_migration_cost_ns",
.data = &sysctl_sched_migration_cost,
.maxlen = sizeof(unsigned int),
.mode = 0644,
......@@ -321,14 +321,14 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
{
.procname = "sched_time_avg",
.procname = "sched_time_avg_ms",
.data = &sysctl_sched_time_avg,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sched_shares_window",
.procname = "sched_shares_window_ns",
.data = &sysctl_sched_shares_window,
.maxlen = sizeof(unsigned int),
.mode = 0644,
......
......@@ -372,7 +372,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
* the scheduler tick in nohz_restart_sched_tick.
*/
if (!ts->tick_stopped) {
select_nohz_load_balancer(1);
nohz_balance_enter_idle(cpu);
calc_load_enter_idle();
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
......@@ -570,7 +570,6 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
{
/* Update jiffies first */
select_nohz_load_balancer(0);
tick_do_update_jiffies64(now);
update_cpu_load_nohz();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment