Commit 6832d965 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'timers-nohz-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull timers/nohz changes from Ingo Molnar:
 "It mostly contains fixes and full dynticks off-case optimizations, by
  Frederic Weisbecker"

* 'timers-nohz-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (24 commits)
  nohz: Include local CPU in full dynticks global kick
  nohz: Optimize full dynticks's sched hooks with static keys
  nohz: Optimize full dynticks state checks with static keys
  nohz: Rename a few state variables
  vtime: Always debug check snapshot source _before_ updating it
  vtime: Always scale generic vtime accounting results
  vtime: Optimize full dynticks accounting off case with static keys
  vtime: Describe overriden functions in dedicated arch headers
  m68k: hardirq_count() only need preempt_mask.h
  hardirq: Split preempt count mask definitions
  context_tracking: Split low level state headers
  vtime: Fix racy cputime delta update
  vtime: Remove a few unneeded generic vtime state checks
  context_tracking: User/kernel broundary cross trace events
  context_tracking: Optimize context switch off case with static keys
  context_tracking: Optimize guest APIs off case with static key
  context_tracking: Optimize main APIs off case with static key
  context_tracking: Ground setup for static key use
  context_tracking: Remove full dynticks' hacky dependency on wide context tracking
  nohz: Only enable context tracking on full dynticks CPUs
  ...
parents 228abe73 c2e7fcf5
...@@ -3,3 +3,4 @@ generic-y += clkdev.h ...@@ -3,3 +3,4 @@ generic-y += clkdev.h
generic-y += exec.h generic-y += exec.h
generic-y += kvm_para.h generic-y += kvm_para.h
generic-y += trace_clock.h generic-y += trace_clock.h
generic-y += vtime.h
\ No newline at end of file
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include <linux/types.h> #include <linux/types.h>
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
#include <linux/hardirq.h> #include <linux/preempt_mask.h>
#endif #endif
#include <linux/preempt.h> #include <linux/preempt.h>
#include <asm/thread_info.h> #include <asm/thread_info.h>
......
...@@ -2,3 +2,4 @@ ...@@ -2,3 +2,4 @@
generic-y += clkdev.h generic-y += clkdev.h
generic-y += rwsem.h generic-y += rwsem.h
generic-y += trace_clock.h generic-y += trace_clock.h
generic-y += vtime.h
\ No newline at end of file
...@@ -13,9 +13,6 @@ ...@@ -13,9 +13,6 @@
#include <asm/div64.h> #include <asm/div64.h>
#define __ARCH_HAS_VTIME_ACCOUNT
#define __ARCH_HAS_VTIME_TASK_SWITCH
/* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */ /* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */
typedef unsigned long long __nocast cputime_t; typedef unsigned long long __nocast cputime_t;
......
#ifndef _S390_VTIME_H
#define _S390_VTIME_H
#define __ARCH_HAS_VTIME_ACCOUNT
#define __ARCH_HAS_VTIME_TASK_SWITCH
#endif /* _S390_VTIME_H */
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <asm/irq_regs.h> #include <asm/irq_regs.h>
#include <asm/cputime.h> #include <asm/cputime.h>
#include <asm/vtimer.h> #include <asm/vtimer.h>
#include <asm/vtime.h>
#include <asm/irq.h> #include <asm/irq.h>
#include "entry.h" #include "entry.h"
......
...@@ -2,100 +2,110 @@ ...@@ -2,100 +2,110 @@
#define _LINUX_CONTEXT_TRACKING_H #define _LINUX_CONTEXT_TRACKING_H
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/percpu.h>
#include <linux/vtime.h> #include <linux/vtime.h>
#include <linux/context_tracking_state.h>
#include <asm/ptrace.h> #include <asm/ptrace.h>
struct context_tracking {
/*
* When active is false, probes are unset in order
* to minimize overhead: TIF flags are cleared
* and calls to user_enter/exit are ignored. This
* may be further optimized using static keys.
*/
bool active;
enum ctx_state {
IN_KERNEL = 0,
IN_USER,
} state;
};
static inline void __guest_enter(void)
{
/*
* This is running in ioctl context so we can avoid
* the call to vtime_account() with its unnecessary idle check.
*/
vtime_account_system(current);
current->flags |= PF_VCPU;
}
static inline void __guest_exit(void)
{
/*
* This is running in ioctl context so we can avoid
* the call to vtime_account() with its unnecessary idle check.
*/
vtime_account_system(current);
current->flags &= ~PF_VCPU;
}
#ifdef CONFIG_CONTEXT_TRACKING #ifdef CONFIG_CONTEXT_TRACKING
DECLARE_PER_CPU(struct context_tracking, context_tracking); extern void context_tracking_cpu_set(int cpu);
extern void context_tracking_user_enter(void);
extern void context_tracking_user_exit(void);
extern void __context_tracking_task_switch(struct task_struct *prev,
struct task_struct *next);
static inline bool context_tracking_in_user(void) static inline void user_enter(void)
{ {
return __this_cpu_read(context_tracking.state) == IN_USER; if (static_key_false(&context_tracking_enabled))
} context_tracking_user_enter();
static inline bool context_tracking_active(void) }
static inline void user_exit(void)
{ {
return __this_cpu_read(context_tracking.active); if (static_key_false(&context_tracking_enabled))
context_tracking_user_exit();
} }
extern void user_enter(void);
extern void user_exit(void);
extern void guest_enter(void);
extern void guest_exit(void);
static inline enum ctx_state exception_enter(void) static inline enum ctx_state exception_enter(void)
{ {
enum ctx_state prev_ctx; enum ctx_state prev_ctx;
if (!static_key_false(&context_tracking_enabled))
return 0;
prev_ctx = this_cpu_read(context_tracking.state); prev_ctx = this_cpu_read(context_tracking.state);
user_exit(); context_tracking_user_exit();
return prev_ctx; return prev_ctx;
} }
static inline void exception_exit(enum ctx_state prev_ctx) static inline void exception_exit(enum ctx_state prev_ctx)
{ {
if (static_key_false(&context_tracking_enabled)) {
if (prev_ctx == IN_USER) if (prev_ctx == IN_USER)
user_enter(); context_tracking_user_enter();
}
} }
extern void context_tracking_task_switch(struct task_struct *prev, static inline void context_tracking_task_switch(struct task_struct *prev,
struct task_struct *next); struct task_struct *next)
{
if (static_key_false(&context_tracking_enabled))
__context_tracking_task_switch(prev, next);
}
#else #else
static inline bool context_tracking_in_user(void) { return false; }
static inline void user_enter(void) { } static inline void user_enter(void) { }
static inline void user_exit(void) { } static inline void user_exit(void) { }
static inline enum ctx_state exception_enter(void) { return 0; }
static inline void exception_exit(enum ctx_state prev_ctx) { }
static inline void context_tracking_task_switch(struct task_struct *prev,
struct task_struct *next) { }
#endif /* !CONFIG_CONTEXT_TRACKING */
#ifdef CONFIG_CONTEXT_TRACKING_FORCE
extern void context_tracking_init(void);
#else
static inline void context_tracking_init(void) { }
#endif /* CONFIG_CONTEXT_TRACKING_FORCE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
static inline void guest_enter(void) static inline void guest_enter(void)
{ {
__guest_enter(); if (vtime_accounting_enabled())
vtime_guest_enter(current);
else
current->flags |= PF_VCPU;
} }
static inline void guest_exit(void) static inline void guest_exit(void)
{ {
__guest_exit(); if (vtime_accounting_enabled())
vtime_guest_exit(current);
else
current->flags &= ~PF_VCPU;
} }
static inline enum ctx_state exception_enter(void) { return 0; } #else
static inline void exception_exit(enum ctx_state prev_ctx) { } static inline void guest_enter(void)
static inline void context_tracking_task_switch(struct task_struct *prev, {
struct task_struct *next) { } /*
#endif /* !CONFIG_CONTEXT_TRACKING */ * This is running in ioctl context so its safe
* to assume that it's the stime pending cputime
* to flush.
*/
vtime_account_system(current);
current->flags |= PF_VCPU;
}
static inline void guest_exit(void)
{
/* Flush the guest cputime we spent on the guest */
vtime_account_system(current);
current->flags &= ~PF_VCPU;
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
#endif #endif
#ifndef _LINUX_CONTEXT_TRACKING_STATE_H
#define _LINUX_CONTEXT_TRACKING_STATE_H
#include <linux/percpu.h>
#include <linux/static_key.h>
struct context_tracking {
/*
* When active is false, probes are unset in order
* to minimize overhead: TIF flags are cleared
* and calls to user_enter/exit are ignored. This
* may be further optimized using static keys.
*/
bool active;
enum ctx_state {
IN_KERNEL = 0,
IN_USER,
} state;
};
#ifdef CONFIG_CONTEXT_TRACKING
extern struct static_key context_tracking_enabled;
DECLARE_PER_CPU(struct context_tracking, context_tracking);
static inline bool context_tracking_in_user(void)
{
return __this_cpu_read(context_tracking.state) == IN_USER;
}
static inline bool context_tracking_active(void)
{
return __this_cpu_read(context_tracking.active);
}
#else
static inline bool context_tracking_in_user(void) { return false; }
static inline bool context_tracking_active(void) { return false; }
#endif /* CONFIG_CONTEXT_TRACKING */
#endif
#ifndef LINUX_HARDIRQ_H #ifndef LINUX_HARDIRQ_H
#define LINUX_HARDIRQ_H #define LINUX_HARDIRQ_H
#include <linux/preempt.h> #include <linux/preempt_mask.h>
#include <linux/lockdep.h> #include <linux/lockdep.h>
#include <linux/ftrace_irq.h> #include <linux/ftrace_irq.h>
#include <linux/vtime.h> #include <linux/vtime.h>
#include <asm/hardirq.h>
/*
* We put the hardirq and softirq counter into the preemption
* counter. The bitmask has the following meaning:
*
* - bits 0-7 are the preemption count (max preemption depth: 256)
* - bits 8-15 are the softirq count (max # of softirqs: 256)
*
* The hardirq count can in theory reach the same as NR_IRQS.
* In reality, the number of nested IRQS is limited to the stack
* size as well. For archs with over 1000 IRQS it is not practical
* to expect that they will all nest. We give a max of 10 bits for
* hardirq nesting. An arch may choose to give less than 10 bits.
* m68k expects it to be 8.
*
* - bits 16-25 are the hardirq count (max # of nested hardirqs: 1024)
* - bit 26 is the NMI_MASK
* - bit 27 is the PREEMPT_ACTIVE flag
*
* PREEMPT_MASK: 0x000000ff
* SOFTIRQ_MASK: 0x0000ff00
* HARDIRQ_MASK: 0x03ff0000
* NMI_MASK: 0x04000000
*/
#define PREEMPT_BITS 8
#define SOFTIRQ_BITS 8
#define NMI_BITS 1
#define MAX_HARDIRQ_BITS 10
#ifndef HARDIRQ_BITS
# define HARDIRQ_BITS MAX_HARDIRQ_BITS
#endif
#if HARDIRQ_BITS > MAX_HARDIRQ_BITS
#error HARDIRQ_BITS too high!
#endif
#define PREEMPT_SHIFT 0
#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS)
#define __IRQ_MASK(x) ((1UL << (x))-1)
#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT)
#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT)
#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT)
#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
#define NMI_OFFSET (1UL << NMI_SHIFT)
#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
#ifndef PREEMPT_ACTIVE
#define PREEMPT_ACTIVE_BITS 1
#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
#endif
#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS))
#error PREEMPT_ACTIVE is too low!
#endif
#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
| NMI_MASK))
/*
* Are we doing bottom half or hardware interrupt processing?
* Are we in a softirq context? Interrupt context?
* in_softirq - Are we currently processing softirq or have bh disabled?
* in_serving_softirq - Are we currently processing softirq?
*/
#define in_irq() (hardirq_count())
#define in_softirq() (softirq_count())
#define in_interrupt() (irq_count())
#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
/*
* Are we in NMI context?
*/
#define in_nmi() (preempt_count() & NMI_MASK)
#if defined(CONFIG_PREEMPT_COUNT)
# define PREEMPT_CHECK_OFFSET 1
#else
# define PREEMPT_CHECK_OFFSET 0
#endif
/*
* Are we running in atomic context? WARNING: this macro cannot
* always detect atomic context; in particular, it cannot know about
* held spinlocks in non-preemptible kernels. Thus it should not be
* used in the general case to determine whether sleeping is possible.
* Do not use in_atomic() in driver code.
*/
#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0)
/*
* Check whether we were atomic before we did preempt_disable():
* (used by the scheduler, *after* releasing the kernel lock)
*/
#define in_atomic_preempt_off() \
((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
#ifdef CONFIG_PREEMPT_COUNT
# define preemptible() (preempt_count() == 0 && !irqs_disabled())
#else
# define preemptible() 0
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_GENERIC_HARDIRQS) #if defined(CONFIG_SMP) || defined(CONFIG_GENERIC_HARDIRQS)
extern void synchronize_irq(unsigned int irq); extern void synchronize_irq(unsigned int irq);
......
#ifndef LINUX_PREEMPT_MASK_H
#define LINUX_PREEMPT_MASK_H
#include <linux/preempt.h>
#include <asm/hardirq.h>
/*
* We put the hardirq and softirq counter into the preemption
* counter. The bitmask has the following meaning:
*
* - bits 0-7 are the preemption count (max preemption depth: 256)
* - bits 8-15 are the softirq count (max # of softirqs: 256)
*
* The hardirq count can in theory reach the same as NR_IRQS.
* In reality, the number of nested IRQS is limited to the stack
* size as well. For archs with over 1000 IRQS it is not practical
* to expect that they will all nest. We give a max of 10 bits for
* hardirq nesting. An arch may choose to give less than 10 bits.
* m68k expects it to be 8.
*
* - bits 16-25 are the hardirq count (max # of nested hardirqs: 1024)
* - bit 26 is the NMI_MASK
* - bit 27 is the PREEMPT_ACTIVE flag
*
* PREEMPT_MASK: 0x000000ff
* SOFTIRQ_MASK: 0x0000ff00
* HARDIRQ_MASK: 0x03ff0000
* NMI_MASK: 0x04000000
*/
#define PREEMPT_BITS 8
#define SOFTIRQ_BITS 8
#define NMI_BITS 1
#define MAX_HARDIRQ_BITS 10
#ifndef HARDIRQ_BITS
# define HARDIRQ_BITS MAX_HARDIRQ_BITS
#endif
#if HARDIRQ_BITS > MAX_HARDIRQ_BITS
#error HARDIRQ_BITS too high!
#endif
#define PREEMPT_SHIFT 0
#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS)
#define __IRQ_MASK(x) ((1UL << (x))-1)
#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT)
#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT)
#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT)
#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
#define NMI_OFFSET (1UL << NMI_SHIFT)
#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
#ifndef PREEMPT_ACTIVE
#define PREEMPT_ACTIVE_BITS 1
#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
#endif
#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS))
#error PREEMPT_ACTIVE is too low!
#endif
#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
| NMI_MASK))
/*
* Are we doing bottom half or hardware interrupt processing?
* Are we in a softirq context? Interrupt context?
* in_softirq - Are we currently processing softirq or have bh disabled?
* in_serving_softirq - Are we currently processing softirq?
*/
#define in_irq() (hardirq_count())
#define in_softirq() (softirq_count())
#define in_interrupt() (irq_count())
#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
/*
* Are we in NMI context?
*/
#define in_nmi() (preempt_count() & NMI_MASK)
#if defined(CONFIG_PREEMPT_COUNT)
# define PREEMPT_CHECK_OFFSET 1
#else
# define PREEMPT_CHECK_OFFSET 0
#endif
/*
* Are we running in atomic context? WARNING: this macro cannot
* always detect atomic context; in particular, it cannot know about
* held spinlocks in non-preemptible kernels. Thus it should not be
* used in the general case to determine whether sleeping is possible.
* Do not use in_atomic() in driver code.
*/
#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0)
/*
* Check whether we were atomic before we did preempt_disable():
* (used by the scheduler, *after* releasing the kernel lock)
*/
#define in_atomic_preempt_off() \
((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
#ifdef CONFIG_PREEMPT_COUNT
# define preemptible() (preempt_count() == 0 && !irqs_disabled())
#else
# define preemptible() 0
#endif
#endif /* LINUX_PREEMPT_MASK_H */
...@@ -10,6 +10,8 @@ ...@@ -10,6 +10,8 @@
#include <linux/irqflags.h> #include <linux/irqflags.h>
#include <linux/percpu.h> #include <linux/percpu.h>
#include <linux/hrtimer.h> #include <linux/hrtimer.h>
#include <linux/context_tracking_state.h>
#include <linux/cpumask.h>
#ifdef CONFIG_GENERIC_CLOCKEVENTS #ifdef CONFIG_GENERIC_CLOCKEVENTS
...@@ -158,20 +160,51 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } ...@@ -158,20 +160,51 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
# endif /* !CONFIG_NO_HZ_COMMON */ # endif /* !CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL #ifdef CONFIG_NO_HZ_FULL
extern bool tick_nohz_full_running;
extern cpumask_var_t tick_nohz_full_mask;
static inline bool tick_nohz_full_enabled(void)
{
if (!static_key_false(&context_tracking_enabled))
return false;
return tick_nohz_full_running;
}
static inline bool tick_nohz_full_cpu(int cpu)
{
if (!tick_nohz_full_enabled())
return false;
return cpumask_test_cpu(cpu, tick_nohz_full_mask);
}
extern void tick_nohz_init(void); extern void tick_nohz_init(void);
extern int tick_nohz_full_cpu(int cpu); extern void __tick_nohz_full_check(void);
extern void tick_nohz_full_check(void);
extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick(void);
extern void tick_nohz_full_kick_all(void); extern void tick_nohz_full_kick_all(void);
extern void tick_nohz_task_switch(struct task_struct *tsk); extern void __tick_nohz_task_switch(struct task_struct *tsk);
#else #else
static inline void tick_nohz_init(void) { } static inline void tick_nohz_init(void) { }
static inline int tick_nohz_full_cpu(int cpu) { return 0; } static inline bool tick_nohz_full_enabled(void) { return false; }
static inline void tick_nohz_full_check(void) { } static inline bool tick_nohz_full_cpu(int cpu) { return false; }
static inline void __tick_nohz_full_check(void) { }
static inline void tick_nohz_full_kick(void) { } static inline void tick_nohz_full_kick(void) { }
static inline void tick_nohz_full_kick_all(void) { } static inline void tick_nohz_full_kick_all(void) { }
static inline void tick_nohz_task_switch(struct task_struct *tsk) { } static inline void __tick_nohz_task_switch(struct task_struct *tsk) { }
#endif #endif
static inline void tick_nohz_full_check(void)
{
if (tick_nohz_full_enabled())
__tick_nohz_full_check();
}
static inline void tick_nohz_task_switch(struct task_struct *tsk)
{
if (tick_nohz_full_enabled())
__tick_nohz_task_switch(tsk);
}
#endif #endif
#ifndef _LINUX_KERNEL_VTIME_H #ifndef _LINUX_KERNEL_VTIME_H
#define _LINUX_KERNEL_VTIME_H #define _LINUX_KERNEL_VTIME_H
#include <linux/context_tracking_state.h>
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
#include <asm/vtime.h>
#endif
struct task_struct; struct task_struct;
/*
* vtime_accounting_enabled() definitions/declarations
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
static inline bool vtime_accounting_enabled(void) { return true; }
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
static inline bool vtime_accounting_enabled(void)
{
if (static_key_false(&context_tracking_enabled)) {
if (context_tracking_active())
return true;
}
return false;
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
static inline bool vtime_accounting_enabled(void) { return false; }
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
/*
* Common vtime APIs
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING #ifdef CONFIG_VIRT_CPU_ACCOUNTING
#ifdef __ARCH_HAS_VTIME_TASK_SWITCH
extern void vtime_task_switch(struct task_struct *prev); extern void vtime_task_switch(struct task_struct *prev);
#else
extern void vtime_common_task_switch(struct task_struct *prev);
static inline void vtime_task_switch(struct task_struct *prev)
{
if (vtime_accounting_enabled())
vtime_common_task_switch(prev);
}
#endif /* __ARCH_HAS_VTIME_TASK_SWITCH */
extern void vtime_account_system(struct task_struct *tsk); extern void vtime_account_system(struct task_struct *tsk);
extern void vtime_account_idle(struct task_struct *tsk); extern void vtime_account_idle(struct task_struct *tsk);
extern void vtime_account_user(struct task_struct *tsk); extern void vtime_account_user(struct task_struct *tsk);
extern void vtime_account_irq_enter(struct task_struct *tsk);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE #ifdef __ARCH_HAS_VTIME_ACCOUNT
static inline bool vtime_accounting_enabled(void) { return true; } extern void vtime_account_irq_enter(struct task_struct *tsk);
#endif #else
extern void vtime_common_account_irq_enter(struct task_struct *tsk);
static inline void vtime_account_irq_enter(struct task_struct *tsk)
{
if (vtime_accounting_enabled())
vtime_common_account_irq_enter(tsk);
}
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ #else /* !CONFIG_VIRT_CPU_ACCOUNTING */
...@@ -20,14 +70,20 @@ static inline void vtime_task_switch(struct task_struct *prev) { } ...@@ -20,14 +70,20 @@ static inline void vtime_task_switch(struct task_struct *prev) { }
static inline void vtime_account_system(struct task_struct *tsk) { } static inline void vtime_account_system(struct task_struct *tsk) { }
static inline void vtime_account_user(struct task_struct *tsk) { } static inline void vtime_account_user(struct task_struct *tsk) { }
static inline void vtime_account_irq_enter(struct task_struct *tsk) { } static inline void vtime_account_irq_enter(struct task_struct *tsk) { }
static inline bool vtime_accounting_enabled(void) { return false; } #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
#endif
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
extern void arch_vtime_task_switch(struct task_struct *tsk); extern void arch_vtime_task_switch(struct task_struct *tsk);
extern void vtime_account_irq_exit(struct task_struct *tsk); extern void vtime_gen_account_irq_exit(struct task_struct *tsk);
extern bool vtime_accounting_enabled(void);
static inline void vtime_account_irq_exit(struct task_struct *tsk)
{
if (vtime_accounting_enabled())
vtime_gen_account_irq_exit(tsk);
}
extern void vtime_user_enter(struct task_struct *tsk); extern void vtime_user_enter(struct task_struct *tsk);
static inline void vtime_user_exit(struct task_struct *tsk) static inline void vtime_user_exit(struct task_struct *tsk)
{ {
vtime_account_user(tsk); vtime_account_user(tsk);
...@@ -35,7 +91,7 @@ static inline void vtime_user_exit(struct task_struct *tsk) ...@@ -35,7 +91,7 @@ static inline void vtime_user_exit(struct task_struct *tsk)
extern void vtime_guest_enter(struct task_struct *tsk); extern void vtime_guest_enter(struct task_struct *tsk);
extern void vtime_guest_exit(struct task_struct *tsk); extern void vtime_guest_exit(struct task_struct *tsk);
extern void vtime_init_idle(struct task_struct *tsk, int cpu); extern void vtime_init_idle(struct task_struct *tsk, int cpu);
#else #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */
static inline void vtime_account_irq_exit(struct task_struct *tsk) static inline void vtime_account_irq_exit(struct task_struct *tsk)
{ {
/* On hard|softirq exit we always account to hard|softirq cputime */ /* On hard|softirq exit we always account to hard|softirq cputime */
......
#undef TRACE_SYSTEM
#define TRACE_SYSTEM context_tracking
#if !defined(_TRACE_CONTEXT_TRACKING_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_CONTEXT_TRACKING_H
#include <linux/tracepoint.h>
DECLARE_EVENT_CLASS(context_tracking_user,
TP_PROTO(int dummy),
TP_ARGS(dummy),
TP_STRUCT__entry(
__field( int, dummy )
),
TP_fast_assign(
__entry->dummy = dummy;
),
TP_printk("%s", "")
);
/**
* user_enter - called when the kernel resumes to userspace
* @dummy: dummy arg to make trace event macro happy
*
* This event occurs when the kernel resumes to userspace after
* an exception or a syscall.
*/
DEFINE_EVENT(context_tracking_user, user_enter,
TP_PROTO(int dummy),
TP_ARGS(dummy)
);
/**
* user_exit - called when userspace enters the kernel
* @dummy: dummy arg to make trace event macro happy
*
* This event occurs when userspace enters the kernel through
* an exception or a syscall.
*/
DEFINE_EVENT(context_tracking_user, user_exit,
TP_PROTO(int dummy),
TP_ARGS(dummy)
);
#endif /* _TRACE_CONTEXT_TRACKING_H */
/* This part must be outside protection */
#include <trace/define_trace.h>
...@@ -528,13 +528,29 @@ config RCU_USER_QS ...@@ -528,13 +528,29 @@ config RCU_USER_QS
config CONTEXT_TRACKING_FORCE config CONTEXT_TRACKING_FORCE
bool "Force context tracking" bool "Force context tracking"
depends on CONTEXT_TRACKING depends on CONTEXT_TRACKING
default CONTEXT_TRACKING default y if !NO_HZ_FULL
help help
Probe on user/kernel boundaries by default in order to The major pre-requirement for full dynticks to work is to
test the features that rely on it such as userspace RCU extended support the context tracking subsystem. But there are also
quiescent states. other dependencies to provide in order to make the full
This test is there for debugging until we have a real user like the dynticks working.
full dynticks mode.
This option stands for testing when an arch implements the
context tracking backend but doesn't yet fullfill all the
requirements to make the full dynticks feature working.
Without the full dynticks, there is no way to test the support
for context tracking and the subsystems that rely on it: RCU
userspace extended quiescent state and tickless cputime
accounting. This option copes with the absence of the full
dynticks subsystem by forcing the context tracking on all
CPUs in the system.
Say Y only if you're working on the developpement of an
architecture backend for the context tracking.
Say N otherwise, this option brings an overhead that you
don't want in production.
config RCU_FANOUT config RCU_FANOUT
int "Tree-based hierarchical RCU fanout value" int "Tree-based hierarchical RCU fanout value"
......
...@@ -75,6 +75,7 @@ ...@@ -75,6 +75,7 @@
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/elevator.h> #include <linux/elevator.h>
#include <linux/sched_clock.h> #include <linux/sched_clock.h>
#include <linux/context_tracking.h>
#include <asm/io.h> #include <asm/io.h>
#include <asm/bugs.h> #include <asm/bugs.h>
...@@ -545,6 +546,7 @@ asmlinkage void __init start_kernel(void) ...@@ -545,6 +546,7 @@ asmlinkage void __init start_kernel(void)
idr_init_cache(); idr_init_cache();
rcu_init(); rcu_init();
tick_nohz_init(); tick_nohz_init();
context_tracking_init();
radix_tree_init(); radix_tree_init();
/* init some links before init_ISA_irqs() */ /* init some links before init_ISA_irqs() */
early_irq_init(); early_irq_init();
......
...@@ -20,14 +20,25 @@ ...@@ -20,14 +20,25 @@
#include <linux/hardirq.h> #include <linux/hardirq.h>
#include <linux/export.h> #include <linux/export.h>
DEFINE_PER_CPU(struct context_tracking, context_tracking) = { #define CREATE_TRACE_POINTS
#ifdef CONFIG_CONTEXT_TRACKING_FORCE #include <trace/events/context_tracking.h>
.active = true,
#endif struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
}; EXPORT_SYMBOL_GPL(context_tracking_enabled);
DEFINE_PER_CPU(struct context_tracking, context_tracking);
EXPORT_SYMBOL_GPL(context_tracking);
void context_tracking_cpu_set(int cpu)
{
if (!per_cpu(context_tracking.active, cpu)) {
per_cpu(context_tracking.active, cpu) = true;
static_key_slow_inc(&context_tracking_enabled);
}
}
/** /**
* user_enter - Inform the context tracking that the CPU is going to * context_tracking_user_enter - Inform the context tracking that the CPU is going to
* enter userspace mode. * enter userspace mode.
* *
* This function must be called right before we switch from the kernel * This function must be called right before we switch from the kernel
...@@ -35,7 +46,7 @@ DEFINE_PER_CPU(struct context_tracking, context_tracking) = { ...@@ -35,7 +46,7 @@ DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
* to execute won't use any RCU read side critical section because this * to execute won't use any RCU read side critical section because this
* function sets RCU in extended quiescent state. * function sets RCU in extended quiescent state.
*/ */
void user_enter(void) void context_tracking_user_enter(void)
{ {
unsigned long flags; unsigned long flags;
...@@ -54,8 +65,9 @@ void user_enter(void) ...@@ -54,8 +65,9 @@ void user_enter(void)
WARN_ON_ONCE(!current->mm); WARN_ON_ONCE(!current->mm);
local_irq_save(flags); local_irq_save(flags);
if (__this_cpu_read(context_tracking.active) && if ( __this_cpu_read(context_tracking.state) != IN_USER) {
__this_cpu_read(context_tracking.state) != IN_USER) { if (__this_cpu_read(context_tracking.active)) {
trace_user_enter(0);
/* /*
* At this stage, only low level arch entry code remains and * At this stage, only low level arch entry code remains and
* then we'll run in userspace. We can assume there won't be * then we'll run in userspace. We can assume there won't be
...@@ -65,6 +77,20 @@ void user_enter(void) ...@@ -65,6 +77,20 @@ void user_enter(void)
*/ */
vtime_user_enter(current); vtime_user_enter(current);
rcu_user_enter(); rcu_user_enter();
}
/*
* Even if context tracking is disabled on this CPU, because it's outside
* the full dynticks mask for example, we still have to keep track of the
* context transitions and states to prevent inconsistency on those of
* other CPUs.
* If a task triggers an exception in userspace, sleep on the exception
* handler and then migrate to another CPU, that new CPU must know where
* the exception returns by the time we call exception_exit().
* This information can only be provided by the previous CPU when it called
* exception_enter().
* OTOH we can spare the calls to vtime and RCU when context_tracking.active
* is false because we know that CPU is not tickless.
*/
__this_cpu_write(context_tracking.state, IN_USER); __this_cpu_write(context_tracking.state, IN_USER);
} }
local_irq_restore(flags); local_irq_restore(flags);
...@@ -87,10 +113,9 @@ void user_enter(void) ...@@ -87,10 +113,9 @@ void user_enter(void)
*/ */
void __sched notrace preempt_schedule_context(void) void __sched notrace preempt_schedule_context(void)
{ {
struct thread_info *ti = current_thread_info();
enum ctx_state prev_ctx; enum ctx_state prev_ctx;
if (likely(ti->preempt_count || irqs_disabled())) if (likely(!preemptible()))
return; return;
/* /*
...@@ -112,7 +137,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context); ...@@ -112,7 +137,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context);
#endif /* CONFIG_PREEMPT */ #endif /* CONFIG_PREEMPT */
/** /**
* user_exit - Inform the context tracking that the CPU is * context_tracking_user_exit - Inform the context tracking that the CPU is
* exiting userspace mode and entering the kernel. * exiting userspace mode and entering the kernel.
* *
* This function must be called after we entered the kernel from userspace * This function must be called after we entered the kernel from userspace
...@@ -122,7 +147,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context); ...@@ -122,7 +147,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context);
* This call supports re-entrancy. This way it can be called from any exception * This call supports re-entrancy. This way it can be called from any exception
* handler without needing to know if we came from userspace or not. * handler without needing to know if we came from userspace or not.
*/ */
void user_exit(void) void context_tracking_user_exit(void)
{ {
unsigned long flags; unsigned long flags;
...@@ -131,38 +156,22 @@ void user_exit(void) ...@@ -131,38 +156,22 @@ void user_exit(void)
local_irq_save(flags); local_irq_save(flags);
if (__this_cpu_read(context_tracking.state) == IN_USER) { if (__this_cpu_read(context_tracking.state) == IN_USER) {
if (__this_cpu_read(context_tracking.active)) {
/* /*
* We are going to run code that may use RCU. Inform * We are going to run code that may use RCU. Inform
* RCU core about that (ie: we may need the tick again). * RCU core about that (ie: we may need the tick again).
*/ */
rcu_user_exit(); rcu_user_exit();
vtime_user_exit(current); vtime_user_exit(current);
trace_user_exit(0);
}
__this_cpu_write(context_tracking.state, IN_KERNEL); __this_cpu_write(context_tracking.state, IN_KERNEL);
} }
local_irq_restore(flags); local_irq_restore(flags);
} }
void guest_enter(void)
{
if (vtime_accounting_enabled())
vtime_guest_enter(current);
else
__guest_enter();
}
EXPORT_SYMBOL_GPL(guest_enter);
void guest_exit(void)
{
if (vtime_accounting_enabled())
vtime_guest_exit(current);
else
__guest_exit();
}
EXPORT_SYMBOL_GPL(guest_exit);
/** /**
* context_tracking_task_switch - context switch the syscall callbacks * __context_tracking_task_switch - context switch the syscall callbacks
* @prev: the task that is being switched out * @prev: the task that is being switched out
* @next: the task that is being switched in * @next: the task that is being switched in
* *
...@@ -174,11 +183,19 @@ EXPORT_SYMBOL_GPL(guest_exit); ...@@ -174,11 +183,19 @@ EXPORT_SYMBOL_GPL(guest_exit);
* migrate to some CPU that doesn't do the context tracking. As such the TIF * migrate to some CPU that doesn't do the context tracking. As such the TIF
* flag may not be desired there. * flag may not be desired there.
*/ */
void context_tracking_task_switch(struct task_struct *prev, void __context_tracking_task_switch(struct task_struct *prev,
struct task_struct *next) struct task_struct *next)
{ {
if (__this_cpu_read(context_tracking.active)) {
clear_tsk_thread_flag(prev, TIF_NOHZ); clear_tsk_thread_flag(prev, TIF_NOHZ);
set_tsk_thread_flag(next, TIF_NOHZ); set_tsk_thread_flag(next, TIF_NOHZ);
}
} }
#ifdef CONFIG_CONTEXT_TRACKING_FORCE
void __init context_tracking_init(void)
{
int cpu;
for_each_possible_cpu(cpu)
context_tracking_cpu_set(cpu);
}
#endif
...@@ -2527,13 +2527,11 @@ void __sched schedule_preempt_disabled(void) ...@@ -2527,13 +2527,11 @@ void __sched schedule_preempt_disabled(void)
*/ */
asmlinkage void __sched notrace preempt_schedule(void) asmlinkage void __sched notrace preempt_schedule(void)
{ {
struct thread_info *ti = current_thread_info();
/* /*
* If there is a non-zero preempt_count or interrupts are disabled, * If there is a non-zero preempt_count or interrupts are disabled,
* we do not want to preempt the current task. Just return.. * we do not want to preempt the current task. Just return..
*/ */
if (likely(ti->preempt_count || irqs_disabled())) if (likely(!preemptible()))
return; return;
do { do {
......
...@@ -378,11 +378,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ ...@@ -378,11 +378,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
#ifdef CONFIG_VIRT_CPU_ACCOUNTING #ifdef CONFIG_VIRT_CPU_ACCOUNTING
#ifndef __ARCH_HAS_VTIME_TASK_SWITCH #ifndef __ARCH_HAS_VTIME_TASK_SWITCH
void vtime_task_switch(struct task_struct *prev) void vtime_common_task_switch(struct task_struct *prev)
{ {
if (!vtime_accounting_enabled())
return;
if (is_idle_task(prev)) if (is_idle_task(prev))
vtime_account_idle(prev); vtime_account_idle(prev);
else else
...@@ -404,11 +401,8 @@ void vtime_task_switch(struct task_struct *prev) ...@@ -404,11 +401,8 @@ void vtime_task_switch(struct task_struct *prev)
* vtime_account(). * vtime_account().
*/ */
#ifndef __ARCH_HAS_VTIME_ACCOUNT #ifndef __ARCH_HAS_VTIME_ACCOUNT
void vtime_account_irq_enter(struct task_struct *tsk) void vtime_common_account_irq_enter(struct task_struct *tsk)
{ {
if (!vtime_accounting_enabled())
return;
if (!in_interrupt()) { if (!in_interrupt()) {
/* /*
* If we interrupted user, context_tracking_in_user() * If we interrupted user, context_tracking_in_user()
...@@ -428,7 +422,7 @@ void vtime_account_irq_enter(struct task_struct *tsk) ...@@ -428,7 +422,7 @@ void vtime_account_irq_enter(struct task_struct *tsk)
} }
vtime_account_system(tsk); vtime_account_system(tsk);
} }
EXPORT_SYMBOL_GPL(vtime_account_irq_enter); EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */ #endif /* __ARCH_HAS_VTIME_ACCOUNT */
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ #endif /* CONFIG_VIRT_CPU_ACCOUNTING */
...@@ -559,12 +553,6 @@ static void cputime_adjust(struct task_cputime *curr, ...@@ -559,12 +553,6 @@ static void cputime_adjust(struct task_cputime *curr,
{ {
cputime_t rtime, stime, utime, total; cputime_t rtime, stime, utime, total;
if (vtime_accounting_enabled()) {
*ut = curr->utime;
*st = curr->stime;
return;
}
stime = curr->stime; stime = curr->stime;
total = stime + curr->utime; total = stime + curr->utime;
...@@ -664,23 +652,17 @@ static void __vtime_account_system(struct task_struct *tsk) ...@@ -664,23 +652,17 @@ static void __vtime_account_system(struct task_struct *tsk)
void vtime_account_system(struct task_struct *tsk) void vtime_account_system(struct task_struct *tsk)
{ {
if (!vtime_accounting_enabled())
return;
write_seqlock(&tsk->vtime_seqlock); write_seqlock(&tsk->vtime_seqlock);
__vtime_account_system(tsk); __vtime_account_system(tsk);
write_sequnlock(&tsk->vtime_seqlock); write_sequnlock(&tsk->vtime_seqlock);
} }
void vtime_account_irq_exit(struct task_struct *tsk) void vtime_gen_account_irq_exit(struct task_struct *tsk)
{ {
if (!vtime_accounting_enabled())
return;
write_seqlock(&tsk->vtime_seqlock); write_seqlock(&tsk->vtime_seqlock);
__vtime_account_system(tsk);
if (context_tracking_in_user()) if (context_tracking_in_user())
tsk->vtime_snap_whence = VTIME_USER; tsk->vtime_snap_whence = VTIME_USER;
__vtime_account_system(tsk);
write_sequnlock(&tsk->vtime_seqlock); write_sequnlock(&tsk->vtime_seqlock);
} }
...@@ -688,12 +670,8 @@ void vtime_account_user(struct task_struct *tsk) ...@@ -688,12 +670,8 @@ void vtime_account_user(struct task_struct *tsk)
{ {
cputime_t delta_cpu; cputime_t delta_cpu;
if (!vtime_accounting_enabled())
return;
delta_cpu = get_vtime_delta(tsk);
write_seqlock(&tsk->vtime_seqlock); write_seqlock(&tsk->vtime_seqlock);
delta_cpu = get_vtime_delta(tsk);
tsk->vtime_snap_whence = VTIME_SYS; tsk->vtime_snap_whence = VTIME_SYS;
account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
write_sequnlock(&tsk->vtime_seqlock); write_sequnlock(&tsk->vtime_seqlock);
...@@ -701,22 +679,27 @@ void vtime_account_user(struct task_struct *tsk) ...@@ -701,22 +679,27 @@ void vtime_account_user(struct task_struct *tsk)
void vtime_user_enter(struct task_struct *tsk) void vtime_user_enter(struct task_struct *tsk)
{ {
if (!vtime_accounting_enabled())
return;
write_seqlock(&tsk->vtime_seqlock); write_seqlock(&tsk->vtime_seqlock);
tsk->vtime_snap_whence = VTIME_USER;
__vtime_account_system(tsk); __vtime_account_system(tsk);
tsk->vtime_snap_whence = VTIME_USER;
write_sequnlock(&tsk->vtime_seqlock); write_sequnlock(&tsk->vtime_seqlock);
} }
void vtime_guest_enter(struct task_struct *tsk) void vtime_guest_enter(struct task_struct *tsk)
{ {
/*
* The flags must be updated under the lock with
* the vtime_snap flush and update.
* That enforces a right ordering and update sequence
* synchronization against the reader (task_gtime())
* that can thus safely catch up with a tickless delta.
*/
write_seqlock(&tsk->vtime_seqlock); write_seqlock(&tsk->vtime_seqlock);
__vtime_account_system(tsk); __vtime_account_system(tsk);
current->flags |= PF_VCPU; current->flags |= PF_VCPU;
write_sequnlock(&tsk->vtime_seqlock); write_sequnlock(&tsk->vtime_seqlock);
} }
EXPORT_SYMBOL_GPL(vtime_guest_enter);
void vtime_guest_exit(struct task_struct *tsk) void vtime_guest_exit(struct task_struct *tsk)
{ {
...@@ -725,6 +708,7 @@ void vtime_guest_exit(struct task_struct *tsk) ...@@ -725,6 +708,7 @@ void vtime_guest_exit(struct task_struct *tsk)
current->flags &= ~PF_VCPU; current->flags &= ~PF_VCPU;
write_sequnlock(&tsk->vtime_seqlock); write_sequnlock(&tsk->vtime_seqlock);
} }
EXPORT_SYMBOL_GPL(vtime_guest_exit);
void vtime_account_idle(struct task_struct *tsk) void vtime_account_idle(struct task_struct *tsk)
{ {
...@@ -733,11 +717,6 @@ void vtime_account_idle(struct task_struct *tsk) ...@@ -733,11 +717,6 @@ void vtime_account_idle(struct task_struct *tsk)
account_idle_time(delta_cpu); account_idle_time(delta_cpu);
} }
bool vtime_accounting_enabled(void)
{
return context_tracking_active();
}
void arch_vtime_task_switch(struct task_struct *prev) void arch_vtime_task_switch(struct task_struct *prev)
{ {
write_seqlock(&prev->vtime_seqlock); write_seqlock(&prev->vtime_seqlock);
......
...@@ -105,7 +105,6 @@ config NO_HZ_FULL ...@@ -105,7 +105,6 @@ config NO_HZ_FULL
select RCU_USER_QS select RCU_USER_QS
select RCU_NOCB_CPU select RCU_NOCB_CPU
select VIRT_CPU_ACCOUNTING_GEN select VIRT_CPU_ACCOUNTING_GEN
select CONTEXT_TRACKING_FORCE
select IRQ_WORK select IRQ_WORK
help help
Adaptively try to shutdown the tick whenever possible, even when Adaptively try to shutdown the tick whenever possible, even when
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include <linux/irq_work.h> #include <linux/irq_work.h>
#include <linux/posix-timers.h> #include <linux/posix-timers.h>
#include <linux/perf_event.h> #include <linux/perf_event.h>
#include <linux/context_tracking.h>
#include <asm/irq_regs.h> #include <asm/irq_regs.h>
...@@ -148,8 +149,8 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) ...@@ -148,8 +149,8 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
} }
#ifdef CONFIG_NO_HZ_FULL #ifdef CONFIG_NO_HZ_FULL
static cpumask_var_t nohz_full_mask; cpumask_var_t tick_nohz_full_mask;
bool have_nohz_full_mask; bool tick_nohz_full_running;
static bool can_stop_full_tick(void) static bool can_stop_full_tick(void)
{ {
...@@ -182,7 +183,7 @@ static bool can_stop_full_tick(void) ...@@ -182,7 +183,7 @@ static bool can_stop_full_tick(void)
* Don't allow the user to think they can get * Don't allow the user to think they can get
* full NO_HZ with this machine. * full NO_HZ with this machine.
*/ */
WARN_ONCE(have_nohz_full_mask, WARN_ONCE(tick_nohz_full_running,
"NO_HZ FULL will not work with unstable sched clock"); "NO_HZ FULL will not work with unstable sched clock");
return false; return false;
} }
...@@ -197,7 +198,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); ...@@ -197,7 +198,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
* Re-evaluate the need for the tick on the current CPU * Re-evaluate the need for the tick on the current CPU
* and restart it if necessary. * and restart it if necessary.
*/ */
void tick_nohz_full_check(void) void __tick_nohz_full_check(void)
{ {
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
...@@ -211,7 +212,7 @@ void tick_nohz_full_check(void) ...@@ -211,7 +212,7 @@ void tick_nohz_full_check(void)
static void nohz_full_kick_work_func(struct irq_work *work) static void nohz_full_kick_work_func(struct irq_work *work)
{ {
tick_nohz_full_check(); __tick_nohz_full_check();
} }
static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
...@@ -230,7 +231,7 @@ void tick_nohz_full_kick(void) ...@@ -230,7 +231,7 @@ void tick_nohz_full_kick(void)
static void nohz_full_kick_ipi(void *info) static void nohz_full_kick_ipi(void *info)
{ {
tick_nohz_full_check(); __tick_nohz_full_check();
} }
/* /*
...@@ -239,12 +240,13 @@ static void nohz_full_kick_ipi(void *info) ...@@ -239,12 +240,13 @@ static void nohz_full_kick_ipi(void *info)
*/ */
void tick_nohz_full_kick_all(void) void tick_nohz_full_kick_all(void)
{ {
if (!have_nohz_full_mask) if (!tick_nohz_full_running)
return; return;
preempt_disable(); preempt_disable();
smp_call_function_many(nohz_full_mask, smp_call_function_many(tick_nohz_full_mask,
nohz_full_kick_ipi, NULL, false); nohz_full_kick_ipi, NULL, false);
tick_nohz_full_kick();
preempt_enable(); preempt_enable();
} }
...@@ -253,7 +255,7 @@ void tick_nohz_full_kick_all(void) ...@@ -253,7 +255,7 @@ void tick_nohz_full_kick_all(void)
* It might need the tick due to per task/process properties: * It might need the tick due to per task/process properties:
* perf events, posix cpu timers, ... * perf events, posix cpu timers, ...
*/ */
void tick_nohz_task_switch(struct task_struct *tsk) void __tick_nohz_task_switch(struct task_struct *tsk)
{ {
unsigned long flags; unsigned long flags;
...@@ -269,31 +271,23 @@ void tick_nohz_task_switch(struct task_struct *tsk) ...@@ -269,31 +271,23 @@ void tick_nohz_task_switch(struct task_struct *tsk)
local_irq_restore(flags); local_irq_restore(flags);
} }
int tick_nohz_full_cpu(int cpu)
{
if (!have_nohz_full_mask)
return 0;
return cpumask_test_cpu(cpu, nohz_full_mask);
}
/* Parse the boot-time nohz CPU list from the kernel parameters. */ /* Parse the boot-time nohz CPU list from the kernel parameters. */
static int __init tick_nohz_full_setup(char *str) static int __init tick_nohz_full_setup(char *str)
{ {
int cpu; int cpu;
alloc_bootmem_cpumask_var(&nohz_full_mask); alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
if (cpulist_parse(str, nohz_full_mask) < 0) { if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
return 1; return 1;
} }
cpu = smp_processor_id(); cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, nohz_full_mask)) { if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
cpumask_clear_cpu(cpu, nohz_full_mask); cpumask_clear_cpu(cpu, tick_nohz_full_mask);
} }
have_nohz_full_mask = true; tick_nohz_full_running = true;
return 1; return 1;
} }
...@@ -311,7 +305,7 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, ...@@ -311,7 +305,7 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
* If we handle the timekeeping duty for full dynticks CPUs, * If we handle the timekeeping duty for full dynticks CPUs,
* we can't safely shutdown that CPU. * we can't safely shutdown that CPU.
*/ */
if (have_nohz_full_mask && tick_do_timer_cpu == cpu) if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
return NOTIFY_BAD; return NOTIFY_BAD;
break; break;
} }
...@@ -330,31 +324,34 @@ static int tick_nohz_init_all(void) ...@@ -330,31 +324,34 @@ static int tick_nohz_init_all(void)
int err = -1; int err = -1;
#ifdef CONFIG_NO_HZ_FULL_ALL #ifdef CONFIG_NO_HZ_FULL_ALL
if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) { if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
return err; return err;
} }
err = 0; err = 0;
cpumask_setall(nohz_full_mask); cpumask_setall(tick_nohz_full_mask);
cpumask_clear_cpu(smp_processor_id(), nohz_full_mask); cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
have_nohz_full_mask = true; tick_nohz_full_running = true;
#endif #endif
return err; return err;
} }
void __init tick_nohz_init(void) void __init tick_nohz_init(void)
{ {
if (!have_nohz_full_mask) { int cpu;
if (!tick_nohz_full_running) {
if (tick_nohz_init_all() < 0) if (tick_nohz_init_all() < 0)
return; return;
} }
for_each_cpu(cpu, tick_nohz_full_mask)
context_tracking_cpu_set(cpu);
cpu_notifier(tick_nohz_cpu_down_callback, 0); cpu_notifier(tick_nohz_cpu_down_callback, 0);
cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask);
pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
} }
#else
#define have_nohz_full_mask (0)
#endif #endif
/* /*
...@@ -732,7 +729,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) ...@@ -732,7 +729,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return false; return false;
} }
if (have_nohz_full_mask) { if (tick_nohz_full_enabled()) {
/* /*
* Keep the tick alive to guarantee timekeeping progression * Keep the tick alive to guarantee timekeeping progression
* if there are full dynticks CPUs around * if there are full dynticks CPUs around
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment