Commit 1bd4c02c authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] s390: no timer interrupts in idle.

From: Martin Schwidefsky <schwidefsky@de.ibm.com>

This patch add a system control that allows to switch off the jiffies timer
interrupts while a cpu sleeps in idle.  This is useful for a system running
with virtual cpus under z/VM.
parent 30c1ec2b
......@@ -333,6 +333,25 @@ config APPLDATA_NET_SUM
This can also be compiled as a module, which will be called
appldata_net_sum.o.
config NO_IDLE_HZ
bool "No HZ timer ticks in idle"
help
Switches the regular HZ timer off when the system is going idle.
This helps z/VM to detect that the Linux system is idle. VM can
then "swap-out" this guest which reduces memory usage. It also
reduces the overhead of idle systems.
The HZ timer can be switched on/off via /proc/sys/kernel/hz_timer.
hz_timer=0 means HZ timer is disabled. hz_timer=1 means HZ
timer is active.
config NO_IDLE_HZ_INIT
bool "HZ timer in idle off by default"
depends on NO_IDLE_HZ
help
The HZ timer is switched off in idle by default. That means the
HZ timer is already disabled at boot time.
endmenu
config PCMCIA
......
......@@ -83,6 +83,7 @@ CONFIG_PFAULT=y
# CONFIG_SHARED_KERNEL is not set
# CONFIG_CMM is not set
# CONFIG_VIRT_TIMER is not set
# CONFIG_NO_IDLE_HZ is not set
# CONFIG_PCMCIA is not set
#
......
......@@ -40,7 +40,7 @@
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/irq.h>
#ifdef CONFIG_VIRT_TIMER
#if defined(CONFIG_VIRT_TIMER) || defined (CONFIG_NO_IDLE_HZ)
#include <asm/timer.h>
#endif
......@@ -75,17 +75,21 @@ void default_idle(void)
psw_t wait_psw;
unsigned long reg;
local_irq_disable();
if (need_resched()) {
local_irq_enable();
schedule();
return;
}
#ifdef CONFIG_VIRT_TIMER
#if defined(CONFIG_VIRT_TIMER) || defined (CONFIG_NO_IDLE_HZ)
/*
* hook to stop timers that should not tick while CPU is idle
*/
if (stop_timers())
if (stop_timers()) {
local_irq_enable();
return;
}
#endif
/*
......
......@@ -281,29 +281,6 @@ int stop_cpu_timer(void)
return 0;
}
void do_monitor_call(struct pt_regs *regs, long interruption_code)
{
/* disable monitor call class 0 */
__ctl_clear_bit(8, 15);
start_cpu_timer();
}
/*
* called from cpu_idle to stop any timers
* returns 1 if CPU should not be stopped
*/
int stop_timers(void)
{
if (stop_cpu_timer())
return 1;
/* enable monitor call class 0 */
__ctl_set_bit(8, 15);
return 0;
}
void set_vtimer(__u64 expires)
{
asm volatile ("SPT %0" : : "m" (expires));
......@@ -424,6 +401,139 @@ static void do_cpu_timer_interrupt(struct pt_regs *regs, __u16 error_code)
}
#endif
#ifdef CONFIG_NO_IDLE_HZ
#ifdef CONFIG_NO_IDLE_HZ_INIT
int sysctl_hz_timer = 0;
#else
int sysctl_hz_timer = 1;
#endif
/*
* Start the HZ tick on the current CPU.
* Only cpu_idle may call this function.
*/
void start_hz_timer(struct pt_regs *regs)
{
__u64 tmp;
__u32 ticks;
if (!cpu_isset(smp_processor_id(), idle_cpu_mask))
return;
/* Calculate how many ticks have passed */
asm volatile ("STCK 0(%0)" : : "a" (&tmp) : "memory", "cc");
tmp = tmp + CLK_TICKS_PER_JIFFY - S390_lowcore.jiffy_timer;
ticks = __calculate_ticks(tmp);
S390_lowcore.jiffy_timer += CLK_TICKS_PER_JIFFY * (__u64) ticks;
/* Set the clock comparator to the next tick. */
tmp = S390_lowcore.jiffy_timer + CPU_DEVIATION;
asm volatile ("SCKC %0" : : "m" (tmp));
/* Charge the ticks. */
if (ticks > 0) {
#ifdef CONFIG_SMP
/*
* Do not rely on the boot cpu to do the calls to do_timer.
* Spread it over all cpus instead.
*/
write_seqlock(&xtime_lock);
if (S390_lowcore.jiffy_timer > xtime_cc) {
__u32 xticks;
tmp = S390_lowcore.jiffy_timer - xtime_cc;
if (tmp >= 2*CLK_TICKS_PER_JIFFY) {
xticks = __calculate_ticks(tmp);
xtime_cc += (__u64) xticks*CLK_TICKS_PER_JIFFY;
} else {
xticks = 1;
xtime_cc += CLK_TICKS_PER_JIFFY;
}
while (xticks--)
do_timer(regs);
}
write_sequnlock(&xtime_lock);
while (ticks--)
update_process_times(user_mode(regs));
#else
while (ticks--)
do_timer(regs);
#endif
}
cpu_clear(smp_processor_id(), idle_cpu_mask);
}
/*
* Stop the HZ tick on the current CPU.
* Only cpu_idle may call this function.
*/
int stop_hz_timer(void)
{
__u64 timer;
if (sysctl_hz_timer != 0)
return 1;
/*
* Leave the clock comparator set up for the next timer
* tick if either rcu or a softirq is pending.
*/
if (rcu_pending(smp_processor_id()) || local_softirq_pending())
return 1;
/*
* This cpu is going really idle. Set up the clock comparator
* for the next event.
*/
cpu_set(smp_processor_id(), idle_cpu_mask);
timer = (__u64) (next_timer_interrupt() - jiffies) + jiffies_64;
timer = jiffies_timer_cc + timer * CLK_TICKS_PER_JIFFY;
asm volatile ("SCKC %0" : : "m" (timer));
return 0;
}
#endif
#if defined(CONFIG_VIRT_TIMER) || defined(CONFIG_NO_IDLE_HZ)
void do_monitor_call(struct pt_regs *regs, long interruption_code)
{
/* disable monitor call class 0 */
__ctl_clear_bit(8, 15);
#ifdef CONFIG_VIRT_TIMER
start_cpu_timer();
#endif
#ifdef CONFIG_NO_IDLE_HZ
start_hz_timer(regs);
#endif
}
/*
* called from cpu_idle to stop any timers
* returns 1 if CPU should not be stopped
*/
int stop_timers(void)
{
#ifdef CONFIG_VIRT_TIMER
if (stop_cpu_timer())
return 1;
#endif
#ifdef CONFIG_NO_IDLE_HZ
if (stop_hz_timer())
return 1;
#endif
/* enable monitor call class 0 */
__ctl_set_bit(8, 15);
return 0;
}
#endif
/*
* Start the clock comparator and the virtual CPU timer
* on the current CPU.
......
......@@ -64,7 +64,7 @@ extern void pfault_fini(void);
extern void pfault_interrupt(struct pt_regs *regs, __u16 error_code);
static ext_int_info_t ext_int_pfault;
#endif
#ifdef CONFIG_VIRT_TIMER
#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_VIRT_TIMER)
extern pgm_check_handler_t do_monitor_call;
#endif
......@@ -620,7 +620,7 @@ void __init trap_init(void)
#endif /* CONFIG_ARCH_S390X */
pgm_check_table[0x15] = &operand_exception;
pgm_check_table[0x1C] = &privileged_op;
#ifdef CONFIG_VIRT_TIMER
#if defined(CONFIG_VIRT_TIMER) || defined(CONFIG_NO_IDLE_HZ)
pgm_check_table[0x40] = &do_monitor_call;
#endif
if (MACHINE_IS_VM) {
......
......@@ -149,6 +149,8 @@ typedef struct task_struct task_t;
extern void sched_init(void);
extern void init_idle(task_t *idle, int cpu);
extern cpumask_t idle_cpu_mask;
extern void show_state(void);
extern void show_regs(struct pt_regs *);
......
......@@ -132,6 +132,7 @@ enum
KERN_PTY=62, /* dir: pty driver */
KERN_NGROUPS_MAX=63, /* int: NGROUPS_MAX */
KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */
KERN_HZ_TIMER=65, /* int: hz timer on or off */
};
......
......@@ -65,6 +65,8 @@ extern int del_timer(struct timer_list * timer);
extern int __mod_timer(struct timer_list *timer, unsigned long expires);
extern int mod_timer(struct timer_list *timer, unsigned long expires);
extern unsigned long next_timer_interrupt(void);
/***
* add_timer - start a timer
* @timer: the timer to be added
......
......@@ -103,6 +103,8 @@ static void rcu_do_batch(struct list_head *list)
*/
static void rcu_start_batch(long newbatch)
{
cpumask_t active;
if (rcu_batch_before(rcu_ctrlblk.maxbatch, newbatch)) {
rcu_ctrlblk.maxbatch = newbatch;
}
......@@ -111,7 +113,9 @@ static void rcu_start_batch(long newbatch)
return;
}
/* Can't change, since spin lock held. */
rcu_ctrlblk.rcu_cpu_mask = cpu_online_map;
active = idle_cpu_mask;
cpus_complement(active);
cpus_and(rcu_ctrlblk.rcu_cpu_mask, cpu_online_map, active);
}
/*
......
......@@ -2684,6 +2684,15 @@ void __init init_idle(task_t *idle, int cpu)
#endif
}
/*
* In a system that switches off the HZ timer idle_cpu_mask
* indicates which cpus entered this state. This is used
* in the rcu update to wait only for active cpus. For system
* which do not switch off the HZ timer idle_cpu_mask should
* always be CPU_MASK_NONE.
*/
cpumask_t idle_cpu_mask = CPU_MASK_NONE;
#ifdef CONFIG_SMP
/*
* This is how migration works:
......
......@@ -108,6 +108,8 @@ extern int sysctl_ieee_emulation_warnings;
extern int sysctl_userprocess_debug;
#endif
extern int sysctl_hz_timer;
#if defined(CONFIG_PPC32) && defined(CONFIG_6xx)
extern unsigned long powersave_nap;
int proc_dol2crvec(ctl_table *table, int write, struct file *filp,
......@@ -573,6 +575,16 @@ static ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#endif
#ifdef CONFIG_NO_IDLE_HZ
{
.ctl_name = KERN_HZ_TIMER,
.procname = "hz_timer",
.data = &sysctl_hz_timer,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#endif
{
.ctl_name = KERN_S390_USER_DEBUG_LOGGING,
......
......@@ -428,6 +428,75 @@ static inline void __run_timers(tvec_base_t *base)
spin_unlock_irq(&base->lock);
}
#ifdef CONFIG_NO_IDLE_HZ
/*
* Find out when the next timer event is due to happen. This
* is used on S/390 to stop all activity when a cpus is idle.
* This functions needs to be called disabled.
*/
unsigned long next_timer_interrupt(void)
{
tvec_base_t *base;
struct list_head *list;
struct timer_list *nte;
unsigned long expires;
tvec_t *varray[4];
int i, j;
base = &__get_cpu_var(tvec_bases);
spin_lock(&base->lock);
expires = base->timer_jiffies + (LONG_MAX >> 1);
list = 0;
/* Look for timer events in tv1. */
j = base->timer_jiffies & TVR_MASK;
do {
list_for_each_entry(nte, base->tv1.vec + j, entry) {
expires = nte->expires;
if (j < (base->timer_jiffies & TVR_MASK))
list = base->tv2.vec + (INDEX(0));
goto found;
}
j = (j + 1) & TVR_MASK;
} while (j != (base->timer_jiffies & TVR_MASK));
/* Check tv2-tv5. */
varray[0] = &base->tv2;
varray[1] = &base->tv3;
varray[2] = &base->tv4;
varray[3] = &base->tv5;
for (i = 0; i < 4; i++) {
j = INDEX(i);
do {
if (list_empty(varray[i]->vec + j)) {
j = (j + 1) & TVN_MASK;
continue;
}
list_for_each_entry(nte, varray[i]->vec + j, entry)
if (time_before(nte->expires, expires))
expires = nte->expires;
if (j < (INDEX(i)) && i < 3)
list = varray[i + 1]->vec + (INDEX(i + 1));
goto found;
} while (j != (INDEX(i)));
}
found:
if (list) {
/*
* The search wrapped. We need to look at the next list
* from next tv element that would cascade into tv element
* where we found the timer element.
*/
list_for_each_entry(nte, list, entry) {
if (time_before(nte->expires, expires))
expires = nte->expires;
}
}
spin_unlock(&base->lock);
return expires;
}
#endif
/******************************************************************/
/*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment