Commit 8e44a351 authored by Wanpeng Li's avatar Wanpeng Li Committed by Greg Kroah-Hartman

sched/cputime: Accumulate vtime on top of nsec clocksource

commit 2a42eb95 upstream.

Currently the cputime source used by vtime is jiffies. When we cross
a context boundary and jiffies have changed since the last snapshot, the
pending cputime is accounted to the switching out context.

This system works ok if the ticks are not aligned across CPUs. If they
instead are aligned (ie: all fire at the same time) and the CPUs run in
userspace, the jiffies change is only observed on tick exit and therefore
the user cputime is accounted as system cputime. This is because the
CPU that maintains timekeeping fires its tick at the same time as the
others. It updates jiffies in the middle of the tick and the other CPUs
see that update on IRQ exit:

    CPU 0 (timekeeper)                  CPU 1
    -------------------              -------------
                      jiffies = N
    ...                              run in userspace for a jiffy
    tick entry                       tick entry (sees jiffies = N)
    set jiffies = N + 1
    tick exit                        tick exit (sees jiffies = N + 1)
                                                account 1 jiffy as stime

Fix this with using a nanosec clock source instead of jiffies. The
cputime is then accumulated and flushed everytime the pending delta
reaches a jiffy in order to mitigate the accounting overhead.

[ fweisbec: changelog, rebase on struct vtime, field renames, add delta
  on cputime readers, keep idle vtime as-is (low overhead accounting),
  harmonize clock sources. ]
Suggested-by: default avatarThomas Gleixner <tglx@linutronix.de>
Reported-by: default avatarLuiz Capitulino <lcapitulino@redhat.com>
Tested-by: default avatarLuiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: default avatarWanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: default avatarFrederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: default avatarThomas Gleixner <tglx@linutronix.de>
Acked-by: default avatarRik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1498756511-11714-6-git-send-email-fweisbec@gmail.comSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>
Signed-off-by: default avatarMel Gorman <mgorman@techsingularity.net>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent a1d04e8a
...@@ -236,6 +236,9 @@ struct vtime { ...@@ -236,6 +236,9 @@ struct vtime {
seqcount_t seqcount; seqcount_t seqcount;
unsigned long long starttime; unsigned long long starttime;
enum vtime_state state; enum vtime_state state;
u64 utime;
u64 stime;
u64 gtime;
}; };
struct sched_info { struct sched_info {
......
...@@ -681,18 +681,19 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) ...@@ -681,18 +681,19 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
static u64 vtime_delta(struct vtime *vtime) static u64 vtime_delta(struct vtime *vtime)
{ {
unsigned long now = READ_ONCE(jiffies); unsigned long long clock;
if (time_before(now, (unsigned long)vtime->starttime)) clock = sched_clock_cpu(smp_processor_id());
if (clock < vtime->starttime)
return 0; return 0;
return jiffies_to_nsecs(now - vtime->starttime); return clock - vtime->starttime;
} }
static u64 get_vtime_delta(struct vtime *vtime) static u64 get_vtime_delta(struct vtime *vtime)
{ {
unsigned long now = READ_ONCE(jiffies); u64 delta = vtime_delta(vtime);
u64 delta, other; u64 other;
/* /*
* Unlike tick based timing, vtime based timing never has lost * Unlike tick based timing, vtime based timing never has lost
...@@ -701,17 +702,31 @@ static u64 get_vtime_delta(struct vtime *vtime) ...@@ -701,17 +702,31 @@ static u64 get_vtime_delta(struct vtime *vtime)
* elapsed time. Limit account_other_time to prevent rounding * elapsed time. Limit account_other_time to prevent rounding
* errors from causing elapsed vtime to go negative. * errors from causing elapsed vtime to go negative.
*/ */
delta = jiffies_to_nsecs(now - vtime->starttime);
other = account_other_time(delta); other = account_other_time(delta);
WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
vtime->starttime = now; vtime->starttime += delta;
return delta - other; return delta - other;
} }
static void __vtime_account_system(struct task_struct *tsk) static void __vtime_account_system(struct task_struct *tsk,
struct vtime *vtime)
{ {
account_system_time(tsk, irq_count(), get_vtime_delta(&tsk->vtime)); vtime->stime += get_vtime_delta(vtime);
if (vtime->stime >= TICK_NSEC) {
account_system_time(tsk, irq_count(), vtime->stime);
vtime->stime = 0;
}
}
static void vtime_account_guest(struct task_struct *tsk,
struct vtime *vtime)
{
vtime->gtime += get_vtime_delta(vtime);
if (vtime->gtime >= TICK_NSEC) {
account_guest_time(tsk, vtime->gtime);
vtime->gtime = 0;
}
} }
void vtime_account_system(struct task_struct *tsk) void vtime_account_system(struct task_struct *tsk)
...@@ -722,7 +737,11 @@ void vtime_account_system(struct task_struct *tsk) ...@@ -722,7 +737,11 @@ void vtime_account_system(struct task_struct *tsk)
return; return;
write_seqcount_begin(&vtime->seqcount); write_seqcount_begin(&vtime->seqcount);
__vtime_account_system(tsk); /* We might have scheduled out from guest path */
if (current->flags & PF_VCPU)
vtime_account_guest(tsk, vtime);
else
__vtime_account_system(tsk, vtime);
write_seqcount_end(&vtime->seqcount); write_seqcount_end(&vtime->seqcount);
} }
...@@ -731,8 +750,7 @@ void vtime_user_enter(struct task_struct *tsk) ...@@ -731,8 +750,7 @@ void vtime_user_enter(struct task_struct *tsk)
struct vtime *vtime = &tsk->vtime; struct vtime *vtime = &tsk->vtime;
write_seqcount_begin(&vtime->seqcount); write_seqcount_begin(&vtime->seqcount);
if (vtime_delta(vtime)) __vtime_account_system(tsk, vtime);
__vtime_account_system(tsk);
vtime->state = VTIME_USER; vtime->state = VTIME_USER;
write_seqcount_end(&vtime->seqcount); write_seqcount_end(&vtime->seqcount);
} }
...@@ -742,8 +760,11 @@ void vtime_user_exit(struct task_struct *tsk) ...@@ -742,8 +760,11 @@ void vtime_user_exit(struct task_struct *tsk)
struct vtime *vtime = &tsk->vtime; struct vtime *vtime = &tsk->vtime;
write_seqcount_begin(&vtime->seqcount); write_seqcount_begin(&vtime->seqcount);
if (vtime_delta(vtime)) vtime->utime += get_vtime_delta(vtime);
account_user_time(tsk, get_vtime_delta(vtime)); if (vtime->utime >= TICK_NSEC) {
account_user_time(tsk, vtime->utime);
vtime->utime = 0;
}
vtime->state = VTIME_SYS; vtime->state = VTIME_SYS;
write_seqcount_end(&vtime->seqcount); write_seqcount_end(&vtime->seqcount);
} }
...@@ -759,8 +780,7 @@ void vtime_guest_enter(struct task_struct *tsk) ...@@ -759,8 +780,7 @@ void vtime_guest_enter(struct task_struct *tsk)
* that can thus safely catch up with a tickless delta. * that can thus safely catch up with a tickless delta.
*/ */
write_seqcount_begin(&vtime->seqcount); write_seqcount_begin(&vtime->seqcount);
if (vtime_delta(vtime)) __vtime_account_system(tsk, vtime);
__vtime_account_system(tsk);
current->flags |= PF_VCPU; current->flags |= PF_VCPU;
write_seqcount_end(&vtime->seqcount); write_seqcount_end(&vtime->seqcount);
} }
...@@ -771,7 +791,7 @@ void vtime_guest_exit(struct task_struct *tsk) ...@@ -771,7 +791,7 @@ void vtime_guest_exit(struct task_struct *tsk)
struct vtime *vtime = &tsk->vtime; struct vtime *vtime = &tsk->vtime;
write_seqcount_begin(&vtime->seqcount); write_seqcount_begin(&vtime->seqcount);
__vtime_account_system(tsk); vtime_account_guest(tsk, vtime);
current->flags &= ~PF_VCPU; current->flags &= ~PF_VCPU;
write_seqcount_end(&vtime->seqcount); write_seqcount_end(&vtime->seqcount);
} }
...@@ -794,7 +814,7 @@ void arch_vtime_task_switch(struct task_struct *prev) ...@@ -794,7 +814,7 @@ void arch_vtime_task_switch(struct task_struct *prev)
write_seqcount_begin(&vtime->seqcount); write_seqcount_begin(&vtime->seqcount);
vtime->state = VTIME_SYS; vtime->state = VTIME_SYS;
vtime->starttime = jiffies; vtime->starttime = sched_clock_cpu(smp_processor_id());
write_seqcount_end(&vtime->seqcount); write_seqcount_end(&vtime->seqcount);
} }
...@@ -806,7 +826,7 @@ void vtime_init_idle(struct task_struct *t, int cpu) ...@@ -806,7 +826,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
local_irq_save(flags); local_irq_save(flags);
write_seqcount_begin(&vtime->seqcount); write_seqcount_begin(&vtime->seqcount);
vtime->state = VTIME_SYS; vtime->state = VTIME_SYS;
vtime->starttime = jiffies; vtime->starttime = sched_clock_cpu(cpu);
write_seqcount_end(&vtime->seqcount); write_seqcount_end(&vtime->seqcount);
local_irq_restore(flags); local_irq_restore(flags);
} }
...@@ -825,7 +845,7 @@ u64 task_gtime(struct task_struct *t) ...@@ -825,7 +845,7 @@ u64 task_gtime(struct task_struct *t)
gtime = t->gtime; gtime = t->gtime;
if (vtime->state == VTIME_SYS && t->flags & PF_VCPU) if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
gtime += vtime_delta(vtime); gtime += vtime->gtime + vtime_delta(vtime);
} while (read_seqcount_retry(&vtime->seqcount, seq)); } while (read_seqcount_retry(&vtime->seqcount, seq));
...@@ -866,9 +886,9 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) ...@@ -866,9 +886,9 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
* the right place. * the right place.
*/ */
if (vtime->state == VTIME_USER || t->flags & PF_VCPU) if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
*utime += delta; *utime += vtime->utime + delta;
else if (vtime->state == VTIME_SYS) else if (vtime->state == VTIME_SYS)
*stime += delta; *stime += vtime->stime + delta;
} while (read_seqcount_retry(&vtime->seqcount, seq)); } while (read_seqcount_retry(&vtime->seqcount, seq));
} }
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment