Commit 18783bfd authored by Andrew Morton's avatar Andrew Morton Committed by Ben Collins

[PATCH] improved core support for time-interpolation

From: David Mosberger <davidm@napali.hpl.hp.com>

Basically, what the patch does is provide two hooks such that platforms
(and subplatforms) can provide time-interpolation in a way that guarantees
that two causally related gettimeofday() calls will never see time going
backwards (unless there is a settimeofday() call, of course).

There is some evidence that the current scheme does work: we use it on ia64
both for cycle-counter-based interpolation and the SGI folks use it with a
chipset-based high-performance counter.


It seems like enough platforms do this sort of thing to provide _some_
support in the core, especially because it's rather tricky to guarantee
that time never goes backwards (short of a settimeofday, of course).

This patch is based on something Jes Sorensen wrote for the SGI Itanium 2
platform (which has a chipset-internal high-res clock).  I adapted it so it
can be used for cycle-counter interpolation also.  The net effect is that
"last_time_offset" can be removed completely from the kernel.

The basic idea behind the patch is simply: every time you advance xtime by
N nanoseconds, you call update_wall_time_hook(NSEC).  Every time the time
gets set (i.e., discontinuity is OK), reset_wall_time_hook() is called.
parent ae871bc9
...@@ -51,6 +51,9 @@ ...@@ -51,6 +51,9 @@
#ifndef _LINUX_TIMEX_H #ifndef _LINUX_TIMEX_H
#define _LINUX_TIMEX_H #define _LINUX_TIMEX_H
#include <linux/config.h>
#include <linux/compiler.h>
#include <asm/param.h> #include <asm/param.h>
/* /*
...@@ -310,6 +313,105 @@ extern long pps_calcnt; /* calibration intervals */ ...@@ -310,6 +313,105 @@ extern long pps_calcnt; /* calibration intervals */
extern long pps_errcnt; /* calibration errors */ extern long pps_errcnt; /* calibration errors */
extern long pps_stbcnt; /* stability limit exceeded */ extern long pps_stbcnt; /* stability limit exceeded */
#ifdef CONFIG_TIME_INTERPOLATION
struct time_interpolator {
/* cache-hot stuff first: */
unsigned long (*get_offset) (void);
void (*update) (long);
void (*reset) (void);
/* cache-cold stuff follows here: */
struct time_interpolator *next;
unsigned long frequency; /* frequency in counts/second */
long drift; /* drift in parts-per-million (or -1) */
};
extern volatile unsigned long last_nsec_offset;
#ifndef __HAVE_ARCH_CMPXCHG
extern spin_lock_t last_nsec_offset_lock;
#endif
extern struct time_interpolator *time_interpolator;
extern void register_time_interpolator(struct time_interpolator *);
extern void unregister_time_interpolator(struct time_interpolator *);
/* Called with xtime WRITE-lock acquired. */
static inline void
time_interpolator_update(long delta_nsec)
{
struct time_interpolator *ti = time_interpolator;
if (last_nsec_offset > 0) {
#ifdef __HAVE_ARCH_CMPXCHG
unsigned long new, old;
do {
old = last_nsec_offset;
if (old > delta_nsec)
new = old - delta_nsec;
else
new = 0;
} while (cmpxchg(&last_nsec_offset, old, new) != old);
#else
/*
* This really hurts, because it serializes gettimeofday(), but without an
* atomic single-word compare-and-exchange, there isn't all that much else
* we can do.
*/
spin_lock(&last_nsec_offset_lock);
{
last_nsec_offset -= min(last_nsec_offset, delta_nsec);
}
spin_unlock(&last_nsec_offset_lock);
#endif
}
if (ti)
(*ti->update)(delta_nsec);
}
/* Called with xtime WRITE-lock acquired. */
static inline void
time_interpolator_reset(void)
{
struct time_interpolator *ti = time_interpolator;
last_nsec_offset = 0;
if (ti)
(*ti->reset)();
}
/* Called with xtime READ-lock acquired. */
static inline unsigned long
time_interpolator_get_offset(void)
{
struct time_interpolator *ti = time_interpolator;
if (ti)
return (*ti->get_offset)();
return last_nsec_offset;
}
#else /* !CONFIG_TIME_INTERPOLATION */
static inline void
time_interpolator_update(long delta_nsec)
{
}
static inline void
time_interpolator_reset(void)
{
}
static inline unsigned long
time_interpolator_get_offset(void)
{
return 0;
}
#endif /* !CONFIG_TIME_INTERPOLATION */
#endif /* KERNEL */ #endif /* KERNEL */
#endif /* LINUX_TIMEX_H */ #endif /* LINUX_TIMEX_H */
...@@ -35,8 +35,6 @@ ...@@ -35,8 +35,6 @@
*/ */
struct timezone sys_tz; struct timezone sys_tz;
extern unsigned long last_time_offset;
#if !defined(__alpha__) && !defined(__ia64__) #if !defined(__alpha__) && !defined(__ia64__)
/* /*
...@@ -77,9 +75,10 @@ asmlinkage long sys_stime(int * tptr) ...@@ -77,9 +75,10 @@ asmlinkage long sys_stime(int * tptr)
if (get_user(value, tptr)) if (get_user(value, tptr))
return -EFAULT; return -EFAULT;
write_seqlock_irq(&xtime_lock); write_seqlock_irq(&xtime_lock);
time_interpolator_reset();
xtime.tv_sec = value; xtime.tv_sec = value;
xtime.tv_nsec = 0; xtime.tv_nsec = 0;
last_time_offset = 0;
time_adjust = 0; /* stop active adjtime() */ time_adjust = 0; /* stop active adjtime() */
time_status |= STA_UNSYNC; time_status |= STA_UNSYNC;
time_maxerror = NTP_PHASE_LIMIT; time_maxerror = NTP_PHASE_LIMIT;
...@@ -125,7 +124,7 @@ inline static void warp_clock(void) ...@@ -125,7 +124,7 @@ inline static void warp_clock(void)
{ {
write_seqlock_irq(&xtime_lock); write_seqlock_irq(&xtime_lock);
xtime.tv_sec += sys_tz.tz_minuteswest * 60; xtime.tv_sec += sys_tz.tz_minuteswest * 60;
last_time_offset = 0; time_interpolator_update(sys_tz.tz_minuteswest * 60 * NSEC_PER_SEC);
write_sequnlock_irq(&xtime_lock); write_sequnlock_irq(&xtime_lock);
} }
...@@ -381,7 +380,6 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 ...@@ -381,7 +380,6 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
txc->calcnt = pps_calcnt; txc->calcnt = pps_calcnt;
txc->errcnt = pps_errcnt; txc->errcnt = pps_errcnt;
txc->stbcnt = pps_stbcnt; txc->stbcnt = pps_stbcnt;
last_time_offset = 0;
write_sequnlock_irq(&xtime_lock); write_sequnlock_irq(&xtime_lock);
do_gettimeofday(&txc->time); do_gettimeofday(&txc->time);
return(result); return(result);
......
...@@ -517,6 +517,7 @@ static void second_overflow(void) ...@@ -517,6 +517,7 @@ static void second_overflow(void)
if (xtime.tv_sec % 86400 == 0) { if (xtime.tv_sec % 86400 == 0) {
xtime.tv_sec--; xtime.tv_sec--;
wall_to_monotonic.tv_sec++; wall_to_monotonic.tv_sec++;
time_interpolator_update(-NSEC_PER_SEC);
time_state = TIME_OOP; time_state = TIME_OOP;
clock_was_set(); clock_was_set();
printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
...@@ -527,6 +528,7 @@ static void second_overflow(void) ...@@ -527,6 +528,7 @@ static void second_overflow(void)
if ((xtime.tv_sec + 1) % 86400 == 0) { if ((xtime.tv_sec + 1) % 86400 == 0) {
xtime.tv_sec++; xtime.tv_sec++;
wall_to_monotonic.tv_sec--; wall_to_monotonic.tv_sec--;
time_interpolator_update(NSEC_PER_SEC);
time_state = TIME_WAIT; time_state = TIME_WAIT;
clock_was_set(); clock_was_set();
printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
...@@ -605,7 +607,7 @@ static void second_overflow(void) ...@@ -605,7 +607,7 @@ static void second_overflow(void)
/* in the NTP reference this is called "hardclock()" */ /* in the NTP reference this is called "hardclock()" */
static void update_wall_time_one_tick(void) static void update_wall_time_one_tick(void)
{ {
long time_adjust_step; long time_adjust_step, delta_nsec;
if ( (time_adjust_step = time_adjust) != 0 ) { if ( (time_adjust_step = time_adjust) != 0 ) {
/* We are doing an adjtime thing. /* We are doing an adjtime thing.
...@@ -621,11 +623,11 @@ static void update_wall_time_one_tick(void) ...@@ -621,11 +623,11 @@ static void update_wall_time_one_tick(void)
time_adjust_step = tickadj; time_adjust_step = tickadj;
else if (time_adjust < -tickadj) else if (time_adjust < -tickadj)
time_adjust_step = -tickadj; time_adjust_step = -tickadj;
/* Reduce by this step the amount of time left */ /* Reduce by this step the amount of time left */
time_adjust -= time_adjust_step; time_adjust -= time_adjust_step;
} }
xtime.tv_nsec += tick_nsec + time_adjust_step * 1000; delta_nsec = tick_nsec + time_adjust_step * 1000;
/* /*
* Advance the phase, once it gets to one microsecond, then * Advance the phase, once it gets to one microsecond, then
* advance the tick more. * advance the tick more.
...@@ -634,13 +636,15 @@ static void update_wall_time_one_tick(void) ...@@ -634,13 +636,15 @@ static void update_wall_time_one_tick(void)
if (time_phase <= -FINEUSEC) { if (time_phase <= -FINEUSEC) {
long ltemp = -time_phase >> (SHIFT_SCALE - 10); long ltemp = -time_phase >> (SHIFT_SCALE - 10);
time_phase += ltemp << (SHIFT_SCALE - 10); time_phase += ltemp << (SHIFT_SCALE - 10);
xtime.tv_nsec -= ltemp; delta_nsec -= ltemp;
} }
else if (time_phase >= FINEUSEC) { else if (time_phase >= FINEUSEC) {
long ltemp = time_phase >> (SHIFT_SCALE - 10); long ltemp = time_phase >> (SHIFT_SCALE - 10);
time_phase -= ltemp << (SHIFT_SCALE - 10); time_phase -= ltemp << (SHIFT_SCALE - 10);
xtime.tv_nsec += ltemp; delta_nsec += ltemp;
} }
xtime.tv_nsec += delta_nsec;
time_interpolator_update(delta_nsec);
} }
/* /*
...@@ -660,6 +664,7 @@ static void update_wall_time(unsigned long ticks) ...@@ -660,6 +664,7 @@ static void update_wall_time(unsigned long ticks)
if (xtime.tv_nsec >= 1000000000) { if (xtime.tv_nsec >= 1000000000) {
xtime.tv_nsec -= 1000000000; xtime.tv_nsec -= 1000000000;
xtime.tv_sec++; xtime.tv_sec++;
time_interpolator_update(NSEC_PER_SEC);
second_overflow(); second_overflow();
} }
} }
...@@ -777,7 +782,6 @@ unsigned long wall_jiffies = INITIAL_JIFFIES; ...@@ -777,7 +782,6 @@ unsigned long wall_jiffies = INITIAL_JIFFIES;
#ifndef ARCH_HAVE_XTIME_LOCK #ifndef ARCH_HAVE_XTIME_LOCK
seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
#endif #endif
unsigned long last_time_offset;
/* /*
* This function runs timers and the timer-tq in bottom half context. * This function runs timers and the timer-tq in bottom half context.
...@@ -811,7 +815,6 @@ static inline void update_times(void) ...@@ -811,7 +815,6 @@ static inline void update_times(void)
wall_jiffies += ticks; wall_jiffies += ticks;
update_wall_time(ticks); update_wall_time(ticks);
} }
last_time_offset = 0;
calc_load(ticks); calc_load(ticks);
} }
...@@ -1221,3 +1224,80 @@ void __init init_timers(void) ...@@ -1221,3 +1224,80 @@ void __init init_timers(void)
register_cpu_notifier(&timers_nb); register_cpu_notifier(&timers_nb);
open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
} }
#ifdef CONFIG_TIME_INTERPOLATION
volatile unsigned long last_nsec_offset;
struct time_interpolator *time_interpolator;
#ifndef __HAVE_ARCH_CMPXCHG
spinlock_t last_nsec_offset_lock = SPIN_LOCK_UNLOCKED;
#endif
static struct {
spinlock_t lock; /* lock protecting list */
struct time_interpolator *list; /* list of registered interpolators */
} ti_global = {
.lock = SPIN_LOCK_UNLOCKED
};
static inline int
is_better_time_interpolator(struct time_interpolator *new)
{
if (!time_interpolator)
return 1;
return new->frequency > 2*time_interpolator->frequency
|| (unsigned long) new->drift < (unsigned long) time_interpolator->drift;
}
void
register_time_interpolator(struct time_interpolator *ti)
{
spin_lock(&ti_global.lock);
{
write_seqlock_irq(&xtime_lock);
{
if (is_better_time_interpolator(ti))
time_interpolator = ti;
}
write_sequnlock_irq(&xtime_lock);
ti->next = ti_global.list;
ti_global.list = ti;
}
spin_unlock(&ti_global.lock);
}
void
unregister_time_interpolator(struct time_interpolator *ti)
{
struct time_interpolator *curr, **prev;
spin_lock(&ti_global.lock);
{
prev = &ti_global.list;
for (curr = *prev; curr; curr = curr->next) {
if (curr == ti) {
*prev = curr->next;
break;
}
prev = &curr->next;
}
write_seqlock_irq(&xtime_lock);
{
if (ti == time_interpolator) {
/* we lost the best time-interpolator: */
time_interpolator = NULL;
/* find the next-best interpolator */
for (curr = ti_global.list; curr; curr = curr->next)
if (is_better_time_interpolator(curr))
time_interpolator = curr;
}
}
write_sequnlock_irq(&xtime_lock);
}
spin_unlock(&ti_global.lock);
}
#endif /* CONFIG_TIME_INTERPOLATION */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment