Commit f7dd3b17 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-timers-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull timer updates from Thomas Gleixner:
 "This is the last functional update from the tip tree for 4.10. It got
  delayed due to a newly reported and anlyzed variant of BIOS bug and
  the resulting wreckage:

   - Seperation of TSC being marked realiable and the fact that the
     platform provides the TSC frequency via CPUID/MSRs and making use
     for it for GOLDMONT.

   - TSC adjust MSR validation and sanitizing:

     The TSC adjust MSR contains the offset to the hardware counter. The
     sum of the adjust MSR and the counter is the TSC value which is
     read via RDTSC.

     On at least two machines from different vendors the BIOS sets the
     TSC adjust MSR to negative values. This happens on cold and warm
     boot. While on cold boot the offset is a few milliseconds, on warm
     boot it basically compensates the power on time of the system. The
     BIOSes are not even using the adjust MSR to set all CPUs in the
     package to the same offset. The offsets are different which renders
     the TSC unusable,

     What's worse is that the TSC deadline timer has a HW feature^Wbug.
     It malfunctions when the TSC adjust value is negative or greater
     equal 0x80000000 resulting in silent boot failures, hard lockups or
     non firing timers. This looks like some hardware internal 32/64bit
     issue with a sign extension problem. Intel has been silent so far
     on the issue.

     The update contains sanity checks and keeps the adjust register
     within working limits and in sync on the package.

     As it looks like this disease is spreading via BIOS crapware, we
     need to address this urgently as the boot failures are hard to
     debug for users"

* 'x86-timers-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/tsc: Limit the adjust value further
  x86/tsc: Annotate printouts as firmware bug
  x86/tsc: Force TSC_ADJUST register to value >= zero
  x86/tsc: Validate TSC_ADJUST after resume
  x86/tsc: Validate cpumask pointer before accessing it
  x86/tsc: Fix broken CONFIG_X86_TSC=n build
  x86/tsc: Try to adjust TSC if sync test fails
  x86/tsc: Prepare warp test for TSC adjustment
  x86/tsc: Move sync cleanup to a safe place
  x86/tsc: Sync test only for the first cpu in a package
  x86/tsc: Verify TSC_ADJUST from idle
  x86/tsc: Store and check TSC ADJUST MSR
  x86/tsc: Detect random warps
  x86/tsc: Use X86_FEATURE_TSC_ADJUST in detect_art()
  x86/tsc: Finalize the split of the TSC_RELIABLE flag
  x86/tsc: Set TSC_KNOWN_FREQ and TSC_RELIABLE flags on Intel Atom SoCs
  x86/tsc: Mark Intel ATOM_GOLDMONT TSC reliable
  x86/tsc: Mark TSC frequency determined by CPUID as known
  x86/tsc: Add X86_FEATURE_TSC_KNOWN_FREQ flag
parents 1bbb05f5 8c9b9d87
......@@ -105,6 +105,7 @@
#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */
#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
......
......@@ -45,8 +45,17 @@ extern int tsc_clocksource_reliable;
* Boot-time check whether the TSCs are synchronized across
* all CPUs/cores:
*/
#ifdef CONFIG_X86_TSC
extern bool tsc_store_and_check_tsc_adjust(bool bootcpu);
extern void tsc_verify_tsc_adjust(bool resume);
extern void check_tsc_sync_source(int cpu);
extern void check_tsc_sync_target(void);
#else
static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return false; }
static inline void tsc_verify_tsc_adjust(bool resume) { }
static inline void check_tsc_sync_source(int cpu) { }
static inline void check_tsc_sync_target(void) { }
#endif
extern int notsc_setup(char *);
extern void tsc_save_sched_clock_state(void);
......
......@@ -75,7 +75,7 @@ apm-y := apm_32.o
obj-$(CONFIG_APM) += apm.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_SMP) += smpboot.o
obj-$(CONFIG_SMP) += tsc_sync.o
obj-$(CONFIG_X86_TSC) += tsc_sync.o
obj-$(CONFIG_SMP) += setup_percpu.o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
obj-y += apic/
......
......@@ -235,6 +235,7 @@ static inline void play_dead(void)
void arch_cpu_idle_enter(void)
{
tsc_verify_tsc_adjust(false);
local_touch_nmi();
}
......
......@@ -702,6 +702,20 @@ unsigned long native_calibrate_tsc(void)
}
}
/*
* TSC frequency determined by CPUID is a "hardware reported"
* frequency and is the most accurate one so far we have. This
* is considered a known frequency.
*/
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
/*
* For Atom SoCs TSC is the only reliable clocksource.
* Mark TSC reliable so no watchdog on it.
*/
if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT)
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
return crystal_khz * ebx_numerator / eax_denominator;
}
......@@ -1043,18 +1057,20 @@ static void detect_art(void)
if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
return;
cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
&art_to_tsc_numerator, unused, unused+1);
/* Don't enable ART in a VM, non-stop TSC required */
/* Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required */
if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
art_to_tsc_denominator < ART_MIN_DENOMINATOR)
!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
return;
if (rdmsrl_safe(MSR_IA32_TSC_ADJUST, &art_to_tsc_offset))
cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
&art_to_tsc_numerator, unused, unused+1);
if (art_to_tsc_denominator < ART_MIN_DENOMINATOR)
return;
rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset);
/* Make this sticky over multiple CPU init calls */
setup_force_cpu_cap(X86_FEATURE_ART);
}
......@@ -1064,6 +1080,11 @@ static void detect_art(void)
static struct clocksource clocksource_tsc;
static void tsc_resume(struct clocksource *cs)
{
tsc_verify_tsc_adjust(true);
}
/*
* We used to compare the TSC to the cycle_last value in the clocksource
* structure to avoid a nasty time-warp. This can be observed in a
......@@ -1096,6 +1117,7 @@ static struct clocksource clocksource_tsc = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
.archdata = { .vclock_mode = VCLOCK_TSC },
.resume = tsc_resume,
};
void mark_tsc_unstable(char *reason)
......@@ -1283,10 +1305,10 @@ static int __init init_tsc_clocksource(void)
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
/*
* Trust the results of the earlier calibration on systems
* exporting a reliable TSC.
* When TSC frequency is known (retrieved via MSR or CPUID), we skip
* the refined calibration and directly register it as a clocksource.
*/
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
clocksource_register_khz(&clocksource_tsc, tsc_khz);
return 0;
}
......@@ -1363,6 +1385,8 @@ void __init tsc_init(void)
if (unsynchronized_tsc())
mark_tsc_unstable("TSCs unsynchronized");
else
tsc_store_and_check_tsc_adjust(true);
check_system_tsc_reliable();
......
......@@ -100,5 +100,24 @@ unsigned long cpu_khz_from_msr(void)
#ifdef CONFIG_X86_LOCAL_APIC
lapic_timer_frequency = (freq * 1000) / HZ;
#endif
/*
* TSC frequency determined by MSR is always considered "known"
* because it is reported by HW.
* Another fact is that on MSR capable platforms, PIT/HPET is
* generally not available so calibration won't work at all.
*/
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
/*
* Unfortunately there is no way for hardware to tell whether the
* TSC is reliable. We were told by silicon design team that TSC
* on Atom SoCs are always "reliable". TSC is also the only
* reliable clocksource on these SoCs (HPET is either not present
* or not functional) so mark TSC reliable which removes the
* requirement for a watchdog clocksource.
*/
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
return res;
}
......@@ -14,18 +14,166 @@
* ( The serial nature of the boot logic and the CPU hotplug lock
* protects against more than 2 CPUs entering this code. )
*/
#include <linux/topology.h>
#include <linux/spinlock.h>
#include <linux/kernel.h>
#include <linux/smp.h>
#include <linux/nmi.h>
#include <asm/tsc.h>
struct tsc_adjust {
s64 bootval;
s64 adjusted;
unsigned long nextcheck;
bool warned;
};
static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
void tsc_verify_tsc_adjust(bool resume)
{
struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust);
s64 curval;
if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
return;
/* Rate limit the MSR check */
if (!resume && time_before(jiffies, adj->nextcheck))
return;
adj->nextcheck = jiffies + HZ;
rdmsrl(MSR_IA32_TSC_ADJUST, curval);
if (adj->adjusted == curval)
return;
/* Restore the original value */
wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted);
if (!adj->warned || resume) {
pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n",
smp_processor_id(), adj->adjusted, curval);
adj->warned = true;
}
}
static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
unsigned int cpu, bool bootcpu)
{
/*
* First online CPU in a package stores the boot value in the
* adjustment value. This value might change later via the sync
* mechanism. If that fails we still can yell about boot values not
* being consistent.
*
* On the boot cpu we just force set the ADJUST value to 0 if it's
* non zero. We don't do that on non boot cpus because physical
* hotplug should have set the ADJUST register to a value > 0 so
* the TSC is in sync with the already running cpus.
*
* But we always force positive ADJUST values. Otherwise the TSC
* deadline timer creates an interrupt storm. We also have to
* prevent values > 0x7FFFFFFF as those wreckage the timer as well.
*/
if ((bootcpu && bootval != 0) || (!bootcpu && bootval < 0) ||
(bootval > 0x7FFFFFFF)) {
pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu,
bootval);
wrmsrl(MSR_IA32_TSC_ADJUST, 0);
bootval = 0;
}
cur->adjusted = bootval;
}
#ifndef CONFIG_SMP
bool __init tsc_store_and_check_tsc_adjust(bool bootcpu)
{
struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
s64 bootval;
if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
return false;
rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
cur->bootval = bootval;
cur->nextcheck = jiffies + HZ;
tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu);
return false;
}
#else /* !CONFIG_SMP */
/*
* Store and check the TSC ADJUST MSR if available
*/
bool tsc_store_and_check_tsc_adjust(bool bootcpu)
{
struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust);
unsigned int refcpu, cpu = smp_processor_id();
struct cpumask *mask;
s64 bootval;
if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
return false;
rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
cur->bootval = bootval;
cur->nextcheck = jiffies + HZ;
cur->warned = false;
/*
* Check whether this CPU is the first in a package to come up. In
* this case do not check the boot value against another package
* because the new package might have been physically hotplugged,
* where TSC_ADJUST is expected to be different. When called on the
* boot CPU topology_core_cpumask() might not be available yet.
*/
mask = topology_core_cpumask(cpu);
refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids;
if (refcpu >= nr_cpu_ids) {
tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(),
bootcpu);
return false;
}
ref = per_cpu_ptr(&tsc_adjust, refcpu);
/*
* Compare the boot value and complain if it differs in the
* package.
*/
if (bootval != ref->bootval) {
pr_warn(FW_BUG "TSC ADJUST differs: Reference CPU%u: %lld CPU%u: %lld\n",
refcpu, ref->bootval, cpu, bootval);
}
/*
* The TSC_ADJUST values in a package must be the same. If the boot
* value on this newly upcoming CPU differs from the adjustment
* value of the already online CPU in this package, set it to that
* adjusted value.
*/
if (bootval != ref->adjusted) {
pr_warn("TSC ADJUST synchronize: Reference CPU%u: %lld CPU%u: %lld\n",
refcpu, ref->adjusted, cpu, bootval);
cur->adjusted = ref->adjusted;
wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted);
}
/*
* We have the TSCs forced to be in sync on this package. Skip sync
* test:
*/
return true;
}
/*
* Entry/exit counters that make sure that both CPUs
* run the measurement code at once:
*/
static atomic_t start_count;
static atomic_t stop_count;
static atomic_t skip_test;
static atomic_t test_runs;
/*
* We use a raw spinlock in this exceptional case, because
......@@ -37,15 +185,16 @@ static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
static cycles_t last_tsc;
static cycles_t max_warp;
static int nr_warps;
static int random_warps;
/*
* TSC-warp measurement loop running on both CPUs. This is not called
* if there is no TSC.
*/
static void check_tsc_warp(unsigned int timeout)
static cycles_t check_tsc_warp(unsigned int timeout)
{
cycles_t start, now, prev, end;
int i;
cycles_t start, now, prev, end, cur_max_warp = 0;
int i, cur_warps = 0;
start = rdtsc_ordered();
/*
......@@ -85,13 +234,22 @@ static void check_tsc_warp(unsigned int timeout)
if (unlikely(prev > now)) {
arch_spin_lock(&sync_lock);
max_warp = max(max_warp, prev - now);
cur_max_warp = max_warp;
/*
* Check whether this bounces back and forth. Only
* one CPU should observe time going backwards.
*/
if (cur_warps != nr_warps)
random_warps++;
nr_warps++;
cur_warps = nr_warps;
arch_spin_unlock(&sync_lock);
}
}
WARN(!(now-start),
"Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
now-start, end-start);
return cur_max_warp;
}
/*
......@@ -136,15 +294,26 @@ void check_tsc_sync_source(int cpu)
}
/*
* Reset it - in case this is a second bootup:
* Set the maximum number of test runs to
* 1 if the CPU does not provide the TSC_ADJUST MSR
* 3 if the MSR is available, so the target can try to adjust
*/
atomic_set(&stop_count, 0);
if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
atomic_set(&test_runs, 1);
else
atomic_set(&test_runs, 3);
retry:
/*
* Wait for the target to arrive:
* Wait for the target to start or to skip the test:
*/
while (atomic_read(&start_count) != cpus-1)
while (atomic_read(&start_count) != cpus - 1) {
if (atomic_read(&skip_test) > 0) {
atomic_set(&skip_test, 0);
return;
}
cpu_relax();
}
/*
* Trigger the target to continue into the measurement too:
*/
......@@ -155,21 +324,35 @@ void check_tsc_sync_source(int cpu)
while (atomic_read(&stop_count) != cpus-1)
cpu_relax();
if (nr_warps) {
/*
* If the test was successful set the number of runs to zero and
* stop. If not, decrement the number of runs an check if we can
* retry. In case of random warps no retry is attempted.
*/
if (!nr_warps) {
atomic_set(&test_runs, 0);
pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
smp_processor_id(), cpu);
} else if (atomic_dec_and_test(&test_runs) || random_warps) {
/* Force it to 0 if random warps brought us here */
atomic_set(&test_runs, 0);
pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
smp_processor_id(), cpu);
pr_warning("Measured %Ld cycles TSC warp between CPUs, "
"turning off TSC clock.\n", max_warp);
if (random_warps)
pr_warning("TSC warped randomly between CPUs\n");
mark_tsc_unstable("check_tsc_sync_source failed");
} else {
pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
smp_processor_id(), cpu);
}
/*
* Reset it - just in case we boot another CPU later:
*/
atomic_set(&start_count, 0);
random_warps = 0;
nr_warps = 0;
max_warp = 0;
last_tsc = 0;
......@@ -178,6 +361,12 @@ void check_tsc_sync_source(int cpu)
* Let the target continue with the bootup:
*/
atomic_inc(&stop_count);
/*
* Retry, if there is a chance to do so.
*/
if (atomic_read(&test_runs) > 0)
goto retry;
}
/*
......@@ -185,12 +374,25 @@ void check_tsc_sync_source(int cpu)
*/
void check_tsc_sync_target(void)
{
struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
unsigned int cpu = smp_processor_id();
cycles_t cur_max_warp, gbl_max_warp;
int cpus = 2;
/* Also aborts if there is no TSC. */
if (unsynchronized_tsc() || tsc_clocksource_reliable)
return;
/*
* Store, verify and sanitize the TSC adjust register. If
* successful skip the test.
*/
if (tsc_store_and_check_tsc_adjust(false)) {
atomic_inc(&skip_test);
return;
}
retry:
/*
* Register this CPU's participation and wait for the
* source CPU to start the measurement:
......@@ -199,7 +401,12 @@ void check_tsc_sync_target(void)
while (atomic_read(&start_count) != cpus)
cpu_relax();
check_tsc_warp(loop_timeout(smp_processor_id()));
cur_max_warp = check_tsc_warp(loop_timeout(cpu));
/*
* Store the maximum observed warp value for a potential retry:
*/
gbl_max_warp = max_warp;
/*
* Ok, we are done:
......@@ -211,4 +418,61 @@ void check_tsc_sync_target(void)
*/
while (atomic_read(&stop_count) != cpus)
cpu_relax();
/*
* Reset it for the next sync test:
*/
atomic_set(&stop_count, 0);
/*
* Check the number of remaining test runs. If not zero, the test
* failed and a retry with adjusted TSC is possible. If zero the
* test was either successful or failed terminally.
*/
if (!atomic_read(&test_runs))
return;
/*
* If the warp value of this CPU is 0, then the other CPU
* observed time going backwards so this TSC was ahead and
* needs to move backwards.
*/
if (!cur_max_warp)
cur_max_warp = -gbl_max_warp;
/*
* Add the result to the previous adjustment value.
*
* The adjustement value is slightly off by the overhead of the
* sync mechanism (observed values are ~200 TSC cycles), but this
* really depends on CPU, node distance and frequency. So
* compensating for this is hard to get right. Experiments show
* that the warp is not longer detectable when the observed warp
* value is used. In the worst case the adjustment needs to go
* through a 3rd run for fine tuning.
*/
cur->adjusted += cur_max_warp;
/*
* TSC deadline timer stops working or creates an interrupt storm
* with adjust values < 0 and > x07ffffff.
*
* To allow adjust values > 0x7FFFFFFF we need to disable the
* deadline timer and use the local APIC timer, but that requires
* more intrusive changes and we do not have any useful information
* from Intel about the underlying HW wreckage yet.
*/
if (cur->adjusted < 0)
cur->adjusted = 0;
if (cur->adjusted > 0x7FFFFFFF)
cur->adjusted = 0x7FFFFFFF;
pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
cpu, cur_max_warp, cur->adjusted);
wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted);
goto retry;
}
#endif /* CONFIG_SMP */
......@@ -49,8 +49,13 @@ static unsigned long __init mfld_calibrate_tsc(void)
fast_calibrate = ratio * fsb;
pr_debug("read penwell tsc %lu khz\n", fast_calibrate);
lapic_timer_frequency = fsb * 1000 / HZ;
/* mark tsc clocksource as reliable */
set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
/*
* TSC on Intel Atom SoCs is reliable and of known frequency.
* See tsc_msr.c for details.
*/
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
return fast_calibrate;
}
......
......@@ -78,8 +78,12 @@ static unsigned long __init tangier_calibrate_tsc(void)
pr_debug("Setting lapic_timer_frequency = %d\n",
lapic_timer_frequency);
/* mark tsc clocksource as reliable */
set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
/*
* TSC on Intel Atom SoCs is reliable and of known frequency.
* See tsc_msr.c for details.
*/
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
return fast_calibrate;
}
......
......@@ -252,6 +252,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
fix_processor_context();
do_fpu_end();
tsc_verify_tsc_adjust(true);
x86_platform.restore_sched_clock_state();
mtrr_bp_restore();
perf_restore_debug_store();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment