Commit 9dbbe3cf authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm changes from Paolo Bonzini:
 "Remove from guest code the handling of task migration during a pvclock
  read; instead use the correct protocol in KVM.

  This removes the need for task migration notifiers in core scheduler
  code"

[ The scheduler people really hated the migration notifiers, so this was
  kind of required  - Linus ]

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  x86: pvclock: Really remove the sched notifier for cross-cpu migrations
  kvm: x86: fix kvmclock update protocol
parents fb45f493 73459e2a
...@@ -95,7 +95,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, ...@@ -95,7 +95,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
struct pvclock_vsyscall_time_info { struct pvclock_vsyscall_time_info {
struct pvclock_vcpu_time_info pvti; struct pvclock_vcpu_time_info pvti;
u32 migrate_count;
} __attribute__((__aligned__(SMP_CACHE_BYTES))); } __attribute__((__aligned__(SMP_CACHE_BYTES)));
#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
......
...@@ -141,46 +141,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, ...@@ -141,46 +141,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
} }
static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
static struct pvclock_vsyscall_time_info *
pvclock_get_vsyscall_user_time_info(int cpu)
{
if (!pvclock_vdso_info) {
BUG();
return NULL;
}
return &pvclock_vdso_info[cpu];
}
struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
{
return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
}
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
void *v)
{
struct task_migration_notifier *mn = v;
struct pvclock_vsyscall_time_info *pvti;
pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
/* this is NULL when pvclock vsyscall is not initialized */
if (unlikely(pvti == NULL))
return NOTIFY_DONE;
pvti->migrate_count++;
return NOTIFY_DONE;
}
static struct notifier_block pvclock_migrate = {
.notifier_call = pvclock_task_migrate,
};
/* /*
* Initialize the generic pvclock vsyscall state. This will allocate * Initialize the generic pvclock vsyscall state. This will allocate
* a/some page(s) for the per-vcpu pvclock information, set up a * a/some page(s) for the per-vcpu pvclock information, set up a
...@@ -194,17 +155,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, ...@@ -194,17 +155,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
pvclock_vdso_info = i;
for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
__pa(i) + (idx*PAGE_SIZE), __pa(i) + (idx*PAGE_SIZE),
PAGE_KERNEL_VVAR); PAGE_KERNEL_VVAR);
} }
register_task_migration_notifier(&pvclock_migrate);
return 0; return 0;
} }
#endif #endif
...@@ -1669,12 +1669,28 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) ...@@ -1669,12 +1669,28 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
&guest_hv_clock, sizeof(guest_hv_clock)))) &guest_hv_clock, sizeof(guest_hv_clock))))
return 0; return 0;
/* /* This VCPU is paused, but it's legal for a guest to read another
* The interface expects us to write an even number signaling that the * VCPU's kvmclock, so we really have to follow the specification where
* update is finished. Since the guest won't see the intermediate * it says that version is odd if data is being modified, and even after
* state, we just increase by 2 at the end. * it is consistent.
*
* Version field updates must be kept separate. This is because
* kvm_write_guest_cached might use a "rep movs" instruction, and
* writes within a string instruction are weakly ordered. So there
* are three writes overall.
*
* As a small optimization, only write the version field in the first
* and third write. The vcpu->pv_time cache is still valid, because the
* version field is the first in the struct.
*/ */
vcpu->hv_clock.version = guest_hv_clock.version + 2; BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
vcpu->hv_clock.version = guest_hv_clock.version + 1;
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
&vcpu->hv_clock,
sizeof(vcpu->hv_clock.version));
smp_wmb();
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
...@@ -1695,6 +1711,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) ...@@ -1695,6 +1711,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
kvm_write_guest_cached(v->kvm, &vcpu->pv_time, kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
&vcpu->hv_clock, &vcpu->hv_clock,
sizeof(vcpu->hv_clock)); sizeof(vcpu->hv_clock));
smp_wmb();
vcpu->hv_clock.version++;
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
&vcpu->hv_clock,
sizeof(vcpu->hv_clock.version));
return 0; return 0;
} }
......
...@@ -82,15 +82,18 @@ static notrace cycle_t vread_pvclock(int *mode) ...@@ -82,15 +82,18 @@ static notrace cycle_t vread_pvclock(int *mode)
cycle_t ret; cycle_t ret;
u64 last; u64 last;
u32 version; u32 version;
u32 migrate_count;
u8 flags; u8 flags;
unsigned cpu, cpu1; unsigned cpu, cpu1;
/* /*
* When looping to get a consistent (time-info, tsc) pair, we * Note: hypervisor must guarantee that:
* also need to deal with the possibility we can switch vcpus, * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
* so make sure we always re-fetch time-info for the current vcpu. * 2. that per-CPU pvclock time info is updated if the
* underlying CPU changes.
* 3. that version is increased whenever underlying CPU
* changes.
*
*/ */
do { do {
cpu = __getcpu() & VGETCPU_CPU_MASK; cpu = __getcpu() & VGETCPU_CPU_MASK;
...@@ -99,27 +102,20 @@ static notrace cycle_t vread_pvclock(int *mode) ...@@ -99,27 +102,20 @@ static notrace cycle_t vread_pvclock(int *mode)
* __getcpu() calls (Gleb). * __getcpu() calls (Gleb).
*/ */
/* Make sure migrate_count will change if we leave the VCPU. */ pvti = get_pvti(cpu);
do {
pvti = get_pvti(cpu);
migrate_count = pvti->migrate_count;
cpu1 = cpu;
cpu = __getcpu() & VGETCPU_CPU_MASK;
} while (unlikely(cpu != cpu1));
version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
/* /*
* Test we're still on the cpu as well as the version. * Test we're still on the cpu as well as the version.
* - We must read TSC of pvti's VCPU. * We could have been migrated just after the first
* - KVM doesn't follow the versioning protocol, so data could * vgetcpu but before fetching the version, so we
* change before version if we left the VCPU. * wouldn't notice a version change.
*/ */
smp_rmb(); cpu1 = __getcpu() & VGETCPU_CPU_MASK;
} while (unlikely((pvti->pvti.version & 1) || } while (unlikely(cpu != cpu1 ||
pvti->pvti.version != version || (pvti->pvti.version & 1) ||
pvti->migrate_count != migrate_count)); pvti->pvti.version != version));
if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
*mode = VCLOCK_NONE; *mode = VCLOCK_NONE;
......
...@@ -175,14 +175,6 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); ...@@ -175,14 +175,6 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
extern void calc_global_load(unsigned long ticks); extern void calc_global_load(unsigned long ticks);
extern void update_cpu_load_nohz(void); extern void update_cpu_load_nohz(void);
/* Notifier for when a task gets migrated to a new CPU */
struct task_migration_notifier {
struct task_struct *task;
int from_cpu;
int to_cpu;
};
extern void register_task_migration_notifier(struct notifier_block *n);
extern unsigned long get_parent_ip(unsigned long addr); extern unsigned long get_parent_ip(unsigned long addr);
extern void dump_cpu_task(int cpu); extern void dump_cpu_task(int cpu);
......
...@@ -1016,13 +1016,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) ...@@ -1016,13 +1016,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
rq_clock_skip_update(rq, true); rq_clock_skip_update(rq, true);
} }
static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
void register_task_migration_notifier(struct notifier_block *n)
{
atomic_notifier_chain_register(&task_migration_notifier, n);
}
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
void set_task_cpu(struct task_struct *p, unsigned int new_cpu) void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{ {
...@@ -1053,18 +1046,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ...@@ -1053,18 +1046,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
trace_sched_migrate_task(p, new_cpu); trace_sched_migrate_task(p, new_cpu);
if (task_cpu(p) != new_cpu) { if (task_cpu(p) != new_cpu) {
struct task_migration_notifier tmn;
if (p->sched_class->migrate_task_rq) if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu); p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++; p->se.nr_migrations++;
perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
tmn.task = p;
tmn.from_cpu = task_cpu(p);
tmn.to_cpu = new_cpu;
atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
} }
__set_task_cpu(p, new_cpu); __set_task_cpu(p, new_cpu);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment