Merge tag 'powerpc-4.17-4' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux

Pull powerpc fixes from Michael Ellerman: "A bunch of fixes, mostly for existing code and going to stable. Our memory hot-unplug path wasn't flushing the cache before removing memory. That is a problem now that we are doing memory hotplug on bare metal. Three fixes for the NPU code that supports devices connected via NVLink (ie. GPUs). The main one tweaks the TLB flush algorithm to avoid soft lockups for large flushes. A fix for our memory error handling where we would loop infinitely, returning back to the bad access and hard lockup the CPU. Fixes for the OPAL RTC driver, which wasn't handling some error cases correctly. A fix for a hardlockup in the powernv cpufreq driver. And finally two fixes to our smp_send_stop(), required due to a recent change to use it on shutdown. Thanks to: Alistair Popple, Balbir Singh, Laurentiu Tudor, Mahesh Salgaonkar, Mark Hairgrove, Nicholas Piggin, Rashmica Gupta, Shilpasri G Bhat" * tag 'powerpc-4.17-4' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: powerpc/kvm/booke: Fix altivec related build break powerpc: Fix deadlock with multiple calls to smp_send_stop cpufreq: powernv: Fix hardlockup due to synchronous smp_call in timer interrupt powerpc: Fix smp_send_stop NMI IPI handling rtc: opal: Fix OPAL RTC driver OPAL_BUSY loops powerpc/mce: Fix a bug where mce loops on memory UE. powerpc/powernv/npu: Do a PID GPU TLB flush when invalidating a large address range powerpc/powernv/npu: Prevent overwriting of pnv_npu2_init_contex() callback parameters powerpc/powernv/npu: Add lock to prevent race in concurrent context init/destroy powerpc/powernv/memtrace: Let the arch hotunplug code flush cache powerpc/mm: Flush cache on memory hot(un)plug

Merge tag 'powerpc-4.17-4' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux
Pull powerpc fixes from Michael Ellerman: "A bunch of fixes, mostly for existing code and going to stable. Our memory hot-unplug path wasn't flushing the cache before removing memory. That is a problem now that we are doing memory hotplug on bare metal. Three fixes for the NPU code that supports devices connected via NVLink (ie. GPUs). The main one tweaks the TLB flush algorithm to avoid soft lockups for large flushes. A fix for our memory error handling where we would loop infinitely, returning back to the bad access and hard lockup the CPU. Fixes for the OPAL RTC driver, which wasn't handling some error cases correctly. A fix for a hardlockup in the powernv cpufreq driver. And finally two fixes to our smp_send_stop(), required due to a recent change to use it on shutdown. Thanks to: Alistair Popple, Balbir Singh, Laurentiu Tudor, Mahesh Salgaonkar, Mark Hairgrove, Nicholas Piggin, Rashmica Gupta, Shilpasri G Bhat" * tag 'powerpc-4.17-4' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: powerpc/kvm/booke: Fix altivec related build break powerpc: Fix deadlock with multiple calls to smp_send_stop cpufreq: powernv: Fix hardlockup due to synchronous smp_call in timer interrupt powerpc: Fix smp_send_stop NMI IPI handling rtc: opal: Fix OPAL RTC driver OPAL_BUSY loops powerpc/mce: Fix a bug where mce loops on memory UE. powerpc/powernv/npu: Do a PID GPU TLB flush when invalidating a large address range powerpc/powernv/npu: Prevent overwriting of pnv_npu2_init_contex() callback parameters powerpc/powernv/npu: Add lock to prevent race in concurrent context init/destroy powerpc/powernv/memtrace: Let the arch hotunplug code flush cache powerpc/mm: Flush cache on memory hot(un)plug
0d95cfa9 · Linus Torvalds · 46dc111d · b2d7ecbe · 0d95cfa9 · 0d95cfa9
Commit 0d95cfa9 authored Apr 28, 2018 by Linus Torvalds
10 changed files
--- a/arch/powerpc/include/asm/powernv.h
+++ b/arch/powerpc/include/asm/powernv.h
@@ -15,7 +15,7 @@
 extern void powernv_set_nmmu_ptcr(unsigned long ptcr);
 extern struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
 			unsigned long flags,
-			struct npu_context *(*cb)(struct npu_context *, void *),
+			void (*cb)(struct npu_context *, void *),
 			void *priv);
 extern void pnv_npu2_destroy_context(struct npu_context *context,
 				struct pci_dev *gpdev);

--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -441,7 +441,6 @@ static int mce_handle_ierror(struct pt_regs *regs,
 					if (pfn != ULONG_MAX) {
 						*phys_addr =
 							(pfn << PAGE_SHIFT);
-						handled = 1;
 					}
 				}
 			}
@@ -532,9 +531,7 @@ static int mce_handle_derror(struct pt_regs *regs,
 			 * kernel/exception-64s.h
 			 */
 			if (get_paca()->in_mce < MAX_MCE_DEPTH)
-				if (!mce_find_instr_ea_and_pfn(regs, addr,
-								phys_addr))
-					handled = 1;
+				mce_find_instr_ea_and_pfn(regs, addr, phys_addr);
 		}
 		found = 1;
 	}
@@ -572,7 +569,7 @@ static long mce_handle_error(struct pt_regs *regs,
 		const struct mce_ierror_table itable[])
 {
 	struct mce_error_info mce_err = { 0 };
-	uint64_t addr, phys_addr;
+	uint64_t addr, phys_addr = ULONG_MAX;
 	uint64_t srr1 = regs->msr;
 	long handled;


--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -566,10 +566,35 @@ void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
 #endif

 #ifdef CONFIG_NMI_IPI
-static void stop_this_cpu(struct pt_regs *regs)
-#else
+static void nmi_stop_this_cpu(struct pt_regs *regs)
+{
+	/*
+	 * This is a special case because it never returns, so the NMI IPI
+	 * handling would never mark it as done, which makes any later
+	 * smp_send_nmi_ipi() call spin forever. Mark it done now.
+	 *
+	 * IRQs are already hard disabled by the smp_handle_nmi_ipi.
+	 */
+	nmi_ipi_lock();
+	nmi_ipi_busy_count--;
+	nmi_ipi_unlock();
+
+	/* Remove this CPU */
+	set_cpu_online(smp_processor_id(), false);
+
+	spin_begin();
+	while (1)
+		spin_cpu_relax();
+}
+
+void smp_send_stop(void)
+{
+	smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, nmi_stop_this_cpu, 1000000);
+}
+
+#else /* CONFIG_NMI_IPI */
+
 static void stop_this_cpu(void *dummy)
-#endif
 {
 	/* Remove this CPU */
 	set_cpu_online(smp_processor_id(), false);
@@ -582,12 +607,22 @@ static void stop_this_cpu(void *dummy)

 void smp_send_stop(void)
 {
-#ifdef CONFIG_NMI_IPI
-	smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, stop_this_cpu, 1000000);
-#else
+	static bool stopped = false;
+
+	/*
+	 * Prevent waiting on csd lock from a previous smp_send_stop.
+	 * This is racy, but in general callers try to do the right
+	 * thing and only fire off one smp_send_stop (e.g., see
+	 * kernel/panic.c)
+	 */
+	if (stopped)
+		return;
+
+	stopped = true;
+
 	smp_call_function(stop_this_cpu, NULL, 0);
-#endif
 }
+#endif /* CONFIG_NMI_IPI */

 struct thread_info *current_set[NR_CPUS];


--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -305,6 +305,13 @@ void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu)
 	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_FP_UNAVAIL);
 }

+#ifdef CONFIG_ALTIVEC
+void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALTIVEC_UNAVAIL);
+}
+#endif
+
 void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
 {
 	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DECREMENTER);

--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -133,6 +133,7 @@ int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *
 			start, start + size, rc);
 		return -EFAULT;
 	}
+	flush_inval_dcache_range(start, start + size);

 	return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
 }
@@ -159,6 +160,7 @@ int __meminit arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap

 	/* Remove htab bolted mappings for this section of memory */
 	start = (unsigned long)__va(start);
+	flush_inval_dcache_range(start, start + size);
 	ret = remove_section_mapping(start, start + size);

 	/* Ensure all vmalloc mappings are flushed in case they also

--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -82,19 +82,6 @@ static const struct file_operations memtrace_fops = {
 	.open	= simple_open,
 };

-static void flush_memory_region(u64 base, u64 size)
-{
-	unsigned long line_size = ppc64_caches.l1d.size;
-	u64 end = base + size;
-	u64 addr;
-
-	base = round_down(base, line_size);
-	end = round_up(end, line_size);
-
-	for (addr = base; addr < end; addr += line_size)
-		asm volatile("dcbf 0,%0" : "=r" (addr) :: "memory");
-}
-
 static int check_memblock_online(struct memory_block *mem, void *arg)
 {
 	if (mem->state != MEM_ONLINE)
@@ -132,10 +119,6 @@ static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
 	walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
 			  change_memblock_state);

-	/* RCU grace period? */
-	flush_memory_region((u64)__va(start_pfn << PAGE_SHIFT),
-			    nr_pages << PAGE_SHIFT);
-
 	lock_device_hotplug();
 	remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
 	unlock_device_hotplug();

--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -33,6 +33,19 @@

 #define npu_to_phb(x) container_of(x, struct pnv_phb, npu)

+/*
+ * spinlock to protect initialisation of an npu_context for a particular
+ * mm_struct.
+ */
+static DEFINE_SPINLOCK(npu_context_lock);
+
+/*
+ * When an address shootdown range exceeds this threshold we invalidate the
+ * entire TLB on the GPU for the given PID rather than each specific address in
+ * the range.
+ */
+#define ATSD_THRESHOLD (2*1024*1024)
+
 /*
 * Other types of TCE cache invalidation are not functional in the
 * hardware.
@@ -401,7 +414,7 @@ struct npu_context {
 	bool nmmu_flush;

 	/* Callback to stop translation requests on a given GPU */
-	struct npu_context *(*release_cb)(struct npu_context *, void *);
+	void (*release_cb)(struct npu_context *context, void *priv);

 	/*
 	 * Private pointer passed to the above callback for usage by
@@ -671,11 +684,19 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
 	struct npu_context *npu_context = mn_to_npu_context(mn);
 	unsigned long address;

-	for (address = start; address < end; address += PAGE_SIZE)
-		mmio_invalidate(npu_context, 1, address, false);
+	if (end - start > ATSD_THRESHOLD) {
+		/*
+		 * Just invalidate the entire PID if the address range is too
+		 * large.
+		 */
+		mmio_invalidate(npu_context, 0, 0, true);
+	} else {
+		for (address = start; address < end; address += PAGE_SIZE)
+			mmio_invalidate(npu_context, 1, address, false);

-	/* Do the flush only on the final addess == end */
-	mmio_invalidate(npu_context, 1, address, true);
+		/* Do the flush only on the final addess == end */
+		mmio_invalidate(npu_context, 1, address, true);
+	}
 }

 static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
@@ -696,11 +717,12 @@ static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
 * Returns an error if there no contexts are currently available or a
 * npu_context which should be passed to pnv_npu2_handle_fault().
 *
- * mmap_sem must be held in write mode.
+ * mmap_sem must be held in write mode and must not be called from interrupt
+ * context.
 */
 struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
 			unsigned long flags,
-			struct npu_context *(*cb)(struct npu_context *, void *),
+			void (*cb)(struct npu_context *, void *),
 			void *priv)
 {
 	int rc;
@@ -743,7 +765,9 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
 	/*
 	 * Setup the NPU context table for a particular GPU. These need to be
 	 * per-GPU as we need the tables to filter ATSDs when there are no
-	 * active contexts on a particular GPU.
+	 * active contexts on a particular GPU. It is safe for these to be
+	 * called concurrently with destroy as the OPAL call takes appropriate
+	 * locks and refcounts on init/destroy.
 	 */
 	rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
 				PCI_DEVID(gpdev->bus->number, gpdev->devfn));
@@ -754,8 +778,29 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
 	 * We store the npu pci device so we can more easily get at the
 	 * associated npus.
 	 */
+	spin_lock(&npu_context_lock);
 	npu_context = mm->context.npu_context;
+	if (npu_context) {
+		if (npu_context->release_cb != cb ||
+			npu_context->priv != priv) {
+			spin_unlock(&npu_context_lock);
+			opal_npu_destroy_context(nphb->opal_id, mm->context.id,
+						PCI_DEVID(gpdev->bus->number,
+							gpdev->devfn));
+			return ERR_PTR(-EINVAL);
+		}
+
+		WARN_ON(!kref_get_unless_zero(&npu_context->kref));
+	}
+	spin_unlock(&npu_context_lock);
+
 	if (!npu_context) {
+		/*
+		 * We can set up these fields without holding the
+		 * npu_context_lock as the npu_context hasn't been returned to
+		 * the caller meaning it can't be destroyed. Parallel allocation
+		 * is protected against by mmap_sem.
+		 */
 		rc = -ENOMEM;
 		npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
 		if (npu_context) {
@@ -774,8 +819,6 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
 		}

 		mm->context.npu_context = npu_context;
-	} else {
-		WARN_ON(!kref_get_unless_zero(&npu_context->kref));
 	}

 	npu_context->release_cb = cb;
@@ -814,15 +857,16 @@ static void pnv_npu2_release_context(struct kref *kref)
 		mm_context_remove_copro(npu_context->mm);

 	npu_context->mm->context.npu_context = NULL;
-	mmu_notifier_unregister(&npu_context->mn,
-				npu_context->mm);
-
-	kfree(npu_context);
 }

+/*
+ * Destroy a context on the given GPU. May free the npu_context if it is no
+ * longer active on any GPUs. Must not be called from interrupt context.
+ */
 void pnv_npu2_destroy_context(struct npu_context *npu_context,
 			struct pci_dev *gpdev)
 {
+	int removed;
 	struct pnv_phb *nphb;
 	struct npu *npu;
 	struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
@@ -844,7 +888,21 @@ void pnv_npu2_destroy_context(struct npu_context *npu_context,
 	WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
 	opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,
 				PCI_DEVID(gpdev->bus->number, gpdev->devfn));
-	kref_put(&npu_context->kref, pnv_npu2_release_context);
+	spin_lock(&npu_context_lock);
+	removed = kref_put(&npu_context->kref, pnv_npu2_release_context);
+	spin_unlock(&npu_context_lock);
+
+	/*
+	 * We need to do this outside of pnv_npu2_release_context so that it is
+	 * outside the spinlock as mmu_notifier_destroy uses SRCU.
+	 */
+	if (removed) {
+		mmu_notifier_unregister(&npu_context->mn,
+					npu_context->mm);
+
+		kfree(npu_context);
+	}
+
 }
 EXPORT_SYMBOL(pnv_npu2_destroy_context);


--- a/arch/powerpc/platforms/powernv/opal-rtc.c
+++ b/arch/powerpc/platforms/powernv/opal-rtc.c
@@ -48,10 +48,12 @@ unsigned long __init opal_get_boot_time(void)

 	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
 		rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms);
-		if (rc == OPAL_BUSY_EVENT)
+		if (rc == OPAL_BUSY_EVENT) {
+			mdelay(OPAL_BUSY_DELAY_MS);
 			opal_poll_events(NULL);
-		else if (rc == OPAL_BUSY)
-			mdelay(10);
+		} else if (rc == OPAL_BUSY) {
+			mdelay(OPAL_BUSY_DELAY_MS);
+		}
 	}
 	if (rc != OPAL_SUCCESS)
 		return 0;

--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -679,6 +679,16 @@ void gpstate_timer_handler(struct timer_list *t)

 	if (!spin_trylock(&gpstates->gpstate_lock))
 		return;
+	/*
+	 * If the timer has migrated to the different cpu then bring
+	 * it back to one of the policy->cpus
+	 */
+	if (!cpumask_test_cpu(raw_smp_processor_id(), policy->cpus)) {
+		gpstates->timer.expires = jiffies + msecs_to_jiffies(1);
+		add_timer_on(&gpstates->timer, cpumask_first(policy->cpus));
+		spin_unlock(&gpstates->gpstate_lock);
+		return;
+	}

 	/*
 	 * If PMCR was last updated was using fast_swtich then
@@ -718,10 +728,8 @@ void gpstate_timer_handler(struct timer_list *t)
 	if (gpstate_idx != gpstates->last_lpstate_idx)
 		queue_gpstate_timer(gpstates);

+	set_pstate(&freq_data);
 	spin_unlock(&gpstates->gpstate_lock);
-
-	/* Timer may get migrated to a different cpu on cpu hot unplug */
-	smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
 }

 /*

--- a/drivers/rtc/rtc-opal.c
+++ b/drivers/rtc/rtc-opal.c
@@ -57,7 +57,7 @@ static void tm_to_opal(struct rtc_time *tm, u32 *y_m_d, u64 *h_m_s_ms)

 static int opal_get_rtc_time(struct device *dev, struct rtc_time *tm)
 {
-	long rc = OPAL_BUSY;
+	s64 rc = OPAL_BUSY;
 	int retries = 10;
 	u32 y_m_d;
 	u64 h_m_s_ms;
@@ -66,13 +66,17 @@ static int opal_get_rtc_time(struct device *dev, struct rtc_time *tm)

 	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
 		rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms);
-		if (rc == OPAL_BUSY_EVENT)
+		if (rc == OPAL_BUSY_EVENT) {
+			msleep(OPAL_BUSY_DELAY_MS);
 			opal_poll_events(NULL);
-		else if (retries-- && (rc == OPAL_HARDWARE
-				       || rc == OPAL_INTERNAL_ERROR))
-			msleep(10);
-		else if (rc != OPAL_BUSY && rc != OPAL_BUSY_EVENT)
-			break;
+		} else if (rc == OPAL_BUSY) {
+			msleep(OPAL_BUSY_DELAY_MS);
+		} else if (rc == OPAL_HARDWARE || rc == OPAL_INTERNAL_ERROR) {
+			if (retries--) {
+				msleep(10); /* Wait 10ms before retry */
+				rc = OPAL_BUSY; /* go around again */
+			}
+		}
 	}

 	if (rc != OPAL_SUCCESS)
@@ -87,21 +91,26 @@ static int opal_get_rtc_time(struct device *dev, struct rtc_time *tm)

 static int opal_set_rtc_time(struct device *dev, struct rtc_time *tm)
 {
-	long rc = OPAL_BUSY;
+	s64 rc = OPAL_BUSY;
 	int retries = 10;
 	u32 y_m_d = 0;
 	u64 h_m_s_ms = 0;

 	tm_to_opal(tm, &y_m_d, &h_m_s_ms);
+
 	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
 		rc = opal_rtc_write(y_m_d, h_m_s_ms);
-		if (rc == OPAL_BUSY_EVENT)
+		if (rc == OPAL_BUSY_EVENT) {
+			msleep(OPAL_BUSY_DELAY_MS);
 			opal_poll_events(NULL);
-		else if (retries-- && (rc == OPAL_HARDWARE
-				       || rc == OPAL_INTERNAL_ERROR))
-			msleep(10);
-		else if (rc != OPAL_BUSY && rc != OPAL_BUSY_EVENT)
-			break;
+		} else if (rc == OPAL_BUSY) {
+			msleep(OPAL_BUSY_DELAY_MS);
+		} else if (rc == OPAL_HARDWARE || rc == OPAL_INTERNAL_ERROR) {
+			if (retries--) {
+				msleep(10); /* Wait 10ms before retry */
+				rc = OPAL_BUSY; /* go around again */
+			}
+		}
 	}

 	return rc == OPAL_SUCCESS ? 0 : -EIO;