Commit cada8caa authored by Jane Chu's avatar Jane Chu Committed by Greg Kroah-Hartman

sparc64: Measure receiver forward progress to avoid send mondo timeout


[ Upstream commit 9d53caec ]

A large sun4v SPARC system may have moments of intensive xcall activities,
usually caused by unmapping many pages on many CPUs concurrently. This can
flood receivers with CPU mondo interrupts for an extended period, causing
some unlucky senders to hit send-mondo timeout. This problem gets worse
as cpu count increases because sometimes mappings must be invalidated on
all CPUs, and sometimes all CPUs may gang up on a single CPU.

But a busy system is not a broken system. In the above scenario, as long
as the receiver is making forward progress processing mondo interrupts,
the sender should continue to retry.

This patch implements the receiver's forward progress meter by introducing
a per cpu counter 'cpu_mondo_counter[cpu]' where 'cpu' is in the range
of 0..NR_CPUS. The receiver increments its counter as soon as it receives
a mondo and the sender tracks the receiver's counter. If the receiver has
stopped making forward progress when the retry limit is reached, the sender
declares send-mondo-timeout and panic; otherwise, the receiver is allowed
to keep making forward progress.

In addition, it's been observed that PCIe hotplug events generate Correctable
Errors that are handled by hypervisor and then OS. Hypervisor 'borrows'
a guest cpu strand briefly to provide the service. If the cpu strand is
simultaneously the only cpu targeted by a mondo, it may not be available
for the mondo in 20msec, causing SUN4V mondo timeout. It appears that 1 second
is the agreed wait time between hypervisor and guest OS, this patch makes
the adjustment.

Orabug: 25476541
Orabug: 26417466
Signed-off-by: default avatarJane Chu <jane.chu@oracle.com>
Reviewed-by: default avatarSteve Sistare <steven.sistare@oracle.com>
Reviewed-by: default avatarAnthony Yznaga <anthony.yznaga@oracle.com>
Reviewed-by: default avatarRob Gardner <rob.gardner@oracle.com>
Reviewed-by: default avatarThomas Tai <thomas.tai@oracle.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 7c37101c
...@@ -54,6 +54,7 @@ extern struct trap_per_cpu trap_block[NR_CPUS]; ...@@ -54,6 +54,7 @@ extern struct trap_per_cpu trap_block[NR_CPUS];
void init_cur_cpu_trap(struct thread_info *); void init_cur_cpu_trap(struct thread_info *);
void setup_tba(void); void setup_tba(void);
extern int ncpus_probed; extern int ncpus_probed;
extern u64 cpu_mondo_counter[NR_CPUS];
unsigned long real_hard_smp_processor_id(void); unsigned long real_hard_smp_processor_id(void);
......
...@@ -617,22 +617,48 @@ static void cheetah_xcall_deliver(struct trap_per_cpu *tb, int cnt) ...@@ -617,22 +617,48 @@ static void cheetah_xcall_deliver(struct trap_per_cpu *tb, int cnt)
} }
} }
/* Multi-cpu list version. */ #define CPU_MONDO_COUNTER(cpuid) (cpu_mondo_counter[cpuid])
#define MONDO_USEC_WAIT_MIN 2
#define MONDO_USEC_WAIT_MAX 100
#define MONDO_RETRY_LIMIT 500000
/* Multi-cpu list version.
*
* Deliver xcalls to 'cnt' number of cpus in 'cpu_list'.
* Sometimes not all cpus receive the mondo, requiring us to re-send
* the mondo until all cpus have received, or cpus are truly stuck
* unable to receive mondo, and we timeout.
* Occasionally a target cpu strand is borrowed briefly by hypervisor to
* perform guest service, such as PCIe error handling. Consider the
* service time, 1 second overall wait is reasonable for 1 cpu.
* Here two in-between mondo check wait time are defined: 2 usec for
* single cpu quick turn around and up to 100usec for large cpu count.
* Deliver mondo to large number of cpus could take longer, we adjusts
* the retry count as long as target cpus are making forward progress.
*/
static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt) static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
{ {
int retries, this_cpu, prev_sent, i, saw_cpu_error; int this_cpu, tot_cpus, prev_sent, i, rem;
int usec_wait, retries, tot_retries;
u16 first_cpu = 0xffff;
unsigned long xc_rcvd = 0;
unsigned long status; unsigned long status;
int ecpuerror_id = 0;
int enocpu_id = 0;
u16 *cpu_list; u16 *cpu_list;
u16 cpu;
this_cpu = smp_processor_id(); this_cpu = smp_processor_id();
cpu_list = __va(tb->cpu_list_pa); cpu_list = __va(tb->cpu_list_pa);
usec_wait = cnt * MONDO_USEC_WAIT_MIN;
saw_cpu_error = 0; if (usec_wait > MONDO_USEC_WAIT_MAX)
retries = 0; usec_wait = MONDO_USEC_WAIT_MAX;
retries = tot_retries = 0;
tot_cpus = cnt;
prev_sent = 0; prev_sent = 0;
do { do {
int forward_progress, n_sent; int n_sent, mondo_delivered, target_cpu_busy;
status = sun4v_cpu_mondo_send(cnt, status = sun4v_cpu_mondo_send(cnt,
tb->cpu_list_pa, tb->cpu_list_pa,
...@@ -640,94 +666,113 @@ static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt) ...@@ -640,94 +666,113 @@ static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
/* HV_EOK means all cpus received the xcall, we're done. */ /* HV_EOK means all cpus received the xcall, we're done. */
if (likely(status == HV_EOK)) if (likely(status == HV_EOK))
break; goto xcall_done;
/* If not these non-fatal errors, panic */
if (unlikely((status != HV_EWOULDBLOCK) &&
(status != HV_ECPUERROR) &&
(status != HV_ENOCPU)))
goto fatal_errors;
/* First, see if we made any forward progress. /* First, see if we made any forward progress.
*
* Go through the cpu_list, count the target cpus that have
* received our mondo (n_sent), and those that did not (rem).
* Re-pack cpu_list with the cpus remain to be retried in the
* front - this simplifies tracking the truly stalled cpus.
* *
* The hypervisor indicates successful sends by setting * The hypervisor indicates successful sends by setting
* cpu list entries to the value 0xffff. * cpu list entries to the value 0xffff.
*
* EWOULDBLOCK means some target cpus did not receive the
* mondo and retry usually helps.
*
* ECPUERROR means at least one target cpu is in error state,
* it's usually safe to skip the faulty cpu and retry.
*
* ENOCPU means one of the target cpu doesn't belong to the
* domain, perhaps offlined which is unexpected, but not
* fatal and it's okay to skip the offlined cpu.
*/ */
rem = 0;
n_sent = 0; n_sent = 0;
for (i = 0; i < cnt; i++) { for (i = 0; i < cnt; i++) {
if (likely(cpu_list[i] == 0xffff)) cpu = cpu_list[i];
if (likely(cpu == 0xffff)) {
n_sent++; n_sent++;
} else if ((status == HV_ECPUERROR) &&
(sun4v_cpu_state(cpu) == HV_CPU_STATE_ERROR)) {
ecpuerror_id = cpu + 1;
} else if (status == HV_ENOCPU && !cpu_online(cpu)) {
enocpu_id = cpu + 1;
} else {
cpu_list[rem++] = cpu;
}
} }
forward_progress = 0; /* No cpu remained, we're done. */
if (n_sent > prev_sent) if (rem == 0)
forward_progress = 1; break;
prev_sent = n_sent; /* Otherwise, update the cpu count for retry. */
cnt = rem;
/* If we get a HV_ECPUERROR, then one or more of the cpus /* Record the overall number of mondos received by the
* in the list are in error state. Use the cpu_state() * first of the remaining cpus.
* hypervisor call to find out which cpus are in error state.
*/ */
if (unlikely(status == HV_ECPUERROR)) { if (first_cpu != cpu_list[0]) {
for (i = 0; i < cnt; i++) { first_cpu = cpu_list[0];
long err; xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
u16 cpu; }
cpu = cpu_list[i]; /* Was any mondo delivered successfully? */
if (cpu == 0xffff) mondo_delivered = (n_sent > prev_sent);
continue; prev_sent = n_sent;
err = sun4v_cpu_state(cpu); /* or, was any target cpu busy processing other mondos? */
if (err == HV_CPU_STATE_ERROR) { target_cpu_busy = (xc_rcvd < CPU_MONDO_COUNTER(first_cpu));
saw_cpu_error = (cpu + 1); xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
cpu_list[i] = 0xffff;
}
}
} else if (unlikely(status != HV_EWOULDBLOCK))
goto fatal_mondo_error;
/* Don't bother rewriting the CPU list, just leave the /* Retry count is for no progress. If we're making progress,
* 0xffff and non-0xffff entries in there and the * reset the retry count.
* hypervisor will do the right thing.
*
* Only advance timeout state if we didn't make any
* forward progress.
*/ */
if (unlikely(!forward_progress)) { if (likely(mondo_delivered || target_cpu_busy)) {
if (unlikely(++retries > 10000)) tot_retries += retries;
retries = 0;
} else if (unlikely(retries > MONDO_RETRY_LIMIT)) {
goto fatal_mondo_timeout; goto fatal_mondo_timeout;
}
/* Delay a little bit to let other cpus catch up /* Delay a little bit to let other cpus catch up on
* on their cpu mondo queue work. * their cpu mondo queue work.
*/ */
udelay(2 * cnt); if (!mondo_delivered)
} udelay(usec_wait);
} while (1);
if (unlikely(saw_cpu_error)) retries++;
goto fatal_mondo_cpu_error; } while (1);
xcall_done:
if (unlikely(ecpuerror_id > 0)) {
pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) was in error state\n",
this_cpu, ecpuerror_id - 1);
} else if (unlikely(enocpu_id > 0)) {
pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) does not belong to the domain\n",
this_cpu, enocpu_id - 1);
}
return; return;
fatal_mondo_cpu_error: fatal_errors:
printk(KERN_CRIT "CPU[%d]: SUN4V mondo cpu error, some target cpus " /* fatal errors include bad alignment, etc */
"(including %d) were in error state\n", pr_crit("CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) mondo_block_pa(%lx)\n",
this_cpu, saw_cpu_error - 1); this_cpu, tot_cpus, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
return; panic("Unexpected SUN4V mondo error %lu\n", status);
fatal_mondo_timeout: fatal_mondo_timeout:
printk(KERN_CRIT "CPU[%d]: SUN4V mondo timeout, no forward " /* some cpus being non-responsive to the cpu mondo */
" progress after %d retries.\n", pr_crit("CPU[%d]: SUN4V mondo timeout, cpu(%d) made no forward progress after %d retries. Total target cpus(%d).\n",
this_cpu, retries); this_cpu, first_cpu, (tot_retries + retries), tot_cpus);
goto dump_cpu_list_and_out; panic("SUN4V mondo timeout panic\n");
fatal_mondo_error:
printk(KERN_CRIT "CPU[%d]: Unexpected SUN4V mondo error %lu\n",
this_cpu, status);
printk(KERN_CRIT "CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) "
"mondo_block_pa(%lx)\n",
this_cpu, cnt, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
dump_cpu_list_and_out:
printk(KERN_CRIT "CPU[%d]: CPU list [ ", this_cpu);
for (i = 0; i < cnt; i++)
printk("%u ", cpu_list[i]);
printk("]\n");
} }
static void (*xcall_deliver_impl)(struct trap_per_cpu *, int); static void (*xcall_deliver_impl)(struct trap_per_cpu *, int);
......
...@@ -26,6 +26,21 @@ sun4v_cpu_mondo: ...@@ -26,6 +26,21 @@ sun4v_cpu_mondo:
ldxa [%g0] ASI_SCRATCHPAD, %g4 ldxa [%g0] ASI_SCRATCHPAD, %g4
sub %g4, TRAP_PER_CPU_FAULT_INFO, %g4 sub %g4, TRAP_PER_CPU_FAULT_INFO, %g4
/* Get smp_processor_id() into %g3 */
sethi %hi(trap_block), %g5
or %g5, %lo(trap_block), %g5
sub %g4, %g5, %g3
srlx %g3, TRAP_BLOCK_SZ_SHIFT, %g3
/* Increment cpu_mondo_counter[smp_processor_id()] */
sethi %hi(cpu_mondo_counter), %g5
or %g5, %lo(cpu_mondo_counter), %g5
sllx %g3, 3, %g3
add %g5, %g3, %g5
ldx [%g5], %g3
add %g3, 1, %g3
stx %g3, [%g5]
/* Get CPU mondo queue base phys address into %g7. */ /* Get CPU mondo queue base phys address into %g7. */
ldx [%g4 + TRAP_PER_CPU_CPU_MONDO_PA], %g7 ldx [%g4 + TRAP_PER_CPU_CPU_MONDO_PA], %g7
......
...@@ -2659,6 +2659,7 @@ void do_getpsr(struct pt_regs *regs) ...@@ -2659,6 +2659,7 @@ void do_getpsr(struct pt_regs *regs)
} }
} }
u64 cpu_mondo_counter[NR_CPUS] = {0};
struct trap_per_cpu trap_block[NR_CPUS]; struct trap_per_cpu trap_block[NR_CPUS];
EXPORT_SYMBOL(trap_block); EXPORT_SYMBOL(trap_block);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment