• Barry Song's avatar
    irqchip/gic-v3: Use dsb(ishst) to order writes with ICC_SGI1R_EL1 accesses · 80e4e1f4
    Barry Song authored
    A dsb(ishst) barrier should be enough to order previous writes with
    the system register generating the SGI, as we only need to guarantee
    the visibility of data to other CPUs in the inner shareable domain
    before we send the SGI.
    
    A micro-benchmark is written to verify the performance impact on
    kunpeng920 machine with 2 sockets, each socket has 2 dies, and
    each die has 24 CPUs, so totally the system has 2 * 2 * 24 = 96
    CPUs. ~2% performance improvement can be seen by this benchmark.
    
    The code of benchmark module:
    
     #include <linux/module.h>
     #include <linux/timekeeping.h>
    
     volatile int data0 ____cacheline_aligned;
     volatile int data1 ____cacheline_aligned;
     volatile int data2 ____cacheline_aligned;
     volatile int data3 ____cacheline_aligned;
     volatile int data4 ____cacheline_aligned;
     volatile int data5 ____cacheline_aligned;
     volatile int data6 ____cacheline_aligned;
    
     static void ipi_latency_func(void *val)
     {
     }
    
     static int __init ipi_latency_init(void)
     {
     	ktime_t stime, etime, delta;
     	int cpu, i;
     	int start = smp_processor_id();
    
     	stime = ktime_get();
     	for ( i = 0; i < 1000; i++)
     		for (cpu = 0; cpu < 96; cpu++) {
     			data0 = data1 = data2 = data3 = data4 = data5 = data6 = cpu;
     			smp_call_function_single(cpu, ipi_latency_func, NULL, 1);
     		}
     	etime = ktime_get();
    
     	delta = ktime_sub(etime, stime);
    
     	printk("%s ipi from cpu%d to cpu0-95 delta of 1000times:%lld\n",
     			__func__, start, delta);
    
     	return 0;
     }
     module_init(ipi_latency_init);
    
     static void ipi_latency_exit(void)
     {
     }
     module_exit(ipi_latency_exit);
    
     MODULE_DESCRIPTION("IPI benchmark");
     MODULE_LICENSE("GPL");
    
    run the below commands 10 times on both Vanilla and the kernel with this
    patch:
     # taskset -c 0 insmod test.ko
     # rmmod test
    
    The result on vanilla:
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:126757449
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:126784249
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:126177703
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:127022281
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:126184883
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:127374585
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:125778089
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:126974441
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:127357625
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:126228184
    
    The result on the kernel with this patch:
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:124467401
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:123474209
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:123558497
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:122993951
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:122984223
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:123323609
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:124507583
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:123386963
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:123340664
     ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:123285324
    Signed-off-by: default avatarBarry Song <song.bao.hua@hisilicon.com>
    [maz: tidied up commit message]
    Signed-off-by: default avatarMarc Zyngier <maz@kernel.org>
    Link: https://lore.kernel.org/r/20220220061910.6155-1-21cnbao@gmail.com
    80e4e1f4
irq-gic-v3.c 58.1 KB