Merge branch 'sfc-optimize-rxqs-count-and-affinities'

Íñigo Huguet says: ==================== sfc: optimize RXQs count and affinities In sfc driver one RX queue per physical core was allocated by default. Later on, IRQ affinities were set spreading the IRQs in all NUMA local CPUs. However, with that default configuration it result in a non very optimal configuration in many modern systems. Specifically, in systems with hyper threading and 2 NUMA nodes, affinities are set in a way that IRQs are handled by all logical cores of one same NUMA node. Handling IRQs from both hyper threading siblings has no benefit, and setting affinities to one queue per physical core is neither a very good idea because there is a performance penalty for moving data across nodes (I was able to check it with some XDP tests using pktgen). This patches reduce the default number of channels to one per physical core in the local NUMA node. Then, they set IRQ affinities to CPUs in the local NUMA node only. This way we save hardware resources since channels are limited resources. We also leave more room for XDP_TX channels without hitting driver's limit of 32 channels per interface. Running performance tests using iperf with a SFC9140 device showed no performance penalty for reducing the number of channels. RX XDP tests showed that performance can go down to less than half if the IRQ is handled by a CPU in a different NUMA node, which doesn't happen with the new defaults from this patches. ==================== Link: https://lore.kernel.org/r/20220228132254.25787-1-ihuguet@redhat.comSigned-off-by: Jakub Kicinski <kuba@kernel.org>

Merge branch 'sfc-optimize-rxqs-count-and-affinities'
Íñigo Huguet says: ==================== sfc: optimize RXQs count and affinities In sfc driver one RX queue per physical core was allocated by default. Later on, IRQ affinities were set spreading the IRQs in all NUMA local CPUs. However, with that default configuration it result in a non very optimal configuration in many modern systems. Specifically, in systems with hyper threading and 2 NUMA nodes, affinities are set in a way that IRQs are handled by all logical cores of one same NUMA node. Handling IRQs from both hyper threading siblings has no benefit, and setting affinities to one queue per physical core is neither a very good idea because there is a performance penalty for moving data across nodes (I was able to check it with some XDP tests using pktgen). This patches reduce the default number of channels to one per physical core in the local NUMA node. Then, they set IRQ affinities to CPUs in the local NUMA node only. This way we save hardware resources since channels are limited resources. We also leave more room for XDP_TX channels without hitting driver's limit of 32 channels per interface. Running performance tests using iperf with a SFC9140 device showed no performance penalty for reducing the number of channels. RX XDP tests showed that performance can go down to less than half if the IRQ is handled by a CPU in a different NUMA node, which doesn't happen with the new defaults from this patches. ==================== Link: https://lore.kernel.org/r/20220228132254.25787-1-ihuguet@redhat.comSigned-off-by: Jakub Kicinski <kuba@kernel.org>
422ce836 · Jakub Kicinski · ef739f1d · 09a99ab1 · 422ce836
Commit 422ce836 authored Mar 01, 2022 by Jakub Kicinski
Hide whitespace changes
Inline Side-by-side

Showing with 44 additions and 19 deletions

drivers/net/ethernet/sfc/efx_channels.c drivers/net/ethernet/sfc/efx_channels.c +44 -19

No files found.
--- a/drivers/net/ethernet/sfc/efx_channels.c
+++ b/drivers/net/ethernet/sfc/efx_channels.c
@@ -78,31 +78,48 @@ static const struct efx_channel_type efx_default_channel_type = {
 * INTERRUPTS
 *************/

-static unsigned int efx_wanted_parallelism(struct efx_nic *efx)
+static unsigned int count_online_cores(struct efx_nic *efx, bool local_node)
 {
-	cpumask_var_t thread_mask;
+	cpumask_var_t filter_mask;
 	unsigned int count;
 	int cpu;

+	if (unlikely(!zalloc_cpumask_var(&filter_mask, GFP_KERNEL))) {
+		netif_warn(efx, probe, efx->net_dev,
+			   "RSS disabled due to allocation failure\n");
+		return 1;
+	}
+
+	cpumask_copy(filter_mask, cpu_online_mask);
+	if (local_node) {
+		int numa_node = pcibus_to_node(efx->pci_dev->bus);
+
+		cpumask_and(filter_mask, filter_mask, cpumask_of_node(numa_node));
+	}
+
+	count = 0;
+	for_each_cpu(cpu, filter_mask) {
+		++count;
+		cpumask_andnot(filter_mask, filter_mask, topology_sibling_cpumask(cpu));
+	}
+
+	free_cpumask_var(filter_mask);
+
+	return count;
+}
+
+static unsigned int efx_wanted_parallelism(struct efx_nic *efx)
+{
+	unsigned int count;
+
 	if (rss_cpus) {
 		count = rss_cpus;
 	} else {
-		if (unlikely(!zalloc_cpumask_var(&thread_mask, GFP_KERNEL))) {
-			netif_warn(efx, probe, efx->net_dev,
-				   "RSS disabled due to allocation failure\n");
-			return 1;
-		}
+		count = count_online_cores(efx, true);

-		count = 0;
-		for_each_online_cpu(cpu) {
-			if (!cpumask_test_cpu(cpu, thread_mask)) {
-				++count;
-				cpumask_or(thread_mask, thread_mask,
-					   topology_sibling_cpumask(cpu));
-			}
-		}
-
-		free_cpumask_var(thread_mask);
+		/* If no online CPUs in local node, fallback to any online CPUs */
+		if (count == 0)
+			count = count_online_cores(efx, false);
 	}

 	if (count > EFX_MAX_RX_QUEUES) {
@@ -369,12 +386,20 @@ int efx_probe_interrupts(struct efx_nic *efx)
 #if defined(CONFIG_SMP)
 void efx_set_interrupt_affinity(struct efx_nic *efx)
 {
+	int numa_node = pcibus_to_node(efx->pci_dev->bus);
+	const struct cpumask *numa_mask = cpumask_of_node(numa_node);
 	struct efx_channel *channel;
 	unsigned int cpu;

+	/* If no online CPUs in local node, fallback to any online CPU */
+	if (cpumask_first_and(cpu_online_mask, numa_mask) >= nr_cpu_ids)
+		numa_mask = cpu_online_mask;
+
+	cpu = -1;
 	efx_for_each_channel(channel, efx) {
-		cpu = cpumask_local_spread(channel->channel,
-					   pcibus_to_node(efx->pci_dev->bus));
+		cpu = cpumask_next_and(cpu, cpu_online_mask, numa_mask);
+		if (cpu >= nr_cpu_ids)
+			cpu = cpumask_first_and(cpu_online_mask, numa_mask);
 		irq_set_affinity_hint(channel->irq, cpumask_of(cpu));
 	}
 }