Commit 1f656ff3 authored by K. Y. Srinivasan's avatar K. Y. Srinivasan Committed by Greg Kroah-Hartman

Drivers: hv: vmbus: Implement NUMA aware CPU affinity for channels

Channels/sub-channels can be affinitized to VCPUs in the guest. Implement
this affinity in a way that is NUMA aware. The current protocol distributed
the primary channels uniformly across all available CPUs. The new protocol
is NUMA aware: primary channels are distributed across the available NUMA
nodes while the sub-channels within a primary channel are distributed amongst
CPUs within the NUMA node assigned to the primary channel.
Signed-off-by: default avatarK. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 9c6e64ad
...@@ -370,25 +370,27 @@ static const struct hv_vmbus_device_id hp_devs[] = { ...@@ -370,25 +370,27 @@ static const struct hv_vmbus_device_id hp_devs[] = {
/* /*
* We use this state to statically distribute the channel interrupt load. * We use this state to statically distribute the channel interrupt load.
*/ */
static u32 next_vp; static int next_numa_node_id;
/* /*
* Starting with Win8, we can statically distribute the incoming * Starting with Win8, we can statically distribute the incoming
* channel interrupt load by binding a channel to VCPU. We * channel interrupt load by binding a channel to VCPU.
* implement here a simple round robin scheme for distributing * We do this in a hierarchical fashion:
* the interrupt load. * First distribute the primary channels across available NUMA nodes
* We will bind channels that are not performance critical to cpu 0 and * and then distribute the subchannels amongst the CPUs in the NUMA
* performance critical channels (IDE, SCSI and Network) will be uniformly * node assigned to the primary channel.
* distributed across all available CPUs. *
* For pre-win8 hosts or non-performance critical channels we assign the
* first CPU in the first NUMA node.
*/ */
static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_guid) static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_guid)
{ {
u32 cur_cpu; u32 cur_cpu;
int i; int i;
bool perf_chn = false; bool perf_chn = false;
u32 max_cpus = num_online_cpus(); struct vmbus_channel *primary = channel->primary_channel;
struct vmbus_channel *primary = channel->primary_channel, *prev; int next_node;
unsigned long flags; struct cpumask available_mask;
for (i = IDE; i < MAX_PERF_CHN; i++) { for (i = IDE; i < MAX_PERF_CHN; i++) {
if (!memcmp(type_guid->b, hp_devs[i].guid, if (!memcmp(type_guid->b, hp_devs[i].guid,
...@@ -405,36 +407,48 @@ static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_gui ...@@ -405,36 +407,48 @@ static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_gui
* Also if the channel is not a performance critical * Also if the channel is not a performance critical
* channel, bind it to cpu 0. * channel, bind it to cpu 0.
*/ */
channel->numa_node = 0;
cpumask_set_cpu(0, &channel->alloced_cpus_in_node);
channel->target_cpu = 0; channel->target_cpu = 0;
channel->target_vp = hv_context.vp_index[0]; channel->target_vp = hv_context.vp_index[0];
return; return;
} }
/* /*
* Primary channels are distributed evenly across all vcpus we have. * We distribute primary channels evenly across all the available
* When the host asks us to create subchannels it usually makes us * NUMA nodes and within the assigned NUMA node we will assign the
* num_cpus-1 offers and we are supposed to distribute the work evenly * first available CPU to the primary channel.
* among the channel itself and all its subchannels. Make sure they are * The sub-channels will be assigned to the CPUs available in the
* all assigned to different vcpus. * NUMA node evenly.
*/ */
if (!primary) if (!primary) {
cur_cpu = (++next_vp % max_cpus); while (true) {
else { next_node = next_numa_node_id++;
if (next_node == nr_node_ids)
next_node = next_numa_node_id = 0;
if (cpumask_empty(cpumask_of_node(next_node)))
continue;
break;
}
channel->numa_node = next_node;
primary = channel;
}
if (cpumask_weight(&primary->alloced_cpus_in_node) ==
cpumask_weight(cpumask_of_node(primary->numa_node))) {
/* /*
* Let's assign the first subchannel of a channel to the * We have cycled through all the CPUs in the node;
* primary->target_cpu+1 and all the subsequent channels to * reset the alloced map.
* the prev->target_cpu+1.
*/ */
spin_lock_irqsave(&primary->lock, flags); cpumask_clear(&primary->alloced_cpus_in_node);
if (primary->num_sc == 1)
cur_cpu = (primary->target_cpu + 1) % max_cpus;
else {
prev = list_prev_entry(channel, sc_list);
cur_cpu = (prev->target_cpu + 1) % max_cpus;
}
spin_unlock_irqrestore(&primary->lock, flags);
} }
cpumask_xor(&available_mask, &primary->alloced_cpus_in_node,
cpumask_of_node(primary->numa_node));
cur_cpu = cpumask_next(-1, &available_mask);
cpumask_set_cpu(cur_cpu, &primary->alloced_cpus_in_node);
channel->target_cpu = cur_cpu; channel->target_cpu = cur_cpu;
channel->target_vp = hv_context.vp_index[cur_cpu]; channel->target_vp = hv_context.vp_index[cur_cpu];
} }
......
...@@ -696,6 +696,11 @@ struct vmbus_channel { ...@@ -696,6 +696,11 @@ struct vmbus_channel {
u32 target_vp; u32 target_vp;
/* The corresponding CPUID in the guest */ /* The corresponding CPUID in the guest */
u32 target_cpu; u32 target_cpu;
/*
* State to manage the CPU affiliation of channels.
*/
struct cpumask alloced_cpus_in_node;
int numa_node;
/* /*
* Support for sub-channels. For high performance devices, * Support for sub-channels. For high performance devices,
* it will be useful to have multiple sub-channels to support * it will be useful to have multiple sub-channels to support
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment