Commit 6b2591c2 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'hyperv-next-signed' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux

Pull hyper-v updates from Wei Liu:

 - a series from Andrea to support channel reassignment

 - a series from Vitaly to clean up Vmbus message handling

 - a series from Michael to clean up and augment hyperv-tlfs.h

 - patches from Andy to clean up GUID usage in Hyper-V code

 - a few other misc patches

* tag 'hyperv-next-signed' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux: (29 commits)
  Drivers: hv: vmbus: Resolve more races involving init_vp_index()
  Drivers: hv: vmbus: Resolve race between init_vp_index() and CPU hotplug
  vmbus: Replace zero-length array with flexible-array
  Driver: hv: vmbus: drop a no long applicable comment
  hyper-v: Switch to use UUID types directly
  hyper-v: Replace open-coded variant of %*phN specifier
  hyper-v: Supply GUID pointer to printf() like functions
  hyper-v: Use UUID API for exporting the GUID (part 2)
  asm-generic/hyperv: Add definitions for Get/SetVpRegister hypercalls
  x86/hyperv: Split hyperv-tlfs.h into arch dependent and independent files
  x86/hyperv: Remove HV_PROCESSOR_POWER_STATE #defines
  KVM: x86: hyperv: Remove duplicate definitions of Reference TSC Page
  drivers: hv: remove redundant assignment to pointer primary_channel
  scsi: storvsc: Re-init stor_chns when a channel interrupt is re-assigned
  Drivers: hv: vmbus: Introduce the CHANNELMSG_MODIFYCHANNEL message type
  Drivers: hv: vmbus: Synchronize init_vp_index() vs. CPU hotplug
  Drivers: hv: vmbus: Remove the unused HV_LOCALIZED channel affinity logic
  PCI: hv: Prepare hv_compose_msi_msg() for the VMBus-channel-interrupt-to-vCPU reassignment functionality
  Drivers: hv: vmbus: Use a spin lock for synchronizing channel scheduling vs. channel removal
  hv_utils: Always execute the fcopy and vss callbacks in a tasklet
  ...
parents f1e45535 afaa33da
...@@ -7924,6 +7924,7 @@ F: drivers/pci/controller/pci-hyperv.c ...@@ -7924,6 +7924,7 @@ F: drivers/pci/controller/pci-hyperv.c
F: drivers/scsi/storvsc_drv.c F: drivers/scsi/storvsc_drv.c
F: drivers/uio/uio_hv_generic.c F: drivers/uio/uio_hv_generic.c
F: drivers/video/fbdev/hyperv_fb.c F: drivers/video/fbdev/hyperv_fb.c
F: include/asm-generic/hyperv-tlfs.h
F: include/asm-generic/mshyperv.h F: include/asm-generic/mshyperv.h
F: include/clocksource/hyperv_timer.h F: include/clocksource/hyperv_timer.h
F: include/linux/hyperv.h F: include/linux/hyperv.h
......
This diff is collapsed.
...@@ -866,7 +866,7 @@ struct kvm_hv { ...@@ -866,7 +866,7 @@ struct kvm_hv {
u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
u64 hv_crash_ctl; u64 hv_crash_ctl;
HV_REFERENCE_TSC_PAGE tsc_ref; struct ms_hyperv_tsc_page tsc_ref;
struct idr conn_to_evt; struct idr conn_to_evt;
......
...@@ -900,7 +900,7 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, ...@@ -900,7 +900,7 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
* These two equivalencies are implemented in this function. * These two equivalencies are implemented in this function.
*/ */
static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
HV_REFERENCE_TSC_PAGE *tsc_ref) struct ms_hyperv_tsc_page *tsc_ref)
{ {
u64 max_mul; u64 max_mul;
...@@ -941,7 +941,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, ...@@ -941,7 +941,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
u64 gfn; u64 gfn;
BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0); BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0);
if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
return; return;
......
...@@ -289,6 +289,34 @@ int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id, ...@@ -289,6 +289,34 @@ int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id,
} }
EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request); EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request);
/*
* Set/change the vCPU (@target_vp) the channel (@child_relid) will interrupt.
*
* CHANNELMSG_MODIFYCHANNEL messages are aynchronous. Also, Hyper-V does not
* ACK such messages. IOW we can't know when the host will stop interrupting
* the "old" vCPU and start interrupting the "new" vCPU for the given channel.
*
* The CHANNELMSG_MODIFYCHANNEL message type is supported since VMBus version
* VERSION_WIN10_V4_1.
*/
int vmbus_send_modifychannel(u32 child_relid, u32 target_vp)
{
struct vmbus_channel_modifychannel conn_msg;
int ret;
memset(&conn_msg, 0, sizeof(conn_msg));
conn_msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL;
conn_msg.child_relid = child_relid;
conn_msg.target_vp = target_vp;
ret = vmbus_post_msg(&conn_msg, sizeof(conn_msg), true);
trace_vmbus_send_modifychannel(&conn_msg, ret);
return ret;
}
EXPORT_SYMBOL_GPL(vmbus_send_modifychannel);
/* /*
* create_gpadl_header - Creates a gpadl for the specified buffer * create_gpadl_header - Creates a gpadl for the specified buffer
*/ */
...@@ -594,35 +622,31 @@ int vmbus_teardown_gpadl(struct vmbus_channel *channel, u32 gpadl_handle) ...@@ -594,35 +622,31 @@ int vmbus_teardown_gpadl(struct vmbus_channel *channel, u32 gpadl_handle)
} }
EXPORT_SYMBOL_GPL(vmbus_teardown_gpadl); EXPORT_SYMBOL_GPL(vmbus_teardown_gpadl);
static void reset_channel_cb(void *arg)
{
struct vmbus_channel *channel = arg;
channel->onchannel_callback = NULL;
}
void vmbus_reset_channel_cb(struct vmbus_channel *channel) void vmbus_reset_channel_cb(struct vmbus_channel *channel)
{ {
unsigned long flags;
/* /*
* vmbus_on_event(), running in the per-channel tasklet, can race * vmbus_on_event(), running in the per-channel tasklet, can race
* with vmbus_close_internal() in the case of SMP guest, e.g., when * with vmbus_close_internal() in the case of SMP guest, e.g., when
* the former is accessing channel->inbound.ring_buffer, the latter * the former is accessing channel->inbound.ring_buffer, the latter
* could be freeing the ring_buffer pages, so here we must stop it * could be freeing the ring_buffer pages, so here we must stop it
* first. * first.
*
* vmbus_chan_sched() might call the netvsc driver callback function
* that ends up scheduling NAPI work that accesses the ring buffer.
* At this point, we have to ensure that any such work is completed
* and that the channel ring buffer is no longer being accessed, cf.
* the calls to napi_disable() in netvsc_device_remove().
*/ */
tasklet_disable(&channel->callback_event); tasklet_disable(&channel->callback_event);
channel->sc_creation_callback = NULL; /* See the inline comments in vmbus_chan_sched(). */
spin_lock_irqsave(&channel->sched_lock, flags);
channel->onchannel_callback = NULL;
spin_unlock_irqrestore(&channel->sched_lock, flags);
/* Stop the callback asap */ channel->sc_creation_callback = NULL;
if (channel->target_cpu != get_cpu()) {
put_cpu();
smp_call_function_single(channel->target_cpu, reset_channel_cb,
channel, true);
} else {
reset_channel_cb(channel);
put_cpu();
}
/* Re-enable tasklet for use on re-open */ /* Re-enable tasklet for use on re-open */
tasklet_enable(&channel->callback_event); tasklet_enable(&channel->callback_event);
......
This diff is collapsed.
...@@ -69,7 +69,6 @@ MODULE_PARM_DESC(max_version, ...@@ -69,7 +69,6 @@ MODULE_PARM_DESC(max_version,
int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version) int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version)
{ {
int ret = 0; int ret = 0;
unsigned int cur_cpu;
struct vmbus_channel_initiate_contact *msg; struct vmbus_channel_initiate_contact *msg;
unsigned long flags; unsigned long flags;
...@@ -102,24 +101,7 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version) ...@@ -102,24 +101,7 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version)
msg->monitor_page1 = virt_to_phys(vmbus_connection.monitor_pages[0]); msg->monitor_page1 = virt_to_phys(vmbus_connection.monitor_pages[0]);
msg->monitor_page2 = virt_to_phys(vmbus_connection.monitor_pages[1]); msg->monitor_page2 = virt_to_phys(vmbus_connection.monitor_pages[1]);
/* msg->target_vcpu = hv_cpu_number_to_vp_number(VMBUS_CONNECT_CPU);
* We want all channel messages to be delivered on CPU 0.
* This has been the behavior pre-win8. This is not
* perf issue and having all channel messages delivered on CPU 0
* would be ok.
* For post win8 hosts, we support receiving channel messagges on
* all the CPUs. This is needed for kexec to work correctly where
* the CPU attempting to connect may not be CPU 0.
*/
if (version >= VERSION_WIN8_1) {
cur_cpu = get_cpu();
msg->target_vcpu = hv_cpu_number_to_vp_number(cur_cpu);
vmbus_connection.connect_cpu = cur_cpu;
put_cpu();
} else {
msg->target_vcpu = 0;
vmbus_connection.connect_cpu = 0;
}
/* /*
* Add to list before we send the request since we may * Add to list before we send the request since we may
...@@ -266,6 +248,14 @@ int vmbus_connect(void) ...@@ -266,6 +248,14 @@ int vmbus_connect(void)
pr_info("Vmbus version:%d.%d\n", pr_info("Vmbus version:%d.%d\n",
version >> 16, version & 0xFFFF); version >> 16, version & 0xFFFF);
vmbus_connection.channels = kcalloc(MAX_CHANNEL_RELIDS,
sizeof(struct vmbus_channel *),
GFP_KERNEL);
if (vmbus_connection.channels == NULL) {
ret = -ENOMEM;
goto cleanup;
}
kfree(msginfo); kfree(msginfo);
return 0; return 0;
...@@ -313,33 +303,9 @@ void vmbus_disconnect(void) ...@@ -313,33 +303,9 @@ void vmbus_disconnect(void)
*/ */
struct vmbus_channel *relid2channel(u32 relid) struct vmbus_channel *relid2channel(u32 relid)
{ {
struct vmbus_channel *channel; if (WARN_ON(relid >= MAX_CHANNEL_RELIDS))
struct vmbus_channel *found_channel = NULL; return NULL;
struct list_head *cur, *tmp; return READ_ONCE(vmbus_connection.channels[relid]);
struct vmbus_channel *cur_sc;
BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex));
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
if (channel->offermsg.child_relid == relid) {
found_channel = channel;
break;
} else if (!list_empty(&channel->sc_list)) {
/*
* Deal with sub-channels.
*/
list_for_each_safe(cur, tmp, &channel->sc_list) {
cur_sc = list_entry(cur, struct vmbus_channel,
sc_list);
if (cur_sc->offermsg.child_relid == relid) {
found_channel = cur_sc;
break;
}
}
}
}
return found_channel;
} }
/* /*
......
...@@ -117,8 +117,6 @@ int hv_synic_alloc(void) ...@@ -117,8 +117,6 @@ int hv_synic_alloc(void)
pr_err("Unable to allocate post msg page\n"); pr_err("Unable to allocate post msg page\n");
goto err; goto err;
} }
INIT_LIST_HEAD(&hv_cpu->chan_list);
} }
return 0; return 0;
...@@ -245,11 +243,19 @@ int hv_synic_cleanup(unsigned int cpu) ...@@ -245,11 +243,19 @@ int hv_synic_cleanup(unsigned int cpu)
bool channel_found = false; bool channel_found = false;
unsigned long flags; unsigned long flags;
/*
* Hyper-V does not provide a way to change the connect CPU once
* it is set; we must prevent the connect CPU from going offline.
*/
if (cpu == VMBUS_CONNECT_CPU)
return -EBUSY;
/* /*
* Search for channels which are bound to the CPU we're about to * Search for channels which are bound to the CPU we're about to
* cleanup. In case we find one and vmbus is still connected we need to * cleanup. In case we find one and vmbus is still connected, we
* fail, this will effectively prevent CPU offlining. There is no way * fail; this will effectively prevent CPU offlining.
* we can re-bind channels to different CPUs for now. *
* TODO: Re-bind the channels to different CPUs.
*/ */
mutex_lock(&vmbus_connection.channel_mutex); mutex_lock(&vmbus_connection.channel_mutex);
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
......
...@@ -71,7 +71,7 @@ static void fcopy_poll_wrapper(void *channel) ...@@ -71,7 +71,7 @@ static void fcopy_poll_wrapper(void *channel)
{ {
/* Transaction is finished, reset the state here to avoid races. */ /* Transaction is finished, reset the state here to avoid races. */
fcopy_transaction.state = HVUTIL_READY; fcopy_transaction.state = HVUTIL_READY;
hv_fcopy_onchannelcallback(channel); tasklet_schedule(&((struct vmbus_channel *)channel)->callback_event);
} }
static void fcopy_timeout_func(struct work_struct *dummy) static void fcopy_timeout_func(struct work_struct *dummy)
......
...@@ -80,7 +80,7 @@ static void vss_poll_wrapper(void *channel) ...@@ -80,7 +80,7 @@ static void vss_poll_wrapper(void *channel)
{ {
/* Transaction is finished, reset the state here to avoid races. */ /* Transaction is finished, reset the state here to avoid races. */
vss_transaction.state = HVUTIL_READY; vss_transaction.state = HVUTIL_READY;
hv_vss_onchannelcallback(channel); tasklet_schedule(&((struct vmbus_channel *)channel)->callback_event);
} }
/* /*
......
...@@ -44,10 +44,8 @@ TRACE_EVENT(vmbus_onoffer, ...@@ -44,10 +44,8 @@ TRACE_EVENT(vmbus_onoffer,
__entry->monitorid = offer->monitorid; __entry->monitorid = offer->monitorid;
__entry->is_ddc_int = offer->is_dedicated_interrupt; __entry->is_ddc_int = offer->is_dedicated_interrupt;
__entry->connection_id = offer->connection_id; __entry->connection_id = offer->connection_id;
memcpy(__entry->if_type, export_guid(__entry->if_type, &offer->offer.if_type);
&offer->offer.if_type.b, 16); export_guid(__entry->if_instance, &offer->offer.if_instance);
memcpy(__entry->if_instance,
&offer->offer.if_instance.b, 16);
__entry->chn_flags = offer->offer.chn_flags; __entry->chn_flags = offer->offer.chn_flags;
__entry->mmio_mb = offer->offer.mmio_megabytes; __entry->mmio_mb = offer->offer.mmio_megabytes;
__entry->sub_idx = offer->offer.sub_channel_index; __entry->sub_idx = offer->offer.sub_channel_index;
...@@ -296,6 +294,25 @@ TRACE_EVENT(vmbus_send_tl_connect_request, ...@@ -296,6 +294,25 @@ TRACE_EVENT(vmbus_send_tl_connect_request,
) )
); );
TRACE_EVENT(vmbus_send_modifychannel,
TP_PROTO(const struct vmbus_channel_modifychannel *msg,
int ret),
TP_ARGS(msg, ret),
TP_STRUCT__entry(
__field(u32, child_relid)
__field(u32, target_vp)
__field(int, ret)
),
TP_fast_assign(
__entry->child_relid = msg->child_relid;
__entry->target_vp = msg->target_vp;
__entry->ret = ret;
),
TP_printk("binding child_relid 0x%x to target_vp 0x%x, ret %d",
__entry->child_relid, __entry->target_vp, __entry->ret
)
);
DECLARE_EVENT_CLASS(vmbus_channel, DECLARE_EVENT_CLASS(vmbus_channel,
TP_PROTO(const struct vmbus_channel *channel), TP_PROTO(const struct vmbus_channel *channel),
TP_ARGS(channel), TP_ARGS(channel),
......
...@@ -132,12 +132,6 @@ struct hv_per_cpu_context { ...@@ -132,12 +132,6 @@ struct hv_per_cpu_context {
* basis. * basis.
*/ */
struct tasklet_struct msg_dpc; struct tasklet_struct msg_dpc;
/*
* To optimize the mapping of relid to channel, maintain
* per-cpu list of the channels based on their CPU affinity.
*/
struct list_head chan_list;
}; };
struct hv_context { struct hv_context {
...@@ -202,6 +196,8 @@ int hv_ringbuffer_read(struct vmbus_channel *channel, ...@@ -202,6 +196,8 @@ int hv_ringbuffer_read(struct vmbus_channel *channel,
/* TODO: Need to make this configurable */ /* TODO: Need to make this configurable */
#define MAX_NUM_CHANNELS_SUPPORTED 256 #define MAX_NUM_CHANNELS_SUPPORTED 256
#define MAX_CHANNEL_RELIDS \
max(MAX_NUM_CHANNELS_SUPPORTED, HV_EVENT_FLAGS_COUNT)
enum vmbus_connect_state { enum vmbus_connect_state {
DISCONNECTED, DISCONNECTED,
...@@ -212,12 +208,13 @@ enum vmbus_connect_state { ...@@ -212,12 +208,13 @@ enum vmbus_connect_state {
#define MAX_SIZE_CHANNEL_MESSAGE HV_MESSAGE_PAYLOAD_BYTE_COUNT #define MAX_SIZE_CHANNEL_MESSAGE HV_MESSAGE_PAYLOAD_BYTE_COUNT
struct vmbus_connection { /*
/* * The CPU that Hyper-V will interrupt for VMBUS messages, such as
* CPU on which the initial host contact was made. * CHANNELMSG_OFFERCHANNEL and CHANNELMSG_RESCIND_CHANNELOFFER.
*/ */
int connect_cpu; #define VMBUS_CONNECT_CPU 0
struct vmbus_connection {
u32 msg_conn_id; u32 msg_conn_id;
atomic_t offer_in_progress; atomic_t offer_in_progress;
...@@ -250,6 +247,9 @@ struct vmbus_connection { ...@@ -250,6 +247,9 @@ struct vmbus_connection {
struct list_head chn_list; struct list_head chn_list;
struct mutex channel_mutex; struct mutex channel_mutex;
/* Array of channels */
struct vmbus_channel **channels;
/* /*
* An offer message is handled first on the work_queue, and then * An offer message is handled first on the work_queue, and then
* is further handled on handle_primary_chan_wq or * is further handled on handle_primary_chan_wq or
...@@ -317,6 +317,7 @@ struct vmbus_channel_message_table_entry { ...@@ -317,6 +317,7 @@ struct vmbus_channel_message_table_entry {
enum vmbus_channel_message_type message_type; enum vmbus_channel_message_type message_type;
enum vmbus_message_handler_type handler_type; enum vmbus_message_handler_type handler_type;
void (*message_handler)(struct vmbus_channel_message_header *msg); void (*message_handler)(struct vmbus_channel_message_header *msg);
u32 min_payload_len;
}; };
extern const struct vmbus_channel_message_table_entry extern const struct vmbus_channel_message_table_entry
...@@ -336,6 +337,9 @@ int vmbus_add_channel_kobj(struct hv_device *device_obj, ...@@ -336,6 +337,9 @@ int vmbus_add_channel_kobj(struct hv_device *device_obj,
void vmbus_remove_channel_attr_group(struct vmbus_channel *channel); void vmbus_remove_channel_attr_group(struct vmbus_channel *channel);
void vmbus_channel_map_relid(struct vmbus_channel *channel);
void vmbus_channel_unmap_relid(struct vmbus_channel *channel);
struct vmbus_channel *relid2channel(u32 relid); struct vmbus_channel *relid2channel(u32 relid);
void vmbus_free_channels(void); void vmbus_free_channels(void);
...@@ -374,12 +378,7 @@ static inline void hv_poll_channel(struct vmbus_channel *channel, ...@@ -374,12 +378,7 @@ static inline void hv_poll_channel(struct vmbus_channel *channel,
{ {
if (!channel) if (!channel)
return; return;
if (in_interrupt() && (channel->target_cpu == smp_processor_id())) {
cb(channel); cb(channel);
return;
}
smp_call_function_single(channel->target_cpu, cb, channel, true);
} }
enum hvutil_device_state { enum hvutil_device_state {
...@@ -396,6 +395,54 @@ enum delay { ...@@ -396,6 +395,54 @@ enum delay {
MESSAGE_DELAY = 1, MESSAGE_DELAY = 1,
}; };
extern const struct vmbus_device vmbus_devs[];
static inline bool hv_is_perf_channel(struct vmbus_channel *channel)
{
return vmbus_devs[channel->device_id].perf_device;
}
static inline bool hv_is_alloced_cpu(unsigned int cpu)
{
struct vmbus_channel *channel, *sc;
lockdep_assert_held(&vmbus_connection.channel_mutex);
/*
* List additions/deletions as well as updates of the target CPUs are
* protected by channel_mutex.
*/
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
if (!hv_is_perf_channel(channel))
continue;
if (channel->target_cpu == cpu)
return true;
list_for_each_entry(sc, &channel->sc_list, sc_list) {
if (sc->target_cpu == cpu)
return true;
}
}
return false;
}
static inline void hv_set_alloced_cpu(unsigned int cpu)
{
cpumask_set_cpu(cpu, &hv_context.hv_numa_map[cpu_to_node(cpu)]);
}
static inline void hv_clear_alloced_cpu(unsigned int cpu)
{
if (hv_is_alloced_cpu(cpu))
return;
cpumask_clear_cpu(cpu, &hv_context.hv_numa_map[cpu_to_node(cpu)]);
}
static inline void hv_update_alloced_cpus(unsigned int old_cpu,
unsigned int new_cpu)
{
hv_set_alloced_cpu(new_cpu);
hv_clear_alloced_cpu(old_cpu);
}
#ifdef CONFIG_HYPERV_TESTING #ifdef CONFIG_HYPERV_TESTING
int hv_debug_add_dev_dir(struct hv_device *dev); int hv_debug_add_dev_dir(struct hv_device *dev);
......
This diff is collapsed.
...@@ -636,9 +636,12 @@ void netvsc_device_remove(struct hv_device *device) ...@@ -636,9 +636,12 @@ void netvsc_device_remove(struct hv_device *device)
RCU_INIT_POINTER(net_device_ctx->nvdev, NULL); RCU_INIT_POINTER(net_device_ctx->nvdev, NULL);
/* And disassociate NAPI context from device */ /* Disable NAPI and disassociate its context from the device. */
for (i = 0; i < net_device->num_chn; i++) for (i = 0; i < net_device->num_chn; i++) {
/* See also vmbus_reset_channel_cb(). */
napi_disable(&net_device->chan_table[i].napi);
netif_napi_del(&net_device->chan_table[i].napi); netif_napi_del(&net_device->chan_table[i].napi);
}
/* /*
* At this point, no one should be accessing net_device * At this point, no one should be accessing net_device
......
...@@ -1356,11 +1356,11 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) ...@@ -1356,11 +1356,11 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
{ {
struct irq_cfg *cfg = irqd_cfg(data); struct irq_cfg *cfg = irqd_cfg(data);
struct hv_pcibus_device *hbus; struct hv_pcibus_device *hbus;
struct vmbus_channel *channel;
struct hv_pci_dev *hpdev; struct hv_pci_dev *hpdev;
struct pci_bus *pbus; struct pci_bus *pbus;
struct pci_dev *pdev; struct pci_dev *pdev;
struct cpumask *dest; struct cpumask *dest;
unsigned long flags;
struct compose_comp_ctxt comp; struct compose_comp_ctxt comp;
struct tran_int_desc *int_desc; struct tran_int_desc *int_desc;
struct { struct {
...@@ -1378,6 +1378,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) ...@@ -1378,6 +1378,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
dest = irq_data_get_effective_affinity_mask(data); dest = irq_data_get_effective_affinity_mask(data);
pbus = pdev->bus; pbus = pdev->bus;
hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
channel = hbus->hdev->channel;
hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
if (!hpdev) if (!hpdev)
goto return_null_message; goto return_null_message;
...@@ -1435,43 +1436,52 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) ...@@ -1435,43 +1436,52 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
goto free_int_desc; goto free_int_desc;
} }
/*
* Prevents hv_pci_onchannelcallback() from running concurrently
* in the tasklet.
*/
tasklet_disable(&channel->callback_event);
/* /*
* Since this function is called with IRQ locks held, can't * Since this function is called with IRQ locks held, can't
* do normal wait for completion; instead poll. * do normal wait for completion; instead poll.
*/ */
while (!try_wait_for_completion(&comp.comp_pkt.host_event)) { while (!try_wait_for_completion(&comp.comp_pkt.host_event)) {
unsigned long flags;
/* 0xFFFF means an invalid PCI VENDOR ID. */ /* 0xFFFF means an invalid PCI VENDOR ID. */
if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) { if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) {
dev_err_once(&hbus->hdev->device, dev_err_once(&hbus->hdev->device,
"the device has gone\n"); "the device has gone\n");
goto free_int_desc; goto enable_tasklet;
} }
/* /*
* When the higher level interrupt code calls us with * Make sure that the ring buffer data structure doesn't get
* interrupt disabled, we must poll the channel by calling * freed while we dereference the ring buffer pointer. Test
* the channel callback directly when channel->target_cpu is * for the channel's onchannel_callback being NULL within a
* the current CPU. When the higher level interrupt code * sched_lock critical section. See also the inline comments
* calls us with interrupt enabled, let's add the * in vmbus_reset_channel_cb().
* local_irq_save()/restore() to avoid race:
* hv_pci_onchannelcallback() can also run in tasklet.
*/ */
local_irq_save(flags); spin_lock_irqsave(&channel->sched_lock, flags);
if (unlikely(channel->onchannel_callback == NULL)) {
if (hbus->hdev->channel->target_cpu == smp_processor_id()) spin_unlock_irqrestore(&channel->sched_lock, flags);
goto enable_tasklet;
}
hv_pci_onchannelcallback(hbus); hv_pci_onchannelcallback(hbus);
spin_unlock_irqrestore(&channel->sched_lock, flags);
local_irq_restore(flags);
if (hpdev->state == hv_pcichild_ejecting) { if (hpdev->state == hv_pcichild_ejecting) {
dev_err_once(&hbus->hdev->device, dev_err_once(&hbus->hdev->device,
"the device is being ejected\n"); "the device is being ejected\n");
goto free_int_desc; goto enable_tasklet;
} }
udelay(100); udelay(100);
} }
tasklet_enable(&channel->callback_event);
if (comp.comp_pkt.completion_status < 0) { if (comp.comp_pkt.completion_status < 0) {
dev_err(&hbus->hdev->device, dev_err(&hbus->hdev->device,
"Request for interrupt failed: 0x%x", "Request for interrupt failed: 0x%x",
...@@ -1495,6 +1505,8 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) ...@@ -1495,6 +1505,8 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
put_pcichild(hpdev); put_pcichild(hpdev);
return; return;
enable_tasklet:
tasklet_enable(&channel->callback_event);
free_int_desc: free_int_desc:
kfree(int_desc); kfree(int_desc);
drop_reference: drop_reference:
......
...@@ -621,6 +621,64 @@ static inline struct storvsc_device *get_in_stor_device( ...@@ -621,6 +621,64 @@ static inline struct storvsc_device *get_in_stor_device(
} }
static void storvsc_change_target_cpu(struct vmbus_channel *channel, u32 old,
u32 new)
{
struct storvsc_device *stor_device;
struct vmbus_channel *cur_chn;
bool old_is_alloced = false;
struct hv_device *device;
unsigned long flags;
int cpu;
device = channel->primary_channel ?
channel->primary_channel->device_obj
: channel->device_obj;
stor_device = get_out_stor_device(device);
if (!stor_device)
return;
/* See storvsc_do_io() -> get_og_chn(). */
spin_lock_irqsave(&device->channel->lock, flags);
/*
* Determines if the storvsc device has other channels assigned to
* the "old" CPU to update the alloced_cpus mask and the stor_chns
* array.
*/
if (device->channel != channel && device->channel->target_cpu == old) {
cur_chn = device->channel;
old_is_alloced = true;
goto old_is_alloced;
}
list_for_each_entry(cur_chn, &device->channel->sc_list, sc_list) {
if (cur_chn == channel)
continue;
if (cur_chn->target_cpu == old) {
old_is_alloced = true;
goto old_is_alloced;
}
}
old_is_alloced:
if (old_is_alloced)
WRITE_ONCE(stor_device->stor_chns[old], cur_chn);
else
cpumask_clear_cpu(old, &stor_device->alloced_cpus);
/* "Flush" the stor_chns array. */
for_each_possible_cpu(cpu) {
if (stor_device->stor_chns[cpu] && !cpumask_test_cpu(
cpu, &stor_device->alloced_cpus))
WRITE_ONCE(stor_device->stor_chns[cpu], NULL);
}
WRITE_ONCE(stor_device->stor_chns[new], channel);
cpumask_set_cpu(new, &stor_device->alloced_cpus);
spin_unlock_irqrestore(&device->channel->lock, flags);
}
static void handle_sc_creation(struct vmbus_channel *new_sc) static void handle_sc_creation(struct vmbus_channel *new_sc)
{ {
struct hv_device *device = new_sc->primary_channel->device_obj; struct hv_device *device = new_sc->primary_channel->device_obj;
...@@ -648,6 +706,8 @@ static void handle_sc_creation(struct vmbus_channel *new_sc) ...@@ -648,6 +706,8 @@ static void handle_sc_creation(struct vmbus_channel *new_sc)
return; return;
} }
new_sc->change_target_cpu_callback = storvsc_change_target_cpu;
/* Add the sub-channel to the array of available channels. */ /* Add the sub-channel to the array of available channels. */
stor_device->stor_chns[new_sc->target_cpu] = new_sc; stor_device->stor_chns[new_sc->target_cpu] = new_sc;
cpumask_set_cpu(new_sc->target_cpu, &stor_device->alloced_cpus); cpumask_set_cpu(new_sc->target_cpu, &stor_device->alloced_cpus);
...@@ -876,6 +936,8 @@ static int storvsc_channel_init(struct hv_device *device, bool is_fc) ...@@ -876,6 +936,8 @@ static int storvsc_channel_init(struct hv_device *device, bool is_fc)
if (stor_device->stor_chns == NULL) if (stor_device->stor_chns == NULL)
return -ENOMEM; return -ENOMEM;
device->channel->change_target_cpu_callback = storvsc_change_target_cpu;
stor_device->stor_chns[device->channel->target_cpu] = device->channel; stor_device->stor_chns[device->channel->target_cpu] = device->channel;
cpumask_set_cpu(device->channel->target_cpu, cpumask_set_cpu(device->channel->target_cpu,
&stor_device->alloced_cpus); &stor_device->alloced_cpus);
...@@ -1248,8 +1310,10 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device, ...@@ -1248,8 +1310,10 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device,
const struct cpumask *node_mask; const struct cpumask *node_mask;
int num_channels, tgt_cpu; int num_channels, tgt_cpu;
if (stor_device->num_sc == 0) if (stor_device->num_sc == 0) {
stor_device->stor_chns[q_num] = stor_device->device->channel;
return stor_device->device->channel; return stor_device->device->channel;
}
/* /*
* Our channel array is sparsley populated and we * Our channel array is sparsley populated and we
...@@ -1258,7 +1322,6 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device, ...@@ -1258,7 +1322,6 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device,
* The strategy is simple: * The strategy is simple:
* I. Ensure NUMA locality * I. Ensure NUMA locality
* II. Distribute evenly (best effort) * II. Distribute evenly (best effort)
* III. Mapping is persistent.
*/ */
node_mask = cpumask_of_node(cpu_to_node(q_num)); node_mask = cpumask_of_node(cpu_to_node(q_num));
...@@ -1268,8 +1331,10 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device, ...@@ -1268,8 +1331,10 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device,
if (cpumask_test_cpu(tgt_cpu, node_mask)) if (cpumask_test_cpu(tgt_cpu, node_mask))
num_channels++; num_channels++;
} }
if (num_channels == 0) if (num_channels == 0) {
stor_device->stor_chns[q_num] = stor_device->device->channel;
return stor_device->device->channel; return stor_device->device->channel;
}
hash_qnum = q_num; hash_qnum = q_num;
while (hash_qnum >= num_channels) while (hash_qnum >= num_channels)
...@@ -1295,6 +1360,7 @@ static int storvsc_do_io(struct hv_device *device, ...@@ -1295,6 +1360,7 @@ static int storvsc_do_io(struct hv_device *device,
struct storvsc_device *stor_device; struct storvsc_device *stor_device;
struct vstor_packet *vstor_packet; struct vstor_packet *vstor_packet;
struct vmbus_channel *outgoing_channel, *channel; struct vmbus_channel *outgoing_channel, *channel;
unsigned long flags;
int ret = 0; int ret = 0;
const struct cpumask *node_mask; const struct cpumask *node_mask;
int tgt_cpu; int tgt_cpu;
...@@ -1308,10 +1374,11 @@ static int storvsc_do_io(struct hv_device *device, ...@@ -1308,10 +1374,11 @@ static int storvsc_do_io(struct hv_device *device,
request->device = device; request->device = device;
/* /*
* Select an an appropriate channel to send the request out. * Select an appropriate channel to send the request out.
*/ */
if (stor_device->stor_chns[q_num] != NULL) { /* See storvsc_change_target_cpu(). */
outgoing_channel = stor_device->stor_chns[q_num]; outgoing_channel = READ_ONCE(stor_device->stor_chns[q_num]);
if (outgoing_channel != NULL) {
if (outgoing_channel->target_cpu == q_num) { if (outgoing_channel->target_cpu == q_num) {
/* /*
* Ideally, we want to pick a different channel if * Ideally, we want to pick a different channel if
...@@ -1324,7 +1391,10 @@ static int storvsc_do_io(struct hv_device *device, ...@@ -1324,7 +1391,10 @@ static int storvsc_do_io(struct hv_device *device,
continue; continue;
if (tgt_cpu == q_num) if (tgt_cpu == q_num)
continue; continue;
channel = stor_device->stor_chns[tgt_cpu]; channel = READ_ONCE(
stor_device->stor_chns[tgt_cpu]);
if (channel == NULL)
continue;
if (hv_get_avail_to_write_percent( if (hv_get_avail_to_write_percent(
&channel->outbound) &channel->outbound)
> ring_avail_percent_lowater) { > ring_avail_percent_lowater) {
...@@ -1350,7 +1420,10 @@ static int storvsc_do_io(struct hv_device *device, ...@@ -1350,7 +1420,10 @@ static int storvsc_do_io(struct hv_device *device,
for_each_cpu(tgt_cpu, &stor_device->alloced_cpus) { for_each_cpu(tgt_cpu, &stor_device->alloced_cpus) {
if (cpumask_test_cpu(tgt_cpu, node_mask)) if (cpumask_test_cpu(tgt_cpu, node_mask))
continue; continue;
channel = stor_device->stor_chns[tgt_cpu]; channel = READ_ONCE(
stor_device->stor_chns[tgt_cpu]);
if (channel == NULL)
continue;
if (hv_get_avail_to_write_percent( if (hv_get_avail_to_write_percent(
&channel->outbound) &channel->outbound)
> ring_avail_percent_lowater) { > ring_avail_percent_lowater) {
...@@ -1360,7 +1433,14 @@ static int storvsc_do_io(struct hv_device *device, ...@@ -1360,7 +1433,14 @@ static int storvsc_do_io(struct hv_device *device,
} }
} }
} else { } else {
spin_lock_irqsave(&device->channel->lock, flags);
outgoing_channel = stor_device->stor_chns[q_num];
if (outgoing_channel != NULL) {
spin_unlock_irqrestore(&device->channel->lock, flags);
goto found_channel;
}
outgoing_channel = get_og_chn(stor_device, q_num); outgoing_channel = get_og_chn(stor_device, q_num);
spin_unlock_irqrestore(&device->channel->lock, flags);
} }
found_channel: found_channel:
......
This diff is collapsed.
...@@ -117,7 +117,7 @@ struct hv_ring_buffer { ...@@ -117,7 +117,7 @@ struct hv_ring_buffer {
* Ring data starts here + RingDataStartOffset * Ring data starts here + RingDataStartOffset
* !!! DO NOT place any fields below this !!! * !!! DO NOT place any fields below this !!!
*/ */
u8 buffer[0]; u8 buffer[];
} __packed; } __packed;
struct hv_ring_buffer_info { struct hv_ring_buffer_info {
...@@ -313,7 +313,7 @@ struct vmadd_remove_transfer_page_set { ...@@ -313,7 +313,7 @@ struct vmadd_remove_transfer_page_set {
struct gpa_range { struct gpa_range {
u32 byte_count; u32 byte_count;
u32 byte_offset; u32 byte_offset;
u64 pfn_array[0]; u64 pfn_array[];
}; };
/* /*
...@@ -425,7 +425,7 @@ enum vmbus_channel_message_type { ...@@ -425,7 +425,7 @@ enum vmbus_channel_message_type {
CHANNELMSG_19 = 19, CHANNELMSG_19 = 19,
CHANNELMSG_20 = 20, CHANNELMSG_20 = 20,
CHANNELMSG_TL_CONNECT_REQUEST = 21, CHANNELMSG_TL_CONNECT_REQUEST = 21,
CHANNELMSG_22 = 22, CHANNELMSG_MODIFYCHANNEL = 22,
CHANNELMSG_TL_CONNECT_RESULT = 23, CHANNELMSG_TL_CONNECT_RESULT = 23,
CHANNELMSG_COUNT CHANNELMSG_COUNT
}; };
...@@ -563,7 +563,7 @@ struct vmbus_channel_gpadl_header { ...@@ -563,7 +563,7 @@ struct vmbus_channel_gpadl_header {
u32 gpadl; u32 gpadl;
u16 range_buflen; u16 range_buflen;
u16 rangecount; u16 rangecount;
struct gpa_range range[0]; struct gpa_range range[];
} __packed; } __packed;
/* This is the followup packet that contains more PFNs. */ /* This is the followup packet that contains more PFNs. */
...@@ -571,7 +571,7 @@ struct vmbus_channel_gpadl_body { ...@@ -571,7 +571,7 @@ struct vmbus_channel_gpadl_body {
struct vmbus_channel_message_header header; struct vmbus_channel_message_header header;
u32 msgnumber; u32 msgnumber;
u32 gpadl; u32 gpadl;
u64 pfn[0]; u64 pfn[];
} __packed; } __packed;
struct vmbus_channel_gpadl_created { struct vmbus_channel_gpadl_created {
...@@ -620,6 +620,13 @@ struct vmbus_channel_tl_connect_request { ...@@ -620,6 +620,13 @@ struct vmbus_channel_tl_connect_request {
guid_t host_service_id; guid_t host_service_id;
} __packed; } __packed;
/* Modify Channel parameters, cf. vmbus_send_modifychannel() */
struct vmbus_channel_modifychannel {
struct vmbus_channel_message_header header;
u32 child_relid;
u32 target_vp;
} __packed;
struct vmbus_channel_version_response { struct vmbus_channel_version_response {
struct vmbus_channel_message_header header; struct vmbus_channel_message_header header;
u8 version_supported; u8 version_supported;
...@@ -672,7 +679,7 @@ struct vmbus_channel_msginfo { ...@@ -672,7 +679,7 @@ struct vmbus_channel_msginfo {
* The channel message that goes out on the "wire". * The channel message that goes out on the "wire".
* It will contain at minimum the VMBUS_CHANNEL_MESSAGE_HEADER header * It will contain at minimum the VMBUS_CHANNEL_MESSAGE_HEADER header
*/ */
unsigned char msg[0]; unsigned char msg[];
}; };
struct vmbus_close_msg { struct vmbus_close_msg {
...@@ -689,11 +696,6 @@ union hv_connection_id { ...@@ -689,11 +696,6 @@ union hv_connection_id {
} u; } u;
}; };
enum hv_numa_policy {
HV_BALANCED = 0,
HV_LOCALIZED,
};
enum vmbus_device_type { enum vmbus_device_type {
HV_IDE = 0, HV_IDE = 0,
HV_SCSI, HV_SCSI,
...@@ -771,6 +773,15 @@ struct vmbus_channel { ...@@ -771,6 +773,15 @@ struct vmbus_channel {
void (*onchannel_callback)(void *context); void (*onchannel_callback)(void *context);
void *channel_callback_context; void *channel_callback_context;
void (*change_target_cpu_callback)(struct vmbus_channel *channel,
u32 old, u32 new);
/*
* Synchronize channel scheduling and channel removal; see the inline
* comments in vmbus_chan_sched() and vmbus_reset_channel_cb().
*/
spinlock_t sched_lock;
/* /*
* A channel can be marked for one of three modes of reading: * A channel can be marked for one of three modes of reading:
* BATCHED - callback called from taslket and should read * BATCHED - callback called from taslket and should read
...@@ -802,10 +813,6 @@ struct vmbus_channel { ...@@ -802,10 +813,6 @@ struct vmbus_channel {
u32 target_vp; u32 target_vp;
/* The corresponding CPUID in the guest */ /* The corresponding CPUID in the guest */
u32 target_cpu; u32 target_cpu;
/*
* State to manage the CPU affiliation of channels.
*/
struct cpumask alloced_cpus_in_node;
int numa_node; int numa_node;
/* /*
* Support for sub-channels. For high performance devices, * Support for sub-channels. For high performance devices,
...@@ -854,11 +861,6 @@ struct vmbus_channel { ...@@ -854,11 +861,6 @@ struct vmbus_channel {
* Support per-channel state for use by vmbus drivers. * Support per-channel state for use by vmbus drivers.
*/ */
void *per_channel_state; void *per_channel_state;
/*
* To support per-cpu lookup mapping of relid to channel,
* link up channels based on their CPU affinity.
*/
struct list_head percpu_list;
/* /*
* Defer freeing channel until after all cpu's have * Defer freeing channel until after all cpu's have
...@@ -897,19 +899,14 @@ struct vmbus_channel { ...@@ -897,19 +899,14 @@ struct vmbus_channel {
*/ */
bool low_latency; bool low_latency;
bool probe_done;
/* /*
* NUMA distribution policy: * Cache the device ID here for easy access; this is useful, in
* We support two policies: * particular, in situations where the channel's device_obj has
* 1) Balanced: Here all performance critical channels are * not been allocated/initialized yet.
* distributed evenly amongst all the NUMA nodes.
* This policy will be the default policy.
* 2) Localized: All channels of a given instance of a
* performance critical service will be assigned CPUs
* within a selected NUMA node.
*/ */
enum hv_numa_policy affinity_policy; u16 device_id;
bool probe_done;
/* /*
* We must offload the handling of the primary/sub channels * We must offload the handling of the primary/sub channels
...@@ -964,12 +961,6 @@ static inline bool is_sub_channel(const struct vmbus_channel *c) ...@@ -964,12 +961,6 @@ static inline bool is_sub_channel(const struct vmbus_channel *c)
return c->offermsg.offer.sub_channel_index != 0; return c->offermsg.offer.sub_channel_index != 0;
} }
static inline void set_channel_affinity_state(struct vmbus_channel *c,
enum hv_numa_policy policy)
{
c->affinity_policy = policy;
}
static inline void set_channel_read_mode(struct vmbus_channel *c, static inline void set_channel_read_mode(struct vmbus_channel *c,
enum hv_callback_mode mode) enum hv_callback_mode mode)
{ {
...@@ -1017,7 +1008,7 @@ static inline void clear_low_latency_mode(struct vmbus_channel *c) ...@@ -1017,7 +1008,7 @@ static inline void clear_low_latency_mode(struct vmbus_channel *c)
c->low_latency = false; c->low_latency = false;
} }
void vmbus_onmessage(void *context); void vmbus_onmessage(struct vmbus_channel_message_header *hdr);
int vmbus_request_offers(void); int vmbus_request_offers(void);
...@@ -1531,6 +1522,7 @@ extern __u32 vmbus_proto_version; ...@@ -1531,6 +1522,7 @@ extern __u32 vmbus_proto_version;
int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id, int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id,
const guid_t *shv_host_servie_id); const guid_t *shv_host_servie_id);
int vmbus_send_modifychannel(u32 child_relid, u32 target_vp);
void vmbus_set_event(struct vmbus_channel *channel); void vmbus_set_event(struct vmbus_channel *channel);
/* Get the start of the ring buffer. */ /* Get the start of the ring buffer. */
......
...@@ -434,7 +434,7 @@ struct virtio_device_id { ...@@ -434,7 +434,7 @@ struct virtio_device_id {
* For Hyper-V devices we use the device guid as the id. * For Hyper-V devices we use the device guid as the id.
*/ */
struct hv_vmbus_device_id { struct hv_vmbus_device_id {
uuid_le guid; guid_t guid;
kernel_ulong_t driver_data; /* Data private to the driver */ kernel_ulong_t driver_data; /* Data private to the driver */
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment