diff options
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/hv/channel.c | 58 | ||||
-rw-r--r-- | drivers/hv/channel_mgmt.c | 439 | ||||
-rw-r--r-- | drivers/hv/connection.c | 58 | ||||
-rw-r--r-- | drivers/hv/hv.c | 16 | ||||
-rw-r--r-- | drivers/hv/hv_fcopy.c | 2 | ||||
-rw-r--r-- | drivers/hv/hv_snapshot.c | 2 | ||||
-rw-r--r-- | drivers/hv/hv_trace.h | 25 | ||||
-rw-r--r-- | drivers/hv/hyperv_vmbus.h | 81 | ||||
-rw-r--r-- | drivers/hv/vmbus_drv.c | 314 | ||||
-rw-r--r-- | drivers/net/hyperv/netvsc.c | 7 | ||||
-rw-r--r-- | drivers/pci/controller/pci-hyperv.c | 44 | ||||
-rw-r--r-- | drivers/scsi/storvsc_drv.c | 96 |
12 files changed, 745 insertions, 397 deletions
diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 23f358cb7f49..90070b337c10 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -290,6 +290,34 @@ int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id, EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request); /* + * Set/change the vCPU (@target_vp) the channel (@child_relid) will interrupt. + * + * CHANNELMSG_MODIFYCHANNEL messages are aynchronous. Also, Hyper-V does not + * ACK such messages. IOW we can't know when the host will stop interrupting + * the "old" vCPU and start interrupting the "new" vCPU for the given channel. + * + * The CHANNELMSG_MODIFYCHANNEL message type is supported since VMBus version + * VERSION_WIN10_V4_1. + */ +int vmbus_send_modifychannel(u32 child_relid, u32 target_vp) +{ + struct vmbus_channel_modifychannel conn_msg; + int ret; + + memset(&conn_msg, 0, sizeof(conn_msg)); + conn_msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL; + conn_msg.child_relid = child_relid; + conn_msg.target_vp = target_vp; + + ret = vmbus_post_msg(&conn_msg, sizeof(conn_msg), true); + + trace_vmbus_send_modifychannel(&conn_msg, ret); + + return ret; +} +EXPORT_SYMBOL_GPL(vmbus_send_modifychannel); + +/* * create_gpadl_header - Creates a gpadl for the specified buffer */ static int create_gpadl_header(void *kbuffer, u32 size, @@ -594,35 +622,31 @@ post_msg_err: } EXPORT_SYMBOL_GPL(vmbus_teardown_gpadl); -static void reset_channel_cb(void *arg) -{ - struct vmbus_channel *channel = arg; - - channel->onchannel_callback = NULL; -} - void vmbus_reset_channel_cb(struct vmbus_channel *channel) { + unsigned long flags; + /* * vmbus_on_event(), running in the per-channel tasklet, can race * with vmbus_close_internal() in the case of SMP guest, e.g., when * the former is accessing channel->inbound.ring_buffer, the latter * could be freeing the ring_buffer pages, so here we must stop it * first. + * + * vmbus_chan_sched() might call the netvsc driver callback function + * that ends up scheduling NAPI work that accesses the ring buffer. + * At this point, we have to ensure that any such work is completed + * and that the channel ring buffer is no longer being accessed, cf. + * the calls to napi_disable() in netvsc_device_remove(). */ tasklet_disable(&channel->callback_event); - channel->sc_creation_callback = NULL; + /* See the inline comments in vmbus_chan_sched(). */ + spin_lock_irqsave(&channel->sched_lock, flags); + channel->onchannel_callback = NULL; + spin_unlock_irqrestore(&channel->sched_lock, flags); - /* Stop the callback asap */ - if (channel->target_cpu != get_cpu()) { - put_cpu(); - smp_call_function_single(channel->target_cpu, reset_channel_cb, - channel, true); - } else { - reset_channel_cb(channel); - put_cpu(); - } + channel->sc_creation_callback = NULL; /* Re-enable tasklet for use on re-open */ tasklet_enable(&channel->callback_event); diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 501c43c5851d..417a95e5094d 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -18,14 +18,15 @@ #include <linux/module.h> #include <linux/completion.h> #include <linux/delay.h> +#include <linux/cpu.h> #include <linux/hyperv.h> #include <asm/mshyperv.h> #include "hyperv_vmbus.h" -static void init_vp_index(struct vmbus_channel *channel, u16 dev_type); +static void init_vp_index(struct vmbus_channel *channel); -static const struct vmbus_device vmbus_devs[] = { +const struct vmbus_device vmbus_devs[] = { /* IDE */ { .dev_type = HV_IDE, HV_IDE_GUID, @@ -315,11 +316,11 @@ static struct vmbus_channel *alloc_channel(void) if (!channel) return NULL; + spin_lock_init(&channel->sched_lock); spin_lock_init(&channel->lock); init_completion(&channel->rescind_event); INIT_LIST_HEAD(&channel->sc_list); - INIT_LIST_HEAD(&channel->percpu_list); tasklet_init(&channel->callback_event, vmbus_on_event, (unsigned long)channel); @@ -340,23 +341,49 @@ static void free_channel(struct vmbus_channel *channel) kobject_put(&channel->kobj); } -static void percpu_channel_enq(void *arg) +void vmbus_channel_map_relid(struct vmbus_channel *channel) { - struct vmbus_channel *channel = arg; - struct hv_per_cpu_context *hv_cpu - = this_cpu_ptr(hv_context.cpu_context); - - list_add_tail_rcu(&channel->percpu_list, &hv_cpu->chan_list); + if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS)) + return; + /* + * The mapping of the channel's relid is visible from the CPUs that + * execute vmbus_chan_sched() by the time that vmbus_chan_sched() will + * execute: + * + * (a) In the "normal (i.e., not resuming from hibernation)" path, + * the full barrier in smp_store_mb() guarantees that the store + * is propagated to all CPUs before the add_channel_work work + * is queued. In turn, add_channel_work is queued before the + * channel's ring buffer is allocated/initialized and the + * OPENCHANNEL message for the channel is sent in vmbus_open(). + * Hyper-V won't start sending the interrupts for the channel + * before the OPENCHANNEL message is acked. The memory barrier + * in vmbus_chan_sched() -> sync_test_and_clear_bit() ensures + * that vmbus_chan_sched() must find the channel's relid in + * recv_int_page before retrieving the channel pointer from the + * array of channels. + * + * (b) In the "resuming from hibernation" path, the smp_store_mb() + * guarantees that the store is propagated to all CPUs before + * the VMBus connection is marked as ready for the resume event + * (cf. check_ready_for_resume_event()). The interrupt handler + * of the VMBus driver and vmbus_chan_sched() can not run before + * vmbus_bus_resume() has completed execution (cf. resume_noirq). + */ + smp_store_mb( + vmbus_connection.channels[channel->offermsg.child_relid], + channel); } -static void percpu_channel_deq(void *arg) +void vmbus_channel_unmap_relid(struct vmbus_channel *channel) { - struct vmbus_channel *channel = arg; - - list_del_rcu(&channel->percpu_list); + if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS)) + return; + WRITE_ONCE( + vmbus_connection.channels[channel->offermsg.child_relid], + NULL); } - static void vmbus_release_relid(u32 relid) { struct vmbus_channel_relid_released msg; @@ -373,39 +400,43 @@ static void vmbus_release_relid(u32 relid) void hv_process_channel_removal(struct vmbus_channel *channel) { - struct vmbus_channel *primary_channel; unsigned long flags; - BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex)); + lockdep_assert_held(&vmbus_connection.channel_mutex); BUG_ON(!channel->rescind); - if (channel->target_cpu != get_cpu()) { - put_cpu(); - smp_call_function_single(channel->target_cpu, - percpu_channel_deq, channel, true); - } else { - percpu_channel_deq(channel); - put_cpu(); - } + /* + * hv_process_channel_removal() could find INVALID_RELID only for + * hv_sock channels. See the inline comments in vmbus_onoffer(). + */ + WARN_ON(channel->offermsg.child_relid == INVALID_RELID && + !is_hvsock_channel(channel)); + + /* + * Upon suspend, an in-use hv_sock channel is removed from the array of + * channels and the relid is invalidated. After hibernation, when the + * user-space appplication destroys the channel, it's unnecessary and + * unsafe to remove the channel from the array of channels. See also + * the inline comments before the call of vmbus_release_relid() below. + */ + if (channel->offermsg.child_relid != INVALID_RELID) + vmbus_channel_unmap_relid(channel); if (channel->primary_channel == NULL) { list_del(&channel->listentry); - - primary_channel = channel; } else { - primary_channel = channel->primary_channel; + struct vmbus_channel *primary_channel = channel->primary_channel; spin_lock_irqsave(&primary_channel->lock, flags); list_del(&channel->sc_list); spin_unlock_irqrestore(&primary_channel->lock, flags); } /* - * We need to free the bit for init_vp_index() to work in the case - * of sub-channel, when we reload drivers like hv_netvsc. + * If this is a "perf" channel, updates the hv_numa_map[] masks so that + * init_vp_index() can (re-)use the CPU. */ - if (channel->affinity_policy == HV_LOCALIZED) - cpumask_clear_cpu(channel->target_cpu, - &primary_channel->alloced_cpus_in_node); + if (hv_is_perf_channel(channel)) + hv_clear_alloced_cpu(channel->target_cpu); /* * Upon suspend, an in-use hv_sock channel is marked as "rescinded" and @@ -440,23 +471,8 @@ static void vmbus_add_channel_work(struct work_struct *work) container_of(work, struct vmbus_channel, add_channel_work); struct vmbus_channel *primary_channel = newchannel->primary_channel; unsigned long flags; - u16 dev_type; int ret; - dev_type = hv_get_dev_type(newchannel); - - init_vp_index(newchannel, dev_type); - - if (newchannel->target_cpu != get_cpu()) { - put_cpu(); - smp_call_function_single(newchannel->target_cpu, - percpu_channel_enq, - newchannel, true); - } else { - percpu_channel_enq(newchannel); - put_cpu(); - } - /* * This state is used to indicate a successful open * so that when we do close the channel normally, we @@ -488,7 +504,7 @@ static void vmbus_add_channel_work(struct work_struct *work) if (!newchannel->device_obj) goto err_deq_chan; - newchannel->device_obj->device_id = dev_type; + newchannel->device_obj->device_id = newchannel->device_id; /* * Add the new device to the bus. This will kick off device-driver * binding which eventually invokes the device driver's AddDevice() @@ -523,17 +539,10 @@ err_deq_chan: spin_unlock_irqrestore(&primary_channel->lock, flags); } - mutex_unlock(&vmbus_connection.channel_mutex); + /* vmbus_process_offer() has mapped the channel. */ + vmbus_channel_unmap_relid(newchannel); - if (newchannel->target_cpu != get_cpu()) { - put_cpu(); - smp_call_function_single(newchannel->target_cpu, - percpu_channel_deq, - newchannel, true); - } else { - percpu_channel_deq(newchannel); - put_cpu(); - } + mutex_unlock(&vmbus_connection.channel_mutex); vmbus_release_relid(newchannel->offermsg.child_relid); @@ -551,8 +560,35 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) unsigned long flags; bool fnew = true; + /* + * Synchronize vmbus_process_offer() and CPU hotplugging: + * + * CPU1 CPU2 + * + * [vmbus_process_offer()] [Hot removal of the CPU] + * + * CPU_READ_LOCK CPUS_WRITE_LOCK + * LOAD cpu_online_mask SEARCH chn_list + * STORE target_cpu LOAD target_cpu + * INSERT chn_list STORE cpu_online_mask + * CPUS_READ_UNLOCK CPUS_WRITE_UNLOCK + * + * Forbids: CPU1's LOAD from *not* seing CPU2's STORE && + * CPU2's SEARCH from *not* seeing CPU1's INSERT + * + * Forbids: CPU2's SEARCH from seeing CPU1's INSERT && + * CPU2's LOAD from *not* seing CPU1's STORE + */ + cpus_read_lock(); + + /* + * Serializes the modifications of the chn_list list as well as + * the accesses to next_numa_node_id in init_vp_index(). + */ mutex_lock(&vmbus_connection.channel_mutex); + init_vp_index(newchannel); + /* Remember the channels that should be cleaned up upon suspend. */ if (is_hvsock_channel(newchannel) || is_sub_channel(newchannel)) atomic_inc(&vmbus_connection.nr_chan_close_on_suspend); @@ -599,7 +635,10 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) spin_unlock_irqrestore(&channel->lock, flags); } + vmbus_channel_map_relid(newchannel); + mutex_unlock(&vmbus_connection.channel_mutex); + cpus_read_unlock(); /* * vmbus_process_offer() mustn't call channel->sc_creation_callback() @@ -632,73 +671,61 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) * We use this state to statically distribute the channel interrupt load. */ static int next_numa_node_id; -/* - * init_vp_index() accesses global variables like next_numa_node_id, and - * it can run concurrently for primary channels and sub-channels: see - * vmbus_process_offer(), so we need the lock to protect the global - * variables. - */ -static DEFINE_SPINLOCK(bind_channel_to_cpu_lock); /* * Starting with Win8, we can statically distribute the incoming * channel interrupt load by binding a channel to VCPU. - * We distribute the interrupt loads to one or more NUMA nodes based on - * the channel's affinity_policy. * * For pre-win8 hosts or non-performance critical channels we assign the - * first CPU in the first NUMA node. + * VMBUS_CONNECT_CPU. + * + * Starting with win8, performance critical channels will be distributed + * evenly among all the available NUMA nodes. Once the node is assigned, + * we will assign the CPU based on a simple round robin scheme. */ -static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) +static void init_vp_index(struct vmbus_channel *channel) { - u32 cur_cpu; - bool perf_chn = vmbus_devs[dev_type].perf_device; - struct vmbus_channel *primary = channel->primary_channel; - int next_node; + bool perf_chn = hv_is_perf_channel(channel); cpumask_var_t available_mask; struct cpumask *alloced_mask; + u32 target_cpu; + int numa_node; if ((vmbus_proto_version == VERSION_WS2008) || (vmbus_proto_version == VERSION_WIN7) || (!perf_chn) || !alloc_cpumask_var(&available_mask, GFP_KERNEL)) { /* * Prior to win8, all channel interrupts are - * delivered on cpu 0. + * delivered on VMBUS_CONNECT_CPU. * Also if the channel is not a performance critical - * channel, bind it to cpu 0. - * In case alloc_cpumask_var() fails, bind it to cpu 0. + * channel, bind it to VMBUS_CONNECT_CPU. + * In case alloc_cpumask_var() fails, bind it to + * VMBUS_CONNECT_CPU. */ - channel->numa_node = 0; - channel->target_cpu = 0; - channel->target_vp = hv_cpu_number_to_vp_number(0); + channel->numa_node = cpu_to_node(VMBUS_CONNECT_CPU); + channel->target_cpu = VMBUS_CONNECT_CPU; + channel->target_vp = + hv_cpu_number_to_vp_number(VMBUS_CONNECT_CPU); + if (perf_chn) + hv_set_alloced_cpu(VMBUS_CONNECT_CPU); return; } - spin_lock(&bind_channel_to_cpu_lock); - - /* - * Based on the channel affinity policy, we will assign the NUMA - * nodes. - */ - - if ((channel->affinity_policy == HV_BALANCED) || (!primary)) { - while (true) { - next_node = next_numa_node_id++; - if (next_node == nr_node_ids) { - next_node = next_numa_node_id = 0; - continue; - } - if (cpumask_empty(cpumask_of_node(next_node))) - continue; - break; + while (true) { + numa_node = next_numa_node_id++; + if (numa_node == nr_node_ids) { + next_numa_node_id = 0; + continue; } - channel->numa_node = next_node; - primary = channel; + if (cpumask_empty(cpumask_of_node(numa_node))) + continue; + break; } - alloced_mask = &hv_context.hv_numa_map[primary->numa_node]; + channel->numa_node = numa_node; + alloced_mask = &hv_context.hv_numa_map[numa_node]; if (cpumask_weight(alloced_mask) == - cpumask_weight(cpumask_of_node(primary->numa_node))) { + cpumask_weight(cpumask_of_node(numa_node))) { /* * We have cycled through all the CPUs in the node; * reset the alloced map. @@ -706,59 +733,13 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) cpumask_clear(alloced_mask); } - cpumask_xor(available_mask, alloced_mask, - cpumask_of_node(primary->numa_node)); - - cur_cpu = -1; - - if (primary->affinity_policy == HV_LOCALIZED) { - /* - * Normally Hyper-V host doesn't create more subchannels - * than there are VCPUs on the node but it is possible when not - * all present VCPUs on the node are initialized by guest. - * Clear the alloced_cpus_in_node to start over. - */ - if (cpumask_equal(&primary->alloced_cpus_in_node, - cpumask_of_node(primary->numa_node))) - cpumask_clear(&primary->alloced_cpus_in_node); - } - - while (true) { - cur_cpu = cpumask_next(cur_cpu, available_mask); - if (cur_cpu >= nr_cpu_ids) { - cur_cpu = -1; - cpumask_copy(available_mask, - cpumask_of_node(primary->numa_node)); - continue; - } - - if (primary->affinity_policy == HV_LOCALIZED) { - /* - * NOTE: in the case of sub-channel, we clear the - * sub-channel related bit(s) in - * primary->alloced_cpus_in_node in - * hv_process_channel_removal(), so when we - * reload drivers like hv_netvsc in SMP guest, here - * we're able to re-allocate - * bit from primary->alloced_cpus_in_node. - */ - if (!cpumask_test_cpu(cur_cpu, - &primary->alloced_cpus_in_node)) { - cpumask_set_cpu(cur_cpu, - &primary->alloced_cpus_in_node); - cpumask_set_cpu(cur_cpu, alloced_mask); - break; - } - } else { - cpumask_set_cpu(cur_cpu, alloced_mask); - break; - } - } + cpumask_xor(available_mask, alloced_mask, cpumask_of_node(numa_node)); - channel->target_cpu = cur_cpu; - channel->target_vp = hv_cpu_number_to_vp_number(cur_cpu); + target_cpu = cpumask_first(available_mask); + cpumask_set_cpu(target_cpu, alloced_mask); - spin_unlock(&bind_channel_to_cpu_lock); + channel->target_cpu = target_cpu; + channel->target_vp = hv_cpu_number_to_vp_number(target_cpu); free_cpumask_var(available_mask); } @@ -890,6 +871,7 @@ static void vmbus_setup_channel_state(struct vmbus_channel *channel, sizeof(struct vmbus_channel_offer_channel)); channel->monitor_grp = (u8)offer->monitorid / 32; channel->monitor_bit = (u8)offer->monitorid % 32; + channel->device_id = hv_get_dev_type(channel); } /* @@ -940,8 +922,6 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) oldchannel = find_primary_channel_by_offer(offer); if (oldchannel != NULL) { - atomic_dec(&vmbus_connection.offer_in_progress); - /* * We're resuming from hibernation: all the sub-channel and * hv_sock channels we had before the hibernation should have @@ -949,36 +929,65 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) * primary channel that we had before the hibernation. */ + /* + * { Initially: channel relid = INVALID_RELID, + * channels[valid_relid] = NULL } + * + * CPU1 CPU2 + * + * [vmbus_onoffer()] [vmbus_device_release()] + * + * LOCK channel_mutex LOCK channel_mutex + * STORE channel relid = valid_relid LOAD r1 = channel relid + * MAP_RELID channel if (r1 != INVALID_RELID) + * UNLOCK channel_mutex UNMAP_RELID channel + * UNLOCK channel_mutex + * + * Forbids: r1 == valid_relid && + * channels[valid_relid] == channel + * + * Note. r1 can be INVALID_RELID only for an hv_sock channel. + * None of the hv_sock channels which were present before the + * suspend are re-offered upon the resume. See the WARN_ON() + * in hv_process_channel_removal(). + */ + mutex_lock(&vmbus_connection.channel_mutex); + + atomic_dec(&vmbus_connection.offer_in_progress); + WARN_ON(oldchannel->offermsg.child_relid != INVALID_RELID); /* Fix up the relid. */ oldchannel->offermsg.child_relid = offer->child_relid; offer_sz = sizeof(*offer); - if (memcmp(offer, &oldchannel->offermsg, offer_sz) == 0) { - check_ready_for_resume_event(); - return; + if (memcmp(offer, &oldchannel->offermsg, offer_sz) != 0) { + /* + * This is not an error, since the host can also change + * the other field(s) of the offer, e.g. on WS RS5 + * (Build 17763), the offer->connection_id of the + * Mellanox VF vmbus device can change when the host + * reoffers the device upon resume. + */ + pr_debug("vmbus offer changed: relid=%d\n", + offer->child_relid); + + print_hex_dump_debug("Old vmbus offer: ", + DUMP_PREFIX_OFFSET, 16, 4, + &oldchannel->offermsg, offer_sz, + false); + print_hex_dump_debug("New vmbus offer: ", + DUMP_PREFIX_OFFSET, 16, 4, + offer, offer_sz, false); + + /* Fix up the old channel. */ + vmbus_setup_channel_state(oldchannel, offer); } - /* - * This is not an error, since the host can also change the - * other field(s) of the offer, e.g. on WS RS5 (Build 17763), - * the offer->connection_id of the Mellanox VF vmbus device - * can change when the host reoffers the device upon resume. - */ - pr_debug("vmbus offer changed: relid=%d\n", - offer->child_relid); - - print_hex_dump_debug("Old vmbus offer: ", DUMP_PREFIX_OFFSET, - 16, 4, &oldchannel->offermsg, offer_sz, - false); - print_hex_dump_debug("New vmbus offer: ", DUMP_PREFIX_OFFSET, - 16, 4, offer, offer_sz, false); - - /* Fix up the old channel. */ - vmbus_setup_channel_state(oldchannel, offer); - + /* Add the channel back to the array of channels. */ + vmbus_channel_map_relid(oldchannel); check_ready_for_resume_event(); + mutex_unlock(&vmbus_connection.channel_mutex); return; } @@ -1028,11 +1037,22 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) * offer comes in first and then the rescind. * Since we process these events in work elements, * and with preemption, we may end up processing - * the events out of order. Given that we handle these - * work elements on the same CPU, this is possible only - * in the case of preemption. In any case wait here - * until the offer processing has moved beyond the - * point where the channel is discoverable. + * the events out of order. We rely on the synchronization + * provided by offer_in_progress and by channel_mutex for + * ordering these events: + * + * { Initially: offer_in_progress = 1 } + * + * CPU1 CPU2 + * + * [vmbus_onoffer()] [vmbus_onoffer_rescind()] + * + * LOCK channel_mutex WAIT_ON offer_in_progress == 0 + * DECREMENT offer_in_progress LOCK channel_mutex + * STORE channels[] LOAD channels[] + * UNLOCK channel_mutex UNLOCK channel_mutex + * + * Forbids: CPU2's LOAD from *not* seeing CPU1's STORE */ while (atomic_read(&vmbus_connection.offer_in_progress) != 0) { @@ -1332,30 +1352,36 @@ static void vmbus_onversion_response( /* Channel message dispatch table */ const struct vmbus_channel_message_table_entry channel_message_table[CHANNELMSG_COUNT] = { - { CHANNELMSG_INVALID, 0, NULL }, - { CHANNELMSG_OFFERCHANNEL, 0, vmbus_onoffer }, - { CHANNELMSG_RESCIND_CHANNELOFFER, 0, vmbus_onoffer_rescind }, - { CHANNELMSG_REQUESTOFFERS, 0, NULL }, - { CHANNELMSG_ALLOFFERS_DELIVERED, 1, vmbus_onoffers_delivered }, - { CHANNELMSG_OPENCHANNEL, 0, NULL }, - { CHANNELMSG_OPENCHANNEL_RESULT, 1, vmbus_onopen_result }, - { CHANNELMSG_CLOSECHANNEL, 0, NULL }, - { CHANNELMSG_GPADL_HEADER, 0, NULL }, - { CHANNELMSG_GPADL_BODY, 0, NULL }, - { CHANNELMSG_GPADL_CREATED, 1, vmbus_ongpadl_created }, - { CHANNELMSG_GPADL_TEARDOWN, 0, NULL }, - { CHANNELMSG_GPADL_TORNDOWN, 1, vmbus_ongpadl_torndown }, - { CHANNELMSG_RELID_RELEASED, 0, NULL }, - { CHANNELMSG_INITIATE_CONTACT, 0, NULL }, - { CHANNELMSG_VERSION_RESPONSE, 1, vmbus_onversion_response }, - { CHANNELMSG_UNLOAD, 0, NULL }, - { CHANNELMSG_UNLOAD_RESPONSE, 1, vmbus_unload_response }, - { CHANNELMSG_18, 0, NULL }, - { CHANNELMSG_19, 0, NULL }, - { CHANNELMSG_20, 0, NULL }, - { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL }, - { CHANNELMSG_22, 0, NULL }, - { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL }, + { CHANNELMSG_INVALID, 0, NULL, 0}, + { CHANNELMSG_OFFERCHANNEL, 0, vmbus_onoffer, + sizeof(struct vmbus_channel_offer_channel)}, + { CHANNELMSG_RESCIND_CHANNELOFFER, 0, vmbus_onoffer_rescind, + sizeof(struct vmbus_channel_rescind_offer) }, + { CHANNELMSG_REQUESTOFFERS, 0, NULL, 0}, + { CHANNELMSG_ALLOFFERS_DELIVERED, 1, vmbus_onoffers_delivered, 0}, + { CHANNELMSG_OPENCHANNEL, 0, NULL, 0}, + { CHANNELMSG_OPENCHANNEL_RESULT, 1, vmbus_onopen_result, + sizeof(struct vmbus_channel_open_result)}, + { CHANNELMSG_CLOSECHANNEL, 0, NULL, 0}, + { CHANNELMSG_GPADL_HEADER, 0, NULL, 0}, + { CHANNELMSG_GPADL_BODY, 0, NULL, 0}, + { CHANNELMSG_GPADL_CREATED, 1, vmbus_ongpadl_created, + sizeof(struct vmbus_channel_gpadl_created)}, + { CHANNELMSG_GPADL_TEARDOWN, 0, NULL, 0}, + { CHANNELMSG_GPADL_TORNDOWN, 1, vmbus_ongpadl_torndown, + sizeof(struct vmbus_channel_gpadl_torndown) }, + { CHANNELMSG_RELID_RELEASED, 0, NULL, 0}, + { CHANNELMSG_INITIATE_CONTACT, 0, NULL, 0}, + { CHANNELMSG_VERSION_RESPONSE, 1, vmbus_onversion_response, + sizeof(struct vmbus_channel_version_response)}, + { CHANNELMSG_UNLOAD, 0, NULL, 0}, + { CHANNELMSG_UNLOAD_RESPONSE, 1, vmbus_unload_response, 0}, + { CHANNELMSG_18, 0, NULL, 0}, + { CHANNELMSG_19, 0, NULL, 0}, + { CHANNELMSG_20, 0, NULL, 0}, + { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL, 0}, + { CHANNELMSG_MODIFYCHANNEL, 0, NULL, 0}, + { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL, 0}, }; /* @@ -1363,13 +1389,8 @@ channel_message_table[CHANNELMSG_COUNT] = { * * This is invoked in the vmbus worker thread context. */ -void vmbus_onmessage(void *context) +void vmbus_onmessage(struct vmbus_channel_message_header *hdr) { - struct hv_message *msg = context; - struct vmbus_channel_message_header *hdr; - - hdr = (struct vmbus_channel_message_header *)msg->u.payload; - trace_vmbus_on_message(hdr); /* diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 74e77de89b4f..11170d9a2e1a 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -69,7 +69,6 @@ MODULE_PARM_DESC(max_version, int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version) { int ret = 0; - unsigned int cur_cpu; struct vmbus_channel_initiate_contact *msg; unsigned long flags; @@ -102,24 +101,7 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version) msg->monitor_page1 = virt_to_phys(vmbus_connection.monitor_pages[0]); msg->monitor_page2 = virt_to_phys(vmbus_connection.monitor_pages[1]); - /* - * We want all channel messages to be delivered on CPU 0. - * This has been the behavior pre-win8. This is not - * perf issue and having all channel messages delivered on CPU 0 - * would be ok. - * For post win8 hosts, we support receiving channel messagges on - * all the CPUs. This is needed for kexec to work correctly where - * the CPU attempting to connect may not be CPU 0. - */ - if (version >= VERSION_WIN8_1) { - cur_cpu = get_cpu(); - msg->target_vcpu = hv_cpu_number_to_vp_number(cur_cpu); - vmbus_connection.connect_cpu = cur_cpu; - put_cpu(); - } else { - msg->target_vcpu = 0; - vmbus_connection.connect_cpu = 0; - } + msg->target_vcpu = hv_cpu_number_to_vp_number(VMBUS_CONNECT_CPU); /* * Add to list before we send the request since we may @@ -266,6 +248,14 @@ int vmbus_connect(void) pr_info("Vmbus version:%d.%d\n", version >> 16, version & 0xFFFF); + vmbus_connection.channels = kcalloc(MAX_CHANNEL_RELIDS, + sizeof(struct vmbus_channel *), + GFP_KERNEL); + if (vmbus_connection.channels == NULL) { + ret = -ENOMEM; + goto cleanup; + } + kfree(msginfo); return 0; @@ -313,33 +303,9 @@ void vmbus_disconnect(void) */ struct vmbus_channel *relid2channel(u32 relid) { - struct vmbus_channel *channel; - struct vmbus_channel *found_channel = NULL; - struct list_head *cur, *tmp; - struct vmbus_channel *cur_sc; - - BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex)); - - list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { - if (channel->offermsg.child_relid == relid) { - found_channel = channel; - break; - } else if (!list_empty(&channel->sc_list)) { - /* - * Deal with sub-channels. - */ - list_for_each_safe(cur, tmp, &channel->sc_list) { - cur_sc = list_entry(cur, struct vmbus_channel, - sc_list); - if (cur_sc->offermsg.child_relid == relid) { - found_channel = cur_sc; - break; - } - } - } - } - - return found_channel; + if (WARN_ON(relid >= MAX_CHANNEL_RELIDS)) + return NULL; + return READ_ONCE(vmbus_connection.channels[relid]); } /* diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 533c8b82b344..857290dcfd95 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -117,8 +117,6 @@ int hv_synic_alloc(void) pr_err("Unable to allocate post msg page\n"); goto err; } - - INIT_LIST_HEAD(&hv_cpu->chan_list); } return 0; @@ -246,10 +244,18 @@ int hv_synic_cleanup(unsigned int cpu) unsigned long flags; /* + * Hyper-V does not provide a way to change the connect CPU once + * it is set; we must prevent the connect CPU from going offline. + */ + if (cpu == VMBUS_CONNECT_CPU) + return -EBUSY; + + /* * Search for channels which are bound to the CPU we're about to - * cleanup. In case we find one and vmbus is still connected we need to - * fail, this will effectively prevent CPU offlining. There is no way - * we can re-bind channels to different CPUs for now. + * cleanup. In case we find one and vmbus is still connected, we + * fail; this will effectively prevent CPU offlining. + * + * TODO: Re-bind the channels to different CPUs. */ mutex_lock(&vmbus_connection.channel_mutex); list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c index bb9ba3f7c794..5040d7e0cd9e 100644 --- a/drivers/hv/hv_fcopy.c +++ b/drivers/hv/hv_fcopy.c @@ -71,7 +71,7 @@ static void fcopy_poll_wrapper(void *channel) { /* Transaction is finished, reset the state here to avoid races. */ fcopy_transaction.state = HVUTIL_READY; - hv_fcopy_onchannelcallback(channel); + tasklet_schedule(&((struct vmbus_channel *)channel)->callback_event); } static void fcopy_timeout_func(struct work_struct *dummy) diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c index 1c75b38f0d6d..783779e4cc1a 100644 --- a/drivers/hv/hv_snapshot.c +++ b/drivers/hv/hv_snapshot.c @@ -80,7 +80,7 @@ static void vss_poll_wrapper(void *channel) { /* Transaction is finished, reset the state here to avoid races. */ vss_transaction.state = HVUTIL_READY; - hv_vss_onchannelcallback(channel); + tasklet_schedule(&((struct vmbus_channel *)channel)->callback_event); } /* diff --git a/drivers/hv/hv_trace.h b/drivers/hv/hv_trace.h index f9d14db980cb..6063bb21bb13 100644 --- a/drivers/hv/hv_trace.h +++ b/drivers/hv/hv_trace.h @@ -44,10 +44,8 @@ TRACE_EVENT(vmbus_onoffer, __entry->monitorid = offer->monitorid; __entry->is_ddc_int = offer->is_dedicated_interrupt; __entry->connection_id = offer->connection_id; - memcpy(__entry->if_type, - &offer->offer.if_type.b, 16); - memcpy(__entry->if_instance, - &offer->offer.if_instance.b, 16); + export_guid(__entry->if_type, &offer->offer.if_type); + export_guid(__entry->if_instance, &offer->offer.if_instance); __entry->chn_flags = offer->offer.chn_flags; __entry->mmio_mb = offer->offer.mmio_megabytes; __entry->sub_idx = offer->offer.sub_channel_index; @@ -296,6 +294,25 @@ TRACE_EVENT(vmbus_send_tl_connect_request, ) ); +TRACE_EVENT(vmbus_send_modifychannel, + TP_PROTO(const struct vmbus_channel_modifychannel *msg, + int ret), + TP_ARGS(msg, ret), + TP_STRUCT__entry( + __field(u32, child_relid) + __field(u32, target_vp) + __field(int, ret) + ), + TP_fast_assign( + __entry->child_relid = msg->child_relid; + __entry->target_vp = msg->target_vp; + __entry->ret = ret; + ), + TP_printk("binding child_relid 0x%x to target_vp 0x%x, ret %d", + __entry->child_relid, __entry->target_vp, __entry->ret + ) + ); + DECLARE_EVENT_CLASS(vmbus_channel, TP_PROTO(const struct vmbus_channel *channel), TP_ARGS(channel), diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 70b30e223a57..40e2b9f91163 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -132,12 +132,6 @@ struct hv_per_cpu_context { * basis. */ struct tasklet_struct msg_dpc; - - /* - * To optimize the mapping of relid to channel, maintain - * per-cpu list of the channels based on their CPU affinity. - */ - struct list_head chan_list; }; struct hv_context { @@ -202,6 +196,8 @@ int hv_ringbuffer_read(struct vmbus_channel *channel, /* TODO: Need to make this configurable */ #define MAX_NUM_CHANNELS_SUPPORTED 256 +#define MAX_CHANNEL_RELIDS \ + max(MAX_NUM_CHANNELS_SUPPORTED, HV_EVENT_FLAGS_COUNT) enum vmbus_connect_state { DISCONNECTED, @@ -212,12 +208,13 @@ enum vmbus_connect_state { #define MAX_SIZE_CHANNEL_MESSAGE HV_MESSAGE_PAYLOAD_BYTE_COUNT -struct vmbus_connection { - /* - * CPU on which the initial host contact was made. - */ - int connect_cpu; +/* + * The CPU that Hyper-V will interrupt for VMBUS messages, such as + * CHANNELMSG_OFFERCHANNEL and CHANNELMSG_RESCIND_CHANNELOFFER. + */ +#define VMBUS_CONNECT_CPU 0 +struct vmbus_connection { u32 msg_conn_id; atomic_t offer_in_progress; @@ -250,6 +247,9 @@ struct vmbus_connection { struct list_head chn_list; struct mutex channel_mutex; + /* Array of channels */ + struct vmbus_channel **channels; + /* * An offer message is handled first on the work_queue, and then * is further handled on handle_primary_chan_wq or @@ -317,6 +317,7 @@ struct vmbus_channel_message_table_entry { enum vmbus_channel_message_type message_type; enum vmbus_message_handler_type handler_type; void (*message_handler)(struct vmbus_channel_message_header *msg); + u32 min_payload_len; }; extern const struct vmbus_channel_message_table_entry @@ -336,6 +337,9 @@ int vmbus_add_channel_kobj(struct hv_device *device_obj, void vmbus_remove_channel_attr_group(struct vmbus_channel *channel); +void vmbus_channel_map_relid(struct vmbus_channel *channel); +void vmbus_channel_unmap_relid(struct vmbus_channel *channel); + struct vmbus_channel *relid2channel(u32 relid); void vmbus_free_channels(void); @@ -374,12 +378,7 @@ static inline void hv_poll_channel(struct vmbus_channel *channel, { if (!channel) return; - - if (in_interrupt() && (channel->target_cpu == smp_processor_id())) { - cb(channel); - return; - } - smp_call_function_single(channel->target_cpu, cb, channel, true); + cb(channel); } enum hvutil_device_state { @@ -396,6 +395,54 @@ enum delay { MESSAGE_DELAY = 1, }; +extern const struct vmbus_device vmbus_devs[]; + +static inline bool hv_is_perf_channel(struct vmbus_channel *channel) +{ + return vmbus_devs[channel->device_id].perf_device; +} + +static inline bool hv_is_alloced_cpu(unsigned int cpu) +{ + struct vmbus_channel *channel, *sc; + + lockdep_assert_held(&vmbus_connection.channel_mutex); + /* + * List additions/deletions as well as updates of the target CPUs are + * protected by channel_mutex. + */ + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + if (!hv_is_perf_channel(channel)) + continue; + if (channel->target_cpu == cpu) + return true; + list_for_each_entry(sc, &channel->sc_list, sc_list) { + if (sc->target_cpu == cpu) + return true; + } + } + return false; +} + +static inline void hv_set_alloced_cpu(unsigned int cpu) +{ + cpumask_set_cpu(cpu, &hv_context.hv_numa_map[cpu_to_node(cpu)]); +} + +static inline void hv_clear_alloced_cpu(unsigned int cpu) +{ + if (hv_is_alloced_cpu(cpu)) + return; + cpumask_clear_cpu(cpu, &hv_context.hv_numa_map[cpu_to_node(cpu)]); +} + +static inline void hv_update_alloced_cpus(unsigned int old_cpu, + unsigned int new_cpu) +{ + hv_set_alloced_cpu(new_cpu); + hv_clear_alloced_cpu(old_cpu); +} + #ifdef CONFIG_HYPERV_TESTING int hv_debug_add_dev_dir(struct hv_device *dev); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index e06c6b9555cf..9147ee9d5f7d 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -117,14 +117,6 @@ static int vmbus_exists(void) return 0; } -#define VMBUS_ALIAS_LEN ((sizeof((struct hv_vmbus_device_id *)0)->guid) * 2) -static void print_alias_name(struct hv_device *hv_dev, char *alias_name) -{ - int i; - for (i = 0; i < VMBUS_ALIAS_LEN; i += 2) - sprintf(&alias_name[i], "%02x", hv_dev->dev_type.b[i/2]); -} - static u8 channel_monitor_group(const struct vmbus_channel *channel) { return (u8)channel->offermsg.monitorid / 32; @@ -201,7 +193,7 @@ static ssize_t class_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; return sprintf(buf, "{%pUl}\n", - hv_dev->channel->offermsg.offer.if_type.b); + &hv_dev->channel->offermsg.offer.if_type); } static DEVICE_ATTR_RO(class_id); @@ -213,7 +205,7 @@ static ssize_t device_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; return sprintf(buf, "{%pUl}\n", - hv_dev->channel->offermsg.offer.if_instance.b); + &hv_dev->channel->offermsg.offer.if_instance); } static DEVICE_ATTR_RO(device_id); @@ -221,10 +213,8 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *dev_attr, char *buf) { struct hv_device *hv_dev = device_to_hv_device(dev); - char alias_name[VMBUS_ALIAS_LEN + 1]; - print_alias_name(hv_dev, alias_name); - return sprintf(buf, "vmbus:%s\n", alias_name); + return sprintf(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type); } static DEVICE_ATTR_RO(modalias); @@ -693,12 +683,9 @@ __ATTRIBUTE_GROUPS(vmbus_dev); static int vmbus_uevent(struct device *device, struct kobj_uevent_env *env) { struct hv_device *dev = device_to_hv_device(device); - int ret; - char alias_name[VMBUS_ALIAS_LEN + 1]; + const char *format = "MODALIAS=vmbus:%*phN"; - print_alias_name(dev, alias_name); - ret = add_uevent_var(env, "MODALIAS=vmbus:%s", alias_name); - return ret; + return add_uevent_var(env, format, UUID_SIZE, &dev->dev_type); } static const struct hv_vmbus_device_id * @@ -1033,7 +1020,10 @@ static struct bus_type hv_bus = { struct onmessage_work_context { struct work_struct work; - struct hv_message msg; + struct { + struct hv_message_header header; + u8 payload[]; + } msg; }; static void vmbus_onmessage_work(struct work_struct *work) @@ -1046,7 +1036,8 @@ static void vmbus_onmessage_work(struct work_struct *work) ctx = container_of(work, struct onmessage_work_context, work); - vmbus_onmessage(&ctx->msg); + vmbus_onmessage((struct vmbus_channel_message_header *) + &ctx->msg.payload); kfree(ctx); } @@ -1061,6 +1052,13 @@ void vmbus_on_msg_dpc(unsigned long data) struct onmessage_work_context *ctx; u32 message_type = msg->header.message_type; + /* + * 'enum vmbus_channel_message_type' is supposed to always be 'u32' as + * it is being used in 'struct vmbus_channel_message_header' definition + * which is supposed to match hypervisor ABI. + */ + BUILD_BUG_ON(sizeof(enum vmbus_channel_message_type) != sizeof(u32)); + if (message_type == HVMSG_NONE) /* no msg */ return; @@ -1074,41 +1072,88 @@ void vmbus_on_msg_dpc(unsigned long data) goto msg_handled; } + if (msg->header.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) { + WARN_ONCE(1, "payload size is too large (%d)\n", + msg->header.payload_size); + goto msg_handled; + } + entry = &channel_message_table[hdr->msgtype]; if (!entry->message_handler) goto msg_handled; + if (msg->header.payload_size < entry->min_payload_len) { + WARN_ONCE(1, "message too short: msgtype=%d len=%d\n", + hdr->msgtype, msg->header.payload_size); + goto msg_handled; + } + if (entry->handler_type == VMHT_BLOCKING) { - ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC); + ctx = kmalloc(sizeof(*ctx) + msg->header.payload_size, + GFP_ATOMIC); if (ctx == NULL) return; INIT_WORK(&ctx->work, vmbus_onmessage_work); - memcpy(&ctx->msg, msg, sizeof(*msg)); + memcpy(&ctx->msg, msg, sizeof(msg->header) + + msg->header.payload_size); /* * The host can generate a rescind message while we * may still be handling the original offer. We deal with - * this condition by ensuring the processing is done on the - * same CPU. + * this condition by relying on the synchronization provided + * by offer_in_progress and by channel_mutex. See also the + * inline comments in vmbus_onoffer_rescind(). */ switch (hdr->msgtype) { case CHANNELMSG_RESCIND_CHANNELOFFER: /* * If we are handling the rescind message; * schedule the work on the global work queue. + * + * The OFFER message and the RESCIND message should + * not be handled by the same serialized work queue, + * because the OFFER handler may call vmbus_open(), + * which tries to open the channel by sending an + * OPEN_CHANNEL message to the host and waits for + * the host's response; however, if the host has + * rescinded the channel before it receives the + * OPEN_CHANNEL message, the host just silently + * ignores the OPEN_CHANNEL message; as a result, + * the guest's OFFER handler hangs for ever, if we + * handle the RESCIND message in the same serialized + * work queue: the RESCIND handler can not start to + * run before the OFFER handler finishes. */ - schedule_work_on(vmbus_connection.connect_cpu, - &ctx->work); + schedule_work(&ctx->work); break; case CHANNELMSG_OFFERCHANNEL: + /* + * The host sends the offer message of a given channel + * before sending the rescind message of the same + * channel. These messages are sent to the guest's + * connect CPU; the guest then starts processing them + * in the tasklet handler on this CPU: + * + * VMBUS_CONNECT_CPU + * + * [vmbus_on_msg_dpc()] + * atomic_inc() // CHANNELMSG_OFFERCHANNEL + * queue_work() + * ... + * [vmbus_on_msg_dpc()] + * schedule_work() // CHANNELMSG_RESCIND_CHANNELOFFER + * + * We rely on the memory-ordering properties of the + * queue_work() and schedule_work() primitives, which + * guarantee that the atomic increment will be visible + * to the CPUs which will execute the offer & rescind + * works by the time these works will start execution. + */ atomic_inc(&vmbus_connection.offer_in_progress); - queue_work_on(vmbus_connection.connect_cpu, - vmbus_connection.work_queue, - &ctx->work); - break; + fallthrough; default: queue_work(vmbus_connection.work_queue, &ctx->work); @@ -1133,10 +1178,11 @@ static void vmbus_force_channel_rescinded(struct vmbus_channel *channel) WARN_ON(!is_hvsock_channel(channel)); /* - * sizeof(*ctx) is small and the allocation should really not fail, + * Allocation size is small and the allocation should really not fail, * otherwise the state of the hv_sock connections ends up in limbo. */ - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL); + ctx = kzalloc(sizeof(*ctx) + sizeof(*rescind), + GFP_KERNEL | __GFP_NOFAIL); /* * So far, these are not really used by Linux. Just set them to the @@ -1146,31 +1192,17 @@ static void vmbus_force_channel_rescinded(struct vmbus_channel *channel) ctx->msg.header.payload_size = sizeof(*rescind); /* These values are actually used by Linux. */ - rescind = (struct vmbus_channel_rescind_offer *)ctx->msg.u.payload; + rescind = (struct vmbus_channel_rescind_offer *)ctx->msg.payload; rescind->header.msgtype = CHANNELMSG_RESCIND_CHANNELOFFER; rescind->child_relid = channel->offermsg.child_relid; INIT_WORK(&ctx->work, vmbus_onmessage_work); - queue_work_on(vmbus_connection.connect_cpu, - vmbus_connection.work_queue, - &ctx->work); + queue_work(vmbus_connection.work_queue, &ctx->work); } #endif /* CONFIG_PM_SLEEP */ /* - * Direct callback for channels using other deferred processing - */ -static void vmbus_channel_isr(struct vmbus_channel *channel) -{ - void (*callback_fn)(void *); - - callback_fn = READ_ONCE(channel->onchannel_callback); - if (likely(callback_fn != NULL)) - (*callback_fn)(channel->channel_callback_context); -} - -/* * Schedule all channels with events pending */ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) @@ -1200,6 +1232,7 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) return; for_each_set_bit(relid, recv_int_page, maxbits) { + void (*callback_fn)(void *context); struct vmbus_channel *channel; if (!sync_test_and_clear_bit(relid, recv_int_page)) @@ -1209,33 +1242,54 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) if (relid == 0) continue; + /* + * Pairs with the kfree_rcu() in vmbus_chan_release(). + * Guarantees that the channel data structure doesn't + * get freed while the channel pointer below is being + * dereferenced. + */ rcu_read_lock(); /* Find channel based on relid */ - list_for_each_entry_rcu(channel, &hv_cpu->chan_list, percpu_list) { - if (channel->offermsg.child_relid != relid) - continue; + channel = relid2channel(relid); + if (channel == NULL) + goto sched_unlock_rcu; - if (channel->rescind) - continue; + if (channel->rescind) + goto sched_unlock_rcu; - trace_vmbus_chan_sched(channel); + /* + * Make sure that the ring buffer data structure doesn't get + * freed while we dereference the ring buffer pointer. Test + * for the channel's onchannel_callback being NULL within a + * sched_lock critical section. See also the inline comments + * in vmbus_reset_channel_cb(). + */ + spin_lock(&channel->sched_lock); - ++channel->interrupts; + callback_fn = channel->onchannel_callback; + if (unlikely(callback_fn == NULL)) + goto sched_unlock; - switch (channel->callback_mode) { - case HV_CALL_ISR: - vmbus_channel_isr(channel); - break; + trace_vmbus_chan_sched(channel); - case HV_CALL_BATCHED: - hv_begin_read(&channel->inbound); - /* fallthrough */ - case HV_CALL_DIRECT: - tasklet_schedule(&channel->callback_event); - } + ++channel->interrupts; + + switch (channel->callback_mode) { + case HV_CALL_ISR: + (*callback_fn)(channel->channel_callback_context); + break; + + case HV_CALL_BATCHED: + hv_begin_read(&channel->inbound); + fallthrough; + case HV_CALL_DIRECT: + tasklet_schedule(&channel->callback_event); } +sched_unlock: + spin_unlock(&channel->sched_lock); +sched_unlock_rcu: rcu_read_unlock(); } } @@ -1364,7 +1418,6 @@ static int vmbus_bus_init(void) { int ret; - /* Hypervisor initialization...setup hypercall page..etc */ ret = hv_init(); if (ret != 0) { pr_err("Unable to initialize the hypervisor - 0x%x\n", ret); @@ -1553,8 +1606,24 @@ static ssize_t vmbus_chan_attr_show(struct kobject *kobj, return attribute->show(chan, buf); } +static ssize_t vmbus_chan_attr_store(struct kobject *kobj, + struct attribute *attr, const char *buf, + size_t count) +{ + const struct vmbus_chan_attribute *attribute + = container_of(attr, struct vmbus_chan_attribute, attr); + struct vmbus_channel *chan + = container_of(kobj, struct vmbus_channel, kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(chan, buf, count); +} + static const struct sysfs_ops vmbus_chan_sysfs_ops = { .show = vmbus_chan_attr_show, + .store = vmbus_chan_attr_store, }; static ssize_t out_mask_show(struct vmbus_channel *channel, char *buf) @@ -1625,11 +1694,110 @@ static ssize_t write_avail_show(struct vmbus_channel *channel, char *buf) } static VMBUS_CHAN_ATTR_RO(write_avail); -static ssize_t show_target_cpu(struct vmbus_channel *channel, char *buf) +static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf) { return sprintf(buf, "%u\n", channel->target_cpu); } -static VMBUS_CHAN_ATTR(cpu, S_IRUGO, show_target_cpu, NULL); +static ssize_t target_cpu_store(struct vmbus_channel *channel, + const char *buf, size_t count) +{ + u32 target_cpu, origin_cpu; + ssize_t ret = count; + + if (vmbus_proto_version < VERSION_WIN10_V4_1) + return -EIO; + + if (sscanf(buf, "%uu", &target_cpu) != 1) + return -EIO; + + /* Validate target_cpu for the cpumask_test_cpu() operation below. */ + if (target_cpu >= nr_cpumask_bits) + return -EINVAL; + + /* No CPUs should come up or down during this. */ + cpus_read_lock(); + + if (!cpumask_test_cpu(target_cpu, cpu_online_mask)) { + cpus_read_unlock(); + return -EINVAL; + } + + /* + * Synchronizes target_cpu_store() and channel closure: + * + * { Initially: state = CHANNEL_OPENED } + * + * CPU1 CPU2 + * + * [target_cpu_store()] [vmbus_disconnect_ring()] + * + * LOCK channel_mutex LOCK channel_mutex + * LOAD r1 = state LOAD r2 = state + * IF (r1 == CHANNEL_OPENED) IF (r2 == CHANNEL_OPENED) + * SEND MODIFYCHANNEL STORE state = CHANNEL_OPEN + * [...] SEND CLOSECHANNEL + * UNLOCK channel_mutex UNLOCK channel_mutex + * + * Forbids: r1 == r2 == CHANNEL_OPENED (i.e., CPU1's LOCK precedes + * CPU2's LOCK) && CPU2's SEND precedes CPU1's SEND + * + * Note. The host processes the channel messages "sequentially", in + * the order in which they are received on a per-partition basis. + */ + mutex_lock(&vmbus_connection.channel_mutex); + + /* + * Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels; + * avoid sending the message and fail here for such channels. + */ + if (channel->state != CHANNEL_OPENED_STATE) { + ret = -EIO; + goto cpu_store_unlock; + } + + origin_cpu = channel->target_cpu; + if (target_cpu == origin_cpu) + goto cpu_store_unlock; + + if (vmbus_send_modifychannel(channel->offermsg.child_relid, + hv_cpu_number_to_vp_number(target_cpu))) { + ret = -EIO; + goto cpu_store_unlock; + } + + /* + * Warning. At this point, there is *no* guarantee that the host will + * have successfully processed the vmbus_send_modifychannel() request. + * See the header comment of vmbus_send_modifychannel() for more info. + * + * Lags in the processing of the above vmbus_send_modifychannel() can + * result in missed interrupts if the "old" target CPU is taken offline + * before Hyper-V starts sending interrupts to the "new" target CPU. + * But apart from this offlining scenario, the code tolerates such + * lags. It will function correctly even if a channel interrupt comes + * in on a CPU that is different from the channel target_cpu value. + */ + + channel->target_cpu = target_cpu; + channel->target_vp = hv_cpu_number_to_vp_number(target_cpu); + channel->numa_node = cpu_to_node(target_cpu); + + /* See init_vp_index(). */ + if (hv_is_perf_channel(channel)) + hv_update_alloced_cpus(origin_cpu, target_cpu); + + /* Currently set only for storvsc channels. */ + if (channel->change_target_cpu_callback) { + (*channel->change_target_cpu_callback)(channel, + origin_cpu, target_cpu); + } + +cpu_store_unlock: + mutex_unlock(&vmbus_connection.channel_mutex); + cpus_read_unlock(); + return ret; +} +static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store); static ssize_t channel_pending_show(struct vmbus_channel *channel, char *buf) @@ -1830,7 +1998,7 @@ int vmbus_device_register(struct hv_device *child_device_obj) int ret; dev_set_name(&child_device_obj->device, "%pUl", - child_device_obj->channel->offermsg.offer.if_instance.b); + &child_device_obj->channel->offermsg.offer.if_instance); child_device_obj->device.bus = &hv_bus; child_device_obj->device.parent = &hv_acpi_dev->dev; @@ -2221,9 +2389,12 @@ static int vmbus_bus_suspend(struct device *dev) list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { /* - * Invalidate the field. Upon resume, vmbus_onoffer() will fix - * up the field, and the other fields (if necessary). + * Remove the channel from the array of channels and invalidate + * the channel's relid. Upon resume, vmbus_onoffer() will fix + * up the relid (and other fields, if necessary) and add the + * channel back to the array. */ + vmbus_channel_unmap_relid(channel); channel->offermsg.child_relid = INVALID_RELID; if (is_hvsock_channel(channel)) { @@ -2470,6 +2641,7 @@ static void __exit vmbus_exit(void) hv_debug_rm_all_dir(); vmbus_free_channels(); + kfree(vmbus_connection.channels); if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { kmsg_dump_unregister(&hv_kmsg_dumper); diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index ca68aa1df801..41f5cf0bb997 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -636,9 +636,12 @@ void netvsc_device_remove(struct hv_device *device) RCU_INIT_POINTER(net_device_ctx->nvdev, NULL); - /* And disassociate NAPI context from device */ - for (i = 0; i < net_device->num_chn; i++) + /* Disable NAPI and disassociate its context from the device. */ + for (i = 0; i < net_device->num_chn; i++) { + /* See also vmbus_reset_channel_cb(). */ + napi_disable(&net_device->chan_table[i].napi); netif_napi_del(&net_device->chan_table[i].napi); + } /* * At this point, no one should be accessing net_device diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index e15022ff63e3..222ff5639ebe 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -1356,11 +1356,11 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { struct irq_cfg *cfg = irqd_cfg(data); struct hv_pcibus_device *hbus; + struct vmbus_channel *channel; struct hv_pci_dev *hpdev; struct pci_bus *pbus; struct pci_dev *pdev; struct cpumask *dest; - unsigned long flags; struct compose_comp_ctxt comp; struct tran_int_desc *int_desc; struct { @@ -1378,6 +1378,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) dest = irq_data_get_effective_affinity_mask(data); pbus = pdev->bus; hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); + channel = hbus->hdev->channel; hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); if (!hpdev) goto return_null_message; @@ -1436,42 +1437,51 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) } /* + * Prevents hv_pci_onchannelcallback() from running concurrently + * in the tasklet. + */ + tasklet_disable(&channel->callback_event); + + /* * Since this function is called with IRQ locks held, can't * do normal wait for completion; instead poll. */ while (!try_wait_for_completion(&comp.comp_pkt.host_event)) { + unsigned long flags; + /* 0xFFFF means an invalid PCI VENDOR ID. */ if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) { dev_err_once(&hbus->hdev->device, "the device has gone\n"); - goto free_int_desc; + goto enable_tasklet; } /* - * When the higher level interrupt code calls us with - * interrupt disabled, we must poll the channel by calling - * the channel callback directly when channel->target_cpu is - * the current CPU. When the higher level interrupt code - * calls us with interrupt enabled, let's add the - * local_irq_save()/restore() to avoid race: - * hv_pci_onchannelcallback() can also run in tasklet. + * Make sure that the ring buffer data structure doesn't get + * freed while we dereference the ring buffer pointer. Test + * for the channel's onchannel_callback being NULL within a + * sched_lock critical section. See also the inline comments + * in vmbus_reset_channel_cb(). */ - local_irq_save(flags); - - if (hbus->hdev->channel->target_cpu == smp_processor_id()) - hv_pci_onchannelcallback(hbus); - - local_irq_restore(flags); + spin_lock_irqsave(&channel->sched_lock, flags); + if (unlikely(channel->onchannel_callback == NULL)) { + spin_unlock_irqrestore(&channel->sched_lock, flags); + goto enable_tasklet; + } + hv_pci_onchannelcallback(hbus); + spin_unlock_irqrestore(&channel->sched_lock, flags); if (hpdev->state == hv_pcichild_ejecting) { dev_err_once(&hbus->hdev->device, "the device is being ejected\n"); - goto free_int_desc; + goto enable_tasklet; } udelay(100); } + tasklet_enable(&channel->callback_event); + if (comp.comp_pkt.completion_status < 0) { dev_err(&hbus->hdev->device, "Request for interrupt failed: 0x%x", @@ -1495,6 +1505,8 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) put_pcichild(hpdev); return; +enable_tasklet: + tasklet_enable(&channel->callback_event); free_int_desc: kfree(int_desc); drop_reference: diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index fb41636519ee..072ed8728657 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -621,6 +621,64 @@ get_in_err: } +static void storvsc_change_target_cpu(struct vmbus_channel *channel, u32 old, + u32 new) +{ + struct storvsc_device *stor_device; + struct vmbus_channel *cur_chn; + bool old_is_alloced = false; + struct hv_device *device; + unsigned long flags; + int cpu; + + device = channel->primary_channel ? + channel->primary_channel->device_obj + : channel->device_obj; + stor_device = get_out_stor_device(device); + if (!stor_device) + return; + + /* See storvsc_do_io() -> get_og_chn(). */ + spin_lock_irqsave(&device->channel->lock, flags); + + /* + * Determines if the storvsc device has other channels assigned to + * the "old" CPU to update the alloced_cpus mask and the stor_chns + * array. + */ + if (device->channel != channel && device->channel->target_cpu == old) { + cur_chn = device->channel; + old_is_alloced = true; + goto old_is_alloced; + } + list_for_each_entry(cur_chn, &device->channel->sc_list, sc_list) { + if (cur_chn == channel) + continue; + if (cur_chn->target_cpu == old) { + old_is_alloced = true; + goto old_is_alloced; + } + } + +old_is_alloced: + if (old_is_alloced) + WRITE_ONCE(stor_device->stor_chns[old], cur_chn); + else + cpumask_clear_cpu(old, &stor_device->alloced_cpus); + + /* "Flush" the stor_chns array. */ + for_each_possible_cpu(cpu) { + if (stor_device->stor_chns[cpu] && !cpumask_test_cpu( + cpu, &stor_device->alloced_cpus)) + WRITE_ONCE(stor_device->stor_chns[cpu], NULL); + } + + WRITE_ONCE(stor_device->stor_chns[new], channel); + cpumask_set_cpu(new, &stor_device->alloced_cpus); + + spin_unlock_irqrestore(&device->channel->lock, flags); +} + static void handle_sc_creation(struct vmbus_channel *new_sc) { struct hv_device *device = new_sc->primary_channel->device_obj; @@ -648,6 +706,8 @@ static void handle_sc_creation(struct vmbus_channel *new_sc) return; } + new_sc->change_target_cpu_callback = storvsc_change_target_cpu; + /* Add the sub-channel to the array of available channels. */ stor_device->stor_chns[new_sc->target_cpu] = new_sc; cpumask_set_cpu(new_sc->target_cpu, &stor_device->alloced_cpus); @@ -876,6 +936,8 @@ static int storvsc_channel_init(struct hv_device *device, bool is_fc) if (stor_device->stor_chns == NULL) return -ENOMEM; + device->channel->change_target_cpu_callback = storvsc_change_target_cpu; + stor_device->stor_chns[device->channel->target_cpu] = device->channel; cpumask_set_cpu(device->channel->target_cpu, &stor_device->alloced_cpus); @@ -1248,8 +1310,10 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device, const struct cpumask *node_mask; int num_channels, tgt_cpu; - if (stor_device->num_sc == 0) + if (stor_device->num_sc == 0) { + stor_device->stor_chns[q_num] = stor_device->device->channel; return stor_device->device->channel; + } /* * Our channel array is sparsley populated and we @@ -1258,7 +1322,6 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device, * The strategy is simple: * I. Ensure NUMA locality * II. Distribute evenly (best effort) - * III. Mapping is persistent. */ node_mask = cpumask_of_node(cpu_to_node(q_num)); @@ -1268,8 +1331,10 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device, if (cpumask_test_cpu(tgt_cpu, node_mask)) num_channels++; } - if (num_channels == 0) + if (num_channels == 0) { + stor_device->stor_chns[q_num] = stor_device->device->channel; return stor_device->device->channel; + } hash_qnum = q_num; while (hash_qnum >= num_channels) @@ -1295,6 +1360,7 @@ static int storvsc_do_io(struct hv_device *device, struct storvsc_device *stor_device; struct vstor_packet *vstor_packet; struct vmbus_channel *outgoing_channel, *channel; + unsigned long flags; int ret = 0; const struct cpumask *node_mask; int tgt_cpu; @@ -1308,10 +1374,11 @@ static int storvsc_do_io(struct hv_device *device, request->device = device; /* - * Select an an appropriate channel to send the request out. + * Select an appropriate channel to send the request out. */ - if (stor_device->stor_chns[q_num] != NULL) { - outgoing_channel = stor_device->stor_chns[q_num]; + /* See storvsc_change_target_cpu(). */ + outgoing_channel = READ_ONCE(stor_device->stor_chns[q_num]); + if (outgoing_channel != NULL) { if (outgoing_channel->target_cpu == q_num) { /* * Ideally, we want to pick a different channel if @@ -1324,7 +1391,10 @@ static int storvsc_do_io(struct hv_device *device, continue; if (tgt_cpu == q_num) continue; - channel = stor_device->stor_chns[tgt_cpu]; + channel = READ_ONCE( + stor_device->stor_chns[tgt_cpu]); + if (channel == NULL) + continue; if (hv_get_avail_to_write_percent( &channel->outbound) > ring_avail_percent_lowater) { @@ -1350,7 +1420,10 @@ static int storvsc_do_io(struct hv_device *device, for_each_cpu(tgt_cpu, &stor_device->alloced_cpus) { if (cpumask_test_cpu(tgt_cpu, node_mask)) continue; - channel = stor_device->stor_chns[tgt_cpu]; + channel = READ_ONCE( + stor_device->stor_chns[tgt_cpu]); + if (channel == NULL) + continue; if (hv_get_avail_to_write_percent( &channel->outbound) > ring_avail_percent_lowater) { @@ -1360,7 +1433,14 @@ static int storvsc_do_io(struct hv_device *device, } } } else { + spin_lock_irqsave(&device->channel->lock, flags); + outgoing_channel = stor_device->stor_chns[q_num]; + if (outgoing_channel != NULL) { + spin_unlock_irqrestore(&device->channel->lock, flags); + goto found_channel; + } outgoing_channel = get_og_chn(stor_device, q_num); + spin_unlock_irqrestore(&device->channel->lock, flags); } found_channel: |