diff options
Diffstat (limited to 'drivers/hv/vmbus_drv.c')
-rw-r--r-- | drivers/hv/vmbus_drv.c | 265 |
1 files changed, 265 insertions, 0 deletions
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index ebd35fc35290..391f0b225c9a 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -24,12 +24,14 @@ #include <linux/sched/task_stack.h> #include <asm/mshyperv.h> +#include <linux/delay.h> #include <linux/notifier.h> #include <linux/ptrace.h> #include <linux/screen_info.h> #include <linux/kdebug.h> #include <linux/efi.h> #include <linux/random.h> +#include <linux/syscore_ops.h> #include <clocksource/hyperv_timer.h> #include "hyperv_vmbus.h" @@ -910,6 +912,43 @@ static void vmbus_shutdown(struct device *child_device) drv->shutdown(dev); } +/* + * vmbus_suspend - Suspend a vmbus device + */ +static int vmbus_suspend(struct device *child_device) +{ + struct hv_driver *drv; + struct hv_device *dev = device_to_hv_device(child_device); + + /* The device may not be attached yet */ + if (!child_device->driver) + return 0; + + drv = drv_to_hv_drv(child_device->driver); + if (!drv->suspend) + return -EOPNOTSUPP; + + return drv->suspend(dev); +} + +/* + * vmbus_resume - Resume a vmbus device + */ +static int vmbus_resume(struct device *child_device) +{ + struct hv_driver *drv; + struct hv_device *dev = device_to_hv_device(child_device); + + /* The device may not be attached yet */ + if (!child_device->driver) + return 0; + + drv = drv_to_hv_drv(child_device->driver); + if (!drv->resume) + return -EOPNOTSUPP; + + return drv->resume(dev); +} /* * vmbus_device_release - Final callback release of the vmbus child device @@ -925,6 +964,14 @@ static void vmbus_device_release(struct device *device) kfree(hv_dev); } +/* + * Note: we must use SET_NOIRQ_SYSTEM_SLEEP_PM_OPS rather than + * SET_SYSTEM_SLEEP_PM_OPS: see the comment before vmbus_bus_pm. + */ +static const struct dev_pm_ops vmbus_pm = { + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(vmbus_suspend, vmbus_resume) +}; + /* The one and only one */ static struct bus_type hv_bus = { .name = "vmbus", @@ -935,6 +982,7 @@ static struct bus_type hv_bus = { .uevent = vmbus_uevent, .dev_groups = vmbus_dev_groups, .drv_groups = vmbus_drv_groups, + .pm = &vmbus_pm, }; struct onmessage_work_context { @@ -1022,6 +1070,41 @@ msg_handled: vmbus_signal_eom(msg, message_type); } +/* + * Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for + * hibernation, because hv_sock connections can not persist across hibernation. + */ +static void vmbus_force_channel_rescinded(struct vmbus_channel *channel) +{ + struct onmessage_work_context *ctx; + struct vmbus_channel_rescind_offer *rescind; + + WARN_ON(!is_hvsock_channel(channel)); + + /* + * sizeof(*ctx) is small and the allocation should really not fail, + * otherwise the state of the hv_sock connections ends up in limbo. + */ + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL); + + /* + * So far, these are not really used by Linux. Just set them to the + * reasonable values conforming to the definitions of the fields. + */ + ctx->msg.header.message_type = 1; + ctx->msg.header.payload_size = sizeof(*rescind); + + /* These values are actually used by Linux. */ + rescind = (struct vmbus_channel_rescind_offer *)ctx->msg.u.payload; + rescind->header.msgtype = CHANNELMSG_RESCIND_CHANNELOFFER; + rescind->child_relid = channel->offermsg.child_relid; + + INIT_WORK(&ctx->work, vmbus_onmessage_work); + + queue_work_on(vmbus_connection.connect_cpu, + vmbus_connection.work_queue, + &ctx->work); +} /* * Direct callback for channels using other deferred processing @@ -2042,6 +2125,129 @@ acpi_walk_err: return ret_val; } +static int vmbus_bus_suspend(struct device *dev) +{ + struct vmbus_channel *channel, *sc; + unsigned long flags; + + while (atomic_read(&vmbus_connection.offer_in_progress) != 0) { + /* + * We wait here until the completion of any channel + * offers that are currently in progress. + */ + msleep(1); + } + + mutex_lock(&vmbus_connection.channel_mutex); + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + if (!is_hvsock_channel(channel)) + continue; + + vmbus_force_channel_rescinded(channel); + } + mutex_unlock(&vmbus_connection.channel_mutex); + + /* + * Wait until all the sub-channels and hv_sock channels have been + * cleaned up. Sub-channels should be destroyed upon suspend, otherwise + * they would conflict with the new sub-channels that will be created + * in the resume path. hv_sock channels should also be destroyed, but + * a hv_sock channel of an established hv_sock connection can not be + * really destroyed since it may still be referenced by the userspace + * application, so we just force the hv_sock channel to be rescinded + * by vmbus_force_channel_rescinded(), and the userspace application + * will thoroughly destroy the channel after hibernation. + * + * Note: the counter nr_chan_close_on_suspend may never go above 0 if + * the VM has no sub-channel and hv_sock channel, e.g. a 1-vCPU VM. + */ + if (atomic_read(&vmbus_connection.nr_chan_close_on_suspend) > 0) + wait_for_completion(&vmbus_connection.ready_for_suspend_event); + + WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) != 0); + + mutex_lock(&vmbus_connection.channel_mutex); + + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + /* + * Invalidate the field. Upon resume, vmbus_onoffer() will fix + * up the field, and the other fields (if necessary). + */ + channel->offermsg.child_relid = INVALID_RELID; + + if (is_hvsock_channel(channel)) { + if (!channel->rescind) { + pr_err("hv_sock channel not rescinded!\n"); + WARN_ON_ONCE(1); + } + continue; + } + + spin_lock_irqsave(&channel->lock, flags); + list_for_each_entry(sc, &channel->sc_list, sc_list) { + pr_err("Sub-channel not deleted!\n"); + WARN_ON_ONCE(1); + } + spin_unlock_irqrestore(&channel->lock, flags); + + atomic_inc(&vmbus_connection.nr_chan_fixup_on_resume); + } + + mutex_unlock(&vmbus_connection.channel_mutex); + + vmbus_initiate_unload(false); + + vmbus_connection.conn_state = DISCONNECTED; + + /* Reset the event for the next resume. */ + reinit_completion(&vmbus_connection.ready_for_resume_event); + + return 0; +} + +static int vmbus_bus_resume(struct device *dev) +{ + struct vmbus_channel_msginfo *msginfo; + size_t msgsize; + int ret; + + /* + * We only use the 'vmbus_proto_version', which was in use before + * hibernation, to re-negotiate with the host. + */ + if (vmbus_proto_version == VERSION_INVAL || + vmbus_proto_version == 0) { + pr_err("Invalid proto version = 0x%x\n", vmbus_proto_version); + return -EINVAL; + } + + msgsize = sizeof(*msginfo) + + sizeof(struct vmbus_channel_initiate_contact); + + msginfo = kzalloc(msgsize, GFP_KERNEL); + + if (msginfo == NULL) + return -ENOMEM; + + ret = vmbus_negotiate_version(msginfo, vmbus_proto_version); + + kfree(msginfo); + + if (ret != 0) + return ret; + + WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) == 0); + + vmbus_request_offers(); + + wait_for_completion(&vmbus_connection.ready_for_resume_event); + + /* Reset the event for the next suspend. */ + reinit_completion(&vmbus_connection.ready_for_suspend_event); + + return 0; +} + static const struct acpi_device_id vmbus_acpi_device_ids[] = { {"VMBUS", 0}, {"VMBus", 0}, @@ -2049,6 +2255,19 @@ static const struct acpi_device_id vmbus_acpi_device_ids[] = { }; MODULE_DEVICE_TABLE(acpi, vmbus_acpi_device_ids); +/* + * Note: we must use SET_NOIRQ_SYSTEM_SLEEP_PM_OPS rather than + * SET_SYSTEM_SLEEP_PM_OPS, otherwise NIC SR-IOV can not work, because the + * "pci_dev_pm_ops" uses the "noirq" callbacks: in the resume path, the + * pci "noirq" restore callback runs before "non-noirq" callbacks (see + * resume_target_kernel() -> dpm_resume_start(), and hibernation_restore() -> + * dpm_resume_end()). This means vmbus_bus_resume() and the pci-hyperv's + * resume callback must also run via the "noirq" callbacks. + */ +static const struct dev_pm_ops vmbus_bus_pm = { + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(vmbus_bus_suspend, vmbus_bus_resume) +}; + static struct acpi_driver vmbus_acpi_driver = { .name = "vmbus", .ids = vmbus_acpi_device_ids, @@ -2056,6 +2275,7 @@ static struct acpi_driver vmbus_acpi_driver = { .add = vmbus_acpi_add, .remove = vmbus_acpi_remove, }, + .drv.pm = &vmbus_bus_pm, }; static void hv_kexec_handler(void) @@ -2086,6 +2306,47 @@ static void hv_crash_handler(struct pt_regs *regs) hyperv_cleanup(); }; +static int hv_synic_suspend(void) +{ + /* + * When we reach here, all the non-boot CPUs have been offlined, and + * the stimers on them have been unbound in hv_synic_cleanup() -> + * hv_stimer_cleanup() -> clockevents_unbind_device(). + * + * hv_synic_suspend() only runs on CPU0 with interrupts disabled. Here + * we do not unbind the stimer on CPU0 because: 1) it's unnecessary + * because the interrupts remain disabled between syscore_suspend() + * and syscore_resume(): see create_image() and resume_target_kernel(); + * 2) the stimer on CPU0 is automatically disabled later by + * syscore_suspend() -> timekeeping_suspend() -> tick_suspend() -> ... + * -> clockevents_shutdown() -> ... -> hv_ce_shutdown(); 3) a warning + * would be triggered if we call clockevents_unbind_device(), which + * may sleep, in an interrupts-disabled context. So, we intentionally + * don't call hv_stimer_cleanup(0) here. + */ + + hv_synic_disable_regs(0); + + return 0; +} + +static void hv_synic_resume(void) +{ + hv_synic_enable_regs(0); + + /* + * Note: we don't need to call hv_stimer_init(0), because the timer + * on CPU0 is not unbound in hv_synic_suspend(), and the timer is + * automatically re-enabled in timekeeping_resume(). + */ +} + +/* The callbacks run only on CPU0, with irqs_disabled. */ +static struct syscore_ops hv_synic_syscore_ops = { + .suspend = hv_synic_suspend, + .resume = hv_synic_resume, +}; + static int __init hv_acpi_init(void) { int ret, t; @@ -2116,6 +2377,8 @@ static int __init hv_acpi_init(void) hv_setup_kexec_handler(hv_kexec_handler); hv_setup_crash_handler(hv_crash_handler); + register_syscore_ops(&hv_synic_syscore_ops); + return 0; cleanup: @@ -2128,6 +2391,8 @@ static void __exit vmbus_exit(void) { int cpu; + unregister_syscore_ops(&hv_synic_syscore_ops); + hv_remove_kexec_handler(); hv_remove_crash_handler(); vmbus_connection.conn_state = DISCONNECTED; |