summaryrefslogtreecommitdiffstats
path: root/drivers/hv
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/hv')
-rw-r--r--drivers/hv/Makefile2
-rw-r--r--drivers/hv/channel.c154
-rw-r--r--drivers/hv/channel_mgmt.c373
-rw-r--r--drivers/hv/connection.c53
-rw-r--r--drivers/hv/hv.c34
-rw-r--r--drivers/hv/hv_balloon.c145
-rw-r--r--drivers/hv/hv_fcopy.c287
-rw-r--r--drivers/hv/hv_kvp.c192
-rw-r--r--drivers/hv/hv_snapshot.c168
-rw-r--r--drivers/hv/hv_util.c13
-rw-r--r--drivers/hv/hv_utils_transport.c276
-rw-r--r--drivers/hv/hv_utils_transport.h51
-rw-r--r--drivers/hv/hyperv_vmbus.h62
-rw-r--r--drivers/hv/vmbus_drv.c153
14 files changed, 1290 insertions, 673 deletions
diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
index 5e4dfa4cfe22..39c9b2c08d33 100644
--- a/drivers/hv/Makefile
+++ b/drivers/hv/Makefile
@@ -5,4 +5,4 @@ obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o
hv_vmbus-y := vmbus_drv.o \
hv.o connection.o channel.o \
channel_mgmt.o ring_buffer.o
-hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_fcopy.o
+hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_fcopy.o hv_utils_transport.o
diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 2978f5ee8d2a..603ce97e9027 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -71,7 +71,9 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size,
struct vmbus_channel_msginfo *open_info = NULL;
void *in, *out;
unsigned long flags;
- int ret, t, err = 0;
+ int ret, err = 0;
+ unsigned long t;
+ struct page *page;
spin_lock_irqsave(&newchannel->lock, flags);
if (newchannel->state == CHANNEL_OPEN_STATE) {
@@ -86,12 +88,22 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size,
newchannel->channel_callback_context = context;
/* Allocate the ring buffer */
- out = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
- get_order(send_ringbuffer_size + recv_ringbuffer_size));
-
- if (!out)
- return -ENOMEM;
-
+ page = alloc_pages_node(cpu_to_node(newchannel->target_cpu),
+ GFP_KERNEL|__GFP_ZERO,
+ get_order(send_ringbuffer_size +
+ recv_ringbuffer_size));
+
+ if (!page)
+ out = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
+ get_order(send_ringbuffer_size +
+ recv_ringbuffer_size));
+ else
+ out = (void *)page_address(page);
+
+ if (!out) {
+ err = -ENOMEM;
+ goto error0;
+ }
in = (void *)((unsigned long)out + send_ringbuffer_size);
@@ -135,7 +147,7 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size,
GFP_KERNEL);
if (!open_info) {
err = -ENOMEM;
- goto error0;
+ goto error_gpadl;
}
init_completion(&open_info->waitevent);
@@ -151,7 +163,7 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size,
if (userdatalen > MAX_USER_DEFINED_BYTES) {
err = -EINVAL;
- goto error0;
+ goto error_gpadl;
}
if (userdatalen)
@@ -176,29 +188,32 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size,
goto error1;
}
-
- if (open_info->response.open_result.status)
- err = open_info->response.open_result.status;
-
spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
list_del(&open_info->msglistentry);
spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
- if (err == 0)
- newchannel->state = CHANNEL_OPENED_STATE;
+ if (open_info->response.open_result.status) {
+ err = -EAGAIN;
+ goto error_gpadl;
+ }
+ newchannel->state = CHANNEL_OPENED_STATE;
kfree(open_info);
- return err;
+ return 0;
error1:
spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
list_del(&open_info->msglistentry);
spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+error_gpadl:
+ vmbus_teardown_gpadl(newchannel, newchannel->ringbuffer_gpadlhandle);
+
error0:
free_pages((unsigned long)out,
get_order(send_ringbuffer_size + recv_ringbuffer_size));
kfree(open_info);
+ newchannel->state = CHANNEL_OPEN_STATE;
return err;
}
EXPORT_SYMBOL_GPL(vmbus_open);
@@ -534,6 +549,12 @@ static int vmbus_close_internal(struct vmbus_channel *channel)
free_pages((unsigned long)channel->ringbuffer_pages,
get_order(channel->ringbuffer_pagecount * PAGE_SIZE));
+ /*
+ * If the channel has been rescinded; process device removal.
+ */
+ if (channel->rescind)
+ hv_process_channel_removal(channel,
+ channel->offermsg.child_relid);
return ret;
}
@@ -569,23 +590,9 @@ void vmbus_close(struct vmbus_channel *channel)
}
EXPORT_SYMBOL_GPL(vmbus_close);
-/**
- * vmbus_sendpacket() - Send the specified buffer on the given channel
- * @channel: Pointer to vmbus_channel structure.
- * @buffer: Pointer to the buffer you want to receive the data into.
- * @bufferlen: Maximum size of what the the buffer will hold
- * @requestid: Identifier of the request
- * @type: Type of packet that is being send e.g. negotiate, time
- * packet etc.
- *
- * Sends data in @buffer directly to hyper-v via the vmbus
- * This will send the data unparsed to hyper-v.
- *
- * Mainly used by Hyper-V drivers.
- */
-int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer,
+int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer,
u32 bufferlen, u64 requestid,
- enum vmbus_packet_type type, u32 flags)
+ enum vmbus_packet_type type, u32 flags, bool kick_q)
{
struct vmpacket_descriptor desc;
u32 packetlen = sizeof(struct vmpacket_descriptor) + bufferlen;
@@ -613,21 +620,61 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer,
ret = hv_ringbuffer_write(&channel->outbound, bufferlist, 3, &signal);
- if (ret == 0 && signal)
+ /*
+ * Signalling the host is conditional on many factors:
+ * 1. The ring state changed from being empty to non-empty.
+ * This is tracked by the variable "signal".
+ * 2. The variable kick_q tracks if more data will be placed
+ * on the ring. We will not signal if more data is
+ * to be placed.
+ *
+ * If we cannot write to the ring-buffer; signal the host
+ * even if we may not have written anything. This is a rare
+ * enough condition that it should not matter.
+ */
+ if (((ret == 0) && kick_q && signal) || (ret))
vmbus_setevent(channel);
return ret;
}
+EXPORT_SYMBOL(vmbus_sendpacket_ctl);
+
+/**
+ * vmbus_sendpacket() - Send the specified buffer on the given channel
+ * @channel: Pointer to vmbus_channel structure.
+ * @buffer: Pointer to the buffer you want to receive the data into.
+ * @bufferlen: Maximum size of what the the buffer will hold
+ * @requestid: Identifier of the request
+ * @type: Type of packet that is being send e.g. negotiate, time
+ * packet etc.
+ *
+ * Sends data in @buffer directly to hyper-v via the vmbus
+ * This will send the data unparsed to hyper-v.
+ *
+ * Mainly used by Hyper-V drivers.
+ */
+int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer,
+ u32 bufferlen, u64 requestid,
+ enum vmbus_packet_type type, u32 flags)
+{
+ return vmbus_sendpacket_ctl(channel, buffer, bufferlen, requestid,
+ type, flags, true);
+}
EXPORT_SYMBOL(vmbus_sendpacket);
/*
- * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer
- * packets using a GPADL Direct packet type.
+ * vmbus_sendpacket_pagebuffer_ctl - Send a range of single-page buffer
+ * packets using a GPADL Direct packet type. This interface allows you
+ * to control notifying the host. This will be useful for sending
+ * batched data. Also the sender can control the send flags
+ * explicitly.
*/
-int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
+int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel,
struct hv_page_buffer pagebuffers[],
u32 pagecount, void *buffer, u32 bufferlen,
- u64 requestid)
+ u64 requestid,
+ u32 flags,
+ bool kick_q)
{
int ret;
int i;
@@ -655,7 +702,7 @@ int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
/* Setup the descriptor */
desc.type = VM_PKT_DATA_USING_GPA_DIRECT;
- desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
+ desc.flags = flags;
desc.dataoffset8 = descsize >> 3; /* in 8-bytes grandularity */
desc.length8 = (u16)(packetlen_aligned >> 3);
desc.transactionid = requestid;
@@ -676,11 +723,40 @@ int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
ret = hv_ringbuffer_write(&channel->outbound, bufferlist, 3, &signal);
- if (ret == 0 && signal)
+ /*
+ * Signalling the host is conditional on many factors:
+ * 1. The ring state changed from being empty to non-empty.
+ * This is tracked by the variable "signal".
+ * 2. The variable kick_q tracks if more data will be placed
+ * on the ring. We will not signal if more data is
+ * to be placed.
+ *
+ * If we cannot write to the ring-buffer; signal the host
+ * even if we may not have written anything. This is a rare
+ * enough condition that it should not matter.
+ */
+ if (((ret == 0) && kick_q && signal) || (ret))
vmbus_setevent(channel);
return ret;
}
+EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer_ctl);
+
+/*
+ * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer
+ * packets using a GPADL Direct packet type.
+ */
+int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
+ struct hv_page_buffer pagebuffers[],
+ u32 pagecount, void *buffer, u32 bufferlen,
+ u64 requestid)
+{
+ u32 flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
+ return vmbus_sendpacket_pagebuffer_ctl(channel, pagebuffers, pagecount,
+ buffer, bufferlen, requestid,
+ flags, true);
+
+}
EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer);
/*
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 3736f71bdec5..4506a6623618 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -32,11 +32,8 @@
#include "hyperv_vmbus.h"
-struct vmbus_channel_message_table_entry {
- enum vmbus_channel_message_type message_type;
- void (*message_handler)(struct vmbus_channel_message_header *msg);
-};
-
+static void init_vp_index(struct vmbus_channel *channel,
+ const uuid_le *type_guid);
/**
* vmbus_prep_negotiate_resp() - Create default response for Hyper-V Negotiate message
@@ -139,54 +136,29 @@ EXPORT_SYMBOL_GPL(vmbus_prep_negotiate_resp);
*/
static struct vmbus_channel *alloc_channel(void)
{
+ static atomic_t chan_num = ATOMIC_INIT(0);
struct vmbus_channel *channel;
channel = kzalloc(sizeof(*channel), GFP_ATOMIC);
if (!channel)
return NULL;
+ channel->id = atomic_inc_return(&chan_num);
spin_lock_init(&channel->inbound_lock);
spin_lock_init(&channel->lock);
INIT_LIST_HEAD(&channel->sc_list);
INIT_LIST_HEAD(&channel->percpu_list);
- channel->controlwq = create_workqueue("hv_vmbus_ctl");
- if (!channel->controlwq) {
- kfree(channel);
- return NULL;
- }
-
return channel;
}
/*
- * release_hannel - Release the vmbus channel object itself
- */
-static void release_channel(struct work_struct *work)
-{
- struct vmbus_channel *channel = container_of(work,
- struct vmbus_channel,
- work);
-
- destroy_workqueue(channel->controlwq);
-
- kfree(channel);
-}
-
-/*
* free_channel - Release the resources used by the vmbus channel object
*/
static void free_channel(struct vmbus_channel *channel)
{
-
- /*
- * We have to release the channel's workqueue/thread in the vmbus's
- * workqueue/thread context
- * ie we can't destroy ourselves.
- */
- INIT_WORK(&channel->work, release_channel);
- queue_work(vmbus_connection.work_queue, &channel->work);
+ kfree(channel);
}
static void percpu_channel_enq(void *arg)
@@ -204,33 +176,21 @@ static void percpu_channel_deq(void *arg)
list_del(&channel->percpu_list);
}
-/*
- * vmbus_process_rescind_offer -
- * Rescind the offer by initiating a device removal
- */
-static void vmbus_process_rescind_offer(struct work_struct *work)
+
+void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid)
{
- struct vmbus_channel *channel = container_of(work,
- struct vmbus_channel,
- work);
+ struct vmbus_channel_relid_released msg;
unsigned long flags;
struct vmbus_channel *primary_channel;
- struct vmbus_channel_relid_released msg;
- struct device *dev;
-
- if (channel->device_obj) {
- dev = get_device(&channel->device_obj->device);
- if (dev) {
- vmbus_device_unregister(channel->device_obj);
- put_device(dev);
- }
- }
memset(&msg, 0, sizeof(struct vmbus_channel_relid_released));
- msg.child_relid = channel->offermsg.child_relid;
+ msg.child_relid = relid;
msg.header.msgtype = CHANNELMSG_RELID_RELEASED;
vmbus_post_msg(&msg, sizeof(struct vmbus_channel_relid_released));
+ if (channel == NULL)
+ return;
+
if (channel->target_cpu != get_cpu()) {
put_cpu();
smp_call_function_single(channel->target_cpu,
@@ -248,6 +208,7 @@ static void vmbus_process_rescind_offer(struct work_struct *work)
primary_channel = channel->primary_channel;
spin_lock_irqsave(&primary_channel->lock, flags);
list_del(&channel->sc_list);
+ primary_channel->num_sc--;
spin_unlock_irqrestore(&primary_channel->lock, flags);
}
free_channel(channel);
@@ -255,12 +216,16 @@ static void vmbus_process_rescind_offer(struct work_struct *work)
void vmbus_free_channels(void)
{
- struct vmbus_channel *channel;
+ struct vmbus_channel *channel, *tmp;
+
+ list_for_each_entry_safe(channel, tmp, &vmbus_connection.chn_list,
+ listentry) {
+ /* if we don't set rescind to true, vmbus_close_internal()
+ * won't invoke hv_process_channel_removal().
+ */
+ channel->rescind = true;
- list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
vmbus_device_unregister(channel->device_obj);
- kfree(channel->device_obj);
- free_channel(channel);
}
}
@@ -268,15 +233,10 @@ void vmbus_free_channels(void)
* vmbus_process_offer - Process the offer by creating a channel/device
* associated with this offer
*/
-static void vmbus_process_offer(struct work_struct *work)
+static void vmbus_process_offer(struct vmbus_channel *newchannel)
{
- struct vmbus_channel *newchannel = container_of(work,
- struct vmbus_channel,
- work);
struct vmbus_channel *channel;
bool fnew = true;
- bool enq = false;
- int ret;
unsigned long flags;
/* Make sure this is a new offer */
@@ -292,25 +252,12 @@ static void vmbus_process_offer(struct work_struct *work)
}
}
- if (fnew) {
+ if (fnew)
list_add_tail(&newchannel->listentry,
&vmbus_connection.chn_list);
- enq = true;
- }
spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
- if (enq) {
- if (newchannel->target_cpu != get_cpu()) {
- put_cpu();
- smp_call_function_single(newchannel->target_cpu,
- percpu_channel_enq,
- newchannel, true);
- } else {
- percpu_channel_enq(newchannel);
- put_cpu();
- }
- }
if (!fnew) {
/*
* Check to see if this is a sub-channel.
@@ -322,26 +269,22 @@ static void vmbus_process_offer(struct work_struct *work)
newchannel->primary_channel = channel;
spin_lock_irqsave(&channel->lock, flags);
list_add_tail(&newchannel->sc_list, &channel->sc_list);
+ channel->num_sc++;
spin_unlock_irqrestore(&channel->lock, flags);
+ } else
+ goto err_free_chan;
+ }
- if (newchannel->target_cpu != get_cpu()) {
- put_cpu();
- smp_call_function_single(newchannel->target_cpu,
- percpu_channel_enq,
- newchannel, true);
- } else {
- percpu_channel_enq(newchannel);
- put_cpu();
- }
-
- newchannel->state = CHANNEL_OPEN_STATE;
- if (channel->sc_creation_callback != NULL)
- channel->sc_creation_callback(newchannel);
-
- goto done_init_rescind;
- }
+ init_vp_index(newchannel, &newchannel->offermsg.offer.if_type);
- goto err_free_chan;
+ if (newchannel->target_cpu != get_cpu()) {
+ put_cpu();
+ smp_call_function_single(newchannel->target_cpu,
+ percpu_channel_enq,
+ newchannel, true);
+ } else {
+ percpu_channel_enq(newchannel);
+ put_cpu();
}
/*
@@ -351,6 +294,12 @@ static void vmbus_process_offer(struct work_struct *work)
*/
newchannel->state = CHANNEL_OPEN_STATE;
+ if (!fnew) {
+ if (channel->sc_creation_callback != NULL)
+ channel->sc_creation_callback(newchannel);
+ return;
+ }
+
/*
* Start the process of binding this offer to the driver
* We need to set the DeviceObject field before calling
@@ -361,33 +310,35 @@ static void vmbus_process_offer(struct work_struct *work)
&newchannel->offermsg.offer.if_instance,
newchannel);
if (!newchannel->device_obj)
- goto err_free_chan;
+ goto err_deq_chan;
/*
* Add the new device to the bus. This will kick off device-driver
* binding which eventually invokes the device driver's AddDevice()
* method.
*/
- ret = vmbus_device_register(newchannel->device_obj);
- if (ret != 0) {
+ if (vmbus_device_register(newchannel->device_obj) != 0) {
pr_err("unable to add child device object (relid %d)\n",
- newchannel->offermsg.child_relid);
-
- spin_lock_irqsave(&vmbus_connection.channel_lock, flags);
- list_del(&newchannel->listentry);
- spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
+ newchannel->offermsg.child_relid);
kfree(newchannel->device_obj);
- goto err_free_chan;
+ goto err_deq_chan;
}
-done_init_rescind:
- spin_lock_irqsave(&newchannel->lock, flags);
- /* The next possible work is rescind handling */
- INIT_WORK(&newchannel->work, vmbus_process_rescind_offer);
- /* Check if rescind offer was already received */
- if (newchannel->rescind)
- queue_work(newchannel->controlwq, &newchannel->work);
- spin_unlock_irqrestore(&newchannel->lock, flags);
return;
+
+err_deq_chan:
+ spin_lock_irqsave(&vmbus_connection.channel_lock, flags);
+ list_del(&newchannel->listentry);
+ spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
+
+ if (newchannel->target_cpu != get_cpu()) {
+ put_cpu();
+ smp_call_function_single(newchannel->target_cpu,
+ percpu_channel_deq, newchannel, true);
+ } else {
+ percpu_channel_deq(newchannel);
+ put_cpu();
+ }
+
err_free_chan:
free_channel(newchannel);
}
@@ -411,29 +362,35 @@ static const struct hv_vmbus_device_id hp_devs[] = {
{ HV_SCSI_GUID, },
/* Network */
{ HV_NIC_GUID, },
+ /* NetworkDirect Guest RDMA */
+ { HV_ND_GUID, },
};
/*
* We use this state to statically distribute the channel interrupt load.
*/
-static u32 next_vp;
+static int next_numa_node_id;
/*
* Starting with Win8, we can statically distribute the incoming
- * channel interrupt load by binding a channel to VCPU. We
- * implement here a simple round robin scheme for distributing
- * the interrupt load.
- * We will bind channels that are not performance critical to cpu 0 and
- * performance critical channels (IDE, SCSI and Network) will be uniformly
- * distributed across all available CPUs.
+ * channel interrupt load by binding a channel to VCPU.
+ * We do this in a hierarchical fashion:
+ * First distribute the primary channels across available NUMA nodes
+ * and then distribute the subchannels amongst the CPUs in the NUMA
+ * node assigned to the primary channel.
+ *
+ * For pre-win8 hosts or non-performance critical channels we assign the
+ * first CPU in the first NUMA node.
*/
static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_guid)
{
u32 cur_cpu;
int i;
bool perf_chn = false;
- u32 max_cpus = num_online_cpus();
+ struct vmbus_channel *primary = channel->primary_channel;
+ int next_node;
+ struct cpumask available_mask;
for (i = IDE; i < MAX_PERF_CHN; i++) {
if (!memcmp(type_guid->b, hp_devs[i].guid,
@@ -450,16 +407,77 @@ static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_gui
* Also if the channel is not a performance critical
* channel, bind it to cpu 0.
*/
+ channel->numa_node = 0;
+ cpumask_set_cpu(0, &channel->alloced_cpus_in_node);
channel->target_cpu = 0;
- channel->target_vp = 0;
+ channel->target_vp = hv_context.vp_index[0];
return;
}
- cur_cpu = (++next_vp % max_cpus);
+
+ /*
+ * We distribute primary channels evenly across all the available
+ * NUMA nodes and within the assigned NUMA node we will assign the
+ * first available CPU to the primary channel.
+ * The sub-channels will be assigned to the CPUs available in the
+ * NUMA node evenly.
+ */
+ if (!primary) {
+ while (true) {
+ next_node = next_numa_node_id++;
+ if (next_node == nr_node_ids)
+ next_node = next_numa_node_id = 0;
+ if (cpumask_empty(cpumask_of_node(next_node)))
+ continue;
+ break;
+ }
+ channel->numa_node = next_node;
+ primary = channel;
+ }
+
+ if (cpumask_weight(&primary->alloced_cpus_in_node) ==
+ cpumask_weight(cpumask_of_node(primary->numa_node))) {
+ /*
+ * We have cycled through all the CPUs in the node;
+ * reset the alloced map.
+ */
+ cpumask_clear(&primary->alloced_cpus_in_node);
+ }
+
+ cpumask_xor(&available_mask, &primary->alloced_cpus_in_node,
+ cpumask_of_node(primary->numa_node));
+
+ cur_cpu = cpumask_next(-1, &available_mask);
+ cpumask_set_cpu(cur_cpu, &primary->alloced_cpus_in_node);
+
channel->target_cpu = cur_cpu;
channel->target_vp = hv_context.vp_index[cur_cpu];
}
/*
+ * vmbus_unload_response - Handler for the unload response.
+ */
+static void vmbus_unload_response(struct vmbus_channel_message_header *hdr)
+{
+ /*
+ * This is a global event; just wakeup the waiting thread.
+ * Once we successfully unload, we can cleanup the monitor state.
+ */
+ complete(&vmbus_connection.unload_event);
+}
+
+void vmbus_initiate_unload(void)
+{
+ struct vmbus_channel_message_header hdr;
+
+ init_completion(&vmbus_connection.unload_event);
+ memset(&hdr, 0, sizeof(struct vmbus_channel_message_header));
+ hdr.msgtype = CHANNELMSG_UNLOAD;
+ vmbus_post_msg(&hdr, sizeof(struct vmbus_channel_message_header));
+
+ wait_for_completion(&vmbus_connection.unload_event);
+}
+
+/*
* vmbus_onoffer - Handler for channel offers from vmbus in parent partition.
*
*/
@@ -504,15 +522,12 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
offer->connection_id;
}
- init_vp_index(newchannel, &offer->offer.if_type);
-
memcpy(&newchannel->offermsg, offer,
sizeof(struct vmbus_channel_offer_channel));
newchannel->monitor_grp = (u8)offer->monitorid / 32;
newchannel->monitor_bit = (u8)offer->monitorid % 32;
- INIT_WORK(&newchannel->work, vmbus_process_offer);
- queue_work(newchannel->controlwq, &newchannel->work);
+ vmbus_process_offer(newchannel);
}
/*
@@ -525,28 +540,34 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
struct vmbus_channel_rescind_offer *rescind;
struct vmbus_channel *channel;
unsigned long flags;
+ struct device *dev;
rescind = (struct vmbus_channel_rescind_offer *)hdr;
channel = relid2channel(rescind->child_relid);
- if (channel == NULL)
- /* Just return here, no channel found */
+ if (channel == NULL) {
+ hv_process_channel_removal(NULL, rescind->child_relid);
return;
+ }
spin_lock_irqsave(&channel->lock, flags);
channel->rescind = true;
- /*
- * channel->work.func != vmbus_process_rescind_offer means we are still
- * processing offer request and the rescind offer processing should be
- * postponed. It will be done at the very end of vmbus_process_offer()
- * as rescind flag is being checked there.
- */
- if (channel->work.func == vmbus_process_rescind_offer)
- /* work is initialized for vmbus_process_rescind_offer() from
- * vmbus_process_offer() where the channel got created */
- queue_work(channel->controlwq, &channel->work);
-
spin_unlock_irqrestore(&channel->lock, flags);
+
+ if (channel->device_obj) {
+ /*
+ * We will have to unregister this device from the
+ * driver core.
+ */
+ dev = get_device(&channel->device_obj->device);
+ if (dev) {
+ vmbus_device_unregister(channel->device_obj);
+ put_device(dev);
+ }
+ } else {
+ hv_process_channel_removal(channel,
+ channel->offermsg.child_relid);
+ }
}
/*
@@ -731,25 +752,26 @@ static void vmbus_onversion_response(
}
/* Channel message dispatch table */
-static struct vmbus_channel_message_table_entry
+struct vmbus_channel_message_table_entry
channel_message_table[CHANNELMSG_COUNT] = {
- {CHANNELMSG_INVALID, NULL},
- {CHANNELMSG_OFFERCHANNEL, vmbus_onoffer},
- {CHANNELMSG_RESCIND_CHANNELOFFER, vmbus_onoffer_rescind},
- {CHANNELMSG_REQUESTOFFERS, NULL},
- {CHANNELMSG_ALLOFFERS_DELIVERED, vmbus_onoffers_delivered},
- {CHANNELMSG_OPENCHANNEL, NULL},
- {CHANNELMSG_OPENCHANNEL_RESULT, vmbus_onopen_result},
- {CHANNELMSG_CLOSECHANNEL, NULL},
- {CHANNELMSG_GPADL_HEADER, NULL},
- {CHANNELMSG_GPADL_BODY, NULL},
- {CHANNELMSG_GPADL_CREATED, vmbus_ongpadl_created},
- {CHANNELMSG_GPADL_TEARDOWN, NULL},
- {CHANNELMSG_GPADL_TORNDOWN, vmbus_ongpadl_torndown},
- {CHANNELMSG_RELID_RELEASED, NULL},
- {CHANNELMSG_INITIATE_CONTACT, NULL},
- {CHANNELMSG_VERSION_RESPONSE, vmbus_onversion_response},
- {CHANNELMSG_UNLOAD, NULL},
+ {CHANNELMSG_INVALID, 0, NULL},
+ {CHANNELMSG_OFFERCHANNEL, 0, vmbus_onoffer},
+ {CHANNELMSG_RESCIND_CHANNELOFFER, 0, vmbus_onoffer_rescind},
+ {CHANNELMSG_REQUESTOFFERS, 0, NULL},
+ {CHANNELMSG_ALLOFFERS_DELIVERED, 1, vmbus_onoffers_delivered},
+ {CHANNELMSG_OPENCHANNEL, 0, NULL},
+ {CHANNELMSG_OPENCHANNEL_RESULT, 1, vmbus_onopen_result},
+ {CHANNELMSG_CLOSECHANNEL, 0, NULL},
+ {CHANNELMSG_GPADL_HEADER, 0, NULL},
+ {CHANNELMSG_GPADL_BODY, 0, NULL},
+ {CHANNELMSG_GPADL_CREATED, 1, vmbus_ongpadl_created},
+ {CHANNELMSG_GPADL_TEARDOWN, 0, NULL},
+ {CHANNELMSG_GPADL_TORNDOWN, 1, vmbus_ongpadl_torndown},
+ {CHANNELMSG_RELID_RELEASED, 0, NULL},
+ {CHANNELMSG_INITIATE_CONTACT, 0, NULL},
+ {CHANNELMSG_VERSION_RESPONSE, 1, vmbus_onversion_response},
+ {CHANNELMSG_UNLOAD, 0, NULL},
+ {CHANNELMSG_UNLOAD_RESPONSE, 1, vmbus_unload_response},
};
/*
@@ -787,7 +809,7 @@ int vmbus_request_offers(void)
{
struct vmbus_channel_message_header *msg;
struct vmbus_channel_msginfo *msginfo;
- int ret, t;
+ int ret;
msginfo = kmalloc(sizeof(*msginfo) +
sizeof(struct vmbus_channel_message_header),
@@ -795,8 +817,6 @@ int vmbus_request_offers(void)
if (!msginfo)
return -ENOMEM;
- init_completion(&msginfo->waitevent);
-
msg = (struct vmbus_channel_message_header *)msginfo->msg;
msg->msgtype = CHANNELMSG_REQUESTOFFERS;
@@ -810,14 +830,6 @@ int vmbus_request_offers(void)
goto cleanup;
}
- t = wait_for_completion_timeout(&msginfo->waitevent, 5*HZ);
- if (t == 0) {
- ret = -ETIMEDOUT;
- goto cleanup;
- }
-
-
-
cleanup:
kfree(msginfo);
@@ -826,9 +838,8 @@ cleanup:
/*
* Retrieve the (sub) channel on which to send an outgoing request.
- * When a primary channel has multiple sub-channels, we choose a
- * channel whose VCPU binding is closest to the VCPU on which
- * this call is being made.
+ * When a primary channel has multiple sub-channels, we try to
+ * distribute the load equally amongst all available channels.
*/
struct vmbus_channel *vmbus_get_outgoing_channel(struct vmbus_channel *primary)
{
@@ -836,11 +847,19 @@ struct vmbus_channel *vmbus_get_outgoing_channel(struct vmbus_channel *primary)
int cur_cpu;
struct vmbus_channel *cur_channel;
struct vmbus_channel *outgoing_channel = primary;
- int cpu_distance, new_cpu_distance;
+ int next_channel;
+ int i = 1;
if (list_empty(&primary->sc_list))
return outgoing_channel;
+ next_channel = primary->next_oc++;
+
+ if (next_channel > (primary->num_sc)) {
+ primary->next_oc = 0;
+ return outgoing_channel;
+ }
+
cur_cpu = hv_context.vp_index[get_cpu()];
put_cpu();
list_for_each_safe(cur, tmp, &primary->sc_list) {
@@ -851,18 +870,10 @@ struct vmbus_channel *vmbus_get_outgoing_channel(struct vmbus_channel *primary)
if (cur_channel->target_vp == cur_cpu)
return cur_channel;
- cpu_distance = ((outgoing_channel->target_vp > cur_cpu) ?
- (outgoing_channel->target_vp - cur_cpu) :
- (cur_cpu - outgoing_channel->target_vp));
-
- new_cpu_distance = ((cur_channel->target_vp > cur_cpu) ?
- (cur_channel->target_vp - cur_cpu) :
- (cur_cpu - cur_channel->target_vp));
-
- if (cpu_distance < new_cpu_distance)
- continue;
+ if (i == next_channel)
+ return cur_channel;
- outgoing_channel = cur_channel;
+ i++;
}
return outgoing_channel;
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index a63a795300b9..4fc2e8836e60 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -58,6 +58,9 @@ static __u32 vmbus_get_next_version(__u32 current_version)
case (VERSION_WIN8_1):
return VERSION_WIN8;
+ case (VERSION_WIN10):
+ return VERSION_WIN8_1;
+
case (VERSION_WS2008):
default:
return VERSION_INVAL;
@@ -80,7 +83,7 @@ static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo,
msg->interrupt_page = virt_to_phys(vmbus_connection.int_page);
msg->monitor_page1 = virt_to_phys(vmbus_connection.monitor_pages[0]);
msg->monitor_page2 = virt_to_phys(vmbus_connection.monitor_pages[1]);
- if (version == VERSION_WIN8_1) {
+ if (version >= VERSION_WIN8_1) {
msg->target_vcpu = hv_context.vp_index[get_cpu()];
put_cpu();
}
@@ -216,10 +219,26 @@ int vmbus_connect(void)
cleanup:
pr_err("Unable to connect to host\n");
+
vmbus_connection.conn_state = DISCONNECTED;
+ vmbus_disconnect();
+
+ kfree(msginfo);
+
+ return ret;
+}
+
+void vmbus_disconnect(void)
+{
+ /*
+ * First send the unload request to the host.
+ */
+ vmbus_initiate_unload();
- if (vmbus_connection.work_queue)
+ if (vmbus_connection.work_queue) {
+ drain_workqueue(vmbus_connection.work_queue);
destroy_workqueue(vmbus_connection.work_queue);
+ }
if (vmbus_connection.int_page) {
free_pages((unsigned long)vmbus_connection.int_page, 0);
@@ -230,10 +249,6 @@ cleanup:
free_pages((unsigned long)vmbus_connection.monitor_pages[1], 0);
vmbus_connection.monitor_pages[0] = NULL;
vmbus_connection.monitor_pages[1] = NULL;
-
- kfree(msginfo);
-
- return ret;
}
/*
@@ -311,10 +326,8 @@ static void process_chn_event(u32 relid)
*/
channel = pcpu_relid2channel(relid);
- if (!channel) {
- pr_err("channel not found for relid - %u\n", relid);
+ if (!channel)
return;
- }
/*
* A channel once created is persistent even when there
@@ -349,10 +362,7 @@ static void process_chn_event(u32 relid)
else
bytes_to_read = 0;
} while (read_state && (bytes_to_read != 0));
- } else {
- pr_err("no channel callback for relid - %u\n", relid);
}
-
}
/*
@@ -369,8 +379,7 @@ void vmbus_on_event(unsigned long data)
int cpu = smp_processor_id();
union hv_synic_event_flags *event;
- if ((vmbus_proto_version == VERSION_WS2008) ||
- (vmbus_proto_version == VERSION_WIN7)) {
+ if (vmbus_proto_version < VERSION_WIN8) {
maxdword = MAX_NUM_CHANNELS_SUPPORTED >> 5;
recv_int_page = vmbus_connection.recv_int_page;
} else {
@@ -420,6 +429,7 @@ int vmbus_post_msg(void *buffer, size_t buflen)
union hv_connection_id conn_id;
int ret = 0;
int retries = 0;
+ u32 msec = 1;
conn_id.asu32 = 0;
conn_id.u.id = VMBUS_MESSAGE_CONNECTION_ID;
@@ -429,13 +439,20 @@ int vmbus_post_msg(void *buffer, size_t buflen)
* insufficient resources. Retry the operation a couple of
* times before giving up.
*/
- while (retries < 10) {
+ while (retries < 20) {
ret = hv_post_message(conn_id, 1, buffer, buflen);
switch (ret) {
+ case HV_STATUS_INVALID_CONNECTION_ID:
+ /*
+ * We could get this if we send messages too
+ * frequently.
+ */
+ ret = -EAGAIN;
+ break;
+ case HV_STATUS_INSUFFICIENT_MEMORY:
case HV_STATUS_INSUFFICIENT_BUFFERS:
ret = -ENOMEM;
- case -ENOMEM:
break;
case HV_STATUS_SUCCESS:
return ret;
@@ -445,7 +462,9 @@ int vmbus_post_msg(void *buffer, size_t buflen)
}
retries++;
- msleep(100);
+ msleep(msec);
+ if (msec < 2048)
+ msec *= 2;
}
return ret;
}
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 50e51a51ff8b..d3943bceecc3 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -312,7 +312,11 @@ static void hv_init_clockevent_device(struct clock_event_device *dev, int cpu)
dev->features = CLOCK_EVT_FEAT_ONESHOT;
dev->cpumask = cpumask_of(cpu);
dev->rating = 1000;
- dev->owner = THIS_MODULE;
+ /*
+ * Avoid settint dev->owner = THIS_MODULE deliberately as doing so will
+ * result in clockevents_config_and_register() taking additional
+ * references to the hv_vmbus module making it impossible to unload.
+ */
dev->set_mode = hv_ce_setmode;
dev->set_next_event = hv_ce_set_next_event;
@@ -470,6 +474,20 @@ void hv_synic_init(void *arg)
}
/*
+ * hv_synic_clockevents_cleanup - Cleanup clockevent devices
+ */
+void hv_synic_clockevents_cleanup(void)
+{
+ int cpu;
+
+ if (!(ms_hyperv.features & HV_X64_MSR_SYNTIMER_AVAILABLE))
+ return;
+
+ for_each_online_cpu(cpu)
+ clockevents_unbind_device(hv_context.clk_evt[cpu], cpu);
+}
+
+/*
* hv_synic_cleanup - Cleanup routine for hv_synic_init().
*/
void hv_synic_cleanup(void *arg)
@@ -477,11 +495,17 @@ void hv_synic_cleanup(void *arg)
union hv_synic_sint shared_sint;
union hv_synic_simp simp;
union hv_synic_siefp siefp;
+ union hv_synic_scontrol sctrl;
int cpu = smp_processor_id();
if (!hv_context.synic_initialized)
return;
+ /* Turn off clockevent device */
+ if (ms_hyperv.features & HV_X64_MSR_SYNTIMER_AVAILABLE)
+ hv_ce_setmode(CLOCK_EVT_MODE_SHUTDOWN,
+ hv_context.clk_evt[cpu]);
+
rdmsrl(HV_X64_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
shared_sint.masked = 1;
@@ -502,6 +526,10 @@ void hv_synic_cleanup(void *arg)
wrmsrl(HV_X64_MSR_SIEFP, siefp.as_uint64);
- free_page((unsigned long)hv_context.synic_message_page[cpu]);
- free_page((unsigned long)hv_context.synic_event_page[cpu]);
+ /* Disable the global synic bit */
+ rdmsrl(HV_X64_MSR_SCONTROL, sctrl.as_uint64);
+ sctrl.enable = 0;
+ wrmsrl(HV_X64_MSR_SCONTROL, sctrl.as_uint64);
+
+ hv_synic_free_cpu(cpu);
}
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index ff169386b2c7..8a725cd69ad7 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -428,14 +428,13 @@ struct dm_info_msg {
* currently hot added. We hot add in multiples of 128M
* chunks; it is possible that we may not be able to bring
* online all the pages in the region. The range
- * covered_start_pfn : covered_end_pfn defines the pages that can
+ * covered_end_pfn defines the pages that can
* be brough online.
*/
struct hv_hotadd_state {
struct list_head list;
unsigned long start_pfn;
- unsigned long covered_start_pfn;
unsigned long covered_end_pfn;
unsigned long ha_end_pfn;
unsigned long end_pfn;
@@ -503,6 +502,8 @@ struct hv_dynmem_device {
* Number of pages we have currently ballooned out.
*/
unsigned int num_pages_ballooned;
+ unsigned int num_pages_onlined;
+ unsigned int num_pages_added;
/*
* State to manage the ballooning (up) operation.
@@ -534,7 +535,6 @@ struct hv_dynmem_device {
struct task_struct *thread;
struct mutex ha_region_mutex;
- struct completion waiter_event;
/*
* A list of hot-add regions.
@@ -554,46 +554,34 @@ static struct hv_dynmem_device dm_device;
static void post_status(struct hv_dynmem_device *dm);
#ifdef CONFIG_MEMORY_HOTPLUG
-static void acquire_region_mutex(bool trylock)
-{
- if (trylock) {
- reinit_completion(&dm_device.waiter_event);
- while (!mutex_trylock(&dm_device.ha_region_mutex))
- wait_for_completion(&dm_device.waiter_event);
- } else {
- mutex_lock(&dm_device.ha_region_mutex);
- }
-}
-
-static void release_region_mutex(bool trylock)
-{
- if (trylock) {
- mutex_unlock(&dm_device.ha_region_mutex);
- } else {
- mutex_unlock(&dm_device.ha_region_mutex);
- complete(&dm_device.waiter_event);
- }
-}
-
static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
void *v)
{
+ struct memory_notify *mem = (struct memory_notify *)v;
+
switch (val) {
case MEM_GOING_ONLINE:
- acquire_region_mutex(true);
+ mutex_lock(&dm_device.ha_region_mutex);
break;
case MEM_ONLINE:
+ dm_device.num_pages_onlined += mem->nr_pages;
case MEM_CANCEL_ONLINE:
- release_region_mutex(true);
+ if (val == MEM_ONLINE ||
+ mutex_is_locked(&dm_device.ha_region_mutex))
+ mutex_unlock(&dm_device.ha_region_mutex);
if (dm_device.ha_waiting) {
dm_device.ha_waiting = false;
complete(&dm_device.ol_waitevent);
}
break;
- case MEM_GOING_OFFLINE:
case MEM_OFFLINE:
+ mutex_lock(&dm_device.ha_region_mutex);
+ dm_device.num_pages_onlined -= mem->nr_pages;
+ mutex_unlock(&dm_device.ha_region_mutex);
+ break;
+ case MEM_GOING_OFFLINE:
case MEM_CANCEL_OFFLINE:
break;
}
@@ -646,7 +634,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
init_completion(&dm_device.ol_waitevent);
dm_device.ha_waiting = true;
- release_region_mutex(false);
+ mutex_unlock(&dm_device.ha_region_mutex);
nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
ret = add_memory(nid, PFN_PHYS((start_pfn)),
(HA_CHUNK << PAGE_SHIFT));
@@ -665,6 +653,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
}
has->ha_end_pfn -= HA_CHUNK;
has->covered_end_pfn -= processed_pfn;
+ mutex_lock(&dm_device.ha_region_mutex);
break;
}
@@ -675,7 +664,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
* have not been "onlined" within the allowed time.
*/
wait_for_completion_timeout(&dm_device.ol_waitevent, 5*HZ);
- acquire_region_mutex(false);
+ mutex_lock(&dm_device.ha_region_mutex);
post_status(&dm_device);
}
@@ -691,8 +680,7 @@ static void hv_online_page(struct page *pg)
list_for_each(cur, &dm_device.ha_region_list) {
has = list_entry(cur, struct hv_hotadd_state, list);
- cur_start_pgp = (unsigned long)
- pfn_to_page(has->covered_start_pfn);
+ cur_start_pgp = (unsigned long)pfn_to_page(has->start_pfn);
cur_end_pgp = (unsigned long)pfn_to_page(has->covered_end_pfn);
if (((unsigned long)pg >= cur_start_pgp) &&
@@ -704,7 +692,6 @@ static void hv_online_page(struct page *pg)
__online_page_set_limits(pg);
__online_page_increment_counters(pg);
__online_page_free(pg);
- has->covered_start_pfn++;
}
}
}
@@ -748,10 +735,9 @@ static bool pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
* is, update it.
*/
- if (has->covered_end_pfn != start_pfn) {
+ if (has->covered_end_pfn != start_pfn)
has->covered_end_pfn = start_pfn;
- has->covered_start_pfn = start_pfn;
- }
+
return true;
}
@@ -794,9 +780,18 @@ static unsigned long handle_pg_range(unsigned long pg_start,
pgs_ol = has->ha_end_pfn - start_pfn;
if (pgs_ol > pfn_cnt)
pgs_ol = pfn_cnt;
- hv_bring_pgs_online(start_pfn, pgs_ol);
+
+ /*
+ * Check if the corresponding memory block is already
+ * online by checking its last previously backed page.
+ * In case it is we need to bring rest (which was not
+ * backed previously) online too.
+ */
+ if (start_pfn > has->start_pfn &&
+ !PageReserved(pfn_to_page(start_pfn - 1)))
+ hv_bring_pgs_online(start_pfn, pgs_ol);
+
has->covered_end_pfn += pgs_ol;
- has->covered_start_pfn += pgs_ol;
pfn_cnt -= pgs_ol;
}
@@ -857,7 +852,6 @@ static unsigned long process_hot_add(unsigned long pg_start,
list_add_tail(&ha_region->list, &dm_device.ha_region_list);
ha_region->start_pfn = rg_start;
ha_region->ha_end_pfn = rg_start;
- ha_region->covered_start_pfn = pg_start;
ha_region->covered_end_pfn = pg_start;
ha_region->end_pfn = rg_start + rg_size;
}
@@ -886,7 +880,7 @@ static void hot_add_req(struct work_struct *dummy)
resp.hdr.size = sizeof(struct dm_hot_add_response);
#ifdef CONFIG_MEMORY_HOTPLUG
- acquire_region_mutex(false);
+ mutex_lock(&dm_device.ha_region_mutex);
pg_start = dm->ha_wrk.ha_page_range.finfo.start_page;
pfn_cnt = dm->ha_wrk.ha_page_range.finfo.page_cnt;
@@ -918,7 +912,9 @@ static void hot_add_req(struct work_struct *dummy)
if (do_hot_add)
resp.page_count = process_hot_add(pg_start, pfn_cnt,
rg_start, rg_sz);
- release_region_mutex(false);
+
+ dm->num_pages_added += resp.page_count;
+ mutex_unlock(&dm_device.ha_region_mutex);
#endif
/*
* The result field of the response structure has the
@@ -982,8 +978,8 @@ static unsigned long compute_balloon_floor(void)
* 128 72 (1/2)
* 512 168 (1/4)
* 2048 360 (1/8)
- * 8192 768 (1/16)
- * 32768 1536 (1/32)
+ * 8192 744 (1/16)
+ * 32768 1512 (1/32)
*/
if (totalram_pages < MB2PAGES(128))
min_pages = MB2PAGES(8) + (totalram_pages >> 1);
@@ -992,9 +988,9 @@ static unsigned long compute_balloon_floor(void)
else if (totalram_pages < MB2PAGES(2048))
min_pages = MB2PAGES(104) + (totalram_pages >> 3);
else if (totalram_pages < MB2PAGES(8192))
- min_pages = MB2PAGES(256) + (totalram_pages >> 4);
+ min_pages = MB2PAGES(232) + (totalram_pages >> 4);
else
- min_pages = MB2PAGES(512) + (totalram_pages >> 5);
+ min_pages = MB2PAGES(488) + (totalram_pages >> 5);
#undef MB2PAGES
return min_pages;
}
@@ -1031,17 +1027,21 @@ static void post_status(struct hv_dynmem_device *dm)
status.hdr.trans_id = atomic_inc_return(&trans_id);
/*
- * The host expects the guest to report free memory.
- * Further, the host expects the pressure information to
- * include the ballooned out pages.
- * For a given amount of memory that we are managing, we
- * need to compute a floor below which we should not balloon.
- * Compute this and add it to the pressure report.
+ * The host expects the guest to report free and committed memory.
+ * Furthermore, the host expects the pressure information to include
+ * the ballooned out pages. For a given amount of memory that we are
+ * managing we need to compute a floor below which we should not
+ * balloon. Compute this and add it to the pressure report.
+ * We also need to report all offline pages (num_pages_added -
+ * num_pages_onlined) as committed to the host, otherwise it can try
+ * asking us to balloon them out.
*/
status.num_avail = val.freeram;
status.num_committed = vm_memory_committed() +
- dm->num_pages_ballooned +
- compute_balloon_floor();
+ dm->num_pages_ballooned +
+ (dm->num_pages_added > dm->num_pages_onlined ?
+ dm->num_pages_added - dm->num_pages_onlined : 0) +
+ compute_balloon_floor();
/*
* If our transaction ID is no longer current, just don't
@@ -1083,11 +1083,12 @@ static void free_balloon_pages(struct hv_dynmem_device *dm,
-static int alloc_balloon_pages(struct hv_dynmem_device *dm, int num_pages,
- struct dm_balloon_response *bl_resp, int alloc_unit,
- bool *alloc_error)
+static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
+ unsigned int num_pages,
+ struct dm_balloon_response *bl_resp,
+ int alloc_unit)
{
- int i = 0;
+ unsigned int i = 0;
struct page *pg;
if (num_pages < alloc_unit)
@@ -1106,11 +1107,8 @@ static int alloc_balloon_pages(struct hv_dynmem_device *dm, int num_pages,
__GFP_NOMEMALLOC | __GFP_NOWARN,
get_order(alloc_unit << PAGE_SHIFT));
- if (!pg) {
- *alloc_error = true;
+ if (!pg)
return i * alloc_unit;
- }
-
dm->num_pages_ballooned += alloc_unit;
@@ -1137,14 +1135,15 @@ static int alloc_balloon_pages(struct hv_dynmem_device *dm, int num_pages,
static void balloon_up(struct work_struct *dummy)
{
- int num_pages = dm_device.balloon_wrk.num_pages;
- int num_ballooned = 0;
+ unsigned int num_pages = dm_device.balloon_wrk.num_pages;
+ unsigned int num_ballooned = 0;
struct dm_balloon_response *bl_resp;
int alloc_unit;
int ret;
- bool alloc_error;
bool done = false;
int i;
+ struct sysinfo val;
+ unsigned long floor;
/* The host balloons pages in 2M granularity. */
WARN_ON_ONCE(num_pages % PAGES_IN_2M != 0);
@@ -1155,6 +1154,15 @@ static void balloon_up(struct work_struct *dummy)
*/
alloc_unit = 512;
+ si_meminfo(&val);
+ floor = compute_balloon_floor();
+
+ /* Refuse to balloon below the floor, keep the 2M granularity. */
+ if (val.freeram < num_pages || val.freeram - num_pages < floor) {
+ num_pages = val.freeram > floor ? (val.freeram - floor) : 0;
+ num_pages -= num_pages % PAGES_IN_2M;
+ }
+
while (!done) {
bl_resp = (struct dm_balloon_response *)send_buffer;
memset(send_buffer, 0, PAGE_SIZE);
@@ -1164,18 +1172,15 @@ static void balloon_up(struct work_struct *dummy)
num_pages -= num_ballooned;
- alloc_error = false;
num_ballooned = alloc_balloon_pages(&dm_device, num_pages,
- bl_resp, alloc_unit,
- &alloc_error);
+ bl_resp, alloc_unit);
if (alloc_unit != 1 && num_ballooned == 0) {
alloc_unit = 1;
continue;
}
- if ((alloc_unit == 1 && alloc_error) ||
- (num_ballooned == num_pages)) {
+ if (num_ballooned == 0 || num_ballooned == num_pages) {
bl_resp->more_pages = 0;
done = true;
dm_device.state = DM_INITIALIZED;
@@ -1414,7 +1419,8 @@ static void balloon_onchannelcallback(void *context)
static int balloon_probe(struct hv_device *dev,
const struct hv_vmbus_device_id *dev_id)
{
- int ret, t;
+ int ret;
+ unsigned long t;
struct dm_version_request version_req;
struct dm_capabilities cap_msg;
@@ -1439,7 +1445,6 @@ static int balloon_probe(struct hv_device *dev,
dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN7;
init_completion(&dm_device.host_event);
init_completion(&dm_device.config_event);
- init_completion(&dm_device.waiter_event);
INIT_LIST_HEAD(&dm_device.ha_region_list);
mutex_init(&dm_device.ha_region_mutex);
INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up);
diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c
index cd453e4b2a07..b50dd330cf31 100644
--- a/drivers/hv/hv_fcopy.c
+++ b/drivers/hv/hv_fcopy.c
@@ -19,17 +19,13 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/semaphore.h>
-#include <linux/fs.h>
#include <linux/nls.h>
#include <linux/workqueue.h>
-#include <linux/cdev.h>
#include <linux/hyperv.h>
#include <linux/sched.h>
-#include <linux/uaccess.h>
-#include <linux/miscdevice.h>
#include "hyperv_vmbus.h"
+#include "hv_utils_transport.h"
#define WIN8_SRV_MAJOR 1
#define WIN8_SRV_MINOR 1
@@ -47,39 +43,31 @@
* ensure this by serializing packet processing in this driver - we do not
* read additional packets from the VMBUs until the current packet is fully
* handled.
- *
- * The transaction "active" state is set when we receive a request from the
- * host and we cleanup this state when the transaction is completed - when we
- * respond to the host with our response. When the transaction active state is
- * set, we defer handling incoming packets.
*/
static struct {
- bool active; /* transaction status - active or not */
+ int state; /* hvutil_device_state */
int recv_len; /* number of bytes received. */
struct hv_fcopy_hdr *fcopy_msg; /* current message */
- struct hv_start_fcopy message; /* sent to daemon */
struct vmbus_channel *recv_channel; /* chn we got the request */
u64 recv_req_id; /* request ID. */
void *fcopy_context; /* for the channel callback */
- struct semaphore read_sema;
} fcopy_transaction;
-static bool opened; /* currently device opened */
-
-/*
- * Before we can accept copy messages from the host, we need
- * to handshake with the user level daemon. This state tracks
- * if we are in the handshake phase.
- */
-static bool in_hand_shake = true;
-static void fcopy_send_data(void);
static void fcopy_respond_to_host(int error);
-static void fcopy_work_func(struct work_struct *dummy);
-static DECLARE_DELAYED_WORK(fcopy_work, fcopy_work_func);
+static void fcopy_send_data(struct work_struct *dummy);
+static void fcopy_timeout_func(struct work_struct *dummy);
+static DECLARE_DELAYED_WORK(fcopy_timeout_work, fcopy_timeout_func);
+static DECLARE_WORK(fcopy_send_work, fcopy_send_data);
+static const char fcopy_devname[] = "vmbus/hv_fcopy";
static u8 *recv_buffer;
+static struct hvutil_transport *hvt;
+/*
+ * This state maintains the version number registered by the daemon.
+ */
+static int dm_reg_value;
-static void fcopy_work_func(struct work_struct *dummy)
+static void fcopy_timeout_func(struct work_struct *dummy)
{
/*
* If the timer fires, the user-mode component has not responded;
@@ -87,23 +75,28 @@ static void fcopy_work_func(struct work_struct *dummy)
*/
fcopy_respond_to_host(HV_E_FAIL);
- /* In the case the user-space daemon crashes, hangs or is killed, we
- * need to down the semaphore, otherwise, after the daemon starts next
- * time, the obsolete data in fcopy_transaction.message or
- * fcopy_transaction.fcopy_msg will be used immediately.
- *
- * NOTE: fcopy_read() happens to get the semaphore (very rare)? We're
- * still OK, because we've reported the failure to the host.
- */
- if (down_trylock(&fcopy_transaction.read_sema))
- ;
+ /* Transaction is finished, reset the state. */
+ if (fcopy_transaction.state > HVUTIL_READY)
+ fcopy_transaction.state = HVUTIL_READY;
+ hv_poll_channel(fcopy_transaction.fcopy_context,
+ hv_fcopy_onchannelcallback);
}
static int fcopy_handle_handshake(u32 version)
{
+ u32 our_ver = FCOPY_CURRENT_VERSION;
+
switch (version) {
- case FCOPY_CURRENT_VERSION:
+ case FCOPY_VERSION_0:
+ /* Daemon doesn't expect us to reply */
+ dm_reg_value = version;
+ break;
+ case FCOPY_VERSION_1:
+ /* Daemon expects us to reply with our own version */
+ if (hvutil_transport_send(hvt, &our_ver, sizeof(our_ver)))
+ return -EFAULT;
+ dm_reg_value = version;
break;
default:
/*
@@ -114,20 +107,20 @@ static int fcopy_handle_handshake(u32 version)
*/
return -EINVAL;
}
- pr_info("FCP: user-mode registering done. Daemon version: %d\n",
- version);
- fcopy_transaction.active = false;
- if (fcopy_transaction.fcopy_context)
- hv_fcopy_onchannelcallback(fcopy_transaction.fcopy_context);
- in_hand_shake = false;
+ pr_debug("FCP: userspace daemon ver. %d registered\n", version);
+ fcopy_transaction.state = HVUTIL_READY;
+ hv_poll_channel(fcopy_transaction.fcopy_context,
+ hv_fcopy_onchannelcallback);
return 0;
}
-static void fcopy_send_data(void)
+static void fcopy_send_data(struct work_struct *dummy)
{
- struct hv_start_fcopy *smsg_out = &fcopy_transaction.message;
+ struct hv_start_fcopy smsg_out;
int operation = fcopy_transaction.fcopy_msg->operation;
struct hv_start_fcopy *smsg_in;
+ void *out_src;
+ int rc, out_len;
/*
* The strings sent from the host are encoded in
@@ -142,26 +135,39 @@ static void fcopy_send_data(void)
switch (operation) {
case START_FILE_COPY:
- memset(smsg_out, 0, sizeof(struct hv_start_fcopy));
- smsg_out->hdr.operation = operation;
+ out_len = sizeof(struct hv_start_fcopy);
+ memset(&smsg_out, 0, out_len);
+ smsg_out.hdr.operation = operation;
smsg_in = (struct hv_start_fcopy *)fcopy_transaction.fcopy_msg;
utf16s_to_utf8s((wchar_t *)smsg_in->file_name, W_MAX_PATH,
UTF16_LITTLE_ENDIAN,
- (__u8 *)smsg_out->file_name, W_MAX_PATH - 1);
+ (__u8 *)&smsg_out.file_name, W_MAX_PATH - 1);
utf16s_to_utf8s((wchar_t *)smsg_in->path_name, W_MAX_PATH,
UTF16_LITTLE_ENDIAN,
- (__u8 *)smsg_out->path_name, W_MAX_PATH - 1);
+ (__u8 *)&smsg_out.path_name, W_MAX_PATH - 1);
- smsg_out->copy_flags = smsg_in->copy_flags;
- smsg_out->file_size = smsg_in->file_size;
+ smsg_out.copy_flags = smsg_in->copy_flags;
+ smsg_out.file_size = smsg_in->file_size;
+ out_src = &smsg_out;
break;
default:
+ out_src = fcopy_transaction.fcopy_msg;
+ out_len = fcopy_transaction.recv_len;
break;
}
- up(&fcopy_transaction.read_sema);
+
+ fcopy_transaction.state = HVUTIL_USERSPACE_REQ;
+ rc = hvutil_transport_send(hvt, out_src, out_len);
+ if (rc) {
+ pr_debug("FCP: failed to communicate to the daemon: %d\n", rc);
+ if (cancel_delayed_work_sync(&fcopy_timeout_work)) {
+ fcopy_respond_to_host(HV_E_FAIL);
+ fcopy_transaction.state = HVUTIL_READY;
+ }
+ }
return;
}
@@ -189,8 +195,6 @@ fcopy_respond_to_host(int error)
channel = fcopy_transaction.recv_channel;
req_id = fcopy_transaction.recv_req_id;
- fcopy_transaction.active = false;
-
icmsghdr = (struct icmsg_hdr *)
&recv_buffer[sizeof(struct vmbuspipe_hdr)];
@@ -218,7 +222,7 @@ void hv_fcopy_onchannelcallback(void *context)
int util_fw_version;
int fcopy_srv_version;
- if (fcopy_transaction.active) {
+ if (fcopy_transaction.state > HVUTIL_READY) {
/*
* We will defer processing this callback once
* the current transaction is complete.
@@ -226,6 +230,7 @@ void hv_fcopy_onchannelcallback(void *context)
fcopy_transaction.fcopy_context = context;
return;
}
+ fcopy_transaction.fcopy_context = NULL;
vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 2, &recvlen,
&requestid);
@@ -249,17 +254,23 @@ void hv_fcopy_onchannelcallback(void *context)
* transaction; note transactions are serialized.
*/
- fcopy_transaction.active = true;
fcopy_transaction.recv_len = recvlen;
fcopy_transaction.recv_channel = channel;
fcopy_transaction.recv_req_id = requestid;
fcopy_transaction.fcopy_msg = fcopy_msg;
+ if (fcopy_transaction.state < HVUTIL_READY) {
+ /* Userspace is not registered yet */
+ fcopy_respond_to_host(HV_E_FAIL);
+ return;
+ }
+ fcopy_transaction.state = HVUTIL_HOSTMSG_RECEIVED;
+
/*
* Send the information to the user-level daemon.
*/
- schedule_delayed_work(&fcopy_work, 5*HZ);
- fcopy_send_data();
+ schedule_work(&fcopy_send_work);
+ schedule_delayed_work(&fcopy_timeout_work, 5*HZ);
return;
}
icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE;
@@ -267,155 +278,44 @@ void hv_fcopy_onchannelcallback(void *context)
VM_PKT_DATA_INBAND, 0);
}
-/*
- * Create a char device that can support read/write for passing
- * the payload.
- */
-
-static ssize_t fcopy_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
-{
- void *src;
- size_t copy_size;
- int operation;
-
- /*
- * Wait until there is something to be read.
- */
- if (down_interruptible(&fcopy_transaction.read_sema))
- return -EINTR;
-
- /*
- * The channel may be rescinded and in this case, we will wakeup the
- * the thread blocked on the semaphore and we will use the opened
- * state to correctly handle this case.
- */
- if (!opened)
- return -ENODEV;
-
- operation = fcopy_transaction.fcopy_msg->operation;
-
- if (operation == START_FILE_COPY) {
- src = &fcopy_transaction.message;
- copy_size = sizeof(struct hv_start_fcopy);
- if (count < copy_size)
- return 0;
- } else {
- src = fcopy_transaction.fcopy_msg;
- copy_size = sizeof(struct hv_do_fcopy);
- if (count < copy_size)
- return 0;
- }
- if (copy_to_user(buf, src, copy_size))
- return -EFAULT;
-
- return copy_size;
-}
-
-static ssize_t fcopy_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
+/* Callback when data is received from userspace */
+static int fcopy_on_msg(void *msg, int len)
{
- int response = 0;
+ int *val = (int *)msg;
- if (count != sizeof(int))
+ if (len != sizeof(int))
return -EINVAL;
- if (copy_from_user(&response, buf, sizeof(int)))
- return -EFAULT;
+ if (fcopy_transaction.state == HVUTIL_DEVICE_INIT)
+ return fcopy_handle_handshake(*val);
- if (in_hand_shake) {
- if (fcopy_handle_handshake(response))
- return -EINVAL;
- return sizeof(int);
- }
+ if (fcopy_transaction.state != HVUTIL_USERSPACE_REQ)
+ return -EINVAL;
/*
* Complete the transaction by forwarding the result
* to the host. But first, cancel the timeout.
*/
- if (cancel_delayed_work_sync(&fcopy_work))
- fcopy_respond_to_host(response);
-
- return sizeof(int);
-}
-
-static int fcopy_open(struct inode *inode, struct file *f)
-{
- /*
- * The user level daemon that will open this device is
- * really an extension of this driver. We can have only
- * active open at a time.
- */
- if (opened)
- return -EBUSY;
+ if (cancel_delayed_work_sync(&fcopy_timeout_work)) {
+ fcopy_transaction.state = HVUTIL_USERSPACE_RECV;
+ fcopy_respond_to_host(*val);
+ fcopy_transaction.state = HVUTIL_READY;
+ hv_poll_channel(fcopy_transaction.fcopy_context,
+ hv_fcopy_onchannelcallback);
+ }
- /*
- * The daemon is alive; setup the state.
- */
- opened = true;
return 0;
}
-/* XXX: there are still some tricky corner cases, e.g.,
- * 1) In a SMP guest, when fcopy_release() runs between
- * schedule_delayed_work() and fcopy_send_data(), there is
- * still a chance an obsolete message will be queued.
- *
- * 2) When the fcopy daemon is running, if we unload the driver,
- * we'll notice a kernel oops when we kill the daemon later.
- */
-static int fcopy_release(struct inode *inode, struct file *f)
+static void fcopy_on_reset(void)
{
/*
* The daemon has exited; reset the state.
*/
- in_hand_shake = true;
- opened = false;
+ fcopy_transaction.state = HVUTIL_DEVICE_INIT;
- if (cancel_delayed_work_sync(&fcopy_work)) {
- /* We haven't up()-ed the semaphore(very rare)? */
- if (down_trylock(&fcopy_transaction.read_sema))
- ;
+ if (cancel_delayed_work_sync(&fcopy_timeout_work))
fcopy_respond_to_host(HV_E_FAIL);
- }
- return 0;
-}
-
-
-static const struct file_operations fcopy_fops = {
- .read = fcopy_read,
- .write = fcopy_write,
- .release = fcopy_release,
- .open = fcopy_open,
-};
-
-static struct miscdevice fcopy_misc = {
- .minor = MISC_DYNAMIC_MINOR,
- .name = "vmbus/hv_fcopy",
- .fops = &fcopy_fops,
-};
-
-static int fcopy_dev_init(void)
-{
- return misc_register(&fcopy_misc);
-}
-
-static void fcopy_dev_deinit(void)
-{
-
- /*
- * The device is going away - perhaps because the
- * host has rescinded the channel. Setup state so that
- * user level daemon can gracefully exit if it is blocked
- * on the read semaphore.
- */
- opened = false;
- /*
- * Signal the semaphore as the device is
- * going away.
- */
- up(&fcopy_transaction.read_sema);
- misc_deregister(&fcopy_misc);
}
int hv_fcopy_init(struct hv_util_service *srv)
@@ -428,14 +328,19 @@ int hv_fcopy_init(struct hv_util_service *srv)
* Defer processing channel callbacks until the daemon
* has registered.
*/
- fcopy_transaction.active = true;
- sema_init(&fcopy_transaction.read_sema, 0);
+ fcopy_transaction.state = HVUTIL_DEVICE_INIT;
+
+ hvt = hvutil_transport_init(fcopy_devname, 0, 0,
+ fcopy_on_msg, fcopy_on_reset);
+ if (!hvt)
+ return -EFAULT;
- return fcopy_dev_init();
+ return 0;
}
void hv_fcopy_deinit(void)
{
- cancel_delayed_work_sync(&fcopy_work);
- fcopy_dev_deinit();
+ fcopy_transaction.state = HVUTIL_DEVICE_DYING;
+ cancel_delayed_work_sync(&fcopy_timeout_work);
+ hvutil_transport_destroy(hvt);
}
diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index beb8105c0e7b..d85798d5992c 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -28,6 +28,8 @@
#include <linux/workqueue.h>
#include <linux/hyperv.h>
+#include "hyperv_vmbus.h"
+#include "hv_utils_transport.h"
/*
* Pre win8 version numbers used in ws2008 and ws 2008 r2 (win7)
@@ -45,16 +47,21 @@
#define WIN8_SRV_VERSION (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR)
/*
- * Global state maintained for transaction that is being processed.
- * Note that only one transaction can be active at any point in time.
+ * Global state maintained for transaction that is being processed. For a class
+ * of integration services, including the "KVP service", the specified protocol
+ * is a "request/response" protocol which means that there can only be single
+ * outstanding transaction from the host at any given point in time. We use
+ * this to simplify memory management in this driver - we cache and process
+ * only one message at a time.
*
- * This state is set when we receive a request from the host; we
- * cleanup this state when the transaction is completed - when we respond
- * to the host with the key value.
+ * While the request/response protocol is guaranteed by the host, we further
+ * ensure this by serializing packet processing in this driver - we do not
+ * read additional packets from the VMBUs until the current packet is fully
+ * handled.
*/
static struct {
- bool active; /* transaction status - active or not */
+ int state; /* hvutil_device_state */
int recv_len; /* number of bytes received. */
struct hv_kvp_msg *kvp_msg; /* current message */
struct vmbus_channel *recv_channel; /* chn we got the request */
@@ -63,13 +70,6 @@ static struct {
} kvp_transaction;
/*
- * Before we can accept KVP messages from the host, we need
- * to handshake with the user level daemon. This state tracks
- * if we are in the handshake phase.
- */
-static bool in_hand_shake = true;
-
-/*
* This state maintains the version number registered by the daemon.
*/
static int dm_reg_value;
@@ -78,15 +78,15 @@ static void kvp_send_key(struct work_struct *dummy);
static void kvp_respond_to_host(struct hv_kvp_msg *msg, int error);
-static void kvp_work_func(struct work_struct *dummy);
+static void kvp_timeout_func(struct work_struct *dummy);
static void kvp_register(int);
-static DECLARE_DELAYED_WORK(kvp_work, kvp_work_func);
+static DECLARE_DELAYED_WORK(kvp_timeout_work, kvp_timeout_func);
static DECLARE_WORK(kvp_sendkey_work, kvp_send_key);
-static struct cb_id kvp_id = { CN_KVP_IDX, CN_KVP_VAL };
-static const char kvp_name[] = "kvp_kernel_module";
+static const char kvp_devname[] = "vmbus/hv_kvp";
static u8 *recv_buffer;
+static struct hvutil_transport *hvt;
/*
* Register the kernel component with the user-level daemon.
* As part of this registration, pass the LIC version number.
@@ -98,50 +98,39 @@ static void
kvp_register(int reg_value)
{
- struct cn_msg *msg;
struct hv_kvp_msg *kvp_msg;
char *version;
- msg = kzalloc(sizeof(*msg) + sizeof(struct hv_kvp_msg), GFP_ATOMIC);
+ kvp_msg = kzalloc(sizeof(*kvp_msg), GFP_KERNEL);
- if (msg) {
- kvp_msg = (struct hv_kvp_msg *)msg->data;
+ if (kvp_msg) {
version = kvp_msg->body.kvp_register.version;
- msg->id.idx = CN_KVP_IDX;
- msg->id.val = CN_KVP_VAL;
-
kvp_msg->kvp_hdr.operation = reg_value;
strcpy(version, HV_DRV_VERSION);
- msg->len = sizeof(struct hv_kvp_msg);
- cn_netlink_send(msg, 0, 0, GFP_ATOMIC);
- kfree(msg);
+
+ hvutil_transport_send(hvt, kvp_msg, sizeof(*kvp_msg));
+ kfree(kvp_msg);
}
}
-static void
-kvp_work_func(struct work_struct *dummy)
+
+static void kvp_timeout_func(struct work_struct *dummy)
{
/*
* If the timer fires, the user-mode component has not responded;
* process the pending transaction.
*/
kvp_respond_to_host(NULL, HV_E_FAIL);
-}
-static void poll_channel(struct vmbus_channel *channel)
-{
- if (channel->target_cpu != smp_processor_id())
- smp_call_function_single(channel->target_cpu,
- hv_kvp_onchannelcallback,
- channel, true);
- else
- hv_kvp_onchannelcallback(channel);
-}
+ /* Transaction is finished, reset the state. */
+ if (kvp_transaction.state > HVUTIL_READY)
+ kvp_transaction.state = HVUTIL_READY;
+ hv_poll_channel(kvp_transaction.kvp_context,
+ hv_kvp_onchannelcallback);
+}
static int kvp_handle_handshake(struct hv_kvp_msg *msg)
{
- int ret = 1;
-
switch (msg->kvp_hdr.operation) {
case KVP_OP_REGISTER:
dm_reg_value = KVP_OP_REGISTER;
@@ -155,20 +144,18 @@ static int kvp_handle_handshake(struct hv_kvp_msg *msg)
pr_info("KVP: incompatible daemon\n");
pr_info("KVP: KVP version: %d, Daemon version: %d\n",
KVP_OP_REGISTER1, msg->kvp_hdr.operation);
- ret = 0;
+ return -EINVAL;
}
- if (ret) {
- /*
- * We have a compatible daemon; complete the handshake.
- */
- pr_info("KVP: user-mode registering done.\n");
- kvp_register(dm_reg_value);
- kvp_transaction.active = false;
- if (kvp_transaction.kvp_context)
- poll_channel(kvp_transaction.kvp_context);
- }
- return ret;
+ /*
+ * We have a compatible daemon; complete the handshake.
+ */
+ pr_debug("KVP: userspace daemon ver. %d registered\n",
+ KVP_OP_REGISTER);
+ kvp_register(dm_reg_value);
+ kvp_transaction.state = HVUTIL_READY;
+
+ return 0;
}
@@ -176,26 +163,30 @@ static int kvp_handle_handshake(struct hv_kvp_msg *msg)
* Callback when data is received from user mode.
*/
-static void
-kvp_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
+static int kvp_on_msg(void *msg, int len)
{
- struct hv_kvp_msg *message;
+ struct hv_kvp_msg *message = (struct hv_kvp_msg *)msg;
struct hv_kvp_msg_enumerate *data;
int error = 0;
- message = (struct hv_kvp_msg *)msg->data;
+ if (len < sizeof(*message))
+ return -EINVAL;
/*
* If we are negotiating the version information
* with the daemon; handle that first.
*/
- if (in_hand_shake) {
- if (kvp_handle_handshake(message))
- in_hand_shake = false;
- return;
+ if (kvp_transaction.state < HVUTIL_READY) {
+ return kvp_handle_handshake(message);
}
+ /* We didn't send anything to userspace so the reply is spurious */
+ if (kvp_transaction.state < HVUTIL_USERSPACE_REQ)
+ return -EINVAL;
+
+ kvp_transaction.state = HVUTIL_USERSPACE_RECV;
+
/*
* Based on the version of the daemon, we propagate errors from the
* daemon differently.
@@ -225,8 +216,14 @@ kvp_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
* Complete the transaction by forwarding the key value
* to the host. But first, cancel the timeout.
*/
- if (cancel_delayed_work_sync(&kvp_work))
+ if (cancel_delayed_work_sync(&kvp_timeout_work)) {
kvp_respond_to_host(message, error);
+ kvp_transaction.state = HVUTIL_READY;
+ hv_poll_channel(kvp_transaction.kvp_context,
+ hv_kvp_onchannelcallback);
+ }
+
+ return 0;
}
@@ -343,7 +340,6 @@ static void process_ib_ipinfo(void *in_msg, void *out_msg, int op)
static void
kvp_send_key(struct work_struct *dummy)
{
- struct cn_msg *msg;
struct hv_kvp_msg *message;
struct hv_kvp_msg *in_msg;
__u8 operation = kvp_transaction.kvp_msg->kvp_hdr.operation;
@@ -352,14 +348,11 @@ kvp_send_key(struct work_struct *dummy)
__u64 val64;
int rc;
- msg = kzalloc(sizeof(*msg) + sizeof(struct hv_kvp_msg) , GFP_ATOMIC);
- if (!msg)
+ /* The transaction state is wrong. */
+ if (kvp_transaction.state != HVUTIL_HOSTMSG_RECEIVED)
return;
- msg->id.idx = CN_KVP_IDX;
- msg->id.val = CN_KVP_VAL;
-
- message = (struct hv_kvp_msg *)msg->data;
+ message = kzalloc(sizeof(*message), GFP_KERNEL);
message->kvp_hdr.operation = operation;
message->kvp_hdr.pool = pool;
in_msg = kvp_transaction.kvp_msg;
@@ -446,15 +439,17 @@ kvp_send_key(struct work_struct *dummy)
break;
}
- msg->len = sizeof(struct hv_kvp_msg);
- rc = cn_netlink_send(msg, 0, 0, GFP_ATOMIC);
+ kvp_transaction.state = HVUTIL_USERSPACE_REQ;
+ rc = hvutil_transport_send(hvt, message, sizeof(*message));
if (rc) {
pr_debug("KVP: failed to communicate to the daemon: %d\n", rc);
- if (cancel_delayed_work_sync(&kvp_work))
+ if (cancel_delayed_work_sync(&kvp_timeout_work)) {
kvp_respond_to_host(message, HV_E_FAIL);
+ kvp_transaction.state = HVUTIL_READY;
+ }
}
- kfree(msg);
+ kfree(message);
return;
}
@@ -479,17 +474,6 @@ kvp_respond_to_host(struct hv_kvp_msg *msg_to_host, int error)
int ret;
/*
- * If a transaction is not active; log and return.
- */
-
- if (!kvp_transaction.active) {
- /*
- * This is a spurious call!
- */
- pr_warn("KVP: Transaction not active\n");
- return;
- }
- /*
* Copy the global state for completing the transaction. Note that
* only one transaction can be active at a time.
*/
@@ -498,8 +482,6 @@ kvp_respond_to_host(struct hv_kvp_msg *msg_to_host, int error)
channel = kvp_transaction.recv_channel;
req_id = kvp_transaction.recv_req_id;
- kvp_transaction.active = false;
-
icmsghdrp = (struct icmsg_hdr *)
&recv_buffer[sizeof(struct vmbuspipe_hdr)];
@@ -586,7 +568,6 @@ response_done:
vmbus_sendpacket(channel, recv_buffer, buf_len, req_id,
VM_PKT_DATA_INBAND, 0);
- poll_channel(channel);
}
/*
@@ -612,7 +593,7 @@ void hv_kvp_onchannelcallback(void *context)
int util_fw_version;
int kvp_srv_version;
- if (kvp_transaction.active) {
+ if (kvp_transaction.state > HVUTIL_READY) {
/*
* We will defer processing this callback once
* the current transaction is complete.
@@ -620,6 +601,7 @@ void hv_kvp_onchannelcallback(void *context)
kvp_transaction.kvp_context = context;
return;
}
+ kvp_transaction.kvp_context = NULL;
vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 4, &recvlen,
&requestid);
@@ -664,9 +646,15 @@ void hv_kvp_onchannelcallback(void *context)
kvp_transaction.recv_len = recvlen;
kvp_transaction.recv_channel = channel;
kvp_transaction.recv_req_id = requestid;
- kvp_transaction.active = true;
kvp_transaction.kvp_msg = kvp_msg;
+ if (kvp_transaction.state < HVUTIL_READY) {
+ /* Userspace is not registered yet */
+ kvp_respond_to_host(NULL, HV_E_FAIL);
+ return;
+ }
+ kvp_transaction.state = HVUTIL_HOSTMSG_RECEIVED;
+
/*
* Get the information from the
* user-mode component.
@@ -677,7 +665,7 @@ void hv_kvp_onchannelcallback(void *context)
* user-mode not responding.
*/
schedule_work(&kvp_sendkey_work);
- schedule_delayed_work(&kvp_work, 5*HZ);
+ schedule_delayed_work(&kvp_timeout_work, 5*HZ);
return;
@@ -693,14 +681,16 @@ void hv_kvp_onchannelcallback(void *context)
}
+static void kvp_on_reset(void)
+{
+ if (cancel_delayed_work_sync(&kvp_timeout_work))
+ kvp_respond_to_host(NULL, HV_E_FAIL);
+ kvp_transaction.state = HVUTIL_DEVICE_INIT;
+}
+
int
hv_kvp_init(struct hv_util_service *srv)
{
- int err;
-
- err = cn_add_callback(&kvp_id, kvp_name, kvp_cn_callback);
- if (err)
- return err;
recv_buffer = srv->recv_buffer;
/*
@@ -709,14 +699,20 @@ hv_kvp_init(struct hv_util_service *srv)
* Defer processing channel callbacks until the daemon
* has registered.
*/
- kvp_transaction.active = true;
+ kvp_transaction.state = HVUTIL_DEVICE_INIT;
+
+ hvt = hvutil_transport_init(kvp_devname, CN_KVP_IDX, CN_KVP_VAL,
+ kvp_on_msg, kvp_on_reset);
+ if (!hvt)
+ return -EFAULT;
return 0;
}
void hv_kvp_deinit(void)
{
- cn_del_callback(&kvp_id);
- cancel_delayed_work_sync(&kvp_work);
+ kvp_transaction.state = HVUTIL_DEVICE_DYING;
+ cancel_delayed_work_sync(&kvp_timeout_work);
cancel_work_sync(&kvp_sendkey_work);
+ hvutil_transport_destroy(hvt);
}
diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c
index 9d5e0d1efdb5..815405f2e777 100644
--- a/drivers/hv/hv_snapshot.c
+++ b/drivers/hv/hv_snapshot.c
@@ -24,6 +24,9 @@
#include <linux/workqueue.h>
#include <linux/hyperv.h>
+#include "hyperv_vmbus.h"
+#include "hv_utils_transport.h"
+
#define VSS_MAJOR 5
#define VSS_MINOR 0
#define VSS_VERSION (VSS_MAJOR << 16 | VSS_MINOR)
@@ -31,28 +34,39 @@
#define VSS_USERSPACE_TIMEOUT (msecs_to_jiffies(10 * 1000))
/*
- * Global state maintained for transaction that is being processed.
- * Note that only one transaction can be active at any point in time.
+ * Global state maintained for transaction that is being processed. For a class
+ * of integration services, including the "VSS service", the specified protocol
+ * is a "request/response" protocol which means that there can only be single
+ * outstanding transaction from the host at any given point in time. We use
+ * this to simplify memory management in this driver - we cache and process
+ * only one message at a time.
*
- * This state is set when we receive a request from the host; we
- * cleanup this state when the transaction is completed - when we respond
- * to the host with the key value.
+ * While the request/response protocol is guaranteed by the host, we further
+ * ensure this by serializing packet processing in this driver - we do not
+ * read additional packets from the VMBUs until the current packet is fully
+ * handled.
*/
static struct {
- bool active; /* transaction status - active or not */
+ int state; /* hvutil_device_state */
int recv_len; /* number of bytes received. */
struct vmbus_channel *recv_channel; /* chn we got the request */
u64 recv_req_id; /* request ID. */
struct hv_vss_msg *msg; /* current message */
+ void *vss_context; /* for the channel callback */
} vss_transaction;
static void vss_respond_to_host(int error);
-static struct cb_id vss_id = { CN_VSS_IDX, CN_VSS_VAL };
-static const char vss_name[] = "vss_kernel_module";
+/*
+ * This state maintains the version number registered by the daemon.
+ */
+static int dm_reg_value;
+
+static const char vss_devname[] = "vmbus/hv_vss";
static __u8 *recv_buffer;
+static struct hvutil_transport *hvt;
static void vss_send_op(struct work_struct *dummy);
static void vss_timeout_func(struct work_struct *dummy);
@@ -71,25 +85,69 @@ static void vss_timeout_func(struct work_struct *dummy)
*/
pr_warn("VSS: timeout waiting for daemon to reply\n");
vss_respond_to_host(HV_E_FAIL);
+
+ /* Transaction is finished, reset the state. */
+ if (vss_transaction.state > HVUTIL_READY)
+ vss_transaction.state = HVUTIL_READY;
+
+ hv_poll_channel(vss_transaction.vss_context,
+ hv_vss_onchannelcallback);
}
-static void
-vss_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
+static int vss_handle_handshake(struct hv_vss_msg *vss_msg)
{
- struct hv_vss_msg *vss_msg;
+ u32 our_ver = VSS_OP_REGISTER1;
+
+ switch (vss_msg->vss_hdr.operation) {
+ case VSS_OP_REGISTER:
+ /* Daemon doesn't expect us to reply */
+ dm_reg_value = VSS_OP_REGISTER;
+ break;
+ case VSS_OP_REGISTER1:
+ /* Daemon expects us to reply with our own version*/
+ if (hvutil_transport_send(hvt, &our_ver, sizeof(our_ver)))
+ return -EFAULT;
+ dm_reg_value = VSS_OP_REGISTER1;
+ break;
+ default:
+ return -EINVAL;
+ }
+ vss_transaction.state = HVUTIL_READY;
+ pr_debug("VSS: userspace daemon ver. %d registered\n", dm_reg_value);
+ return 0;
+}
- vss_msg = (struct hv_vss_msg *)msg->data;
+static int vss_on_msg(void *msg, int len)
+{
+ struct hv_vss_msg *vss_msg = (struct hv_vss_msg *)msg;
- if (vss_msg->vss_hdr.operation == VSS_OP_REGISTER) {
- pr_info("VSS daemon registered\n");
- vss_transaction.active = false;
- if (vss_transaction.recv_channel != NULL)
- hv_vss_onchannelcallback(vss_transaction.recv_channel);
- return;
+ if (len != sizeof(*vss_msg))
+ return -EINVAL;
+ if (vss_msg->vss_hdr.operation == VSS_OP_REGISTER ||
+ vss_msg->vss_hdr.operation == VSS_OP_REGISTER1) {
+ /*
+ * Don't process registration messages if we're in the middle
+ * of a transaction processing.
+ */
+ if (vss_transaction.state > HVUTIL_READY)
+ return -EINVAL;
+ return vss_handle_handshake(vss_msg);
+ } else if (vss_transaction.state == HVUTIL_USERSPACE_REQ) {
+ vss_transaction.state = HVUTIL_USERSPACE_RECV;
+ if (cancel_delayed_work_sync(&vss_timeout_work)) {
+ vss_respond_to_host(vss_msg->error);
+ /* Transaction is finished, reset the state. */
+ vss_transaction.state = HVUTIL_READY;
+ hv_poll_channel(vss_transaction.vss_context,
+ hv_vss_onchannelcallback);
+ }
+ } else {
+ /* This is a spurious call! */
+ pr_warn("VSS: Transaction not active\n");
+ return -EINVAL;
}
- if (cancel_delayed_work_sync(&vss_timeout_work))
- vss_respond_to_host(vss_msg->error);
+ return 0;
}
@@ -97,28 +155,29 @@ static void vss_send_op(struct work_struct *dummy)
{
int op = vss_transaction.msg->vss_hdr.operation;
int rc;
- struct cn_msg *msg;
struct hv_vss_msg *vss_msg;
- msg = kzalloc(sizeof(*msg) + sizeof(*vss_msg), GFP_ATOMIC);
- if (!msg)
+ /* The transaction state is wrong. */
+ if (vss_transaction.state != HVUTIL_HOSTMSG_RECEIVED)
return;
- vss_msg = (struct hv_vss_msg *)msg->data;
-
- msg->id.idx = CN_VSS_IDX;
- msg->id.val = CN_VSS_VAL;
+ vss_msg = kzalloc(sizeof(*vss_msg), GFP_KERNEL);
+ if (!vss_msg)
+ return;
vss_msg->vss_hdr.operation = op;
- msg->len = sizeof(struct hv_vss_msg);
- rc = cn_netlink_send(msg, 0, 0, GFP_ATOMIC);
+ vss_transaction.state = HVUTIL_USERSPACE_REQ;
+ rc = hvutil_transport_send(hvt, vss_msg, sizeof(*vss_msg));
if (rc) {
pr_warn("VSS: failed to communicate to the daemon: %d\n", rc);
- if (cancel_delayed_work_sync(&vss_timeout_work))
+ if (cancel_delayed_work_sync(&vss_timeout_work)) {
vss_respond_to_host(HV_E_FAIL);
+ vss_transaction.state = HVUTIL_READY;
+ }
}
- kfree(msg);
+
+ kfree(vss_msg);
return;
}
@@ -136,17 +195,6 @@ vss_respond_to_host(int error)
u64 req_id;
/*
- * If a transaction is not active; log and return.
- */
-
- if (!vss_transaction.active) {
- /*
- * This is a spurious call!
- */
- pr_warn("VSS: Transaction not active\n");
- return;
- }
- /*
* Copy the global state for completing the transaction. Note that
* only one transaction can be active at a time.
*/
@@ -154,7 +202,6 @@ vss_respond_to_host(int error)
buf_len = vss_transaction.recv_len;
channel = vss_transaction.recv_channel;
req_id = vss_transaction.recv_req_id;
- vss_transaction.active = false;
icmsghdrp = (struct icmsg_hdr *)
&recv_buffer[sizeof(struct vmbuspipe_hdr)];
@@ -191,14 +238,15 @@ void hv_vss_onchannelcallback(void *context)
struct icmsg_hdr *icmsghdrp;
struct icmsg_negotiate *negop = NULL;
- if (vss_transaction.active) {
+ if (vss_transaction.state > HVUTIL_READY) {
/*
* We will defer processing this callback once
* the current transaction is complete.
*/
- vss_transaction.recv_channel = channel;
+ vss_transaction.vss_context = context;
return;
}
+ vss_transaction.vss_context = NULL;
vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 2, &recvlen,
&requestid);
@@ -224,7 +272,6 @@ void hv_vss_onchannelcallback(void *context)
vss_transaction.recv_len = recvlen;
vss_transaction.recv_channel = channel;
vss_transaction.recv_req_id = requestid;
- vss_transaction.active = true;
vss_transaction.msg = (struct hv_vss_msg *)vss_msg;
switch (vss_msg->vss_hdr.operation) {
@@ -241,6 +288,12 @@ void hv_vss_onchannelcallback(void *context)
*/
case VSS_OP_FREEZE:
case VSS_OP_THAW:
+ if (vss_transaction.state < HVUTIL_READY) {
+ /* Userspace is not registered yet */
+ vss_respond_to_host(HV_E_FAIL);
+ return;
+ }
+ vss_transaction.state = HVUTIL_HOSTMSG_RECEIVED;
schedule_work(&vss_send_op_work);
schedule_delayed_work(&vss_timeout_work,
VSS_USERSPACE_TIMEOUT);
@@ -275,14 +328,16 @@ void hv_vss_onchannelcallback(void *context)
}
+static void vss_on_reset(void)
+{
+ if (cancel_delayed_work_sync(&vss_timeout_work))
+ vss_respond_to_host(HV_E_FAIL);
+ vss_transaction.state = HVUTIL_DEVICE_INIT;
+}
+
int
hv_vss_init(struct hv_util_service *srv)
{
- int err;
-
- err = cn_add_callback(&vss_id, vss_name, vss_cn_callback);
- if (err)
- return err;
recv_buffer = srv->recv_buffer;
/*
@@ -291,13 +346,20 @@ hv_vss_init(struct hv_util_service *srv)
* Defer processing channel callbacks until the daemon
* has registered.
*/
- vss_transaction.active = true;
+ vss_transaction.state = HVUTIL_DEVICE_INIT;
+
+ hvt = hvutil_transport_init(vss_devname, CN_VSS_IDX, CN_VSS_VAL,
+ vss_on_msg, vss_on_reset);
+ if (!hvt)
+ return -EFAULT;
+
return 0;
}
void hv_vss_deinit(void)
{
- cn_del_callback(&vss_id);
+ vss_transaction.state = HVUTIL_DEVICE_DYING;
cancel_delayed_work_sync(&vss_timeout_work);
cancel_work_sync(&vss_send_op_work);
+ hvutil_transport_destroy(hvt);
}
diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
index 3b9c9ef0deb8..7994ec2e4151 100644
--- a/drivers/hv/hv_util.c
+++ b/drivers/hv/hv_util.c
@@ -340,12 +340,8 @@ static int util_probe(struct hv_device *dev,
set_channel_read_state(dev->channel, false);
- ret = vmbus_open(dev->channel, 4 * PAGE_SIZE, 4 * PAGE_SIZE, NULL, 0,
- srv->util_cb, dev->channel);
- if (ret)
- goto error;
-
hv_set_drvdata(dev, srv);
+
/*
* Based on the host; initialize the framework and
* service version numbers we will negotiate.
@@ -365,6 +361,11 @@ static int util_probe(struct hv_device *dev,
hb_srv_version = HB_VERSION;
}
+ ret = vmbus_open(dev->channel, 4 * PAGE_SIZE, 4 * PAGE_SIZE, NULL, 0,
+ srv->util_cb, dev->channel);
+ if (ret)
+ goto error;
+
return 0;
error:
@@ -379,9 +380,9 @@ static int util_remove(struct hv_device *dev)
{
struct hv_util_service *srv = hv_get_drvdata(dev);
- vmbus_close(dev->channel);
if (srv->util_deinit)
srv->util_deinit();
+ vmbus_close(dev->channel);
kfree(srv->recv_buffer);
return 0;
diff --git a/drivers/hv/hv_utils_transport.c b/drivers/hv/hv_utils_transport.c
new file mode 100644
index 000000000000..ea7ba5ef16a9
--- /dev/null
+++ b/drivers/hv/hv_utils_transport.c
@@ -0,0 +1,276 @@
+/*
+ * Kernel/userspace transport abstraction for Hyper-V util driver.
+ *
+ * Copyright (C) 2015, Vitaly Kuznetsov <vkuznets@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+
+#include "hyperv_vmbus.h"
+#include "hv_utils_transport.h"
+
+static DEFINE_SPINLOCK(hvt_list_lock);
+static struct list_head hvt_list = LIST_HEAD_INIT(hvt_list);
+
+static void hvt_reset(struct hvutil_transport *hvt)
+{
+ mutex_lock(&hvt->outmsg_lock);
+ kfree(hvt->outmsg);
+ hvt->outmsg = NULL;
+ hvt->outmsg_len = 0;
+ mutex_unlock(&hvt->outmsg_lock);
+ if (hvt->on_reset)
+ hvt->on_reset();
+}
+
+static ssize_t hvt_op_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct hvutil_transport *hvt;
+ int ret;
+
+ hvt = container_of(file->f_op, struct hvutil_transport, fops);
+
+ if (wait_event_interruptible(hvt->outmsg_q, hvt->outmsg_len > 0))
+ return -EINTR;
+
+ mutex_lock(&hvt->outmsg_lock);
+ if (!hvt->outmsg) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+
+ if (count < hvt->outmsg_len) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (!copy_to_user(buf, hvt->outmsg, hvt->outmsg_len))
+ ret = hvt->outmsg_len;
+ else
+ ret = -EFAULT;
+
+ kfree(hvt->outmsg);
+ hvt->outmsg = NULL;
+ hvt->outmsg_len = 0;
+
+out_unlock:
+ mutex_unlock(&hvt->outmsg_lock);
+ return ret;
+}
+
+static ssize_t hvt_op_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct hvutil_transport *hvt;
+ u8 *inmsg;
+
+ hvt = container_of(file->f_op, struct hvutil_transport, fops);
+
+ inmsg = kzalloc(count, GFP_KERNEL);
+ if (copy_from_user(inmsg, buf, count)) {
+ kfree(inmsg);
+ return -EFAULT;
+ }
+ if (hvt->on_msg(inmsg, count))
+ return -EFAULT;
+ kfree(inmsg);
+
+ return count;
+}
+
+static unsigned int hvt_op_poll(struct file *file, poll_table *wait)
+{
+ struct hvutil_transport *hvt;
+
+ hvt = container_of(file->f_op, struct hvutil_transport, fops);
+
+ poll_wait(file, &hvt->outmsg_q, wait);
+ if (hvt->outmsg_len > 0)
+ return POLLIN | POLLRDNORM;
+
+ return 0;
+}
+
+static int hvt_op_open(struct inode *inode, struct file *file)
+{
+ struct hvutil_transport *hvt;
+
+ hvt = container_of(file->f_op, struct hvutil_transport, fops);
+
+ /*
+ * Switching to CHARDEV mode. We switch bach to INIT when device
+ * gets released.
+ */
+ if (hvt->mode == HVUTIL_TRANSPORT_INIT)
+ hvt->mode = HVUTIL_TRANSPORT_CHARDEV;
+ else if (hvt->mode == HVUTIL_TRANSPORT_NETLINK) {
+ /*
+ * We're switching from netlink communication to using char
+ * device. Issue the reset first.
+ */
+ hvt_reset(hvt);
+ hvt->mode = HVUTIL_TRANSPORT_CHARDEV;
+ } else
+ return -EBUSY;
+
+ return 0;
+}
+
+static int hvt_op_release(struct inode *inode, struct file *file)
+{
+ struct hvutil_transport *hvt;
+
+ hvt = container_of(file->f_op, struct hvutil_transport, fops);
+
+ hvt->mode = HVUTIL_TRANSPORT_INIT;
+ /*
+ * Cleanup message buffers to avoid spurious messages when the daemon
+ * connects back.
+ */
+ hvt_reset(hvt);
+
+ return 0;
+}
+
+static void hvt_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
+{
+ struct hvutil_transport *hvt, *hvt_found = NULL;
+
+ spin_lock(&hvt_list_lock);
+ list_for_each_entry(hvt, &hvt_list, list) {
+ if (hvt->cn_id.idx == msg->id.idx &&
+ hvt->cn_id.val == msg->id.val) {
+ hvt_found = hvt;
+ break;
+ }
+ }
+ spin_unlock(&hvt_list_lock);
+ if (!hvt_found) {
+ pr_warn("hvt_cn_callback: spurious message received!\n");
+ return;
+ }
+
+ /*
+ * Switching to NETLINK mode. Switching to CHARDEV happens when someone
+ * opens the device.
+ */
+ if (hvt->mode == HVUTIL_TRANSPORT_INIT)
+ hvt->mode = HVUTIL_TRANSPORT_NETLINK;
+
+ if (hvt->mode == HVUTIL_TRANSPORT_NETLINK)
+ hvt_found->on_msg(msg->data, msg->len);
+ else
+ pr_warn("hvt_cn_callback: unexpected netlink message!\n");
+}
+
+int hvutil_transport_send(struct hvutil_transport *hvt, void *msg, int len)
+{
+ struct cn_msg *cn_msg;
+ int ret = 0;
+
+ if (hvt->mode == HVUTIL_TRANSPORT_INIT) {
+ return -EINVAL;
+ } else if (hvt->mode == HVUTIL_TRANSPORT_NETLINK) {
+ cn_msg = kzalloc(sizeof(*cn_msg) + len, GFP_ATOMIC);
+ if (!msg)
+ return -ENOMEM;
+ cn_msg->id.idx = hvt->cn_id.idx;
+ cn_msg->id.val = hvt->cn_id.val;
+ cn_msg->len = len;
+ memcpy(cn_msg->data, msg, len);
+ ret = cn_netlink_send(cn_msg, 0, 0, GFP_ATOMIC);
+ kfree(cn_msg);
+ return ret;
+ }
+ /* HVUTIL_TRANSPORT_CHARDEV */
+ mutex_lock(&hvt->outmsg_lock);
+ if (hvt->outmsg) {
+ /* Previous message wasn't received */
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+ hvt->outmsg = kzalloc(len, GFP_KERNEL);
+ memcpy(hvt->outmsg, msg, len);
+ hvt->outmsg_len = len;
+ wake_up_interruptible(&hvt->outmsg_q);
+out_unlock:
+ mutex_unlock(&hvt->outmsg_lock);
+ return ret;
+}
+
+struct hvutil_transport *hvutil_transport_init(const char *name,
+ u32 cn_idx, u32 cn_val,
+ int (*on_msg)(void *, int),
+ void (*on_reset)(void))
+{
+ struct hvutil_transport *hvt;
+
+ hvt = kzalloc(sizeof(*hvt), GFP_KERNEL);
+ if (!hvt)
+ return NULL;
+
+ hvt->cn_id.idx = cn_idx;
+ hvt->cn_id.val = cn_val;
+
+ hvt->mdev.minor = MISC_DYNAMIC_MINOR;
+ hvt->mdev.name = name;
+
+ hvt->fops.owner = THIS_MODULE;
+ hvt->fops.read = hvt_op_read;
+ hvt->fops.write = hvt_op_write;
+ hvt->fops.poll = hvt_op_poll;
+ hvt->fops.open = hvt_op_open;
+ hvt->fops.release = hvt_op_release;
+
+ hvt->mdev.fops = &hvt->fops;
+
+ init_waitqueue_head(&hvt->outmsg_q);
+ mutex_init(&hvt->outmsg_lock);
+
+ spin_lock(&hvt_list_lock);
+ list_add(&hvt->list, &hvt_list);
+ spin_unlock(&hvt_list_lock);
+
+ hvt->on_msg = on_msg;
+ hvt->on_reset = on_reset;
+
+ if (misc_register(&hvt->mdev))
+ goto err_free_hvt;
+
+ /* Use cn_id.idx/cn_id.val to determine if we need to setup netlink */
+ if (hvt->cn_id.idx > 0 && hvt->cn_id.val > 0 &&
+ cn_add_callback(&hvt->cn_id, name, hvt_cn_callback))
+ goto err_free_hvt;
+
+ return hvt;
+
+err_free_hvt:
+ kfree(hvt);
+ return NULL;
+}
+
+void hvutil_transport_destroy(struct hvutil_transport *hvt)
+{
+ spin_lock(&hvt_list_lock);
+ list_del(&hvt->list);
+ spin_unlock(&hvt_list_lock);
+ if (hvt->cn_id.idx > 0 && hvt->cn_id.val > 0)
+ cn_del_callback(&hvt->cn_id);
+ misc_deregister(&hvt->mdev);
+ kfree(hvt->outmsg);
+ kfree(hvt);
+}
diff --git a/drivers/hv/hv_utils_transport.h b/drivers/hv/hv_utils_transport.h
new file mode 100644
index 000000000000..314c76ce1b07
--- /dev/null
+++ b/drivers/hv/hv_utils_transport.h
@@ -0,0 +1,51 @@
+/*
+ * Kernel/userspace transport abstraction for Hyper-V util driver.
+ *
+ * Copyright (C) 2015, Vitaly Kuznetsov <vkuznets@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ */
+
+#ifndef _HV_UTILS_TRANSPORT_H
+#define _HV_UTILS_TRANSPORT_H
+
+#include <linux/connector.h>
+#include <linux/miscdevice.h>
+
+enum hvutil_transport_mode {
+ HVUTIL_TRANSPORT_INIT = 0,
+ HVUTIL_TRANSPORT_NETLINK,
+ HVUTIL_TRANSPORT_CHARDEV,
+};
+
+struct hvutil_transport {
+ int mode; /* hvutil_transport_mode */
+ struct file_operations fops; /* file operations */
+ struct miscdevice mdev; /* misc device */
+ struct cb_id cn_id; /* CN_*_IDX/CN_*_VAL */
+ struct list_head list; /* hvt_list */
+ int (*on_msg)(void *, int); /* callback on new user message */
+ void (*on_reset)(void); /* callback when userspace drops */
+ u8 *outmsg; /* message to the userspace */
+ int outmsg_len; /* its length */
+ wait_queue_head_t outmsg_q; /* poll/read wait queue */
+ struct mutex outmsg_lock; /* protects outmsg */
+};
+
+struct hvutil_transport *hvutil_transport_init(const char *name,
+ u32 cn_idx, u32 cn_val,
+ int (*on_msg)(void *, int),
+ void (*on_reset)(void));
+int hvutil_transport_send(struct hvutil_transport *hvt, void *msg, int len);
+void hvutil_transport_destroy(struct hvutil_transport *hvt);
+
+#endif /* _HV_UTILS_TRANSPORT_H */
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 44b1c9424712..cddc0c9f6bf9 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -49,6 +49,17 @@ enum hv_cpuid_function {
HVCPUID_IMPLEMENTATION_LIMITS = 0x40000005,
};
+#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE 0x400
+
+#define HV_X64_MSR_CRASH_P0 0x40000100
+#define HV_X64_MSR_CRASH_P1 0x40000101
+#define HV_X64_MSR_CRASH_P2 0x40000102
+#define HV_X64_MSR_CRASH_P3 0x40000103
+#define HV_X64_MSR_CRASH_P4 0x40000104
+#define HV_X64_MSR_CRASH_CTL 0x40000105
+
+#define HV_CRASH_CTL_CRASH_NOTIFY (1ULL << 63)
+
/* Define version of the synthetic interrupt controller. */
#define HV_SYNIC_VERSION (1)
@@ -572,6 +583,8 @@ extern void hv_synic_init(void *irqarg);
extern void hv_synic_cleanup(void *arg);
+extern void hv_synic_clockevents_cleanup(void);
+
/*
* Host version information.
*/
@@ -634,6 +647,7 @@ struct vmbus_connection {
atomic_t next_gpadl_handle;
+ struct completion unload_event;
/*
* Represents channel interrupts. Each bit position represents a
* channel. When a channel sends an interrupt via VMBUS, it finds its
@@ -672,6 +686,23 @@ struct vmbus_msginfo {
extern struct vmbus_connection vmbus_connection;
+enum vmbus_message_handler_type {
+ /* The related handler can sleep. */
+ VMHT_BLOCKING = 0,
+
+ /* The related handler must NOT sleep. */
+ VMHT_NON_BLOCKING = 1,
+};
+
+struct vmbus_channel_message_table_entry {
+ enum vmbus_channel_message_type message_type;
+ enum vmbus_message_handler_type handler_type;
+ void (*message_handler)(struct vmbus_channel_message_header *msg);
+};
+
+extern struct vmbus_channel_message_table_entry
+ channel_message_table[CHANNELMSG_COUNT];
+
/* General vmbus interface */
struct hv_device *vmbus_device_create(const uuid_le *type,
@@ -692,6 +723,7 @@ void vmbus_free_channels(void);
/* Connection interface */
int vmbus_connect(void);
+void vmbus_disconnect(void);
int vmbus_post_msg(void *buffer, size_t buflen);
@@ -699,9 +731,39 @@ int vmbus_set_event(struct vmbus_channel *channel);
void vmbus_on_event(unsigned long data);
+int hv_kvp_init(struct hv_util_service *);
+void hv_kvp_deinit(void);
+void hv_kvp_onchannelcallback(void *);
+
+int hv_vss_init(struct hv_util_service *);
+void hv_vss_deinit(void);
+void hv_vss_onchannelcallback(void *);
+
int hv_fcopy_init(struct hv_util_service *);
void hv_fcopy_deinit(void);
void hv_fcopy_onchannelcallback(void *);
+void vmbus_initiate_unload(void);
+static inline void hv_poll_channel(struct vmbus_channel *channel,
+ void (*cb)(void *))
+{
+ if (!channel)
+ return;
+
+ if (channel->target_cpu != smp_processor_id())
+ smp_call_function_single(channel->target_cpu,
+ cb, channel, true);
+ else
+ cb(channel);
+}
+
+enum hvutil_device_state {
+ HVUTIL_DEVICE_INIT = 0, /* driver is loaded, waiting for userspace */
+ HVUTIL_READY, /* userspace is registered */
+ HVUTIL_HOSTMSG_RECEIVED, /* message from the host was received */
+ HVUTIL_USERSPACE_REQ, /* request to userspace was sent */
+ HVUTIL_USERSPACE_RECV, /* reply from userspace was received */
+ HVUTIL_DEVICE_DYING, /* driver unload is in progress */
+};
#endif /* _HYPERV_VMBUS_H */
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index f518b8d7a5b5..cf204005ee78 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -33,9 +33,12 @@
#include <linux/hyperv.h>
#include <linux/kernel_stat.h>
#include <linux/clockchips.h>
+#include <linux/cpu.h>
#include <asm/hyperv.h>
#include <asm/hypervisor.h>
#include <asm/mshyperv.h>
+#include <linux/notifier.h>
+#include <linux/ptrace.h>
#include "hyperv_vmbus.h"
static struct acpi_device *hv_acpi_dev;
@@ -44,6 +47,31 @@ static struct tasklet_struct msg_dpc;
static struct completion probe_event;
static int irq;
+
+static int hyperv_panic_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct pt_regs *regs;
+
+ regs = current_pt_regs();
+
+ wrmsrl(HV_X64_MSR_CRASH_P0, regs->ip);
+ wrmsrl(HV_X64_MSR_CRASH_P1, regs->ax);
+ wrmsrl(HV_X64_MSR_CRASH_P2, regs->bx);
+ wrmsrl(HV_X64_MSR_CRASH_P3, regs->cx);
+ wrmsrl(HV_X64_MSR_CRASH_P4, regs->dx);
+
+ /*
+ * Let Hyper-V know there is crash data available
+ */
+ wrmsrl(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block hyperv_panic_block = {
+ .notifier_call = hyperv_panic_event,
+};
+
struct resource hyperv_mmio = {
.name = "hyperv mmio",
.flags = IORESOURCE_MEM,
@@ -507,14 +535,26 @@ static int vmbus_probe(struct device *child_device)
*/
static int vmbus_remove(struct device *child_device)
{
- struct hv_driver *drv = drv_to_hv_drv(child_device->driver);
+ struct hv_driver *drv;
struct hv_device *dev = device_to_hv_device(child_device);
-
- if (drv->remove)
- drv->remove(dev);
- else
- pr_err("remove not set for driver %s\n",
- dev_name(child_device));
+ u32 relid = dev->channel->offermsg.child_relid;
+
+ if (child_device->driver) {
+ drv = drv_to_hv_drv(child_device->driver);
+ if (drv->remove)
+ drv->remove(dev);
+ else {
+ hv_process_channel_removal(dev->channel, relid);
+ pr_err("remove not set for driver %s\n",
+ dev_name(child_device));
+ }
+ } else {
+ /*
+ * We don't have a driver for this device; deal with the
+ * rescind message by removing the channel.
+ */
+ hv_process_channel_removal(dev->channel, relid);
+ }
return 0;
}
@@ -573,6 +613,10 @@ static void vmbus_onmessage_work(struct work_struct *work)
{
struct onmessage_work_context *ctx;
+ /* Do not process messages if we're in DISCONNECTED state */
+ if (vmbus_connection.conn_state == DISCONNECTED)
+ return;
+
ctx = container_of(work, struct onmessage_work_context,
work);
vmbus_onmessage(&ctx->msg);
@@ -613,21 +657,36 @@ static void vmbus_on_msg_dpc(unsigned long data)
void *page_addr = hv_context.synic_message_page[cpu];
struct hv_message *msg = (struct hv_message *)page_addr +
VMBUS_MESSAGE_SINT;
+ struct vmbus_channel_message_header *hdr;
+ struct vmbus_channel_message_table_entry *entry;
struct onmessage_work_context *ctx;
while (1) {
- if (msg->header.message_type == HVMSG_NONE) {
+ if (msg->header.message_type == HVMSG_NONE)
/* no msg */
break;
- } else {
+
+ hdr = (struct vmbus_channel_message_header *)msg->u.payload;
+
+ if (hdr->msgtype >= CHANNELMSG_COUNT) {
+ WARN_ONCE(1, "unknown msgtype=%d\n", hdr->msgtype);
+ goto msg_handled;
+ }
+
+ entry = &channel_message_table[hdr->msgtype];
+ if (entry->handler_type == VMHT_BLOCKING) {
ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC);
if (ctx == NULL)
continue;
+
INIT_WORK(&ctx->work, vmbus_onmessage_work);
memcpy(&ctx->msg, msg, sizeof(*msg));
+
queue_work(vmbus_connection.work_queue, &ctx->work);
- }
+ } else
+ entry->message_handler(hdr);
+msg_handled:
msg->header.message_type = HVMSG_NONE;
/*
@@ -704,6 +763,39 @@ static void vmbus_isr(void)
}
}
+#ifdef CONFIG_HOTPLUG_CPU
+static int hyperv_cpu_disable(void)
+{
+ return -ENOSYS;
+}
+
+static void hv_cpu_hotplug_quirk(bool vmbus_loaded)
+{
+ static void *previous_cpu_disable;
+
+ /*
+ * Offlining a CPU when running on newer hypervisors (WS2012R2, Win8,
+ * ...) is not supported at this moment as channel interrupts are
+ * distributed across all of them.
+ */
+
+ if ((vmbus_proto_version == VERSION_WS2008) ||
+ (vmbus_proto_version == VERSION_WIN7))
+ return;
+
+ if (vmbus_loaded) {
+ previous_cpu_disable = smp_ops.cpu_disable;
+ smp_ops.cpu_disable = hyperv_cpu_disable;
+ pr_notice("CPU offlining is not supported by hypervisor\n");
+ } else if (previous_cpu_disable)
+ smp_ops.cpu_disable = previous_cpu_disable;
+}
+#else
+static void hv_cpu_hotplug_quirk(bool vmbus_loaded)
+{
+}
+#endif
+
/*
* vmbus_bus_init -Main vmbus driver initialization routine.
*
@@ -744,6 +836,16 @@ static int vmbus_bus_init(int irq)
if (ret)
goto err_alloc;
+ hv_cpu_hotplug_quirk(true);
+
+ /*
+ * Only register if the crash MSRs are available
+ */
+ if (ms_hyperv.features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &hyperv_panic_block);
+ }
+
vmbus_request_offers();
return 0;
@@ -840,10 +942,8 @@ int vmbus_device_register(struct hv_device *child_device_obj)
{
int ret = 0;
- static atomic_t device_num = ATOMIC_INIT(0);
-
- dev_set_name(&child_device_obj->device, "vmbus_0_%d",
- atomic_inc_return(&device_num));
+ dev_set_name(&child_device_obj->device, "vmbus_%d",
+ child_device_obj->channel->id);
child_device_obj->device.bus = &hv_bus;
child_device_obj->device.parent = &hv_acpi_dev->dev;
@@ -935,6 +1035,15 @@ acpi_walk_err:
return ret_val;
}
+static int vmbus_acpi_remove(struct acpi_device *device)
+{
+ int ret = 0;
+
+ if (hyperv_mmio.start && hyperv_mmio.end)
+ ret = release_resource(&hyperv_mmio);
+ return ret;
+}
+
static const struct acpi_device_id vmbus_acpi_device_ids[] = {
{"VMBUS", 0},
{"VMBus", 0},
@@ -947,6 +1056,7 @@ static struct acpi_driver vmbus_acpi_driver = {
.ids = vmbus_acpi_device_ids,
.ops = {
.add = vmbus_acpi_add,
+ .remove = vmbus_acpi_remove,
},
};
@@ -992,11 +1102,26 @@ cleanup:
static void __exit vmbus_exit(void)
{
+ int cpu;
+
+ vmbus_connection.conn_state = DISCONNECTED;
+ hv_synic_clockevents_cleanup();
+ vmbus_disconnect();
hv_remove_vmbus_irq();
+ tasklet_kill(&msg_dpc);
vmbus_free_channels();
+ if (ms_hyperv.features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
+ atomic_notifier_chain_unregister(&panic_notifier_list,
+ &hyperv_panic_block);
+ }
bus_unregister(&hv_bus);
hv_cleanup();
+ for_each_online_cpu(cpu) {
+ tasklet_kill(hv_context.event_dpc[cpu]);
+ smp_call_function_single(cpu, hv_synic_cleanup, NULL, 1);
+ }
acpi_bus_unregister_driver(&vmbus_acpi_driver);
+ hv_cpu_hotplug_quirk(false);
}