c36cf18
From patchwork Wed Jan 17 13:49:25 2018
c36cf18
Content-Type: text/plain; charset="utf-8"
c36cf18
MIME-Version: 1.0
c36cf18
Content-Transfer-Encoding: 7bit
c36cf18
Subject: [dpdk-dev,
c36cf18
 v5] vhost_user: protect active rings from async ring changes
c36cf18
From: Victor Kaplansky <victork@redhat.com>
c36cf18
X-Patchwork-Id: 33921
c36cf18
X-Patchwork-Delegate: yuanhan.liu@linux.intel.com
c36cf18
List-Id: dev.dpdk.org
c36cf18
To: dev@dpdk.org
c36cf18
Cc: stable@dpdk.org, Jens Freimann <jfreiman@redhat.com>,
c36cf18
 Maxime Coquelin <maxime.coquelin@redhat.com>,
c36cf18
 Yuanhan Liu <yliu@fridaylinux.org>, Tiwei Bie <tiwei.bie@intel.com>, 
c36cf18
 "Tan, Jianfeng" <jianfeng.tan@intel.com>,
c36cf18
 Stephen Hemminger <stephen@networkplumber.org>,
c36cf18
 Victor Kaplansky <victork@redhat.com>
c36cf18
Date: Wed, 17 Jan 2018 15:49:25 +0200
c36cf18
c36cf18
When performing live migration or memory hot-plugging,
c36cf18
the changes to the device and vrings made by message handler
c36cf18
done independently from vring usage by PMD threads.
c36cf18
c36cf18
This causes for example segfaults during live-migration
c36cf18
with MQ enable, but in general virtually any request
c36cf18
sent by qemu changing the state of device can cause
c36cf18
problems.
c36cf18
c36cf18
These patches fixes all above issues by adding a spinlock
c36cf18
to every vring and requiring message handler to start operation
c36cf18
only after ensuring that all PMD threads related to the device
c36cf18
are out of critical section accessing the vring data.
c36cf18
c36cf18
Each vring has its own lock in order to not create contention
c36cf18
between PMD threads of different vrings and to prevent
c36cf18
performance degradation by scaling queue pair number.
c36cf18
c36cf18
See https://bugzilla.redhat.com/show_bug.cgi?id=1450680
c36cf18
c36cf18
Signed-off-by: Victor Kaplansky <victork@redhat.com>
c36cf18
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
c36cf18
---
c36cf18
v5:
c36cf18
 o get rid of spinlock wrapping functions in vhost.h
c36cf18
c36cf18
v4:
c36cf18
c36cf18
 o moved access_unlock before accessing enable flag and
c36cf18
   access_unlock after iommu_unlock consistently.
c36cf18
 o cosmetics: removed blank line.
c36cf18
 o the access_lock variable moved to be in the same
c36cf18
   cache line with enable and access_ok flags.
c36cf18
 o dequeue path is now guarded with trylock and returning
c36cf18
   zero if unsuccessful.
c36cf18
 o GET_VRING_BASE operation is not guarded by access lock
c36cf18
   to avoid deadlock with device_destroy. See the comment
c36cf18
   in the code.
c36cf18
 o Fixed error path exit from enqueue and dequeue carefully
c36cf18
   unlocking access and iommu locks as appropriate.
c36cf18
c36cf18
v3:
c36cf18
   o Added locking to enqueue flow.
c36cf18
   o Enqueue path guarded as well as dequeue path.
c36cf18
   o Changed name of active_lock.
c36cf18
   o Added initialization of guarding spinlock.
c36cf18
   o Reworked functions skimming over all virt-queues.
c36cf18
   o Performance measurements done by Maxime Coquelin shows
c36cf18
     no degradation in bandwidth and throughput.
c36cf18
   o Spelling.
c36cf18
   o Taking lock only on set operations.
c36cf18
   o IOMMU messages are not guarded by access lock.
c36cf18
c36cf18
v2:
c36cf18
   o Fixed checkpatch complains.
c36cf18
   o Added Signed-off-by.
c36cf18
   o Refined placement of guard to exclude IOMMU messages.
c36cf18
   o TODO: performance degradation measurement.
c36cf18
c36cf18
 dpdk-17.11/lib/librte_vhost/vhost.h      |  6 ++--
c36cf18
 dpdk-17.11/lib/librte_vhost/vhost.c      |  1 +
c36cf18
 dpdk-17.11/lib/librte_vhost/vhost_user.c | 70 ++++++++++++++++++++++++++++++++
c36cf18
 dpdk-17.11/lib/librte_vhost/virtio_net.c | 28 ++++++++++++++---
c36cf18
 4 files changed, 99 insertions(+), 6 deletions(-)
c36cf18
c36cf18
diff --git a/dpdk-17.11/lib/librte_vhost/vhost.h b/dpdk-17.11/lib/librte_vhost/vhost.h
c36cf18
index 1cc81c17..c8f2a817 100644
c36cf18
--- a/dpdk-17.11/lib/librte_vhost/vhost.h
c36cf18
+++ b/dpdk-17.11/lib/librte_vhost/vhost.h
c36cf18
@@ -108,12 +108,14 @@ struct vhost_virtqueue {
c36cf18
 
c36cf18
 	/* Backend value to determine if device should started/stopped */
c36cf18
 	int			backend;
c36cf18
+	int			enabled;
c36cf18
+	int			access_ok;
c36cf18
+	rte_spinlock_t		access_lock;
c36cf18
+
c36cf18
 	/* Used to notify the guest (trigger interrupt) */
c36cf18
 	int			callfd;
c36cf18
 	/* Currently unused as polling mode is enabled */
c36cf18
 	int			kickfd;
c36cf18
-	int			enabled;
c36cf18
-	int			access_ok;
c36cf18
 
c36cf18
 	/* Physical address of used ring, for logging */
c36cf18
 	uint64_t		log_guest_addr;
c36cf18
diff --git a/dpdk-17.11/lib/librte_vhost/vhost.c b/dpdk-17.11/lib/librte_vhost/vhost.c
c36cf18
index 4f8b73a0..dcc42fc7 100644
c36cf18
--- a/dpdk-17.11/lib/librte_vhost/vhost.c
c36cf18
+++ b/dpdk-17.11/lib/librte_vhost/vhost.c
c36cf18
@@ -259,6 +259,7 @@ alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx)
c36cf18
 
c36cf18
 	dev->virtqueue[vring_idx] = vq;
c36cf18
 	init_vring_queue(dev, vring_idx);
c36cf18
+	rte_spinlock_init(&vq->access_lock);
c36cf18
 
c36cf18
 	dev->nr_vring += 1;
c36cf18
 
c36cf18
diff --git a/dpdk-17.11/lib/librte_vhost/vhost_user.c b/dpdk-17.11/lib/librte_vhost/vhost_user.c
c36cf18
index f4c7ce46..0685d4e7 100644
c36cf18
--- a/dpdk-17.11/lib/librte_vhost/vhost_user.c
c36cf18
+++ b/dpdk-17.11/lib/librte_vhost/vhost_user.c
c36cf18
@@ -1190,12 +1190,47 @@ vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg)
c36cf18
 	return alloc_vring_queue(dev, vring_idx);
c36cf18
 }
c36cf18
 
c36cf18
+static void
c36cf18
+vhost_user_lock_all_queue_pairs(struct virtio_net *dev)
c36cf18
+{
c36cf18
+	unsigned int i = 0;
c36cf18
+	unsigned int vq_num = 0;
c36cf18
+
c36cf18
+	while (vq_num < dev->nr_vring) {
c36cf18
+		struct vhost_virtqueue *vq = dev->virtqueue[i];
c36cf18
+
c36cf18
+		if (vq) {
c36cf18
+			rte_spinlock_lock(&vq->access_lock);
c36cf18
+			vq_num++;
c36cf18
+		}
c36cf18
+		i++;
c36cf18
+	}
c36cf18
+}
c36cf18
+
c36cf18
+static void
c36cf18
+vhost_user_unlock_all_queue_pairs(struct virtio_net *dev)
c36cf18
+{
c36cf18
+	unsigned int i = 0;
c36cf18
+	unsigned int vq_num = 0;
c36cf18
+
c36cf18
+	while (vq_num < dev->nr_vring) {
c36cf18
+		struct vhost_virtqueue *vq = dev->virtqueue[i];
c36cf18
+
c36cf18
+		if (vq) {
c36cf18
+			rte_spinlock_unlock(&vq->access_lock);
c36cf18
+			vq_num++;
c36cf18
+		}
c36cf18
+		i++;
c36cf18
+	}
c36cf18
+}
c36cf18
+
c36cf18
 int
c36cf18
 vhost_user_msg_handler(int vid, int fd)
c36cf18
 {
c36cf18
 	struct virtio_net *dev;
c36cf18
 	struct VhostUserMsg msg;
c36cf18
 	int ret;
c36cf18
+	int unlock_required = 0;
c36cf18
 
c36cf18
 	dev = get_device(vid);
c36cf18
 	if (dev == NULL)
c36cf18
@@ -1241,6 +1276,38 @@ vhost_user_msg_handler(int vid, int fd)
c36cf18
 		return -1;
c36cf18
 	}
c36cf18
 
c36cf18
+	/*
c36cf18
+	 * Note: we don't lock all queues on VHOST_USER_GET_VRING_BASE,
c36cf18
+	 * since it is sent when virtio stops and device is destroyed.
c36cf18
+	 * destroy_device waits for queues to be inactive, so it is safe.
c36cf18
+	 * Otherwise taking the access_lock would cause a dead lock.
c36cf18
+	 */
c36cf18
+	switch (msg.request.master) {
c36cf18
+	case VHOST_USER_SET_FEATURES:
c36cf18
+	case VHOST_USER_SET_PROTOCOL_FEATURES:
c36cf18
+	case VHOST_USER_SET_OWNER:
c36cf18
+	case VHOST_USER_RESET_OWNER:
c36cf18
+	case VHOST_USER_SET_MEM_TABLE:
c36cf18
+	case VHOST_USER_SET_LOG_BASE:
c36cf18
+	case VHOST_USER_SET_LOG_FD:
c36cf18
+	case VHOST_USER_SET_VRING_NUM:
c36cf18
+	case VHOST_USER_SET_VRING_ADDR:
c36cf18
+	case VHOST_USER_SET_VRING_BASE:
c36cf18
+	case VHOST_USER_SET_VRING_KICK:
c36cf18
+	case VHOST_USER_SET_VRING_CALL:
c36cf18
+	case VHOST_USER_SET_VRING_ERR:
c36cf18
+	case VHOST_USER_SET_VRING_ENABLE:
c36cf18
+	case VHOST_USER_SEND_RARP:
c36cf18
+	case VHOST_USER_NET_SET_MTU:
c36cf18
+	case VHOST_USER_SET_SLAVE_REQ_FD:
c36cf18
+		vhost_user_lock_all_queue_pairs(dev);
c36cf18
+		unlock_required = 1;
c36cf18
+		break;
c36cf18
+	default:
c36cf18
+		break;
c36cf18
+
c36cf18
+	}
c36cf18
+
c36cf18
 	switch (msg.request.master) {
c36cf18
 	case VHOST_USER_GET_FEATURES:
c36cf18
 		msg.payload.u64 = vhost_user_get_features(dev);
c36cf18
@@ -1342,6 +1409,9 @@ vhost_user_msg_handler(int vid, int fd)
c36cf18
 
c36cf18
 	}
c36cf18
 
c36cf18
+	if (unlock_required)
c36cf18
+		vhost_user_unlock_all_queue_pairs(dev);
c36cf18
+
c36cf18
 	if (msg.flags & VHOST_USER_NEED_REPLY) {
c36cf18
 		msg.payload.u64 = !!ret;
c36cf18
 		msg.size = sizeof(msg.payload.u64);
c36cf18
diff --git a/dpdk-17.11/lib/librte_vhost/virtio_net.c b/dpdk-17.11/lib/librte_vhost/virtio_net.c
c36cf18
index 6fee16e5..e09a927d 100644
c36cf18
--- a/dpdk-17.11/lib/librte_vhost/virtio_net.c
c36cf18
+++ b/dpdk-17.11/lib/librte_vhost/virtio_net.c
c36cf18
@@ -44,6 +44,7 @@
c36cf18
 #include <rte_udp.h>
c36cf18
 #include <rte_sctp.h>
c36cf18
 #include <rte_arp.h>
c36cf18
+#include <rte_spinlock.h>
c36cf18
 
c36cf18
 #include "iotlb.h"
c36cf18
 #include "vhost.h"
c36cf18
@@ -326,8 +327,11 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
c36cf18
 	}
c36cf18
 
c36cf18
 	vq = dev->virtqueue[queue_id];
c36cf18
+
c36cf18
+	rte_spinlock_lock(&vq->access_lock);
c36cf18
+
c36cf18
 	if (unlikely(vq->enabled == 0))
c36cf18
-		return 0;
c36cf18
+		goto out_access_unlock;
c36cf18
 
c36cf18
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
c36cf18
 		vhost_user_iotlb_rd_lock(vq);
c36cf18
@@ -419,6 +423,9 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
c36cf18
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
c36cf18
 		vhost_user_iotlb_rd_unlock(vq);
c36cf18
 
c36cf18
+out_access_unlock:
c36cf18
+	rte_spinlock_unlock(&vq->access_lock);
c36cf18
+
c36cf18
 	return count;
c36cf18
 }
c36cf18
 
c36cf18
@@ -651,8 +658,11 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
c36cf18
 	}
c36cf18
 
c36cf18
 	vq = dev->virtqueue[queue_id];
c36cf18
+
c36cf18
+	rte_spinlock_lock(&vq->access_lock);
c36cf18
+
c36cf18
 	if (unlikely(vq->enabled == 0))
c36cf18
-		return 0;
c36cf18
+		goto out_access_unlock;
c36cf18
 
c36cf18
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
c36cf18
 		vhost_user_iotlb_rd_lock(vq);
c36cf18
@@ -715,6 +725,9 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
c36cf18
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
c36cf18
 		vhost_user_iotlb_rd_unlock(vq);
c36cf18
 
c36cf18
+out_access_unlock:
c36cf18
+	rte_spinlock_unlock(&vq->access_lock);
c36cf18
+
c36cf18
 	return pkt_idx;
c36cf18
 }
c36cf18
 
c36cf18
@@ -1180,9 +1193,13 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
c36cf18
 	}
c36cf18
 
c36cf18
 	vq = dev->virtqueue[queue_id];
c36cf18
-	if (unlikely(vq->enabled == 0))
c36cf18
+
c36cf18
+	if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
c36cf18
 		return 0;
c36cf18
 
c36cf18
+	if (unlikely(vq->enabled == 0))
c36cf18
+		goto out_access_unlock;
c36cf18
+
c36cf18
 	vq->batch_copy_nb_elems = 0;
c36cf18
 
c36cf18
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
c36cf18
@@ -1240,7 +1257,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
c36cf18
 		if (rarp_mbuf == NULL) {
c36cf18
 			RTE_LOG(ERR, VHOST_DATA,
c36cf18
 				"Failed to allocate memory for mbuf.\n");
c36cf18
-			return 0;
c36cf18
+			goto out;
c36cf18
 		}
c36cf18
 
c36cf18
 		if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
c36cf18
@@ -1356,6 +1373,9 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
c36cf18
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
c36cf18
 		vhost_user_iotlb_rd_unlock(vq);
c36cf18
 
c36cf18
+out_access_unlock:
c36cf18
+	rte_spinlock_unlock(&vq->access_lock);
c36cf18
+
c36cf18
 	if (unlikely(rarp_mbuf != NULL)) {
c36cf18
 		/*
c36cf18
 		 * Inject it to the head of "pkts" array, so that switch's mac