[ovs-dev] [PATCH v2 1/1] netdev-dpdk: NUMA Aware vHost User

Ciara Loftus ciara.loftus at intel.com
Mon Jun 13 10:10:09 UTC 2016


This commit allows for vHost User memory from QEMU, DPDK and OVS, as
well as the servicing PMD, to all come from the same socket.

The socket id of a vhost-user port used to be set to that of the master
lcore. Now it is possible to update the socket id if it is detected
(during VM boot) that the vhost device memory is not on this node. If
this is the case, a new mempool is created from the new node, and the
PMD thread currently servicing the port will no longer, in favour of a
thread from the new node (if enabled in the pmd-cpu-mask).

To avail of this functionality, one must enable the
CONFIG_RTE_LIBRTE_VHOST_NUMA DPDK configuration option.

Signed-off-by: Ciara Loftus <ciara.loftus at intel.com>
---

v2:
- Remove numactl dependencies from travis & fedora spec files
- Updated log message

 .travis.yml                     |  1 +
 INSTALL.DPDK.md                 |  8 ++++++--
 NEWS                            |  3 +++
 acinclude.m4                    |  2 +-
 lib/netdev-dpdk.c               | 37 ++++++++++++++++++++++++++++++++++---
 rhel/openvswitch-fedora.spec.in |  2 ++
 6 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index ee2cf21..6c818cb 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,6 +11,7 @@ addons:
     packages:
       - bc
       - gcc-multilib
+      - libnuma-dev
       - libssl-dev
       - llvm-dev
       - libjemalloc1
diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md
index c2e32bf..00e75bd 100644
--- a/INSTALL.DPDK.md
+++ b/INSTALL.DPDK.md
@@ -16,7 +16,7 @@ OVS needs a system with 1GB hugepages support.
 Building and Installing:
 ------------------------
 
-Required: DPDK 16.04
+Required: DPDK 16.04, libnuma
 Optional (if building with vhost-cuse): `fuse`, `fuse-devel` (`libfuse-dev`
 on Debian/Ubuntu)
 
@@ -465,7 +465,11 @@ Performance Tuning:
 
    It is good practice to ensure that threads that are in the datapath are
    pinned to cores in the same NUMA area. e.g. pmd threads and QEMU vCPUs
-   responsible for forwarding.
+   responsible for forwarding. If DPDK is built with
+   CONFIG_RTE_LIBRTE_VHOST_NUMA=y, vHost User ports automatically
+   detect the NUMA socket of the QEMU vCPUs and will be serviced by a PMD
+   from the same node provided a core on this node is enabled in the
+   pmd-cpu-mask.
 
 9. Rx Mergeable buffers
 
diff --git a/NEWS b/NEWS
index ba201cf..fe24449 100644
--- a/NEWS
+++ b/NEWS
@@ -33,6 +33,9 @@ Post-v2.5.0
        arguments. Additional arguments can be passed via the dpdk-extra
        entry.
      * Add ingress policing functionality.
+     * PMD threads servicing vHost User ports can now come from the NUMA
+       node that device memory is located on if CONFIG_RTE_LIBRTE_VHOST_NUMA
+       is enabled in DPDK.
    - ovs-benchmark: This utility has been removed due to lack of use and
      bitrot.
    - ovs-appctl:
diff --git a/acinclude.m4 b/acinclude.m4
index 0a14856..3978980 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -219,7 +219,7 @@ AC_DEFUN([OVS_CHECK_DPDK], [
     DPDKLIB_FOUND=false
     save_LIBS=$LIBS
     for extras in "" "-ldl"; do
-        LIBS="$DPDK_LIB $extras $save_LIBS $DPDK_EXTRA_LIB"
+        LIBS="$DPDK_LIB $extras $save_LIBS $DPDK_EXTRA_LIB -lnuma"
         AC_LINK_IFELSE(
            [AC_LANG_PROGRAM([#include <rte_config.h>
                              #include <rte_eal.h>],
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 19d355f..fc683e9 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -30,6 +30,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <getopt.h>
+#include <numaif.h>
 
 #include "dirs.h"
 #include "dp-packet.h"
@@ -385,6 +386,9 @@ struct netdev_dpdk {
     int requested_n_txq;
     int requested_n_rxq;
 
+    /* Socket ID detected when vHost device is brought up */
+    int requested_socket_id;
+
     /* Ingress Policer */
     OVSRCU_TYPE(struct ingress_policer *) ingress_policer;
     uint32_t policer_rate;
@@ -761,6 +765,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
     }
 
     dev->socket_id = sid < 0 ? SOCKET0 : sid;
+    dev->requested_socket_id = dev->socket_id;
     dev->port_id = port_no;
     dev->type = type;
     dev->flags = 0;
@@ -2344,6 +2349,8 @@ new_device(struct virtio_net *virtio_dev)
 {
     struct netdev_dpdk *dev;
     bool exists = false;
+    int newnode = 0;
+    long err = 0;
 
     ovs_mutex_lock(&dpdk_mutex);
     /* Add device to the vhost port with the same name as that passed down. */
@@ -2357,6 +2364,19 @@ new_device(struct virtio_net *virtio_dev)
             }
             ovsrcu_set(&dev->virtio_dev, virtio_dev);
             exists = true;
+
+            /* Get NUMA information */
+            err = get_mempolicy(&newnode, NULL, 0, virtio_dev,
+                                MPOL_F_NODE | MPOL_F_ADDR);
+            if (err) {
+                VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
+                        virtio_dev->ifname);
+                newnode = dev->socket_id;
+            } else if (newnode != dev->socket_id) {
+                dev->requested_socket_id = newnode;
+                netdev_request_reconfigure(&dev->up);
+            }
+
             virtio_dev->flags |= VIRTIO_DEV_RUNNING;
             /* Disable notifications. */
             set_irq_status(virtio_dev);
@@ -2374,8 +2394,8 @@ new_device(struct virtio_net *virtio_dev)
         return -1;
     }
 
-    VLOG_INFO("vHost Device '%s' %"PRIu64" has been added", virtio_dev->ifname,
-              virtio_dev->device_fh);
+    VLOG_INFO("vHost Device '%s' %"PRIu64" has been added on numa node %i",
+              virtio_dev->ifname, virtio_dev->device_fh, newnode);
     return 0;
 }
 
@@ -2937,6 +2957,7 @@ static int
 netdev_dpdk_vhost_user_reconfigure(struct netdev *netdev)
 {
     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+    int err = 0;
 
     ovs_mutex_lock(&dpdk_mutex);
     ovs_mutex_lock(&dev->mutex);
@@ -2944,10 +2965,20 @@ netdev_dpdk_vhost_user_reconfigure(struct netdev *netdev)
     netdev->n_txq = dev->requested_n_txq;
     netdev->n_rxq = dev->requested_n_rxq;
 
+    if (dev->requested_socket_id != dev->socket_id) {
+        dev->socket_id = dev->requested_socket_id;
+        /* Change mempool to new NUMA Node */
+        dpdk_mp_put(dev->dpdk_mp);
+        dev->dpdk_mp = dpdk_mp_get(dev->socket_id, dev->mtu);
+        if (!dev->dpdk_mp) {
+            err = ENOMEM;
+        }
+    }
+
     ovs_mutex_unlock(&dev->mutex);
     ovs_mutex_unlock(&dpdk_mutex);
 
-    return 0;
+    return err;
 }
 
 static int
diff --git a/rhel/openvswitch-fedora.spec.in b/rhel/openvswitch-fedora.spec.in
index 0759096..959c90a 100644
--- a/rhel/openvswitch-fedora.spec.in
+++ b/rhel/openvswitch-fedora.spec.in
@@ -54,6 +54,8 @@ BuildRequires: libcap-ng libcap-ng-devel
 %endif
 %if %{with dpdk}
 BuildRequires: dpdk-devel >= 2.2.0
+BuildRequires: numactl-devel
+Requires: numactl-libs
 Provides: %{name}-dpdk = %{version}-%{release}
 %endif
 
-- 
2.4.3




More information about the dev mailing list