[ovs-dev] [PATCH 2/6] netdev-dpdk: Adapt the requested number of tx and rx queues.

Thu Mar 12 18:04:33 UTC 2015

This commit changes the semantics of 'netdev_set_multiq()' to allow OVS
DPDK to run on device with limited multi queue support.

* If a netdev doesn't have the requested number of rxqs it can simply
  inform the datapath without failing.
* If a netdev doesn't have the requested number of txqs it should try
  to create as many as possible and use locking.

Signed-off-by: Daniele Di Proietto <diproiettod at vmware.com>
---
 lib/netdev-dpdk.c     | 94 +++++++++++++++++++++++++++++++--------------------
 lib/netdev-provider.h | 11 ++++++
 lib/netdev.c          | 10 ++++++
 vswitchd/vswitch.xml  |  2 +-
 4 files changed, 80 insertions(+), 37 deletions(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 54bc318..2278377 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -175,6 +175,10 @@ struct dpdk_tx_queue {
     bool flush_tx;                 /* Set to true to flush queue everytime */
                                    /* pkts are queued. */
     int count;
+    rte_spinlock_t tx_lock;        /* Protects the members and the NIC queue
+                                    * from concurrent access.  It is used only
+                                    * if the queue is shared among different
+                                    * pmd threads (see 'txq_needs_locking'). */
     uint64_t tsc;
     struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];
 };
@@ -216,9 +220,15 @@ struct netdev_dpdk {
     struct rte_eth_link link;
     int link_reset_cnt;
 
+    /* The user might request more txqs than the NIC has.  We remap those
+     * ('up.n_txq') on these ('real_n_txq').
+     * If the numbers match, 'txq_needs_locking' is false, otherwise it is
+     * true and we will take a spinlock on transmission */
+    int real_n_txq;
+    bool txq_needs_locking;
+
     /* In dpdk_list. */
     struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
-    rte_spinlock_t dpdkr_tx_lock;
 };
 
 struct netdev_rxq_dpdk {
@@ -414,6 +424,7 @@ static int
 dpdk_eth_dev_init(struct netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
 {
     struct rte_pktmbuf_pool_private *mbp_priv;
+    struct rte_eth_dev_info info;
     struct ether_addr eth_addr;
     int diag;
     int i;
@@ -422,14 +433,24 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
         return ENODEV;
     }
 
-    diag = rte_eth_dev_configure(dev->port_id, dev->up.n_rxq, dev->up.n_txq,
+    rte_eth_dev_info_get(dev->port_id, &info);
+    /* Adjust the number of rx queues to those available on the device. */
+    dev->up.n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
+
+    /* Adjust the number of tx queues. This change is not visible to the user.
+     * We will pretend that there are 'dev->up.n_txq', while using only
+     * 'dev->real_n_txq'*/
+    dev->real_n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
+
+    diag = rte_eth_dev_configure(dev->port_id, dev->up.n_rxq, dev->real_n_txq,
                                  &port_conf);
     if (diag) {
-        VLOG_ERR("eth dev config error %d",diag);
+        VLOG_ERR("eth dev config error %d. rxq:%d txq:%d", diag, dev->up.n_rxq,
+                 dev->real_n_txq);
         return -diag;
     }
 
-    for (i = 0; i < dev->up.n_txq; i++) {
+    for (i = 0; i < dev->real_n_txq; i++) {
         diag = rte_eth_tx_queue_setup(dev->port_id, i, NIC_PORT_TX_Q_SIZE,
                                       dev->socket_id, &tx_conf);
         if (diag) {
@@ -491,14 +512,20 @@ netdev_dpdk_alloc_txq(struct netdev_dpdk *netdev, unsigned int n_txqs)
     int i;
 
     netdev->tx_q = dpdk_rte_mzalloc(n_txqs * sizeof *netdev->tx_q);
-    /* Each index is considered as a cpu core id, since there should
-     * be one tx queue for each cpu core. */
     for (i = 0; i < n_txqs; i++) {
         int numa_id = ovs_numa_get_numa_id(i);
 
-        /* If the corresponding core is not on the same numa node
-         * as 'netdev', flags the 'flush_tx'. */
-        netdev->tx_q[i].flush_tx = netdev->socket_id == numa_id;
+        if (!netdev->txq_needs_locking) {
+            /* Each index is considered as a cpu core id, since there should
+             * be one tx queue for each cpu core.
+             * If the corresponding core is not on the same numa node
+             * as 'netdev', flags the 'flush_tx'. */
+            netdev->tx_q[i].flush_tx = netdev->socket_id == numa_id;
+        } else {
+            /* Queues are shared among CPUs. Always flush */
+            netdev->tx_q[i].flush_tx = true;
+        }
+        rte_spinlock_init(&netdev->tx_q[i].tx_lock);
     }
 }
 
@@ -524,7 +551,6 @@ netdev_dpdk_init(struct netdev *netdev_, unsigned int port_no)
     netdev->flags = 0;
     netdev->mtu = ETHER_MTU;
     netdev->max_packet_len = MTU_TO_MAX_LEN(netdev->mtu);
-    rte_spinlock_init(&netdev->dpdkr_tx_lock);
 
     netdev->dpdk_mp = dpdk_mp_get(netdev->socket_id, netdev->mtu);
     if (!netdev->dpdk_mp) {
@@ -534,6 +560,7 @@ netdev_dpdk_init(struct netdev *netdev_, unsigned int port_no)
 
     netdev_->n_txq = NR_QUEUE;
     netdev_->n_rxq = NR_QUEUE;
+    netdev->real_n_txq = NR_QUEUE;
     err = dpdk_eth_dev_init(netdev);
     if (err) {
         goto unlock;
@@ -620,7 +647,8 @@ netdev_dpdk_get_config(const struct netdev *netdev_, struct smap *args)
     ovs_mutex_lock(&dev->mutex);
 
     smap_add_format(args, "configured_rx_queues", "%d", netdev_->n_rxq);
-    smap_add_format(args, "configured_tx_queues", "%d", netdev_->n_txq);
+    smap_add_format(args, "requested_tx_queues", "%d", netdev_->n_txq);
+    smap_add_format(args, "configured_tx_queues", "%d", dev->real_n_txq);
     ovs_mutex_unlock(&dev->mutex);
 
     return 0;
@@ -656,8 +684,10 @@ netdev_dpdk_set_multiq(struct netdev *netdev_, unsigned int n_txq,
     netdev->up.n_txq = n_txq;
     netdev->up.n_rxq = n_rxq;
     rte_free(netdev->tx_q);
-    netdev_dpdk_alloc_txq(netdev, n_txq);
     err = dpdk_eth_dev_init(netdev);
+    netdev_dpdk_alloc_txq(netdev, netdev->real_n_txq);
+
+    netdev->txq_needs_locking = netdev->real_n_txq != netdev->up.n_txq;
 
     ovs_mutex_unlock(&netdev->mutex);
     ovs_mutex_unlock(&dpdk_mutex);
@@ -921,12 +951,21 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
 }
 
 static int
-netdev_dpdk_eth_send(struct netdev *netdev, int qid,
-                     struct dp_packet **pkts, int cnt, bool may_steal)
+netdev_dpdk_send(struct netdev *netdev, int qid,
+                 struct dp_packet **pkts, int cnt, bool may_steal)
 {
     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
 
+    if (dev->txq_needs_locking) {
+        qid = qid % dev->real_n_txq;
+        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
+    }
+
     netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);
+
+    if (dev->txq_needs_locking) {
+        rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
+    }
     return 0;
 }
 
@@ -1375,19 +1414,6 @@ dpdk_ring_open(const char dev_name[], unsigned int *eth_port_id) OVS_REQUIRES(dp
 }
 
 static int
-netdev_dpdk_ring_send(struct netdev *netdev, int qid OVS_UNUSED,
-                      struct dp_packet **pkts, int cnt, bool may_steal)
-{
-    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
-
-    /* DPDK Rings have a single TX queue, Therefore needs locking. */
-    rte_spinlock_lock(&dev->dpdkr_tx_lock);
-    netdev_dpdk_send__(dev, 0, pkts, cnt, may_steal);
-    rte_spinlock_unlock(&dev->dpdkr_tx_lock);
-    return 0;
-}
-
-static int
 netdev_dpdk_ring_construct(struct netdev *netdev)
 {
     unsigned int port_no = 0;
@@ -1411,7 +1437,7 @@ unlock_dpdk:
     return err;
 }
 
-#define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, MULTIQ, SEND)      \
+#define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT)              \
 {                                                             \
     NAME,                                                     \
     INIT,                       /* init */                    \
@@ -1429,9 +1455,9 @@ unlock_dpdk:
     NULL, /* push header */                                   \
     NULL, /* pop header */                                    \
     netdev_dpdk_get_numa_id,    /* get_numa_id */             \
-    MULTIQ,                     /* set_multiq */              \
+    netdev_dpdk_set_multiq,     /* set_multiq */              \
                                                               \
-    SEND,                       /* send */                    \
+    netdev_dpdk_send,           /* send */                    \
     NULL,                       /* send_wait */               \
                                                               \
     netdev_dpdk_set_etheraddr,                                \
@@ -1516,17 +1542,13 @@ const struct netdev_class dpdk_class =
     NETDEV_DPDK_CLASS(
         "dpdk",
         NULL,
-        netdev_dpdk_construct,
-        netdev_dpdk_set_multiq,
-        netdev_dpdk_eth_send);
+        netdev_dpdk_construct);
 
 const struct netdev_class dpdk_ring_class =
     NETDEV_DPDK_CLASS(
         "dpdkr",
         NULL,
-        netdev_dpdk_ring_construct,
-        NULL,
-        netdev_dpdk_ring_send);
+        netdev_dpdk_ring_construct);
 
 void
 netdev_dpdk_register(void)
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index 915e54a..8f6fab6 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -279,6 +279,17 @@ struct netdev_class {
     /* Configures the number of tx queues and rx queues of 'netdev'.
      * Return 0 if successful, otherwise a positive errno value.
      *
+     * 'n_rxq' specifies the maximum number of receive queues to create.
+     * The netdev provider might choose to create less (e.g. if the hardware
+     * supports only a smaller number).  The actual number of queues created
+     * is stored in the 'netdev->n_rxq' field.
+     *
+     * 'n_txq' specifies the exact number of transmission queues to create.
+     * The caller will call netdev_send() concurrently from 'n_txq' different
+     * threads (with different qid).  The netdev provider is responsible for
+     * making sure that these concurrent calls do not create a race condition
+     * by using multiple hw queues or locking.
+     *
      * On error, the tx queue and rx queue configuration is indeterminant.
      * Caller should make decision on whether to restore the previous or
      * the default configuration.  Also, caller must make sure there is no
diff --git a/lib/netdev.c b/lib/netdev.c
index b76da13..25994b8 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -672,6 +672,16 @@ netdev_rxq_drain(struct netdev_rxq *rx)
 /* Configures the number of tx queues and rx queues of 'netdev'.
  * Return 0 if successful, otherwise a positive errno value.
  *
+ * 'n_rxq' specifies the maximum number of receive queues to create.
+ * The netdev provider might choose to create less (e.g. if the hardware
+ * supports only a smaller number).  The caller can check how many have been
+ * actually created by calling 'netdev_n_rxq()'
+ *
+ * 'n_txq' specifies the exact number of transmission queues to create.
+ * If this function returns successfully, the caller can make 'n_txq'
+ * concurrent calls to netdev_send() (each one with a different 'qid' in the
+ * range [0..'n_txq'-1]).
+ *
  * On error, the tx queue and rx queue configuration is indeterminant.
  * Caller should make decision on whether to restore the previous or
  * the default configuration.  Also, caller must make sure there is no
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index dac3756..e346119 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -155,7 +155,7 @@
       <column name="other_config" key="n-dpdk-rxqs"
               type='{"type": "integer", "minInteger": 1}'>
         <p>
-          Specifies the number of rx queues to be created for each dpdk
+          Specifies the maximum number of rx queues to be created for each dpdk
           interface.  If not specified or specified to 0, one rx queue will
           be created for each dpdk interface by default.
         </p>
-- 
2.1.4