[ovs-dev] [PATCH] netdev-dpdk: Use intermediate queue during packet transmission.

Bhanuprakash Bodireddy bhanuprakash.bodireddy at intel.com
Fri Dec 16 14:48:43 UTC 2016


In exact match cache processing on an EMC hit, packets are queued in to
batches matching the flow. Thereafter, packets are processed in batches
for faster packet processing. This particularly is inefficient if there
are fewer packets in a batch as rte_eth_tx_burst() incurs expensive MMIO
write.

This commit adds back intermediate queue implementation. Packets are
queued and burst when the packet count >= NETDEV_MAX_BURST. Also drain
logic is refactored to handle fewer packets in the tx queues. Testing
shows significant performance gains with queueing.

Fixes: b59cc14e032d("netdev-dpdk: Use instant sending instead of
queueing of packets.")
Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy at intel.com>
Signed-off-by: Antonio Fischetti <antonio.fischetti at intel.com>
Co-authored-by: Antonio Fischetti <antonio.fischetti at intel.com>
Signed-off-by: Markus Magnusson <markus.magnusson at ericsson.com>
Co-authored-by: Markus Magnusson <markus.magnusson at ericsson.com>
---
This patch is based on commit '8b6987d799fb0bc530ebb7f767767b1c661548c9'
and [PATCH v2 00/19] DPDK/pmd reconfiguration refactor and bugfixes

Test results:
 * In worst case scenario with fewer packets in batch, significant
   bottleneck is observed at netdev_dpdk_eth_send() function due to 
   expensive MMIO writes.

 * Also its observed that CPI(cycles per instruction) Rate for the function
   stood between 3.15 and 4.1 which is significantly higher than acceptable
   limit of 1.0 for HPC applications and theoretical limit of 0.25 (As Backend
   pipeline can retire 4 micro-operations in a cycle).

 * With this patch, CPI for netdev_dpdk_eth_send() is at 0.55 and the overall
   throughput improved significantly.

 lib/dpif-netdev.c     | 42 +++++++++++++++++++++++++++++-
 lib/netdev-bsd.c      |  1 +
 lib/netdev-dpdk.c     | 71 +++++++++++++++++++++++++++++++++++++++++++++------
 lib/netdev-dummy.c    |  1 +
 lib/netdev-linux.c    |  1 +
 lib/netdev-provider.h |  3 +++
 lib/netdev-vport.c    |  3 ++-
 lib/netdev.c          |  9 +++++++
 lib/netdev.h          |  1 +
 9 files changed, 122 insertions(+), 10 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 3509493..65dff83 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -622,6 +622,9 @@ static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
 static inline bool emc_entry_alive(struct emc_entry *ce);
 static void emc_clear_entry(struct emc_entry *ce);
 
+static struct tx_port *pmd_send_port_cache_lookup
+    (const struct dp_netdev_pmd_thread *pmd, odp_port_t port_no);
+
 static void
 emc_cache_init(struct emc_cache *flow_cache)
 {
@@ -2877,6 +2880,31 @@ cycles_count_end(struct dp_netdev_pmd_thread *pmd,
 }
 
 static void
+dp_netdev_drain_txq_port(struct dp_netdev_pmd_thread *pmd,
+                         struct dp_netdev_port *port,
+                         uint64_t now)
+{
+    int tx_qid;
+
+    if (!strcmp(port->type, "dpdk")) {
+        struct tx_port *tx = pmd_send_port_cache_lookup(pmd,
+                 u32_to_odp(port->port_no));
+
+        if (OVS_LIKELY(tx)) {
+            bool dynamic_txqs = tx->port->dynamic_txqs;
+
+            if (dynamic_txqs) {
+                tx_qid = dpif_netdev_xps_get_tx_qid(pmd, tx, now);
+            } else {
+                tx_qid = pmd->static_tx_qid;
+            }
+
+            netdev_txq_drain(port->netdev, tx_qid);
+        }
+    }
+}
+
+static void
 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
                            struct netdev_rxq *rx,
                            odp_port_t port_no)
@@ -3505,6 +3533,8 @@ pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
     return i;
 }
 
+enum { DRAIN_TSC = 20000ULL };
+
 static void *
 pmd_thread_main(void *f_)
 {
@@ -3512,6 +3542,7 @@ pmd_thread_main(void *f_)
     unsigned int lc = 0;
     struct polled_queue *poll_list;
     bool exiting;
+    uint64_t prev = 0, now = 0;
     int poll_cnt;
     int i;
 
@@ -3547,10 +3578,19 @@ reload:
         }
 
         if (lc++ > 1024) {
+            prev = now;
+            now = pmd->last_cycles;
+            struct tx_port *tx_port;
             bool reload;
 
             lc = 0;
 
+            if ((now - prev) > DRAIN_TSC) {
+                HMAP_FOR_EACH (tx_port, node, &pmd->send_port_cache) {
+                    dp_netdev_drain_txq_port(pmd, tx_port->port, now);
+                }
+            }
+
             coverage_try_clear();
             dp_netdev_pmd_try_optimize(pmd);
             if (!ovsrcu_try_quiesce()) {
@@ -4951,7 +4991,7 @@ dpif_dummy_register(enum dummy_level level)
                              "dp port new-number",
                              3, 3, dpif_dummy_change_port_number, NULL);
 }
-
+
 /* Datapath Classifier. */
 
 /* A set of rules that all have the same fields wildcarded. */
diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c
index 75a330b..e13d418 100644
--- a/lib/netdev-bsd.c
+++ b/lib/netdev-bsd.c
@@ -1543,6 +1543,7 @@ netdev_bsd_update_flags(struct netdev *netdev_, enum netdev_flags off,
     netdev_bsd_rxq_recv,                             \
     netdev_bsd_rxq_wait,                             \
     netdev_bsd_rxq_drain,                            \
+    NULL,                                            \
 }
 
 const struct netdev_class netdev_bsd_class =
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index eb4ed02..945f6de 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -166,7 +166,6 @@ static const struct rte_eth_conf port_conf = {
 
 enum { DPDK_RING_SIZE = 256 };
 BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE));
-enum { DRAIN_TSC = 200000ULL };
 
 enum dpdk_dev_type {
     DPDK_DEV_ETH = 0,
@@ -289,12 +288,17 @@ struct dpdk_mp {
 /* There should be one 'struct dpdk_tx_queue' created for
  * each cpu core. */
 struct dpdk_tx_queue {
+    int count;                     /* Number of buffered packets waiting to
+                                      be sent. */
     rte_spinlock_t tx_lock;        /* Protects the members and the NIC queue
                                     * from concurrent access.  It is used only
                                     * if the queue is shared among different
                                     * pmd threads (see 'concurrent_txq'). */
     int map;                       /* Mapping of configured vhost-user queues
                                     * to enabled by guest. */
+    struct rte_mbuf *burst_pkts[NETDEV_MAX_BURST];
+                                   /* Copy the packets in to intermediate queue
+                                    * to amortize the cost of MMIO write. */
 };
 
 /* dpdk has no way to remove dpdk ring ethernet devices
@@ -1242,6 +1246,7 @@ static inline int
 netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
                          struct rte_mbuf **pkts, int cnt)
 {
+    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
     uint32_t nb_tx = 0;
 
     while (nb_tx != cnt) {
@@ -1265,7 +1270,8 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
         }
     }
 
-    return cnt - nb_tx;
+    txq->count = cnt - nb_tx;
+    return txq->count;
 }
 
 static inline bool
@@ -1649,6 +1655,34 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
     }
 }
 
+/* Enqueue packets in an intermediate queue and call the burst
+ * function when the queue is full. This way we can amortize the
+ * cost of MMIO writes. */
+static inline void
+netdev_dpdk_eth_tx_queue(struct netdev_dpdk *dev, int qid,
+                            struct rte_mbuf **pkts, int cnt)
+{
+    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
+
+    int i = 0;
+
+    while (i < cnt) {
+        int freeslots = NETDEV_MAX_BURST - txq->count;
+        int tocopy = MIN(freeslots, cnt-i);
+
+        memcpy(&txq->burst_pkts[txq->count], &pkts[i],
+               tocopy * sizeof (struct rte_mbuf *));
+
+        txq->count += tocopy;
+        i += tocopy;
+
+        /* Queue full, burst the packets */
+        if (txq->count >= NETDEV_MAX_BURST) {
+           netdev_dpdk_eth_tx_burst(dev, qid, txq->burst_pkts, txq->count);
+        }
+    }
+}
+
 static int
 netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
                        struct dp_packet_batch *batch,
@@ -1697,7 +1731,7 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
         cnt = netdev_dpdk_qos_run(dev, pkts, cnt);
         dropped = batch->count - cnt;
 
-        dropped += netdev_dpdk_eth_tx_burst(dev, qid, pkts, cnt);
+        netdev_dpdk_eth_tx_queue(dev, qid, pkts, cnt);
 
         if (OVS_UNLIKELY(dropped)) {
             rte_spinlock_lock(&dev->stats_lock);
@@ -1711,6 +1745,22 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
     }
 }
 
+/* Drain tx queues, this is called periodically to empty the
+ * intermediate queue in case of fewer packets(< NETDEV_MAX_BURST)
+ * in the queue */
+static int
+netdev_dpdk_txq_drain(struct netdev *netdev, int qid)
+{
+    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
+
+    if (OVS_LIKELY(txq->count)) {
+        netdev_dpdk_eth_tx_burst(dev, qid, txq->burst_pkts, txq->count);
+    }
+
+    return 0;
+}
+
 static int
 netdev_dpdk_eth_send(struct netdev *netdev, int qid,
                      struct dp_packet_batch *batch, bool may_steal,
@@ -3051,7 +3101,7 @@ unlock:
                           SET_CONFIG, SET_TX_MULTIQ, SEND,    \
                           GET_CARRIER, GET_STATS,             \
                           GET_FEATURES, GET_STATUS,           \
-                          RECONFIGURE, RXQ_RECV)              \
+                          RECONFIGURE, RXQ_RECV, TXQ_DRAIN)   \
 {                                                             \
     NAME,                                                     \
     true,                       /* is_pmd */                  \
@@ -3118,6 +3168,7 @@ unlock:
     RXQ_RECV,                                                 \
     NULL,                       /* rx_wait */                 \
     NULL,                       /* rxq_drain */               \
+    TXQ_DRAIN,                  /* txq_drain */               \
 }
 
 static const struct netdev_class dpdk_class =
@@ -3134,7 +3185,8 @@ static const struct netdev_class dpdk_class =
         netdev_dpdk_get_features,
         netdev_dpdk_get_status,
         netdev_dpdk_reconfigure,
-        netdev_dpdk_rxq_recv);
+        netdev_dpdk_rxq_recv,
+        netdev_dpdk_txq_drain);
 
 static const struct netdev_class dpdk_ring_class =
     NETDEV_DPDK_CLASS(
@@ -3150,7 +3202,8 @@ static const struct netdev_class dpdk_ring_class =
         netdev_dpdk_get_features,
         netdev_dpdk_get_status,
         netdev_dpdk_reconfigure,
-        netdev_dpdk_rxq_recv);
+        netdev_dpdk_rxq_recv,
+        NULL);
 
 static const struct netdev_class dpdk_vhost_class =
     NETDEV_DPDK_CLASS(
@@ -3166,7 +3219,8 @@ static const struct netdev_class dpdk_vhost_class =
         NULL,
         NULL,
         netdev_dpdk_vhost_reconfigure,
-        netdev_dpdk_vhost_rxq_recv);
+        netdev_dpdk_vhost_rxq_recv,
+        NULL);
 static const struct netdev_class dpdk_vhost_client_class =
     NETDEV_DPDK_CLASS(
         "dpdkvhostuserclient",
@@ -3181,7 +3235,8 @@ static const struct netdev_class dpdk_vhost_client_class =
         NULL,
         NULL,
         netdev_dpdk_vhost_client_reconfigure,
-        netdev_dpdk_vhost_rxq_recv);
+        netdev_dpdk_vhost_rxq_recv,
+        NULL);
 
 void
 netdev_dpdk_register(void)
diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
index dec1a8e..113d334 100644
--- a/lib/netdev-dummy.c
+++ b/lib/netdev-dummy.c
@@ -1384,6 +1384,7 @@ netdev_dummy_update_flags(struct netdev *netdev_,
     netdev_dummy_rxq_recv,                                      \
     netdev_dummy_rxq_wait,                                      \
     netdev_dummy_rxq_drain,                                     \
+    NULL,                                                       \
 }
 
 static const struct netdev_class dummy_class =
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index a5a9ec1..2499b3e 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -2831,6 +2831,7 @@ netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
     netdev_linux_rxq_recv,                                      \
     netdev_linux_rxq_wait,                                      \
     netdev_linux_rxq_drain,                                     \
+    NULL,                                                       \
 }
 
 const struct netdev_class netdev_linux_class =
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index c8507a5..fa2d8ba 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -764,6 +764,9 @@ struct netdev_class {
 
     /* Discards all packets waiting to be received from 'rx'. */
     int (*rxq_drain)(struct netdev_rxq *rx);
+
+    /* Drain all packets on queue 'qid'. */
+    int (*txq_drain)(struct netdev *netdev, int qid);
 };
 
 int netdev_register_provider(const struct netdev_class *);
diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
index 02a246a..359932c 100644
--- a/lib/netdev-vport.c
+++ b/lib/netdev-vport.c
@@ -810,7 +810,8 @@ get_stats(const struct netdev *netdev, struct netdev_stats *stats)
     NULL,                   /* rx_dealloc */                \
     NULL,                   /* rx_recv */                   \
     NULL,                   /* rx_wait */                   \
-    NULL,                   /* rx_drain */
+    NULL,                   /* rx_drain */                  \
+    NULL,                   /* tx_drain */
 
 
 #define TUNNEL_CLASS(NAME, DPIF_PORT, BUILD_HEADER, PUSH_HEADER, POP_HEADER)   \
diff --git a/lib/netdev.c b/lib/netdev.c
index 0258b32..6b4fa9d 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -664,6 +664,15 @@ netdev_rxq_drain(struct netdev_rxq *rx)
             : 0);
 }
 
+/* Flush packets on the queue 'qid'. */
+int
+netdev_txq_drain(struct netdev *netdev, int qid)
+{
+    return (netdev->netdev_class->txq_drain
+            ? netdev->netdev_class->txq_drain(netdev, qid)
+            : EOPNOTSUPP);
+}
+
 /* Configures the number of tx queues of 'netdev'. Returns 0 if successful,
  * otherwise a positive errno value.
  *
diff --git a/lib/netdev.h b/lib/netdev.h
index 03059ca..d8b0ab3 100644
--- a/lib/netdev.h
+++ b/lib/netdev.h
@@ -153,6 +153,7 @@ int netdev_rxq_drain(struct netdev_rxq *);
 int netdev_send(struct netdev *, int qid, struct dp_packet_batch *,
                 bool may_steal, bool concurrent_txq);
 void netdev_send_wait(struct netdev *, int qid);
+int netdev_txq_drain(struct netdev *, int qid);
 
 /* native tunnel APIs */
 /* Structure to pass parameters required to build a tunnel header. */
-- 
2.4.11



More information about the dev mailing list