[ovs-dev] [PATCH RFC v2 1/6] netdev-dpdk: Use instant sending instead of queueing of packets.

Ilya Maximets i.maximets at samsung.com
Tue May 24 13:34:04 UTC 2016


Current implementarion of TX packet's queueing is broken in several ways:

	* TX queue flushing implemented on receive assumes that all
	  core_id-s are sequential and starts from zero. This may lead
	  to situation when packets will stuck in queue forever and,
	  also, this influences on latency.

	* For a long time flushing logic depends on uninitialized
	  'txq_needs_locking', because it usually calculated after
	  'netdev_dpdk_alloc_txq' but used inside of this function
	  for initialization of 'flush_tx'.

According to current flushing logic, constant flushing required if TX
queues will be shared among different CPUs. Further patches will implement
mechanisms for manipulations with TX queues in runtime. In this case PMD
threads will not know is current queue shared or not. This means that
constant flushing will be required.

Conclusion: Lets remove queueing at all because it doesn't work
properly now and, also, constant flushing required anyway.

Testing on basic PHY-OVS-PHY and PHY-OVS-VM-OVS-PHY scenarios shows
insignificant performance drop (less than 0.5 percents) in compare to
profit that we can achieve in the future using XPS or other features.

Signed-off-by: Ilya Maximets <i.maximets at samsung.com>
---
 lib/netdev-dpdk.c | 102 ++++++++----------------------------------------------
 1 file changed, 14 insertions(+), 88 deletions(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 0d1b8c9..66e33df 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -167,7 +167,6 @@ static const struct rte_eth_conf port_conf = {
     },
 };
 
-enum { MAX_TX_QUEUE_LEN = 384 };
 enum { DPDK_RING_SIZE = 256 };
 BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE));
 enum { DRAIN_TSC = 200000ULL };
@@ -284,8 +283,7 @@ static struct ovs_list dpdk_mp_list OVS_GUARDED_BY(dpdk_mutex)
     = OVS_LIST_INITIALIZER(&dpdk_mp_list);
 
 /* This mutex must be used by non pmd threads when allocating or freeing
- * mbufs through mempools. Since dpdk_queue_pkts() and dpdk_queue_flush() may
- * use mempools, a non pmd thread should hold this mutex while calling them */
+ * mbufs through mempools. */
 static struct ovs_mutex nonpmd_mempool_mutex = OVS_MUTEX_INITIALIZER;
 
 struct dpdk_mp {
@@ -299,17 +297,12 @@ struct dpdk_mp {
 /* There should be one 'struct dpdk_tx_queue' created for
  * each cpu core. */
 struct dpdk_tx_queue {
-    bool flush_tx;                 /* Set to true to flush queue everytime */
-                                   /* pkts are queued. */
-    int count;
     rte_spinlock_t tx_lock;        /* Protects the members and the NIC queue
                                     * from concurrent access.  It is used only
                                     * if the queue is shared among different
                                     * pmd threads (see 'txq_needs_locking'). */
     int map;                       /* Mapping of configured vhost-user queues
                                     * to enabled by guest. */
-    uint64_t tsc;
-    struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];
 };
 
 /* dpdk has no way to remove dpdk ring ethernet devices
@@ -703,19 +696,6 @@ netdev_dpdk_alloc_txq(struct netdev_dpdk *dev, unsigned int n_txqs)
 
     dev->tx_q = dpdk_rte_mzalloc(n_txqs * sizeof *dev->tx_q);
     for (i = 0; i < n_txqs; i++) {
-        int numa_id = ovs_numa_get_numa_id(i);
-
-        if (!dev->txq_needs_locking) {
-            /* Each index is considered as a cpu core id, since there should
-             * be one tx queue for each cpu core.  If the corresponding core
-             * is not on the same numa node as 'dev', flags the
-             * 'flush_tx'. */
-            dev->tx_q[i].flush_tx = dev->socket_id == numa_id;
-        } else {
-            /* Queues are shared among CPUs. Always flush */
-            dev->tx_q[i].flush_tx = true;
-        }
-
         /* Initialize map for vhost devices. */
         dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
         rte_spinlock_init(&dev->tx_q[i].tx_lock);
@@ -1056,16 +1036,15 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
 }
 
 static inline void
-dpdk_queue_flush__(struct netdev_dpdk *dev, int qid)
+netdev_dpdk_eth_instant_send(struct netdev_dpdk *dev, int qid,
+                             struct rte_mbuf **pkts, int cnt)
 {
-    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
     uint32_t nb_tx = 0;
 
-    while (nb_tx != txq->count) {
+    while (nb_tx != cnt) {
         uint32_t ret;
 
-        ret = rte_eth_tx_burst(dev->port_id, qid, txq->burst_pkts + nb_tx,
-                               txq->count - nb_tx);
+        ret = rte_eth_tx_burst(dev->port_id, qid, pkts + nb_tx, cnt - nb_tx);
         if (!ret) {
             break;
         }
@@ -1073,32 +1052,18 @@ dpdk_queue_flush__(struct netdev_dpdk *dev, int qid)
         nb_tx += ret;
     }
 
-    if (OVS_UNLIKELY(nb_tx != txq->count)) {
+    if (OVS_UNLIKELY(nb_tx != cnt)) {
         /* free buffers, which we couldn't transmit, one at a time (each
          * packet could come from a different mempool) */
         int i;
 
-        for (i = nb_tx; i < txq->count; i++) {
-            rte_pktmbuf_free(txq->burst_pkts[i]);
+        for (i = nb_tx; i < cnt; i++) {
+            rte_pktmbuf_free(pkts[i]);
         }
         rte_spinlock_lock(&dev->stats_lock);
-        dev->stats.tx_dropped += txq->count-nb_tx;
+        dev->stats.tx_dropped += cnt - nb_tx;
         rte_spinlock_unlock(&dev->stats_lock);
     }
-
-    txq->count = 0;
-    txq->tsc = rte_get_timer_cycles();
-}
-
-static inline void
-dpdk_queue_flush(struct netdev_dpdk *dev, int qid)
-{
-    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
-
-    if (txq->count == 0) {
-        return;
-    }
-    dpdk_queue_flush__(dev, qid);
 }
 
 static bool
@@ -1207,18 +1172,8 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet **packets,
                      int *c)
 {
     struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
-    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
     int nb_rx;
 
-    /* There is only one tx queue for this core.  Do not flush other
-     * queues.
-     * Do not flush tx queue which is shared among CPUs
-     * since it is always flushed */
-    if (rxq->queue_id == rte_lcore_id() &&
-        OVS_LIKELY(!dev->txq_needs_locking)) {
-        dpdk_queue_flush(dev, rxq->queue_id);
-    }
-
     nb_rx = rte_eth_rx_burst(rx->port_id, rxq->queue_id,
                              (struct rte_mbuf **) packets,
                              NETDEV_MAX_BURST);
@@ -1345,35 +1300,6 @@ out:
     }
 }
 
-inline static void
-dpdk_queue_pkts(struct netdev_dpdk *dev, int qid,
-               struct rte_mbuf **pkts, int cnt)
-{
-    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
-    uint64_t diff_tsc;
-
-    int i = 0;
-
-    while (i < cnt) {
-        int freeslots = MAX_TX_QUEUE_LEN - txq->count;
-        int tocopy = MIN(freeslots, cnt-i);
-
-        memcpy(&txq->burst_pkts[txq->count], &pkts[i],
-               tocopy * sizeof (struct rte_mbuf *));
-
-        txq->count += tocopy;
-        i += tocopy;
-
-        if (txq->count == MAX_TX_QUEUE_LEN || txq->flush_tx) {
-            dpdk_queue_flush__(dev, qid);
-        }
-        diff_tsc = rte_get_timer_cycles() - txq->tsc;
-        if (diff_tsc >= DRAIN_TSC) {
-            dpdk_queue_flush__(dev, qid);
-        }
-    }
-}
-
 /* Tx function. Transmit packets indefinitely */
 static void
 dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,
@@ -1435,8 +1361,7 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,
         newcnt = netdev_dpdk_qos_run__(dev, mbufs, newcnt);
 
         dropped += qos_pkts - newcnt;
-        dpdk_queue_pkts(dev, qid, mbufs, newcnt);
-        dpdk_queue_flush(dev, qid);
+        netdev_dpdk_eth_instant_send(dev, qid, mbufs, newcnt);
     }
 
     if (OVS_UNLIKELY(dropped)) {
@@ -1508,7 +1433,7 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
                     temp_cnt = netdev_dpdk_qos_run__(dev, (struct rte_mbuf**)pkts,
                                                 temp_cnt);
                     dropped += qos_pkts - temp_cnt;
-                    dpdk_queue_pkts(dev, qid,
+                    netdev_dpdk_eth_instant_send(dev, qid,
                                     (struct rte_mbuf **)&pkts[next_tx_idx],
                                     temp_cnt);
 
@@ -1528,8 +1453,9 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
 
             cnt = netdev_dpdk_qos_run__(dev, (struct rte_mbuf**)pkts, cnt);
             dropped += qos_pkts - cnt;
-            dpdk_queue_pkts(dev, qid, (struct rte_mbuf **)&pkts[next_tx_idx],
-                            cnt);
+            netdev_dpdk_eth_instant_send(dev, qid,
+                                         (struct rte_mbuf **)&pkts[next_tx_idx],
+                                         cnt);
         }
 
         if (OVS_UNLIKELY(dropped)) {
-- 
2.5.0




More information about the dev mailing list