[ovs-dev] [PATCH v3] dpif-netdev: Report overhead busy cycles per pmd.

Fri Jul 16 16:21:16 UTC 2021

Users complained that per rxq pmd usage was confusing: summing those
values per pmd would never reach 100% even if increasing traffic load
beyond pmd capacity.

This is because the dpif-netdev/pmd-rxq-show command only reports "pure"
rxq cycles while some cycles are used in the pmd mainloop and adds up to
the total pmd load.

dpif-netdev/pmd-stats-show does report per pmd load usage.
This load is measured since the last dpif-netdev/pmd-stats-clear call.
On the other hand, the per rxq pmd usage reflects the pmd load on a 10s
sliding window which makes it non trivial to correlate.

Gather per pmd busy cycles with the same periodicity and report the
difference as overhead in dpif-netdev/pmd-rxq-show so that we have all
info in a single command.

Example:
$ ovs-appctl dpif-netdev/pmd-rxq-show
pmd thread numa_id 1 core_id 3:
  isolated : true
  port: dpdk0             queue-id:  0 (enabled)   pmd usage: 90 %
  overhead:  4 %
pmd thread numa_id 1 core_id 5:
  isolated : false
  port: vhost0            queue-id:  0 (enabled)   pmd usage:  0 %
  port: vhost1            queue-id:  0 (enabled)   pmd usage: 93 %
  port: vhost2            queue-id:  0 (enabled)   pmd usage:  0 %
  port: vhost6            queue-id:  0 (enabled)   pmd usage:  0 %
  overhead:  6 %
pmd thread numa_id 1 core_id 31:
  isolated : true
  port: dpdk1             queue-id:  0 (enabled)   pmd usage: 86 %
  overhead:  4 %
pmd thread numa_id 1 core_id 33:
  isolated : false
  port: vhost3            queue-id:  0 (enabled)   pmd usage:  0 %
  port: vhost4            queue-id:  0 (enabled)   pmd usage:  0 %
  port: vhost5            queue-id:  0 (enabled)   pmd usage: 92 %
  port: vhost7            queue-id:  0 (enabled)   pmd usage:  0 %
  overhead:  7 %

Signed-off-by: David Marchand <david.marchand at redhat.com>
---
Changes since v2:
- rebased on master, dynamically allocating added stats array to avoid
  exposing internal dpif-netdev array size,
- fixed UT on FreeBSD,
- rebased on top of Kevin series to ease merging wrt UT update,
- GHA result: https://github.com/david-marchand/ovs/runs/3087888172

Changes since v1:
- fixed unit test and documentation update,
- moved documentation update under pmd-rxq-show command description,
- updated commitlog,
- renamed variables for better readability,
- avoided reporting a N/A overhead for idle PMD,
- reset overhead stats on PMD reconfigure,

---
 Documentation/topics/dpdk/pmd.rst |   5 ++
 lib/dpif-netdev-private-thread.h  |   7 +-
 lib/dpif-netdev.c                 | 105 +++++++++++++++++++++---------
 tests/pmd.at                      |   9 +++
 4 files changed, 93 insertions(+), 33 deletions(-)

diff --git a/Documentation/topics/dpdk/pmd.rst b/Documentation/topics/dpdk/pmd.rst
index 30040d7033..95fa7af128 100644
--- a/Documentation/topics/dpdk/pmd.rst
+++ b/Documentation/topics/dpdk/pmd.rst
@@ -195,6 +195,11 @@ queue::
    due to traffic pattern or reconfig changes, will take one minute to be fully
    reflected in the stats.
 
+.. versionchanged:: 2.16.0
+
+   A ``overhead`` statistics is shown per PMD: it represents the number of
+   cycles inherently consumed by the OVS PMD processing loop.
+
 Rx queue to PMD assignment takes place whenever there are configuration changes
 or can be triggered by using::
 
diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h
index a4c092b692..a782d9678a 100644
--- a/lib/dpif-netdev-private-thread.h
+++ b/lib/dpif-netdev-private-thread.h
@@ -99,13 +99,18 @@ struct dp_netdev_pmd_thread {
     long long int next_optimization;
     /* End of the next time interval for which processing cycles
        are stored for each polled rxq. */
-    long long int rxq_next_cycle_store;
+    long long int next_cycle_store;
 
     /* Last interval timestamp. */
     uint64_t intrvl_tsc_prev;
     /* Last interval cycles. */
     atomic_ullong intrvl_cycles;
 
+    /* Write index for 'busy_cycles_intrvl'. */
+    unsigned int intrvl_idx;
+    /* Busy cycles in last PMD_INTERVAL_MAX intervals. */
+    atomic_ullong *busy_cycles_intrvl;
+
     /* Current context of the PMD thread. */
     struct dp_netdev_pmd_thread_ctx ctx;
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 05d414ad96..bb27c79857 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -155,11 +155,11 @@ static struct odp_support dp_netdev_support = {
 
 /* Time in microseconds of the interval in which rxq processing cycles used
  * in rxq to pmd assignments is measured and stored. */
-#define PMD_RXQ_INTERVAL_LEN 10000000LL
+#define PMD_INTERVAL_LEN 10000000LL
 
 /* Number of intervals for which cycles are stored
  * and used during rxq to pmd assignment. */
-#define PMD_RXQ_INTERVAL_MAX 6
+#define PMD_INTERVAL_MAX 6
 
 /* Time in microseconds to try RCU quiescing. */
 #define PMD_RCU_QUIESCE_INTERVAL 10000LL
@@ -379,9 +379,9 @@ struct dp_netdev_rxq {
 
     /* Counters of cycles spent successfully polling and processing pkts. */
     atomic_ullong cycles[RXQ_N_CYCLES];
-    /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
+    /* We store PMD_INTERVAL_MAX intervals of data for an rxq and then
        sum them to yield the cycles used for an rxq. */
-    atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
+    atomic_ullong cycles_intrvl[PMD_INTERVAL_MAX];
 };
 
 /* A port in a netdev-based datapath. */
@@ -791,6 +791,8 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
         struct rxq_poll *list;
         size_t n_rxq;
         uint64_t total_cycles = 0;
+        uint64_t busy_cycles = 0;
+        uint64_t total_rxq_proc_cycles = 0;
 
         ds_put_format(reply,
                       "pmd thread numa_id %d core_id %u:\n  isolated : %s\n",
@@ -803,16 +805,27 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
         /* Get the total pmd cycles for an interval. */
         atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
         /* Estimate the cycles to cover all intervals. */
-        total_cycles *= PMD_RXQ_INTERVAL_MAX;
+        total_cycles *= PMD_INTERVAL_MAX;
+
+        for (int j = 0; j < PMD_INTERVAL_MAX; j++) {
+            uint64_t cycles;
+
+            atomic_read_relaxed(&pmd->busy_cycles_intrvl[j], &cycles);
+            busy_cycles += cycles;
+        }
+        if (busy_cycles > total_cycles) {
+            busy_cycles = total_cycles;
+        }
 
         for (int i = 0; i < n_rxq; i++) {
             struct dp_netdev_rxq *rxq = list[i].rxq;
             const char *name = netdev_rxq_get_name(rxq->rx);
-            uint64_t proc_cycles = 0;
+            uint64_t rxq_proc_cycles = 0;
 
-            for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
-                proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
+            for (int j = 0; j < PMD_INTERVAL_MAX; j++) {
+                rxq_proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
             }
+            total_rxq_proc_cycles += rxq_proc_cycles;
             ds_put_format(reply, "  port: %-16s  queue-id: %2d", name,
                           netdev_rxq_get_queue_id(list[i].rxq->rx));
             ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
@@ -820,13 +833,30 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
             ds_put_format(reply, "  pmd usage: ");
             if (total_cycles) {
                 ds_put_format(reply, "%2"PRIu64"",
-                              proc_cycles * 100 / total_cycles);
+                              rxq_proc_cycles * 100 / total_cycles);
                 ds_put_cstr(reply, " %");
             } else {
                 ds_put_format(reply, "%s", "NOT AVAIL");
             }
             ds_put_cstr(reply, "\n");
         }
+
+        if (n_rxq > 0) {
+            ds_put_cstr(reply, "  overhead: ");
+            if (total_cycles) {
+                uint64_t overhead_cycles = 0;
+
+                if (total_rxq_proc_cycles < busy_cycles) {
+                    overhead_cycles = busy_cycles - total_rxq_proc_cycles;
+                }
+                ds_put_format(reply, "%2"PRIu64" %%",
+                              overhead_cycles * 100 / total_cycles);
+            } else {
+                ds_put_cstr(reply, "NOT AVAIL");
+            }
+            ds_put_cstr(reply, "\n");
+        }
+
         ovs_mutex_unlock(&pmd->port_mutex);
         free(list);
     }
@@ -4521,7 +4551,7 @@ static void
 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
                                 unsigned long long cycles)
 {
-    unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
+    unsigned int idx = rx->intrvl_idx++ % PMD_INTERVAL_MAX;
     atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
 }
 
@@ -4978,7 +5008,7 @@ sched_numa_list_assignments(struct sched_numa_list *numa_list,
             struct sched_pmd *sched_pmd;
             uint64_t proc_cycles = 0;
 
-            for (int i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
+            for (int i = 0; i < PMD_INTERVAL_MAX; i++) {
                 proc_cycles  += dp_netdev_rxq_get_intrvl_cycles(rxq, i);
             }
 
@@ -5238,7 +5268,7 @@ sched_numa_list_schedule(struct sched_numa_list *numa_list,
                 uint64_t cycle_hist = 0;
 
                 /* Sum the queue intervals and store the cycle history. */
-                for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
+                for (unsigned i = 0; i < PMD_INTERVAL_MAX; i++) {
                     cycle_hist += dp_netdev_rxq_get_intrvl_cycles(rxq, i);
                 }
                 dp_netdev_rxq_set_cycles(rxq, RXQ_CYCLES_PROC_HIST,
@@ -5418,7 +5448,7 @@ sched_numa_list_variance(struct sched_numa_list *numa_list)
 
             if (total_cycles) {
                 /* Estimate the cycles to cover all intervals. */
-                total_cycles *= PMD_RXQ_INTERVAL_MAX;
+                total_cycles *= PMD_INTERVAL_MAX;
                 percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100)
                                              / total_cycles;
             } else {
@@ -5935,7 +5965,7 @@ dpif_netdev_run(struct dpif *dpif)
             pmd_alb->rebalance_poll_timer = now;
             CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
                 if (atomic_count_get(&pmd->pmd_overloaded) >=
-                                    PMD_RXQ_INTERVAL_MAX) {
+                                    PMD_INTERVAL_MAX) {
                     pmd_rebalance = true;
                     break;
                 }
@@ -6145,6 +6175,10 @@ reload:
 
     pmd->intrvl_tsc_prev = 0;
     atomic_store_relaxed(&pmd->intrvl_cycles, 0);
+    for (i = 0; i < PMD_INTERVAL_MAX; i++) {
+        atomic_store_relaxed(&pmd->busy_cycles_intrvl[i], 0);
+    }
+    pmd->intrvl_idx = 0;
     cycles_counter_update(s);
 
     pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
@@ -6677,7 +6711,9 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
     pmd_thread_ctx_time_update(pmd);
     pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
     pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
-    pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
+    pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN;
+    pmd->busy_cycles_intrvl = xzalloc(PMD_INTERVAL_MAX *
+                                      sizeof *pmd->busy_cycles_intrvl);
     hmap_init(&pmd->poll_list);
     hmap_init(&pmd->tx_ports);
     hmap_init(&pmd->tnl_port_cache);
@@ -6716,6 +6752,7 @@ dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
     hmap_destroy(&pmd->tx_ports);
     cmap_destroy(&pmd->tx_bonds);
     hmap_destroy(&pmd->poll_list);
+    free(pmd->busy_cycles_intrvl);
     /* All flows (including their dpcls_rules) have been deleted already */
     CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
         dpcls_destroy(cls);
@@ -8992,31 +9029,33 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
     uint64_t tot_idle = 0, tot_proc = 0;
     unsigned int pmd_load = 0;
 
-    if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
+    if (pmd->ctx.now > pmd->next_cycle_store) {
         uint64_t curr_tsc;
         uint8_t rebalance_load_trigger;
         struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
-        if (pmd_alb->is_enabled && !pmd->isolated
-            && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
-                                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
-            && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
-                                        pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
-            {
+        unsigned int idx;
+
+        if (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
+                pmd->prev_stats[PMD_CYCLES_ITER_IDLE] &&
+            pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
+                pmd->prev_stats[PMD_CYCLES_ITER_BUSY]) {
             tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
                        pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
             tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
                        pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
 
-            if (tot_proc) {
-                pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
-            }
+            if (pmd_alb->is_enabled && !pmd->isolated) {
+                if (tot_proc) {
+                    pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
+                }
 
-            atomic_read_relaxed(&pmd_alb->rebalance_load_thresh,
-                                &rebalance_load_trigger);
-            if (pmd_load >= rebalance_load_trigger) {
-                atomic_count_inc(&pmd->pmd_overloaded);
-            } else {
-                atomic_count_set(&pmd->pmd_overloaded, 0);
+                atomic_read_relaxed(&pmd_alb->rebalance_load_thresh,
+                                    &rebalance_load_trigger);
+                if (pmd_load >= rebalance_load_trigger) {
+                    atomic_count_inc(&pmd->pmd_overloaded);
+                } else {
+                    atomic_count_set(&pmd->pmd_overloaded, 0);
+                }
             }
         }
 
@@ -9039,9 +9078,11 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
             atomic_store_relaxed(&pmd->intrvl_cycles,
                                  curr_tsc - pmd->intrvl_tsc_prev);
         }
+        idx = pmd->intrvl_idx++ % PMD_INTERVAL_MAX;
+        atomic_store_relaxed(&pmd->busy_cycles_intrvl[idx], tot_proc);
         pmd->intrvl_tsc_prev = curr_tsc;
         /* Start new measuring interval */
-        pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
+        pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN;
     }
 
     if (pmd->ctx.now > pmd->next_optimization) {
diff --git a/tests/pmd.at b/tests/pmd.at
index 08939bfef0..225d4ee3a4 100644
--- a/tests/pmd.at
+++ b/tests/pmd.at
@@ -73,6 +73,7 @@ AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0],
 pmd thread numa_id <cleared> core_id <cleared>:
   isolated : false
   port: p0                queue-id:  0 (enabled)   pmd usage: NOT AVAIL
+  overhead: NOT AVAIL
 ])
 
 AT_CHECK([ovs-appctl dpif/show | sed 's/\(tx_queues=\)[[0-9]]*/\1<cleared>/g'], [0], [dnl
@@ -111,6 +112,7 @@ pmd thread numa_id <cleared> core_id <cleared>:
   port: p0                queue-id:  5 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  6 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  7 (enabled)   pmd usage: NOT AVAIL
+  overhead: NOT AVAIL
 ])
 
 OVS_VSWITCHD_STOP
@@ -142,6 +144,7 @@ pmd thread numa_id <cleared> core_id <cleared>:
   port: p0                queue-id:  5 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  6 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  7 (enabled)   pmd usage: NOT AVAIL
+  overhead: NOT AVAIL
 ])
 
 TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
@@ -190,6 +193,7 @@ pmd thread numa_id <cleared> core_id <cleared>:
   port: p0                queue-id:  5 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  6 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  7 (enabled)   pmd usage: NOT AVAIL
+  overhead: NOT AVAIL
 ])
 
 OVS_VSWITCHD_STOP
@@ -221,6 +225,7 @@ pmd thread numa_id <cleared> core_id <cleared>:
   port: p0                queue-id:  5 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  6 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  7 (enabled)   pmd usage: NOT AVAIL
+  overhead: NOT AVAIL
 ])
 
 # Force cross-numa polling
@@ -285,6 +290,7 @@ pmd thread numa_id 1 core_id 1:
   port: p0                queue-id:  5 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  6 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  7 (enabled)   pmd usage: NOT AVAIL
+  overhead: NOT AVAIL
 pmd thread numa_id 0 core_id 2:
   isolated : false
 ])
@@ -306,6 +312,7 @@ pmd thread numa_id 1 core_id 1:
   port: p0                queue-id:  5 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  6 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  7 (enabled)   pmd usage: NOT AVAIL
+  overhead: NOT AVAIL
 pmd thread numa_id 0 core_id 2:
   isolated : false
 ])
@@ -325,6 +332,7 @@ pmd thread numa_id 1 core_id 1:
   port: p0                queue-id:  5 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  6 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  7 (enabled)   pmd usage: NOT AVAIL
+  overhead: NOT AVAIL
 pmd thread numa_id 0 core_id 2:
   isolated : false
 ])
@@ -345,6 +353,7 @@ pmd thread numa_id 1 core_id 0:
   port: p0                queue-id:  5 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  6 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  7 (enabled)   pmd usage: NOT AVAIL
+  overhead: NOT AVAIL
 ])
 
 OVS_VSWITCHD_STOP
-- 
2.23.0