[ovs-dev] [PATCH] dpif-netdev: Report overhead busy cycles per pmd.

David Marchand david.marchand at redhat.com
Mon Jun 7 11:48:11 UTC 2021


When setting PMD auto load balance parameters and observing the
feature (and the per rxq statistics) in action, you can easily get
rebalances while the sum of per rxq pmd usage is below the pmd load
threshold you had set.

This is because the dpif-netdev/pmd-rxq-show command only reports "pure"
rxq cycles while some cycles are used in the pmd mainloop and adds up to
the total pmd load.

dpif-netdev/pmd-stats-show does report per pmd load usage.
This load is measured since the last dpif-netdev/pmd-stats-clear call.
On the other hand, the auto load balance feature uses the pmd load on a 10s
sliding window which makes it non trivial to correlate.

Gather per pmd busy cycles with the same periodicity and report the
difference as overhead in dpif-netdev/pmd-rxq-show so that we have all
info in a single command.

Example:
$ ovs-appctl dpif-netdev/pmd-rxq-show
pmd thread numa_id 0 core_id 4:
  isolated : false
  port: dpdk0             queue-id:  0 (enabled)   pmd usage: 37 %
  port: dpdk1             queue-id:  0 (enabled)   pmd usage: 36 %
  port: vhost3            queue-id:  0 (enabled)   pmd usage:  0 %
  port: vhost6            queue-id:  0 (enabled)   pmd usage:  0 %
  port: vhost7            queue-id:  0 (enabled)   pmd usage:  0 %
  overhead:  4 %
pmd thread numa_id 0 core_id 18:
  isolated : false
  port: vhost0            queue-id:  0 (enabled)   pmd usage: 37 %
  port: vhost1            queue-id:  0 (enabled)   pmd usage: 39 %
  port: vhost2            queue-id:  0 (enabled)   pmd usage:  0 %
  port: vhost4            queue-id:  0 (enabled)   pmd usage:  0 %
  port: vhost5            queue-id:  0 (enabled)   pmd usage:  0 %
  overhead:  5 %

Signed-off-by: David Marchand <david.marchand at redhat.com>
---
 Documentation/topics/dpdk/pmd.rst |   6 ++
 lib/dpif-netdev.c                 | 100 ++++++++++++++++++++----------
 tests/pmd.at                      |   8 ++-
 3 files changed, 80 insertions(+), 34 deletions(-)

diff --git a/Documentation/topics/dpdk/pmd.rst b/Documentation/topics/dpdk/pmd.rst
index e481e79414..2df10dcc5d 100644
--- a/Documentation/topics/dpdk/pmd.rst
+++ b/Documentation/topics/dpdk/pmd.rst
@@ -213,6 +213,12 @@ load for each of its associated queues every 10 seconds. If the aggregated PMD
 load reaches the load threshold for 6 consecutive intervals then PMD considers
 itself to be overloaded.
 
+.. versionchanged:: 2.16.0
+
+   The per PMD load is shown in the same fashion than Rx queue's in the output
+   of ``pmd-rxq-show``. It accounts for all Rx queue's processing cycles and
+   internal PMD core main loop cost.
+
 For example, to set the load threshold to 70%::
 
     $ ovs-vsctl set open_vswitch .\
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 650e67ab30..bcebcebe0e 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -235,11 +235,11 @@ struct dfc_cache {
 
 /* Time in microseconds of the interval in which rxq processing cycles used
  * in rxq to pmd assignments is measured and stored. */
-#define PMD_RXQ_INTERVAL_LEN 10000000LL
+#define PMD_INTERVAL_LEN 10000000LL
 
 /* Number of intervals for which cycles are stored
  * and used during rxq to pmd assignment. */
-#define PMD_RXQ_INTERVAL_MAX 6
+#define PMD_INTERVAL_MAX 6
 
 /* Time in microseconds to try RCU quiescing. */
 #define PMD_RCU_QUIESCE_INTERVAL 10000LL
@@ -465,9 +465,9 @@ struct dp_netdev_rxq {
 
     /* Counters of cycles spent successfully polling and processing pkts. */
     atomic_ullong cycles[RXQ_N_CYCLES];
-    /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
+    /* We store PMD_INTERVAL_MAX intervals of data for an rxq and then
        sum them to yield the cycles used for an rxq. */
-    atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
+    atomic_ullong cycles_intrvl[PMD_INTERVAL_MAX];
 };
 
 /* A port in a netdev-based datapath. */
@@ -703,13 +703,18 @@ struct dp_netdev_pmd_thread {
     long long int next_optimization;
     /* End of the next time interval for which processing cycles
        are stored for each polled rxq. */
-    long long int rxq_next_cycle_store;
+    long long int next_cycle_store;
 
     /* Last interval timestamp. */
     uint64_t intrvl_tsc_prev;
     /* Last interval cycles. */
     atomic_ullong intrvl_cycles;
 
+    /* Write index for 'busy_cycles_intrvl'. */
+    unsigned int intrvl_idx;
+    /* Busy cycles in last PMD_INTERVAL_MAX intervals. */
+    atomic_ullong busy_cycles_intrvl[PMD_INTERVAL_MAX];
+
     /* Current context of the PMD thread. */
     struct dp_netdev_pmd_thread_ctx ctx;
 
@@ -1229,6 +1234,8 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
         struct rxq_poll *list;
         size_t n_rxq;
         uint64_t total_cycles = 0;
+        uint64_t busy_cycles = 0;
+        uint64_t polling_cycles = 0;
 
         ds_put_format(reply,
                       "pmd thread numa_id %d core_id %u:\n  isolated : %s\n",
@@ -1241,16 +1248,27 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
         /* Get the total pmd cycles for an interval. */
         atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
         /* Estimate the cycles to cover all intervals. */
-        total_cycles *= PMD_RXQ_INTERVAL_MAX;
+        total_cycles *= PMD_INTERVAL_MAX;
+
+        for (int j = 0; j < PMD_INTERVAL_MAX; j++) {
+            uint64_t cycles;
+
+            atomic_read_relaxed(&pmd->busy_cycles_intrvl[j], &cycles);
+            busy_cycles += cycles;
+        }
+        if (busy_cycles > total_cycles) {
+            busy_cycles = total_cycles;
+        }
 
         for (int i = 0; i < n_rxq; i++) {
             struct dp_netdev_rxq *rxq = list[i].rxq;
             const char *name = netdev_rxq_get_name(rxq->rx);
             uint64_t proc_cycles = 0;
 
-            for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
+            for (int j = 0; j < PMD_INTERVAL_MAX; j++) {
                 proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
             }
+            polling_cycles += proc_cycles;
             ds_put_format(reply, "  port: %-16s  queue-id: %2d", name,
                           netdev_rxq_get_queue_id(list[i].rxq->rx));
             ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
@@ -1265,6 +1283,19 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
             }
             ds_put_cstr(reply, "\n");
         }
+
+        ds_put_cstr(reply, "  overhead: ");
+        if (total_cycles) {
+            if (polling_cycles > busy_cycles) {
+                polling_cycles = busy_cycles;
+            }
+            ds_put_format(reply, "%2"PRIu64" %%",
+                          (busy_cycles - polling_cycles) * 100 / total_cycles);
+        } else {
+            ds_put_cstr(reply, "NOT AVAIL");
+        }
+        ds_put_cstr(reply, "\n");
+
         ovs_mutex_unlock(&pmd->port_mutex);
         free(list);
     }
@@ -4621,7 +4652,7 @@ static void
 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
                                 unsigned long long cycles)
 {
-    unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
+    unsigned int idx = rx->intrvl_idx++ % PMD_INTERVAL_MAX;
     atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
 }
 
@@ -5090,7 +5121,7 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
 
                 if (assign_cyc) {
                     /* Sum the queue intervals and store the cycle history. */
-                    for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
+                    for (unsigned i = 0; i < PMD_INTERVAL_MAX; i++) {
                         cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
                     }
                     dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
@@ -5587,7 +5618,7 @@ get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
             }
 
             /* Sum the queue intervals and store the cycle history. */
-            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
+            for (unsigned i = 0; i < PMD_INTERVAL_MAX; i++) {
                 cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
             }
             dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
@@ -5648,7 +5679,7 @@ get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
         /* Get the total pmd cycles for an interval. */
         atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
         /* Estimate the cycles to cover all intervals. */
-        total_cycles *= PMD_RXQ_INTERVAL_MAX;
+        total_cycles *= PMD_INTERVAL_MAX;
         for (int id = 0; id < num_pmds; id++) {
             if (pmd->core_id == core_list[id]) {
                 if (pmd_usage[id]) {
@@ -5707,11 +5738,11 @@ pmd_rebalance_dry_run(struct dp_netdev *dp)
         /* Get the total pmd cycles for an interval. */
         atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
         /* Estimate the cycles to cover all intervals. */
-        total_cycles *= PMD_RXQ_INTERVAL_MAX;
+        total_cycles *= PMD_INTERVAL_MAX;
 
         ovs_mutex_lock(&pmd->port_mutex);
         HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
-            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
+            for (unsigned i = 0; i < PMD_INTERVAL_MAX; i++) {
                 total_proc += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
             }
         }
@@ -5820,7 +5851,7 @@ dpif_netdev_run(struct dpif *dpif)
             pmd_alb->rebalance_poll_timer = now;
             CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
                 if (atomic_count_get(&pmd->pmd_overloaded) >=
-                                    PMD_RXQ_INTERVAL_MAX) {
+                                    PMD_INTERVAL_MAX) {
                     pmd_rebalance = true;
                     break;
                 }
@@ -6030,6 +6061,7 @@ reload:
 
     pmd->intrvl_tsc_prev = 0;
     atomic_store_relaxed(&pmd->intrvl_cycles, 0);
+    pmd->intrvl_idx = 0;
     cycles_counter_update(s);
 
     pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
@@ -6556,7 +6588,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
     pmd_thread_ctx_time_update(pmd);
     pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
     pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
-    pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
+    pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN;
     hmap_init(&pmd->poll_list);
     hmap_init(&pmd->tx_ports);
     hmap_init(&pmd->tnl_port_cache);
@@ -8777,31 +8809,33 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
     uint64_t tot_idle = 0, tot_proc = 0;
     unsigned int pmd_load = 0;
 
-    if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
+    if (pmd->ctx.now > pmd->next_cycle_store) {
         uint64_t curr_tsc;
         uint8_t rebalance_load_trigger;
         struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
-        if (pmd_alb->is_enabled && !pmd->isolated
-            && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
-                                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
-            && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
-                                        pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
-            {
+        unsigned int idx;
+
+        if (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
+                pmd->prev_stats[PMD_CYCLES_ITER_IDLE] &&
+            pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
+                pmd->prev_stats[PMD_CYCLES_ITER_BUSY]) {
             tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
                        pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
             tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
                        pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
 
-            if (tot_proc) {
-                pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
-            }
+            if (pmd_alb->is_enabled && !pmd->isolated) {
+                if (tot_proc) {
+                    pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
+                }
 
-            atomic_read_relaxed(&pmd_alb->rebalance_load_thresh,
-                                &rebalance_load_trigger);
-            if (pmd_load >= rebalance_load_trigger) {
-                atomic_count_inc(&pmd->pmd_overloaded);
-            } else {
-                atomic_count_set(&pmd->pmd_overloaded, 0);
+                atomic_read_relaxed(&pmd_alb->rebalance_load_thresh,
+                                    &rebalance_load_trigger);
+                if (pmd_load >= rebalance_load_trigger) {
+                    atomic_count_inc(&pmd->pmd_overloaded);
+                } else {
+                    atomic_count_set(&pmd->pmd_overloaded, 0);
+                }
             }
         }
 
@@ -8824,9 +8858,11 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
             atomic_store_relaxed(&pmd->intrvl_cycles,
                                  curr_tsc - pmd->intrvl_tsc_prev);
         }
+        idx = pmd->intrvl_idx++ % PMD_INTERVAL_MAX;
+        atomic_store_relaxed(&pmd->busy_cycles_intrvl[idx], tot_proc);
         pmd->intrvl_tsc_prev = curr_tsc;
         /* Start new measuring interval */
-        pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
+        pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN;
     }
 
     if (pmd->ctx.now > pmd->next_optimization) {
diff --git a/tests/pmd.at b/tests/pmd.at
index cc5371d5a5..256adb83f0 100644
--- a/tests/pmd.at
+++ b/tests/pmd.at
@@ -72,6 +72,7 @@ CHECK_PMD_THREADS_CREATED()
 AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl
 pmd thread numa_id <cleared> core_id <cleared>:
   isolated : false
+  usage: NOT AVAIL
   port: p0                queue-id:  0 (enabled)   pmd usage: NOT AVAIL
 ])
 
@@ -103,6 +104,7 @@ dummy at ovs-dummy: hit:0 missed:0
 AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl
 pmd thread numa_id <cleared> core_id <cleared>:
   isolated : false
+  usage: NOT AVAIL
   port: p0                queue-id:  0 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  1 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  2 (enabled)   pmd usage: NOT AVAIL
@@ -134,6 +136,7 @@ dummy at ovs-dummy: hit:0 missed:0
 AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl
 pmd thread numa_id <cleared> core_id <cleared>:
   isolated : false
+  usage: NOT AVAIL
   port: p0                queue-id:  0 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  1 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  2 (enabled)   pmd usage: NOT AVAIL
@@ -149,13 +152,13 @@ TMP=$(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])
 AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x3])
 CHECK_PMD_THREADS_CREATED([2], [], [+$TMP])
 
-AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | awk '/AVAIL$/ { printf("%s\t", $0); next } 1' | parse_pmd_rxq_show_group | sort], [0], [dnl
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | awk '/\<port:.*AVAIL$/ { printf("%s\t", $0); next } 1' | parse_pmd_rxq_show_group | sort], [0], [dnl
 port: p0 queue-id: 0 3 4 7
 port: p0 queue-id: 1 2 5 6
 ])
 
 AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=roundrobin])
-AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | awk '/AVAIL$/ { printf("%s\t", $0); next } 1' | parse_pmd_rxq_show_group | sort], [0], [dnl
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | awk '/\<port:.*AVAIL$/ { printf("%s\t", $0); next } 1' | parse_pmd_rxq_show_group | sort], [0], [dnl
 port: p0 queue-id: 0 2 4 6
 port: p0 queue-id: 1 3 5 7
 ])
@@ -167,6 +170,7 @@ CHECK_PMD_THREADS_CREATED([1], [], [+$TMP])
 AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl
 pmd thread numa_id <cleared> core_id <cleared>:
   isolated : false
+  usage: NOT AVAIL
   port: p0                queue-id:  0 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  1 (enabled)   pmd usage: NOT AVAIL
   port: p0                queue-id:  2 (enabled)   pmd usage: NOT AVAIL
-- 
2.23.0



More information about the dev mailing list