[ovs-dev] [PATCH v2] dpif-netdev: Report overhead busy cycles per pmd.

Kevin Traynor ktraynor at redhat.com
Wed Jul 7 15:49:57 UTC 2021


On 01/07/2021 17:41, David Marchand wrote:
> Users complained that per rxq pmd usage was confusing: summing those
> values per pmd would never reach 100% even if increasing traffic load
> beyond pmd capacity.
> 
> This is because the dpif-netdev/pmd-rxq-show command only reports "pure"
> rxq cycles while some cycles are used in the pmd mainloop and adds up to
> the total pmd load.
> 
> dpif-netdev/pmd-stats-show does report per pmd load usage.
> This load is measured since the last dpif-netdev/pmd-stats-clear call.
> On the other hand, the per rxq pmd usage reflects the pmd load on a 10s
> sliding window which makes it non trivial to correlate.
> 
> Gather per pmd busy cycles with the same periodicity and report the
> difference as overhead in dpif-netdev/pmd-rxq-show so that we have all
> info in a single command.
> 
> Example:
> $ ovs-appctl dpif-netdev/pmd-rxq-show
> pmd thread numa_id 1 core_id 3:
>   isolated : true
>   port: dpdk0             queue-id:  0 (enabled)   pmd usage: 90 %
>   overhead:  4 %
> pmd thread numa_id 1 core_id 5:
>   isolated : false
>   port: vhost0            queue-id:  0 (enabled)   pmd usage:  0 %
>   port: vhost1            queue-id:  0 (enabled)   pmd usage: 93 %
>   port: vhost2            queue-id:  0 (enabled)   pmd usage:  0 %
>   port: vhost6            queue-id:  0 (enabled)   pmd usage:  0 %
>   overhead:  6 %
> pmd thread numa_id 1 core_id 31:
>   isolated : true
>   port: dpdk1             queue-id:  0 (enabled)   pmd usage: 86 %
>   overhead:  4 %
> pmd thread numa_id 1 core_id 33:
>   isolated : false
>   port: vhost3            queue-id:  0 (enabled)   pmd usage:  0 %
>   port: vhost4            queue-id:  0 (enabled)   pmd usage:  0 %
>   port: vhost5            queue-id:  0 (enabled)   pmd usage: 92 %
>   port: vhost7            queue-id:  0 (enabled)   pmd usage:  0 %
>   overhead:  7 %
> 
> Signed-off-by: David Marchand <david.marchand at redhat.com>

LGTM. I did some tests with various configurations of pmds, rxqs,
traffic/no traffic etc. Working as expected, things I reported in v1 are
resolved. Unit test ok, checkpatch ok, GHA ok. Thanks!

Acked-by: Kevin Traynor <ktraynor at redhat.com>

> ---
> Changes since v1:
> - fixed unit test and documentation update,
> - moved documentation update under pmd-rxq-show command description,
> - updated commitlog,
> - renamed variables for better readability,
> - avoided reporting a N/A overhead for idle PMD,
> - reset overhead stats on PMD reconfigure,
> 
> ---
>  Documentation/topics/dpdk/pmd.rst |   5 ++
>  lib/dpif-netdev.c                 | 113 +++++++++++++++++++++---------
>  tests/pmd.at                      |   8 ++-
>  3 files changed, 89 insertions(+), 37 deletions(-)
> 
> diff --git a/Documentation/topics/dpdk/pmd.rst b/Documentation/topics/dpdk/pmd.rst
> index e481e79414..6dfc65724c 100644
> --- a/Documentation/topics/dpdk/pmd.rst
> +++ b/Documentation/topics/dpdk/pmd.rst
> @@ -164,6 +164,11 @@ queue::
>     due to traffic pattern or reconfig changes, will take one minute to be fully
>     reflected in the stats.
>  
> +.. versionchanged:: 2.16.0
> +
> +   A ``overhead`` statistics is shown per PMD: it represents the number of
> +   cycles inherently consumed by the OVS PMD processing loop.
> +
>  Rx queue to PMD assignment takes place whenever there are configuration changes
>  or can be triggered by using::
>  
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 026b52d27d..14f8e0246f 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -235,11 +235,11 @@ struct dfc_cache {
>  
>  /* Time in microseconds of the interval in which rxq processing cycles used
>   * in rxq to pmd assignments is measured and stored. */
> -#define PMD_RXQ_INTERVAL_LEN 10000000LL
> +#define PMD_INTERVAL_LEN 10000000LL
>  
>  /* Number of intervals for which cycles are stored
>   * and used during rxq to pmd assignment. */
> -#define PMD_RXQ_INTERVAL_MAX 6
> +#define PMD_INTERVAL_MAX 6
>  
>  /* Time in microseconds to try RCU quiescing. */
>  #define PMD_RCU_QUIESCE_INTERVAL 10000LL
> @@ -456,9 +456,9 @@ struct dp_netdev_rxq {
>  
>      /* Counters of cycles spent successfully polling and processing pkts. */
>      atomic_ullong cycles[RXQ_N_CYCLES];
> -    /* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
> +    /* We store PMD_INTERVAL_MAX intervals of data for an rxq and then
>         sum them to yield the cycles used for an rxq. */
> -    atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
> +    atomic_ullong cycles_intrvl[PMD_INTERVAL_MAX];
>  };
>  
>  /* A port in a netdev-based datapath. */
> @@ -694,13 +694,18 @@ struct dp_netdev_pmd_thread {
>      long long int next_optimization;
>      /* End of the next time interval for which processing cycles
>         are stored for each polled rxq. */
> -    long long int rxq_next_cycle_store;
> +    long long int next_cycle_store;
>  
>      /* Last interval timestamp. */
>      uint64_t intrvl_tsc_prev;
>      /* Last interval cycles. */
>      atomic_ullong intrvl_cycles;
>  
> +    /* Write index for 'busy_cycles_intrvl'. */
> +    unsigned int intrvl_idx;
> +    /* Busy cycles in last PMD_INTERVAL_MAX intervals. */
> +    atomic_ullong busy_cycles_intrvl[PMD_INTERVAL_MAX];
> +
>      /* Current context of the PMD thread. */
>      struct dp_netdev_pmd_thread_ctx ctx;
>  
> @@ -1220,6 +1225,8 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
>          struct rxq_poll *list;
>          size_t n_rxq;
>          uint64_t total_cycles = 0;
> +        uint64_t busy_cycles = 0;
> +        uint64_t total_rxq_proc_cycles = 0;
>  
>          ds_put_format(reply,
>                        "pmd thread numa_id %d core_id %u:\n  isolated : %s\n",
> @@ -1232,16 +1239,27 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
>          /* Get the total pmd cycles for an interval. */
>          atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
>          /* Estimate the cycles to cover all intervals. */
> -        total_cycles *= PMD_RXQ_INTERVAL_MAX;
> +        total_cycles *= PMD_INTERVAL_MAX;
> +
> +        for (int j = 0; j < PMD_INTERVAL_MAX; j++) {
> +            uint64_t cycles;
> +
> +            atomic_read_relaxed(&pmd->busy_cycles_intrvl[j], &cycles);
> +            busy_cycles += cycles;
> +        }
> +        if (busy_cycles > total_cycles) {
> +            busy_cycles = total_cycles;
> +        }
>  
>          for (int i = 0; i < n_rxq; i++) {
>              struct dp_netdev_rxq *rxq = list[i].rxq;
>              const char *name = netdev_rxq_get_name(rxq->rx);
> -            uint64_t proc_cycles = 0;
> +            uint64_t rxq_proc_cycles = 0;
>  
> -            for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
> -                proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
> +            for (int j = 0; j < PMD_INTERVAL_MAX; j++) {
> +                rxq_proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
>              }
> +            total_rxq_proc_cycles += rxq_proc_cycles;
>              ds_put_format(reply, "  port: %-16s  queue-id: %2d", name,
>                            netdev_rxq_get_queue_id(list[i].rxq->rx));
>              ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
> @@ -1249,13 +1267,30 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
>              ds_put_format(reply, "  pmd usage: ");
>              if (total_cycles) {
>                  ds_put_format(reply, "%2"PRIu64"",
> -                              proc_cycles * 100 / total_cycles);
> +                              rxq_proc_cycles * 100 / total_cycles);
>                  ds_put_cstr(reply, " %");
>              } else {
>                  ds_put_format(reply, "%s", "NOT AVAIL");
>              }
>              ds_put_cstr(reply, "\n");
>          }
> +
> +        if (n_rxq > 0) {
> +            ds_put_cstr(reply, "  overhead: ");
> +            if (total_cycles) {
> +                uint64_t overhead_cycles = 0;
> +
> +                if (total_rxq_proc_cycles < busy_cycles) {
> +                    overhead_cycles = busy_cycles - total_rxq_proc_cycles;
> +                }
> +                ds_put_format(reply, "%2"PRIu64" %%",
> +                              overhead_cycles * 100 / total_cycles);
> +            } else {
> +                ds_put_cstr(reply, "NOT AVAIL");
> +            }
> +            ds_put_cstr(reply, "\n");
> +        }
> +
>          ovs_mutex_unlock(&pmd->port_mutex);
>          free(list);
>      }
> @@ -4641,7 +4676,7 @@ static void
>  dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
>                                  unsigned long long cycles)
>  {
> -    unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
> +    unsigned int idx = rx->intrvl_idx++ % PMD_INTERVAL_MAX;
>      atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
>  }
>  
> @@ -5110,7 +5145,7 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
>  
>                  if (assign_cyc) {
>                      /* Sum the queue intervals and store the cycle history. */
> -                    for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
> +                    for (unsigned i = 0; i < PMD_INTERVAL_MAX; i++) {
>                          cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
>                      }
>                      dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
> @@ -5607,7 +5642,7 @@ get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
>              }
>  
>              /* Sum the queue intervals and store the cycle history. */
> -            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
> +            for (unsigned i = 0; i < PMD_INTERVAL_MAX; i++) {
>                  cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
>              }
>              dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
> @@ -5668,7 +5703,7 @@ get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
>          /* Get the total pmd cycles for an interval. */
>          atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
>          /* Estimate the cycles to cover all intervals. */
> -        total_cycles *= PMD_RXQ_INTERVAL_MAX;
> +        total_cycles *= PMD_INTERVAL_MAX;
>          for (int id = 0; id < num_pmds; id++) {
>              if (pmd->core_id == core_list[id]) {
>                  if (pmd_usage[id]) {
> @@ -5727,11 +5762,11 @@ pmd_rebalance_dry_run(struct dp_netdev *dp)
>          /* Get the total pmd cycles for an interval. */
>          atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
>          /* Estimate the cycles to cover all intervals. */
> -        total_cycles *= PMD_RXQ_INTERVAL_MAX;
> +        total_cycles *= PMD_INTERVAL_MAX;
>  
>          ovs_mutex_lock(&pmd->port_mutex);
>          HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
> -            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
> +            for (unsigned i = 0; i < PMD_INTERVAL_MAX; i++) {
>                  total_proc += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
>              }
>          }
> @@ -5840,7 +5875,7 @@ dpif_netdev_run(struct dpif *dpif)
>              pmd_alb->rebalance_poll_timer = now;
>              CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>                  if (atomic_count_get(&pmd->pmd_overloaded) >=
> -                                    PMD_RXQ_INTERVAL_MAX) {
> +                                    PMD_INTERVAL_MAX) {
>                      pmd_rebalance = true;
>                      break;
>                  }
> @@ -6050,6 +6085,10 @@ reload:
>  
>      pmd->intrvl_tsc_prev = 0;
>      atomic_store_relaxed(&pmd->intrvl_cycles, 0);
> +    for (i = 0; i < PMD_INTERVAL_MAX; i++) {
> +        atomic_store_relaxed(&pmd->busy_cycles_intrvl[i], 0);
> +    }
> +    pmd->intrvl_idx = 0;
>      cycles_counter_update(s);
>  
>      pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
> @@ -6580,7 +6619,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
>      pmd_thread_ctx_time_update(pmd);
>      pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
>      pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
> -    pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
> +    pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN;
>      hmap_init(&pmd->poll_list);
>      hmap_init(&pmd->tx_ports);
>      hmap_init(&pmd->tnl_port_cache);
> @@ -8838,31 +8877,33 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
>      uint64_t tot_idle = 0, tot_proc = 0;
>      unsigned int pmd_load = 0;
>  
> -    if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
> +    if (pmd->ctx.now > pmd->next_cycle_store) {
>          uint64_t curr_tsc;
>          uint8_t rebalance_load_trigger;
>          struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
> -        if (pmd_alb->is_enabled && !pmd->isolated
> -            && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
> -                                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE])
> -            && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
> -                                        pmd->prev_stats[PMD_CYCLES_ITER_BUSY]))
> -            {
> +        unsigned int idx;
> +
> +        if (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
> +                pmd->prev_stats[PMD_CYCLES_ITER_IDLE] &&
> +            pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
> +                pmd->prev_stats[PMD_CYCLES_ITER_BUSY]) {
>              tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
>                         pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
>              tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
>                         pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
>  
> -            if (tot_proc) {
> -                pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
> -            }
> +            if (pmd_alb->is_enabled && !pmd->isolated) {
> +                if (tot_proc) {
> +                    pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
> +                }
>  
> -            atomic_read_relaxed(&pmd_alb->rebalance_load_thresh,
> -                                &rebalance_load_trigger);
> -            if (pmd_load >= rebalance_load_trigger) {
> -                atomic_count_inc(&pmd->pmd_overloaded);
> -            } else {
> -                atomic_count_set(&pmd->pmd_overloaded, 0);
> +                atomic_read_relaxed(&pmd_alb->rebalance_load_thresh,
> +                                    &rebalance_load_trigger);
> +                if (pmd_load >= rebalance_load_trigger) {
> +                    atomic_count_inc(&pmd->pmd_overloaded);
> +                } else {
> +                    atomic_count_set(&pmd->pmd_overloaded, 0);
> +                }
>              }
>          }
>  
> @@ -8885,9 +8926,11 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
>              atomic_store_relaxed(&pmd->intrvl_cycles,
>                                   curr_tsc - pmd->intrvl_tsc_prev);
>          }
> +        idx = pmd->intrvl_idx++ % PMD_INTERVAL_MAX;
> +        atomic_store_relaxed(&pmd->busy_cycles_intrvl[idx], tot_proc);
>          pmd->intrvl_tsc_prev = curr_tsc;
>          /* Start new measuring interval */
> -        pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
> +        pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN;
>      }
>  
>      if (pmd->ctx.now > pmd->next_optimization) {
> diff --git a/tests/pmd.at b/tests/pmd.at
> index e0d54c1dd3..e6925ed4da 100644
> --- a/tests/pmd.at
> +++ b/tests/pmd.at
> @@ -73,6 +73,7 @@ AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0],
>  pmd thread numa_id <cleared> core_id <cleared>:
>    isolated : false
>    port: p0                queue-id:  0 (enabled)   pmd usage: NOT AVAIL
> +  overhead: NOT AVAIL
>  ])
>  
>  AT_CHECK([ovs-appctl dpif/show | sed 's/\(tx_queues=\)[[0-9]]*/\1<cleared>/g'], [0], [dnl
> @@ -111,6 +112,7 @@ pmd thread numa_id <cleared> core_id <cleared>:
>    port: p0                queue-id:  5 (enabled)   pmd usage: NOT AVAIL
>    port: p0                queue-id:  6 (enabled)   pmd usage: NOT AVAIL
>    port: p0                queue-id:  7 (enabled)   pmd usage: NOT AVAIL
> +  overhead: NOT AVAIL
>  ])
>  
>  OVS_VSWITCHD_STOP
> @@ -142,6 +144,7 @@ pmd thread numa_id <cleared> core_id <cleared>:
>    port: p0                queue-id:  5 (enabled)   pmd usage: NOT AVAIL
>    port: p0                queue-id:  6 (enabled)   pmd usage: NOT AVAIL
>    port: p0                queue-id:  7 (enabled)   pmd usage: NOT AVAIL
> +  overhead: NOT AVAIL
>  ])
>  
>  AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=cycles])
> @@ -149,13 +152,13 @@ TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
>  AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x3])
>  CHECK_PMD_THREADS_CREATED([2], [], [+$TMP])
>  
> -AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | awk '/AVAIL$/ { printf("%s\t", $0); next } 1' | parse_pmd_rxq_show_group | sort], [0], [dnl
> +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | awk '/\<port:.*AVAIL$/ { printf("%s\t", $0); next } 1' | parse_pmd_rxq_show_group | sort], [0], [dnl
>  port: p0 queue-id: 0 3 4 7
>  port: p0 queue-id: 1 2 5 6
>  ])
>  
>  AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=roundrobin])
> -AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | awk '/AVAIL$/ { printf("%s\t", $0); next } 1' | parse_pmd_rxq_show_group | sort], [0], [dnl
> +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | awk '/\<port:.*AVAIL$/ { printf("%s\t", $0); next } 1' | parse_pmd_rxq_show_group | sort], [0], [dnl
>  port: p0 queue-id: 0 2 4 6
>  port: p0 queue-id: 1 3 5 7
>  ])
> @@ -175,6 +178,7 @@ pmd thread numa_id <cleared> core_id <cleared>:
>    port: p0                queue-id:  5 (enabled)   pmd usage: NOT AVAIL
>    port: p0                queue-id:  6 (enabled)   pmd usage: NOT AVAIL
>    port: p0                queue-id:  7 (enabled)   pmd usage: NOT AVAIL
> +  overhead: NOT AVAIL
>  ])
>  
>  OVS_VSWITCHD_STOP
> 



More information about the dev mailing list