[ovs-dev] [PATCH v5 2/3] dpif-netdev: Detailed performance stats for PMDs

O Mahony, Billy billy.o.mahony at intel.com
Tue Jan 9 16:25:06 UTC 2018


Hi Jan,

Thanks these patches are really very useful.

I haven't finished trying them out but thought you'd prefer to get initial comments earlier. I'll continue to try to them out and revert with any further other comments later.

Regards,
Billy.

> -----Original Message-----
> From: ovs-dev-bounces at openvswitch.org [mailto:ovs-dev-
> bounces at openvswitch.org] On Behalf Of Jan Scheurich
> Sent: Thursday, January 4, 2018 12:08 PM
> To: dev at openvswitch.org
> Subject: [ovs-dev] [PATCH v5 2/3] dpif-netdev: Detailed performance stats
> for PMDs
> 
> This patch instruments the dpif-netdev datapath to record detailed statistics
> of what is happening in every iteration of a PMD thread.
> 
> The collection of detailed statistics can be controlled by a new configuration
> parameter "other_config:pmd-perf-metrics". By default it is disabled. The
[[BO'M]] Specify table Open_vSwitch
> run-time overhead, when enabled, is in the order of 1%.
[[BO'M]] When I enable metrics I get a reduction in max lossless throughput of about 0.6%. That is with simple port forwarding (~200 cycles per-packet) so is probably a worst case scenario. So really very little overhead for such detailed stats.
> 
> The covered metrics per iteration are:
>   - cycles
>   - packets
>   - (rx) batches
>   - packets/batch
>   - max. vhostuser qlen
>   - upcalls
>   - cycles spent in upcalls
> 
> This raw recorded data is used threefold:
> 
> 1. In histograms for each of the following metrics:
>    - cycles/iteration (log.)
>    - packets/iteration (log.)
>    - cycles/packet
>    - packets/batch
>    - max. vhostuser qlen (log.)
>    - upcalls
>    - cycles/upcall (log)
>    The histograms bins are divided linear or logarithmic.
> 
> 2. A cyclic history of the above statistics for 1024 iterations
[[BO'M]] I only get a max of 999 iterations printed.
> 
> 3. A cyclic history of the cummulative/average values per millisecond
[[BO'M]] typo
>    wall clock for the last 1024 milliseconds:
[[BO'M]] I only get a max of 1000 previous ms printed.
>    - number of iterations
>    - avg. cycles/iteration
>    - packets (Kpps)
>    - avg. packets/batch
>    - avg. max vhost qlen
>    - upcalls
>    - avg. cycles/upcall
> 
> The gathered performance statists can be printed at any time with the new
[[BO'M]] typo
> CLI command
> 
> ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
> 	[-pmd core] [dp]
> 
> The options are
> 
> -nh:            Suppress the histograms
> -it iter_len:   Display the last iter_len iteration stats
> -ms ms_len:     Display the last ms_len millisecond stats
> -pmd core:      Display only
> 
> The performance statistics are reset with the existing dpif-netdev/pmd-
> stats-clear command.
> 
> The output always contains the following global PMD statistics, similar to the
> pmd-stats-show command:
> 
> Time: 15:24:55.270
> Measurement duration: 1.008 s
> 
> pmd thread numa_id 0 core_id 1:
> 
>   Cycles:            2419034712  (2.40 GHz)
>   Iterations:            572817  (1.76 us/it)
>   - idle:                486808  (15.9 % cycles)
>   - busy:                 86009  (84.1 % cycles)
>   Packets:              2399607  (2381 Kpps, 848 cycles/pkt)
>   Datapath passes:      3599415  (1.50 passes/pkt)
>   - EMC hits:            336472  ( 9.3 %)
>   - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
>   - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
>   - Lost upcalls:             0  ( 0.0 %)
> 
> Signed-off-by: Jan Scheurich <jan.scheurich at ericsson.com>
> ---
>  lib/dp-packet.h        |   2 +
>  lib/dpif-netdev-perf.c | 309
> ++++++++++++++++++++++++++++++++++++++++++++++++-
>  lib/dpif-netdev-perf.h | 173 ++++++++++++++++++++++++++-
>  lib/dpif-netdev.c      | 158 +++++++++++++++++++++++--
>  lib/netdev-dpdk.c      |  23 +++-
>  lib/netdev-dpdk.h      |  14 +++
>  ofproto/ofproto-dpif.c |   3 +-
>  7 files changed, 664 insertions(+), 18 deletions(-)
> 
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h index b4b721c..7950247 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -695,8 +695,10 @@ enum { NETDEV_MAX_BURST = 32 }; /* Maximum
> number packets in a batch. */
> 
>  struct dp_packet_batch {
>      size_t count;
> +    size_t qfill; /* Number of packets in queue when reading rx burst.
> + */
>      bool trunc; /* true if the batch needs truncate. */
>      struct dp_packet *packets[NETDEV_MAX_BURST];
> +
>  };
> 
>  static inline void
> diff --git a/lib/dpif-netdev-perf.c b/lib/dpif-netdev-perf.c index
> 7d8b7b2..a66a48c 100644
> --- a/lib/dpif-netdev-perf.c
> +++ b/lib/dpif-netdev-perf.c
> @@ -15,6 +15,7 @@
>   */
> 
>  #include <config.h>
> +#include <stdint.h>
> 
>  #ifdef DPDK_NETDEV
>  #include <rte_cycles.h>
> @@ -27,13 +28,307 @@
> 
>  VLOG_DEFINE_THIS_MODULE(pmd_perf);
> 
> +#ifdef DPDK_NETDEV
> +static uint64_t
> +get_tsc_hz(void)
> +{
> +    return rte_get_tsc_hz();[[BO'M]] maG
> +}
> +#else
> +static uint64_t
> +read_tsc(void)
> +{
> +    register uint64_t tsc asm("eax");
> +    asm volatile (".byte 15, 49" : : : "eax", "edx");
> +    return tsc;
> +}
[[BO'M]] How will this affect other instruction sets/architectures? Should have a compiling even if non-functioning case for non-x86.
> +
> +static uint64_t
> +get_tsc_hz(void)
> +{
> +    uint64_t tsc_start = read_tsc();
> +    xsleep(1);
> +    return read_tsc() - tsc_start;
> +}
> +#endif
> +
> +static void
> +histogram_walls_set_lin(struct histogram *hist, uint32_t min, uint32_t
> +max) {
> +    int i;
> +
> +    ovs_assert(min < max);
> +    for (i = 0; i < NUM_BINS-1; i++) {
> +        hist->wall[i] = min + (i * (max - min)) / (NUM_BINS - 2);
> +    }
> +    hist->wall[NUM_BINS-1] = UINT32_MAX; }
> +
> +static void
> +histogram_walls_set_log(struct histogram *hist, uint32_t min, uint32_t
> +max) {
> +    int i, start, bins, wall;
> +    double log_min, log_max;
> +
> +    ovs_assert(min < max);
> +    if (min > 0) {
> +        log_min = log(min);
> +        log_max = log(max);
> +        start = 0;
> +        bins = NUM_BINS - 1;
> +    } else {
> +        hist->wall[0] = 0;
> +        log_min = log(1);
> +        log_max = log(max);
> +        start = 1;
> +        bins = NUM_BINS - 2;
> +    }
> +    wall = start;
> +    for (i = 0; i < bins; i++) {
> +        /* Make sure each wall is monotonically increasing. */
> +        wall = MAX(wall, exp(log_min + (i * (log_max - log_min)) / (bins-1)));
> +        hist->wall[start + i] = wall++;
> +    }
> +    if (hist->wall[NUM_BINS-2] < max) {
> +        hist->wall[NUM_BINS-2] = max;
> +    }
> +    hist->wall[NUM_BINS-1] = UINT32_MAX; }
> +
> +uint64_t
> +histogram_samples(const struct histogram *hist) {
> +    uint64_t samples = 0;
> +
> +    for (int i = 0; i < NUM_BINS; i++) {
> +        samples += hist->bin[i];
> +    }
> +    return samples;
> +}
> +
> +static void
> +histogram_clear(struct histogram *hist) {
> +    int i;
> +
> +    for (i = 0; i < NUM_BINS; i++) {
> +        hist->bin[i] = 0;
> +    }
> +}
> +
> +static void
> +history_init(struct history *h)
> +{
> +    memset(h, 0, sizeof(*h));
> +}
> +
>  void
>  pmd_perf_stats_init(struct pmd_perf_stats *s) {
> -    memset(s, 0 , sizeof(*s));
> +    /* The struct has been zeroed at allocation. */
> +    histogram_walls_set_log(&s->cycles, 500, 24000000);
> +    histogram_walls_set_log(&s->pkts, 0, 1000);
> +    histogram_walls_set_lin(&s->cycles_per_pkt, 100, 30000);
> +    histogram_walls_set_lin(&s->pkts_per_batch, 0, 32);
> +    histogram_walls_set_lin(&s->upcalls, 0, 30);
> +    histogram_walls_set_log(&s->cycles_per_upcall, 1000, 1000000);
> +    histogram_walls_set_log(&s->max_vhost_qfill, 0, 512);
>      s->start_ms = time_msec();
>  }
> 
>  void
> +pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s,
> +                              double duration) {
> +    uint64_t stats[PMD_N_STATS];
> +    double us_per_cycle = 1000000.0 / get_tsc_hz();
> +
> +    if (duration == 0) {
> +        return;
> +    }
> +
> +    pmd_perf_read_counters(s, stats);
> +    uint64_t tot_cycles = stats[PMD_CYCLES_ITER_IDLE] +
> +                          stats[PMD_CYCLES_ITER_BUSY];
> +    uint64_t packets = stats[PMD_STAT_RECV];
> +    uint64_t passes = stats[PMD_STAT_RECV] +
> +                      stats[PMD_STAT_RECIRC];
> +    uint64_t upcalls = stats[PMD_STAT_MISS];
> +    uint64_t upcall_cycles = stats[PMD_CYCLES_UPCALL];
> +    uint64_t tot_iter = histogram_samples(&s->pkts);
> +    uint64_t idle_iter = s->pkts.bin[0];
> +    uint64_t busy_iter = tot_iter >= idle_iter ? tot_iter - idle_iter :
> + 0;
> +
> +    ds_put_format(str,
> +            "  Cycles:          %12"PRIu64"  (%.2f GHz)\n"
> +            "  Iterations:      %12"PRIu64"  (%.2f us/it)\n"
> +            "  - idle:          %12"PRIu64"  (%4.1f %% cycles)\n"
> +            "  - busy:          %12"PRIu64"  (%4.1f %% cycles)\n",
> +            tot_cycles, (tot_cycles / duration) / 1E9,
> +            tot_iter, tot_cycles * us_per_cycle / tot_iter,
> +            idle_iter,
> +            100.0 * stats[PMD_CYCLES_ITER_IDLE] / tot_cycles,
> +            busy_iter,
> +            100.0 * stats[PMD_CYCLES_ITER_BUSY] / tot_cycles);
> +    if (packets > 0) {
> +        ds_put_format(str,
> +            "  Packets:         %12"PRIu64"  (%.0f Kpps, %.0f cycles/pkt)\n"
> +            "  Datapath passes: %12"PRIu64"  (%.2f passes/pkt)\n"
> +            "  - EMC hits:      %12"PRIu64"  (%4.1f %%)\n"
> +            "  - Megaflow hits: %12"PRIu64"  (%4.1f %%, %.2f subtbl lookups/"
> +                                                                     "hit)\n"
> +            "  - Upcalls:       %12"PRIu64"  (%4.1f %%, %.1f us/upcall)\n"
> +            "  - Lost upcalls:  %12"PRIu64"  (%4.1f %%)\n"
> +            "\n",
> +            packets, (packets / duration) / 1000,
> +            1.0 * stats[PMD_CYCLES_ITER_BUSY] / packets,
> +            passes, packets ? 1.0 * passes / packets : 0,
> +            stats[PMD_STAT_EXACT_HIT],
> +            100.0 * stats[PMD_STAT_EXACT_HIT] / passes,
> +            stats[PMD_STAT_MASKED_HIT],
> +            100.0 * stats[PMD_STAT_MASKED_HIT] / passes,
> +            stats[PMD_STAT_MASKED_HIT]
> +            ? 1.0 * stats[PMD_STAT_MASKED_LOOKUP] /
> stats[PMD_STAT_MASKED_HIT]
> +            : 0,
> +            upcalls, 100.0 * upcalls / passes,
> +            upcalls ? (upcall_cycles * us_per_cycle) / upcalls : 0,
> +            stats[PMD_STAT_LOST],
> +            100.0 * stats[PMD_STAT_LOST] / passes);
> +    } else {
> +        ds_put_format(str,
> +                "  Packets:         %12"PRIu64"\n"
> +                "\n",
> +                0UL);
> +    }
> +}
> +
> +void
> +pmd_perf_format_histograms(struct ds *str, struct pmd_perf_stats *s) {
> +    int i;
> +
> +    ds_put_cstr(str, "Histograms\n");
> +    ds_put_format(str,
> +                  "   %-21s  %-21s  %-21s  %-21s  %-21s  %-21s  %-21s\n",
> +                  "cycles/it", "packets/it", "cycles/pkt", "pkts/batch",
> +                  "max vhost qlen", "upcalls/it", "cycles/upcall");
> +    for (i = 0; i < NUM_BINS-1; i++) {
> +        ds_put_format(str,
> +            "   %-9d %-11"PRIu64"  %-9d %-11"PRIu64"  %-9d %-11"PRIu64
> +            "  %-9d %-11"PRIu64"  %-9d %-11"PRIu64"  %-9d %-11"PRIu64
> +            "  %-9d %-11"PRIu64"\n",
> +            s->cycles.wall[i], s->cycles.bin[i],
> +            s->pkts.wall[i],s->pkts.bin[i],
> +            s->cycles_per_pkt.wall[i], s->cycles_per_pkt.bin[i],
> +            s->pkts_per_batch.wall[i], s->pkts_per_batch.bin[i],
> +            s->max_vhost_qfill.wall[i], s->max_vhost_qfill.bin[i],
> +            s->upcalls.wall[i], s->upcalls.bin[i],
> +            s->cycles_per_upcall.wall[i], s->cycles_per_upcall.bin[i]);
> +    }
> +    ds_put_format(str,
> +                  "   %-9s %-11"PRIu64"  %-9s %-11"PRIu64"  %-9s %-11"PRIu64
> +                  "  %-9s %-11"PRIu64"  %-9s %-11"PRIu64"  %-9s %-11"PRIu64
> +                  "  %-9s %-11"PRIu64"\n",
> +                  ">", s->cycles.bin[i],
> +                  ">", s->pkts.bin[i],
> +                  ">", s->cycles_per_pkt.bin[i],
> +                  ">", s->pkts_per_batch.bin[i],
> +                  ">", s->max_vhost_qfill.bin[i],
> +                  ">", s->upcalls.bin[i],
> +                  ">", s->cycles_per_upcall.bin[i]);
> +    if (s->totals.iterations > 0) {
> +        ds_put_cstr(str,
> +                    "-----------------------------------------------------"
> +                    "-----------------------------------------------------"
> +                    "------------------------------------------------\n");
> +        ds_put_format(str,
> +                      "   %-21s  %-21s  %-21s  %-21s  %-21s  %-21s  %-21s\n",
> +                      "cycles/it", "packets/it", "cycles/pkt", "pkts/batch",
> +                      "vhost qlen", "upcalls/it", "cycles/upcall");
> +        ds_put_format(str,
> +                      "   %-21"PRIu64"  %-21.5f  %-21"PRIu64
> +                      "  %-21.5f  %-21.5f  %-21.5f  %-21"PRIu32"\n",
> +                      s->totals.cycles / s->totals.iterations,
> +                      1.0 * s->totals.pkts / s->totals.iterations,
> +                      s->totals.pkts
> +                          ? s->totals.busy_cycles / s->totals.pkts : 0,
> +                      s->totals.batches
> +                          ? 1.0 * s->totals.pkts / s->totals.batches : 0,
> +                      1.0 * s->totals.max_vhost_qfill / s->totals.iterations,
> +                      1.0 * s->totals.upcalls / s->totals.iterations,
> +                      s->totals.upcalls
> +                          ? s->totals.upcall_cycles / s->totals.upcalls : 0);
> +    }
> +}
> +
> +void
> +pmd_perf_format_iteration_history(struct ds *str, struct pmd_perf_stats
> *s,
> +                                  int n_iter) {
> +    struct iter_stats *is;
> +    size_t index;
> +    int i;
> +
> +    if (n_iter == 0) {
> +        return;
> +    }
> +    ds_put_format(str, "   %-17s   %-10s   %-10s   %-10s   %-10s   "
> +                  "%-10s   %-10s   %-10s\n",
> +                  "tsc", "cycles", "packets", "cycles/pkt", "pkts/batch",
> +                  "vhost qlen", "upcalls", "cycles/upcall");
> +    for (i = 1; i <= n_iter; i++) {
> +        index = (s->iterations.idx + HISTORY_LEN - i) % HISTORY_LEN;
> +        is = &s->iterations.sample[index];
> +        ds_put_format(str,
> +                      "   %-17"PRIu64"   %-11"PRIu64"  %-11"PRIu32
> +                      "  %-11"PRIu64"  %-11"PRIu32"  %-11"PRIu32
> +                      "  %-11"PRIu32"  %-11"PRIu32"\n",
> +                      is->timestamp,
> +                      is->cycles,
> +                      is->pkts,
> +                      is->pkts ? is->cycles / is->pkts : 0,
> +                      is->batches ? is->pkts / is->batches : 0,
> +                      is->max_vhost_qfill,
> +                      is->upcalls,
> +                      is->upcalls ? is->upcall_cycles / is->upcalls : 0);
> +    }
> +}
> +
> +void
> +pmd_perf_format_ms_history(struct ds *str, struct pmd_perf_stats *s,
> +int n_ms) {
> +    struct iter_stats *is;
> +    size_t index;
> +    int i;
> +
> +    if (n_ms == 0) {
> +        return;
> +    }
> +    ds_put_format(str,
> +                  "   %-12s   %-10s   %-10s   %-10s   %-10s"
> +                  "   %-10s   %-10s   %-10s   %-10s\n",
> +                  "ms", "iterations", "cycles/it", "Kpps", "cycles/pkt",
> +                  "pkts/batch", "vhost qlen", "upcalls", "cycles/upcall");
> +    for (i = 1; i <= n_ms; i++) {
> +        index = (s->milliseconds.idx + HISTORY_LEN - i) % HISTORY_LEN;
> +        is = &s->milliseconds.sample[index];
> +        ds_put_format(str,
> +                      "   %-12"PRIu64"   %-11"PRIu32"  %-11"PRIu64
> +                      "  %-11"PRIu32"  %-11"PRIu64"  %-11"PRIu32
> +                      "  %-11"PRIu32"  %-11"PRIu32"  %-11"PRIu32"\n",
> +                      is->timestamp,
> +                      is->iterations,
> +                      is->iterations ? is->cycles / is->iterations : 0,
> +                      is->pkts,
> +                      is->pkts ? is->busy_cycles / is->pkts : 0,
> +                      is->batches ? is->pkts / is->batches : 0,
> +                      is->iterations
> +                          ? is->max_vhost_qfill / is->iterations : 0,
> +                      is->upcalls,
> +                      is->upcalls ? is->upcall_cycles / is->upcalls : 0);
> +    }
> +}
> +
> +void
>  pmd_perf_read_counters(struct pmd_perf_stats *s,
>                         uint64_t stats[PMD_N_STATS])  { @@ -59,8 +354,20 @@ void
> pmd_perf_stats_clear(struct pmd_perf_stats *s)  {
>      s->start_ms = 0; /* Marks start of clearing. */
> +    memset(&s->current, 0 , sizeof(struct iter_stats));
> +    memset(&s->totals, 0 , sizeof(struct iter_stats));
>      for (int i = 0; i < PMD_N_STATS; i++) {
>          atomic_read_relaxed(&s->counters.n[i], &s->counters.zero[i]);
>      }
> +    histogram_clear(&s->cycles);
> +    histogram_clear(&s->pkts);
> +    histogram_clear(&s->cycles_per_pkt);
> +    histogram_clear(&s->upcalls);
> +    histogram_clear(&s->cycles_per_upcall);
> +    histogram_clear(&s->pkts_per_batch);
> +    histogram_clear(&s->max_vhost_qfill);
> +    history_init(&s->iterations);
> +    history_init(&s->milliseconds);
>      s->start_ms = time_msec(); /* Clearing finished. */
> +    s->milliseconds.sample[0].timestamp = s->start_ms;
>  }
[[BO'M]] This new metrics data (histograms & history) does not have a shadow 'zero' mark to . This ought to make the metric data more susceptible to race where an old (large) value being written back in when it's incremented by the pmd thread. 
The use of start_ms to avoid this helps but the check on that in pmd_perf_start_iteration will also be subject to race in itself (I think).
Apologies is that is not the case. If it is then we can use that the 1) cli thread is not in a particular hurry to get done and 2) that we don't mind missing some iterations when we reset the stats can be used here to effectively avoid a race.
1. cli-thread - uses start_ms (or some other flag to tell pmd to stop incrementing stats)
2. cli-thread - wait long enough for several pmd iterations to pass
3. cli-thread - clear data
4. cli-thread - use start_ms to tell the pmd to continue incrementing stats

> diff --git a/lib/dpif-netdev-perf.h b/lib/dpif-netdev-perf.h index
> fa5cf37..3611723 100644
> --- a/lib/dpif-netdev-perf.h
> +++ b/lib/dpif-netdev-perf.h
> @@ -33,6 +33,11 @@
>  extern "C" {
>  #endif
> 
> +#define NUM_BINS 32             /* Number of histogram bins. */
> +#define HISTORY_LEN 1000        /* Length of recorded history
> +                                   (iterations and ms). */
[[BO'M]] Explains why I don't see 1024 history elements! There is still a difference between the ms and iter histories one show 1000 the other 999 (last one is all zeros).

> +#define DEF_HIST_SHOW 20        /* Default number of history samples. */
> +
>  enum pmd_stat_type {
>      PMD_STAT_EXACT_HIT,     /* Packets that had an exact match (emc). */
>      PMD_STAT_MASKED_HIT,    /* Packets that matched in the flow table. */
> @@ -62,6 +67,28 @@ enum pmd_stat_type {
>      PMD_N_STATS
>  };
> 
> +struct histogram {
> +    uint32_t wall[NUM_BINS];
> +    uint64_t bin[NUM_BINS];
> +};
> +
> +struct iter_stats {
> +    uint64_t timestamp;
> +    uint64_t cycles;
> +    uint64_t busy_cycles;
> +    uint32_t iterations;
> +    uint32_t pkts;
> +    uint32_t upcalls;
> +    uint32_t upcall_cycles;
> +    uint32_t batches;
> +    uint32_t max_vhost_qfill;
> +};
> +
> +struct history {
> +    uint64_t idx;
> +    struct iter_stats sample[HISTORY_LEN]; };
> +
>  struct pmd_counters {
>      atomic_uint64_t n[PMD_N_STATS];     /* Indexed by PMD_STAT_*. */
>      uint64_t zero[PMD_N_STATS];
> @@ -71,6 +98,17 @@ struct pmd_perf_stats {
>      uint64_t start_ms;
>      uint64_t last_tsc;
>      struct pmd_counters counters;
> +    struct iter_stats current;
> +    struct iter_stats totals;
> +    struct histogram cycles;
> +    struct histogram pkts;
> +    struct histogram cycles_per_pkt;
> +    struct histogram upcalls;
> +    struct histogram cycles_per_upcall;
> +    struct histogram pkts_per_batch;
> +    struct histogram max_vhost_qfill;
> +    struct history iterations;
> +    struct history milliseconds;
>  };
> 
>  enum pmd_info_type;
> @@ -98,6 +136,63 @@ pmd_perf_update_counter(struct pmd_perf_stats
> *s,
>      atomic_store_relaxed(&s->counters.n[counter], tmp);  }
> 
> +void pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats
> *s,
> +                                   double duration); void
> +pmd_perf_format_histograms(struct ds *str, struct pmd_perf_stats *s);
> +void pmd_perf_format_iteration_history(struct ds *str,
> +                                       struct pmd_perf_stats *s,
> +                                       int n_iter); void
> +pmd_perf_format_ms_history(struct ds *str, struct pmd_perf_stats *s,
> +                                int n_ms);
> +
> +void pmd_perf_show(struct unixctl_conn *conn, int argc,
> +                   const char *argv[],
> +                   void *aux OVS_UNUSED); static inline void
> +histogram_add_sample(struct histogram *hist, uint32_t val) {
> +    /* TODO: Can do better with binary search? */
> +    for (int i = 0; i < NUM_BINS-1; i++) {
> +        if (val <= hist->wall[i]) {
> +            hist->bin[i]++;
> +            return;
> +        }
> +    }
> +    hist->bin[NUM_BINS-1]++;
> +}
> +
> +uint64_t histogram_samples(const struct histogram *hist);
> +
> +static inline struct iter_stats *
> +history_current(struct history *h)
> +{
> +    return &h->sample[h->idx];
> +}
> +
> +static inline struct iter_stats *
> +history_next(struct history *h)
> +{
> +    struct iter_stats *next;
> +
> +    h->idx++;
> +    if (h->idx == HISTORY_LEN) {
> +        h->idx = 0;
> +    }
> +    next = &h->sample[h->idx];
> +    memset(next, 0, sizeof(*next));
> +    return next;
> +}
> +
> +static inline struct iter_stats *
> +history_store(struct history *h, struct iter_stats *is) {
> +    if (is) {
> +        h->sample[h->idx] = *is;
> +    }
> +    /* Advance the history pointer */
> +    return history_next(h);
> +}
> +
>  static inline void
>  pmd_perf_start_iteration(struct pmd_perf_stats *s, uint64_t now_tsc)  {
> @@ -105,14 +200,25 @@ pmd_perf_start_iteration(struct pmd_perf_stats
> *s, uint64_t now_tsc)
>          /* Stats not yet fully initialised. */
>          return;
>      }
> -    s->last_tsc = now_tsc;
> +    memset(&s->current, 0, sizeof(struct iter_stats));
> +    s->current.timestamp = now_tsc;
>  }
> 
>  static inline void
>  pmd_perf_end_iteration(struct pmd_perf_stats *s, uint64_t now_tsc,
> -                       int packets)
> +                       int packets, bool full_metrics)
>  {
> -    uint64_t cycles = now_tsc - s->last_tsc;
> +    struct iter_stats *cum_ms;
> +    uint64_t cycles, cycles_per_pkt = 0;
> +
> +    if (OVS_UNLIKELY(s->current.timestamp == 0)) {
> +        /* Stats were cleared during the ongoing iteration. */
> +        return;
> +    }
> +
> +    cycles = now_tsc - s->current.timestamp;
> +    s->current.cycles = cycles;
> +    s->current.pkts = packets;
> 
>      /* No need for atomic updates as only invoked within PMD thread. */
>      if (packets > 0) {
> @@ -120,7 +226,68 @@ pmd_perf_end_iteration(struct pmd_perf_stats *s,
> uint64_t now_tsc,
>      } else {
>          s->counters.n[PMD_CYCLES_ITER_IDLE] += cycles;
>      }
> +    /* Add iteration sample to histograms. */
> +    histogram_add_sample(&s->cycles, cycles);
> +    histogram_add_sample(&s->pkts, packets);
> +
> +    if (!full_metrics) {
> +        return;
> +    }
> +
> +    s->counters.n[PMD_CYCLES_UPCALL] += s->current.upcall_cycles;
> +
> +    if (packets > 0) {
> +        cycles_per_pkt = cycles / packets;
> +        histogram_add_sample(&s->cycles_per_pkt, cycles_per_pkt);
> +    }
> +    if (s->current.batches > 0) {
> +        histogram_add_sample(&s->pkts_per_batch, packets / s-
> >current.batches);
> +    }
> +    histogram_add_sample(&s->upcalls, s->current.upcalls);
> +    if (s->current.upcalls > 0) {
> +        histogram_add_sample(&s->cycles_per_upcall,
> +                             s->current.upcall_cycles / s->current.upcalls);
> +    }
> +    histogram_add_sample(&s->max_vhost_qfill,
> + s->current.max_vhost_qfill);
> +
> +    /* Add iteration samples to millisecond stats. */
> +    cum_ms = history_current(&s->milliseconds);
> +    cum_ms->iterations++;
> +    cum_ms->cycles += cycles;
> +    if (packets > 0) {
> +        cum_ms->busy_cycles += cycles;
> +    }
> +    cum_ms->pkts += s->current.pkts;
> +    cum_ms->upcalls += s->current.upcalls;
> +    cum_ms->upcall_cycles += s->current.upcall_cycles;
> +    cum_ms->batches += s->current.batches;
> +    cum_ms->max_vhost_qfill += s->current.max_vhost_qfill;
> +
> +    /* Store in iteration history. */
> +    history_store(&s->iterations, &s->current);
> +    if (now_tsc - s->last_tsc > 10000) {
> +        /* Check if ms is completed and store in milliseconds history. */
> +        uint64_t now = time_msec();
> +        if (now != cum_ms->timestamp) {
> +            /* Add ms stats to totals. */
> +            s->totals.iterations += cum_ms->iterations;
> +            s->totals.cycles += cum_ms->cycles;
> +            s->totals.busy_cycles += cum_ms->busy_cycles;
> +            s->totals.pkts += cum_ms->pkts;
> +            s->totals.upcalls += cum_ms->upcalls;
> +            s->totals.upcall_cycles += cum_ms->upcall_cycles;
> +            s->totals.batches += cum_ms->batches;
> +            s->totals.max_vhost_qfill += cum_ms->max_vhost_qfill;
> +            cum_ms = history_next(&s->milliseconds);
> +            cum_ms->timestamp = now;
> +        }
> +        s->last_tsc = now_tsc;
> +    }
> +}
> +
> +#ifdef  __cplusplus
>  }
> +#endif
> 
>  #ifdef  __cplusplus
>  }
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 8b9698a..8f64df3
> 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -53,6 +53,7 @@
>  #include "id-pool.h"
>  #include "latch.h"
>  #include "netdev.h"
> +#include "netdev-provider.h"
>  #include "netdev-vport.h"
>  #include "netlink.h"
>  #include "odp-execute.h"
> @@ -278,6 +279,8 @@ struct dp_netdev {
> 
>      /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
>      OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min;
> +    /* Enable collection of PMD performance metrics. */
> +    ATOMIC(bool) pmd_perf_metrics;
> 
>      /* Protects access to ofproto-dpif-upcall interface during revalidator
>       * thread synchronization. */
> @@ -703,6 +706,7 @@ static inline bool emc_entry_alive(struct emc_entry
> *ce);  static void emc_clear_entry(struct emc_entry *ce);
> 
>  static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
> +static inline bool pmd_perf_metrics_enabled(const struct
> +dp_netdev_pmd_thread *pmd);
> 
>  static void
>  emc_cache_init(struct emc_cache *flow_cache) @@ -786,7 +790,8 @@
> get_dp_netdev(const struct dpif *dpif)  enum pmd_info_type {
>      PMD_INFO_SHOW_STATS,  /* Show how cpu cycles are spent. */
>      PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
> -    PMD_INFO_SHOW_RXQ     /* Show poll-lists of pmd threads. */
> +    PMD_INFO_SHOW_RXQ,    /* Show poll-lists of pmd threads. */
> +    PMD_INFO_PERF_SHOW,   /* Show pmd performance details. */
>  };
> 
>  static void
> @@ -873,8 +878,46 @@ pmd_info_show_stats(struct ds *reply,
>      ds_put_format(reply,
>                    "\tavg processing cycles per packet: "
>                    "%.02f (%"PRIu64"/%"PRIu64")\n",
> -                  stats[PMD_CYCLES_POLL_BUSY] / (double)total_packets,
> -                  stats[PMD_CYCLES_POLL_BUSY], total_packets);
> +                  stats[PMD_CYCLES_ITER_BUSY] / (double)total_packets,
> +                  stats[PMD_CYCLES_ITER_BUSY], total_packets); }
> +
> +static void
> +pmd_info_show_perf(struct ds *reply,
> +                   struct dp_netdev_pmd_thread *pmd,
> +                   struct pmd_perf_params *par) {
> +    if (pmd->core_id != NON_PMD_CORE_ID) {
> +        char *time_str =
> +                xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
> +        long long now = time_msec();
> +        double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
> +
> +        ds_put_cstr(reply, "\n");
> +        ds_put_format(reply, "Time: %s\n", time_str);
> +        ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
> +        ds_put_cstr(reply, "\n");
> +        format_pmd_thread(reply, pmd);
> +        ds_put_cstr(reply, "\n");
> +        pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
> +        if (pmd_perf_metrics_enabled(pmd)) {
> +            if (par->histograms) {
> +                ds_put_cstr(reply, "\n");
> +                pmd_perf_format_histograms(reply, &pmd->perf_stats);
> +            }
> +            if (par->iter_hist_len > 0) {
> +                ds_put_cstr(reply, "\n");
> +                pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
> +                        par->iter_hist_len);
> +            }
> +            if (par->ms_hist_len > 0) {
> +                ds_put_cstr(reply, "\n");
> +                pmd_perf_format_ms_history(reply, &pmd->perf_stats,
> +                        par->ms_hist_len);
> +            }
> +        }
> +        free(time_str);
> +    }
>  }
> 
>  static int
> @@ -1079,6 +1122,8 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn,
> int argc, const char *argv[],
>              pmd_perf_stats_clear(&pmd->perf_stats);
>          } else if (type == PMD_INFO_SHOW_STATS) {
>              pmd_info_show_stats(&reply, pmd);
> +        } else if (type == PMD_INFO_PERF_SHOW) {
> +            pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params
> + *)aux);
>          }
>      }
>      free(pmd_list);
> @@ -1088,6 +1133,48 @@ dpif_netdev_pmd_info(struct unixctl_conn
> *conn, int argc, const char *argv[],
>      unixctl_command_reply(conn, ds_cstr(&reply));
>      ds_destroy(&reply);
>  }
> +
> +static void
> +pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
> +                          const char *argv[],
> +                          void *aux OVS_UNUSED) {
> +    struct pmd_perf_params par;
> +    long int it_hist = 0, ms_hist = 0;
> +    par.histograms = true;
> +
> +    while (argc > 1) {
> +        if (!strcmp(argv[1], "-nh")) {
> +            par.histograms = false;
> +            argc -= 1;
> +            argv += 1;
> +        } else if (!strcmp(argv[1], "-it") && argc > 2) {
> +            it_hist = strtol(argv[2], NULL, 10);
> +            if (it_hist < 0) {
> +                it_hist = 0;
> +            } else if (it_hist > HISTORY_LEN) {
> +                it_hist = HISTORY_LEN;
> +            }
> +            argc -= 2;
> +            argv += 2;
> +        } else if (!strcmp(argv[1], "-ms") && argc > 2) {
> +            ms_hist = strtol(argv[2], NULL, 10);
> +            if (ms_hist < 0) {
> +                ms_hist = 0;
> +            } else if (ms_hist > HISTORY_LEN) {
> +                ms_hist = HISTORY_LEN;
> +            }
> +            argc -= 2;
> +            argv += 2;
> +        } else {
> +            break;
> +        }
> +    }
> +    par.iter_hist_len = it_hist;
> +    par.ms_hist_len = ms_hist;
> +    par.command_type = PMD_INFO_PERF_SHOW;
> +    dpif_netdev_pmd_info(conn, argc, argv, &par); }
>  

>  static int
>  dpif_netdev_init(void)
> @@ -1105,6 +1192,12 @@ dpif_netdev_init(void)
>      unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core |
> dp]",
>                               0, 2, dpif_netdev_pmd_info,
>                               (void *)&poll_aux);
> +    unixctl_command_register("dpif-netdev/pmd-perf-show",
> +                             "[-nh] [-it iter-history-len]"
> +                             " [-ms ms-history-len]"
> +                             " [-pmd core | dp]",
> +                             0, 7, pmd_perf_show_cmd,
> +                             NULL);
>      unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
>                               0, 1, dpif_netdev_pmd_rebalance,
>                               NULL);
> @@ -2980,6 +3073,18 @@ dpif_netdev_set_config(struct dpif *dpif, const
> struct smap *other_config)
>          }
>      }
> 
> +    bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics",
> false);
> +    bool cur_perf_enabled;
> +    atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
> +    if (perf_enabled != cur_perf_enabled) {
> +        atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
> +        if (perf_enabled) {
> +            VLOG_INFO("PMD performance metrics collection enabled");
> +        } else {
> +            VLOG_INFO("PMD performance metrics collection disabled");
> +        }
> +    }
> +
>      return 0;
>  }
> 
> @@ -3160,6 +3265,14 @@ cycles_count_intermediate(struct
> dp_netdev_pmd_thread *pmd,
>      }
>  }
> 
> +static inline bool
> +pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
> +{
> +    bool pmd_perf_enabled;
> +    atomic_read_relaxed(&pmd->dp->pmd_perf_metrics,
> &pmd_perf_enabled);
> +    return pmd_perf_enabled;
> +}
> +
>  static void
>  dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
>                           enum rxq_cycles_counter_type type,
> @@ -3236,6 +3349,7 @@ dp_netdev_process_rxq_port(struct
> dp_netdev_pmd_thread *pmd,
>                             struct netdev_rxq *rx,
>                             odp_port_t port_no)
>  {
> +    struct pmd_perf_stats *s = &pmd->perf_stats;
>      struct dp_packet_batch batch;
>      int error;
>      int batch_cnt = 0;
> @@ -3245,8 +3359,25 @@ dp_netdev_process_rxq_port(struct
> dp_netdev_pmd_thread *pmd,
>      if (!error) {
>          *recirc_depth_get() = 0;
>          pmd_thread_ctx_time_update(pmd);
> -
>          batch_cnt = batch.count;
> +        if (pmd_perf_metrics_enabled(pmd)) {
> +            /* Update batch histogram. */
> +            s->current.batches++;
> +            histogram_add_sample(&s->pkts_per_batch, batch_cnt);
> +            /* Update the maximum Rx queue fill level. */
> +            uint32_t qfill = batch.qfill;
> +            switch (netdev_dpdk_get_type(netdev_rxq_get_netdev(rx))) {
> +            case DPDK_DEV_VHOST:
> +                if (qfill > s->current.max_vhost_qfill) {
> +                    s->current.max_vhost_qfill = qfill;
> +                }
> +                break;
> +            case DPDK_DEV_ETH:
> +            default:
> +                break;
> +            }
> +        }
> +        /* Process packet batch. */
>          dp_netdev_input(pmd, &batch, port_no);
>          dp_netdev_pmd_flush_output_packets(pmd);
>      } else if (error != EAGAIN && error != EOPNOTSUPP) {
> @@ -4084,7 +4215,9 @@ reload:
>      cycles_count_start(pmd);
>      for (;;) {
>          uint64_t iter_packets = 0;
> +
>          pmd_perf_start_iteration(s, pmd->ctx.last_cycles);
> +
>          for (i = 0; i < poll_cnt; i++) {
>              process_packets =
>                  dp_netdev_process_rxq_port(pmd, poll_list[i].rxq->rx,
> @@ -4116,7 +4249,9 @@ reload:
>              }
>              cycles_count_intermediate(pmd, NULL, PMD_CYCLES_OVERHEAD);
>          }
> -        pmd_perf_end_iteration(s, pmd->ctx.last_cycles, iter_packets);
> +
> +        pmd_perf_end_iteration(s, pmd->ctx.last_cycles, iter_packets,
> +                               pmd_perf_metrics_enabled(pmd));
>      }
> 
>      cycles_count_end(pmd, PMD_CYCLES_OVERHEAD);
> @@ -5012,6 +5147,7 @@ handle_packet_upcall(struct
> dp_netdev_pmd_thread *pmd,
>      struct match match;
>      ovs_u128 ufid;
>      int error;
> +    uint64_t cycles = cycles_counter();
> 
>      match.tun_md.valid = false;
>      miniflow_expand(&key->mf, &match.flow);
> @@ -5065,9 +5201,15 @@ handle_packet_upcall(struct
> dp_netdev_pmd_thread *pmd,
>          ovs_mutex_unlock(&pmd->flow_mutex);
>          emc_probabilistic_insert(pmd, key, netdev_flow);
>      }
> -    /* Only error ENOSPC can reach here. We process the packet but do not
> -     * install a datapath flow. Treat as successful. */
> -    return 0;
> +    if (pmd_perf_metrics_enabled(pmd)) {
> +        /* Update upcall stats. */
> +        cycles = cycles_counter() - cycles;
> +        struct pmd_perf_stats *s = &pmd->perf_stats;
> +        s->current.upcalls++;
> +        s->current.upcall_cycles += cycles;
> +        histogram_add_sample(&s->cycles_per_upcall, cycles);
> +    }
> +    return error;
>  }
> 
>  static inline void
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 364f545..d701621 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -36,6 +36,7 @@
>  #include <rte_mbuf.h>
>  #include <rte_meter.h>
>  #include <rte_pci.h>
> +#include <rte_version.h>
>  #include <rte_vhost.h>
>  #include <rte_version.h>
> 
> @@ -188,11 +189,6 @@ enum { DPDK_RING_SIZE = 256 };
>  BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE));
>  enum { DRAIN_TSC = 200000ULL };
> 
> -enum dpdk_dev_type {
> -    DPDK_DEV_ETH = 0,
> -    DPDK_DEV_VHOST = 1,
> -};
> -
>  /* Quality of Service */
> 
>  /* An instance of a QoS configuration.  Always associated with a particular
> @@ -835,6 +831,13 @@ netdev_dpdk_cast(const struct netdev *netdev)
>      return CONTAINER_OF(netdev, struct netdev_dpdk, up);
>  }
> 
> +enum dpdk_dev_type
> +netdev_dpdk_get_type(const struct netdev *netdev)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +    return dev->type;
> +}
> +
>  static struct netdev *
>  netdev_dpdk_alloc(void)
>  {
> @@ -1644,6 +1647,14 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq
> *rxq,
>          return EAGAIN;
>      }
> 
> +    batch->qfill = nb_rx;
> +
> +    if (OVS_UNLIKELY(nb_rx == NETDEV_MAX_BURST)) {
> +        batch->qfill +=
> rte_vhost_rx_queue_count(netdev_dpdk_get_vid(dev),
> +                                                 qid * VIRTIO_QNUM
> +                                                     + VIRTIO_TXQ);
> +    }
> +
>      if (policer) {
>          dropped = nb_rx;
>          nb_rx = ingress_policer_run(policer,
> @@ -1683,6 +1694,8 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq,
> struct dp_packet_batch *batch)
>          return EAGAIN;
>      }
> 
> +    batch->qfill = nb_rx;
> +
>      if (policer) {
>          dropped = nb_rx;
>          nb_rx = ingress_policer_run(policer,
> diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h
> index b7d02a7..2b357db 100644
> --- a/lib/netdev-dpdk.h
> +++ b/lib/netdev-dpdk.h
> @@ -22,11 +22,18 @@
>  #include "openvswitch/compiler.h"
> 
>  struct dp_packet;
> +struct netdev;
> +
> +enum dpdk_dev_type {
> +    DPDK_DEV_ETH = 0,
> +    DPDK_DEV_VHOST = 1,
> +};
> 
>  #ifdef DPDK_NETDEV
> 
>  void netdev_dpdk_register(void);
>  void free_dpdk_buf(struct dp_packet *);
> +enum dpdk_dev_type netdev_dpdk_get_type(const struct netdev
> *netdev);
> 
>  #else
> 
> @@ -41,6 +48,13 @@ free_dpdk_buf(struct dp_packet *buf OVS_UNUSED)
>      /* Nothing */
>  }
> 
> +static inline enum dpdk_dev_type
> +netdev_dpdk_get_type(const struct netdev *netdev OVS_UNUSED)
> +{
> +    /* Nothing to do. Return value zero to make compiler happy. */
> +    return DPDK_DEV_ETH;
> +}
> +
>  #endif
> 
>  #endif /* netdev-dpdk.h */
> diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
> index 43b7b89..3845b12 100644
> --- a/ofproto/ofproto-dpif.c
> +++ b/ofproto/ofproto-dpif.c
> @@ -5317,7 +5317,8 @@ dpif_show_backer(const struct dpif_backer
> *backer, struct ds *ds)
> 
>      dpif_get_dp_stats(backer->dpif, &dp_stats);
>      ds_put_format(ds, "%s: hit:%"PRIu64" missed:%"PRIu64"\n",
> -                  dpif_name(backer->dpif), dp_stats.n_hit, dp_stats.n_missed);
> +                  dpif_name(backer->dpif),
> +                  dp_stats.n_hit + dp_stats.n_mask_hit, dp_stats.n_missed);
> 
>      shash_init(&ofproto_shash);
>      ofprotos = get_ofprotos(&ofproto_shash);
> --
> 1.9.1
> 
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev


More information about the dev mailing list