[ovs-dev] [PATCH v7 2/2] dpif-netdev: Refactor cycle counting

Ilya Maximets i.maximets at samsung.com
Thu Jan 11 08:48:21 UTC 2018


On 11.01.2018 10:39, Jan Scheurich wrote:
> Simplify the historically grown TSC cycle counting in PMD threads.
> Cycles are currently counted for the following purposes:
> 
> 1. Measure PMD ustilization
> 
> PMD utilization is defined as ratio of cycles spent in busy iterations
> (at least one packet received or sent) over the total number of cycles.
> 
> This is already done in pmd_perf_start_iteration() and
> pmd_perf_end_iteration() based on a TSC timestamp saved in current
> iteration at start_iteration() and the actual TSC at end_iteration().
> No dependency on intermediate cycle accounting.
> 
> 2. Measure the processing load per RX queue
> 
> This comprises cycles spend on polling and processing packets received
> from the rx queue and the cycles spent on delayed sending of these packets
> to tx queues (with time-based batching).
> 
> The previous scheme using cycles_count_start(), cycles_count_intermediate()
> and cycles-count_end() originally introduced to simplify cycle counting
> and saving calls to rte_get_tsc_cycles() was rather obscuring things.
> 
> Replace by a nestable cycle_timer with with start and stop functions to
> embrace a code segment to be timed. The timed code may contain arbitrary
> nested cycle_timers. The duration of nested timers is excluded from the
> outer timer.
> 
> The caller must ensure that each call to cycle_timer_start() is
> followed by a call to cycle_timer_end(). Failure to do so will lead to
> assertion failure or a memory leak.
> 
> The new cycle_timer is used to measure the processing cycles per rx queue.
> This is not yet strictly necessary but will be made use of in a subsequent
> commit.
> 
> All cycle count functions and data are relocated to module
> dpif-netdev-perf.
> 
> Signed-off-by: Jan Scheurich <jan.scheurich at ericsson.com>
> ---
>  lib/dpif-netdev-perf.c |   1 +
>  lib/dpif-netdev-perf.h |  99 ++++++++++++++++++++++++++++++++++++-----
>  lib/dpif-netdev.c      | 118 ++++++++++++++-----------------------------------
>  3 files changed, 124 insertions(+), 94 deletions(-)
> 
> diff --git a/lib/dpif-netdev-perf.c b/lib/dpif-netdev-perf.c
> index f853fd8..073e668 100644
> --- a/lib/dpif-netdev-perf.c
> +++ b/lib/dpif-netdev-perf.c
> @@ -32,6 +32,7 @@ pmd_perf_stats_init(struct pmd_perf_stats *s)
>  {
>      memset(s, 0 , sizeof(*s));
>      s->start_ms = time_msec();
> +    cycles_counter_update(s);

This should not be here. init called by the main thread which is likely
works on a different cpu core.

>  }
>  
>  void
> diff --git a/lib/dpif-netdev-perf.h b/lib/dpif-netdev-perf.h
> index 41f4e85..30d6e1a 100644
> --- a/lib/dpif-netdev-perf.h
> +++ b/lib/dpif-netdev-perf.h
> @@ -59,10 +59,6 @@ enum pmd_stat_type {
>                               * recirculation. */
>      PMD_STAT_SENT_PKTS,     /* Packets that have been sent. */
>      PMD_STAT_SENT_BATCHES,  /* Number of batches sent. */
> -    PMD_CYCLES_POLL_IDLE,   /* Cycles spent unsuccessful polling. */
> -    PMD_CYCLES_POLL_BUSY,   /* Cycles spent successfully polling and
> -                             * processing polled packets. */
> -    PMD_CYCLES_OVERHEAD,    /* Cycles spent for other tasks. */
>      PMD_CYCLES_ITER_IDLE,   /* Cycles spent in idle iterations. */
>      PMD_CYCLES_ITER_BUSY,   /* Cycles spent in busy iterations. */
>      PMD_N_STATS
> @@ -91,11 +87,95 @@ struct pmd_perf_stats {
>       * data. */
>      uint64_t start_ms;
>      /* Start of the current PMD iteration in TSC cycles.*/
> +    uint64_t start_it_tsc;
> +    /* Latest TSC time stamp taken in PMD. */
>      uint64_t last_tsc;
> +    /* If non-NULL, outermost cycle timer currently running in PMD. */
> +    struct cycle_timer *cur_timer;
>      /* Set of PMD counters with their zero offsets. */
>      struct pmd_counters counters;
>  };
>  
> +/* Support for accurate timing of PMD execution on TSC clock cycle level.
> + * These functions are intended to be invoked in the context of pmd threads. */
> +
> +/* Read the TSC cycle register and cache it. Any function not requiring clock
> + * cycle accuracy should read the cached value using cycles_counter_get() to
> + * avoid the overhead of reading the TSC register. */
> +
> +static inline uint64_t
> +cycles_counter_update(struct pmd_perf_stats *s)
> +{
> +#ifdef DPDK_NETDEV
> +    return s->last_tsc = rte_get_tsc_cycles();
> +#else
> +    return s->last_tsc = 0;
> +#endif
> +}
> +
> +static inline uint64_t
> +cycles_counter_get(struct pmd_perf_stats *s)
> +{
> +    return s->last_tsc;
> +}
> +
> +/* A nestable timer for measuring execution time in TSC cycles.
> + *
> + * Usage:
> + * struct cycle_timer timer;
> + *
> + * cycle_timer_start(pmd, &timer);
> + * <Timed execution>
> + * uint64_t cycles = cycle_timer_stop(pmd, &timer);
> + *
> + * The caller must guarantee that a call to cycle_timer_start() is always
> + * paired with a call to cycle_stimer_stop().
> + *
> + * Is is possible to have nested cycles timers within the timed code. The
> + * execution time measured by the nested timers is excluded from the time
> + * measured by the embracing timer.
> + */
> +
> +struct cycle_timer {
> +    uint64_t start;
> +    uint64_t suspended;
> +    struct cycle_timer *interrupted;
> +};
> +
> +static inline void
> +cycle_timer_start(struct pmd_perf_stats *s,
> +                  struct cycle_timer *timer)
> +{
> +    struct cycle_timer *cur_timer = s->cur_timer;
> +    uint64_t now = cycles_counter_update(s);
> +
> +    if (cur_timer) {
> +        cur_timer->suspended = now;
> +    }
> +    timer->interrupted = cur_timer;
> +    timer->start = now;
> +    timer->suspended = 0;
> +    s->cur_timer = timer;
> +}
> +
> +static inline uint64_t
> +cycle_timer_stop(struct pmd_perf_stats *s,
> +                 struct cycle_timer *timer)
> +{
> +    /* Assert that this is the current cycle timer. */
> +    ovs_assert(s->cur_timer == timer);
> +    uint64_t now = cycles_counter_update(s);
> +    struct cycle_timer *intr_timer = timer->interrupted;
> +
> +    if (intr_timer) {
> +        /* Adjust the start offset by the suspended cycles. */
> +        intr_timer->start += now - intr_timer->suspended;
> +    }
> +    /* Restore suspended timer, if any. */
> +    s->cur_timer = intr_timer;
> +    return now - timer->start;
> +}
> +
>  void pmd_perf_stats_init(struct pmd_perf_stats *s);
>  void pmd_perf_stats_clear(struct pmd_perf_stats *s);
>  void pmd_perf_read_counters(struct pmd_perf_stats *s,
> @@ -121,23 +201,22 @@ pmd_perf_update_counter(struct pmd_perf_stats *s,
>  }
>  
>  static inline void
> -pmd_perf_start_iteration(struct pmd_perf_stats *s, uint64_t now_tsc)
> +pmd_perf_start_iteration(struct pmd_perf_stats *s)
>  {
>      if (OVS_UNLIKELY(s->start_ms == 0)) {
>          /* Stats not yet fully initialized. */
>          return;
>      }
> -    s->last_tsc = now_tsc;
> +    s->start_it_tsc = s->last_tsc;
>  }
>  
>  static inline void
> -pmd_perf_end_iteration(struct pmd_perf_stats *s, uint64_t now_tsc,
> -                       int packets)
> +pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets)
>  {
> -    uint64_t cycles = now_tsc - s->last_tsc;
> +    uint64_t cycles = cycles_counter_update(s) - s->start_it_tsc;
>  
>      /* No need for atomic updates as only invoked within PMD thread. */
> -    if (packets > 0) {
> +    if (rx_packets > 0) {
>          s->counters.n[PMD_CYCLES_ITER_BUSY] += cycles;
>      } else {
>          s->counters.n[PMD_CYCLES_ITER_IDLE] += cycles;
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 538d5ce..edc5e2e 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -509,8 +509,6 @@ struct tx_port {
>  struct dp_netdev_pmd_thread_ctx {
>      /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
>      long long now;
> -    /* Used to count cycles. See 'cycles_count_end()' */
> -    unsigned long long last_cycles;
>  };
>  
>  /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
> @@ -3111,64 +3109,20 @@ dp_netdev_actions_free(struct dp_netdev_actions *actions)
>      free(actions);
>  }
>  
> -static inline unsigned long long
> -cycles_counter(void)
> -{
> -#ifdef DPDK_NETDEV
> -    return rte_get_tsc_cycles();
> -#else
> -    return 0;
> -#endif
> -}
> -
> -/* Fake mutex to make sure that the calls to cycles_count_* are balanced */
> -extern struct ovs_mutex cycles_counter_fake_mutex;
> -
> -/* Start counting cycles.  Must be followed by 'cycles_count_end()' */
> -static inline void
> -cycles_count_start(struct dp_netdev_pmd_thread *pmd)
> -    OVS_ACQUIRES(&cycles_counter_fake_mutex)
> -    OVS_NO_THREAD_SAFETY_ANALYSIS
> -{
> -    pmd->ctx.last_cycles = cycles_counter();
> -}
> -
> -/* Stop counting cycles and add them to the counter 'type' */
> -static inline void
> -cycles_count_end(struct dp_netdev_pmd_thread *pmd,
> -                 enum pmd_stat_type type)
> -    OVS_RELEASES(&cycles_counter_fake_mutex)
> -    OVS_NO_THREAD_SAFETY_ANALYSIS
> -{
> -    unsigned long long interval = cycles_counter() - pmd->ctx.last_cycles;
> -
> -    pmd_perf_update_counter(&pmd->perf_stats, type, interval);
> -}
> -
> -/* Calculate the intermediate cycle result and add to the counter 'type' */
> -static inline void
> -cycles_count_intermediate(struct dp_netdev_pmd_thread *pmd,
> -                          struct dp_netdev_rxq *rxq,
> -                          enum pmd_stat_type type)
> -    OVS_NO_THREAD_SAFETY_ANALYSIS
> +static void
> +dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
> +                         enum rxq_cycles_counter_type type,
> +                         unsigned long long cycles)
>  {
> -    unsigned long long new_cycles = cycles_counter();
> -    unsigned long long interval = new_cycles - pmd->ctx.last_cycles;
> -    pmd->ctx.last_cycles = new_cycles;
> -
> -    pmd_perf_update_counter(&pmd->perf_stats, type, interval);
> -    if (rxq && (type == PMD_CYCLES_POLL_BUSY)) {
> -        /* Add to the amount of current processing cycles. */
> -        non_atomic_ullong_add(&rxq->cycles[RXQ_CYCLES_PROC_CURR], interval);
> -    }
> +   atomic_store_relaxed(&rx->cycles[type], cycles);
>  }
>  
>  static void
> -dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
> +dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
>                           enum rxq_cycles_counter_type type,
>                           unsigned long long cycles)
>  {
> -   atomic_store_relaxed(&rx->cycles[type], cycles);
> +    non_atomic_ullong_add(&rx->cycles[type], cycles);
>  }
>  
>  static uint64_t
> @@ -3234,27 +3188,40 @@ dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd)
>  
>  static int
>  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
> -                           struct netdev_rxq *rx,
> +                           struct dp_netdev_rxq *rxq,
>                             odp_port_t port_no)
>  {
>      struct dp_packet_batch batch;
> +    struct cycle_timer timer;
>      int error;
>      int batch_cnt = 0;
>  
> +    /* Measure duration for polling and processing rx burst. */
> +    cycle_timer_start(&pmd->perf_stats, &timer);
>      dp_packet_batch_init(&batch);
> -    error = netdev_rxq_recv(rx, &batch);
> +    error = netdev_rxq_recv(rxq->rx, &batch);
>      if (!error) {
> +        /* At least one packet received. */
>          *recirc_depth_get() = 0;
>          pmd_thread_ctx_time_update(pmd);
>  
>          batch_cnt = batch.count;
>          dp_netdev_input(pmd, &batch, port_no);
>          dp_netdev_pmd_flush_output_packets(pmd);
> -    } else if (error != EAGAIN && error != EOPNOTSUPP) {
> -        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>  
> -        VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
> -                    netdev_rxq_get_name(rx), ovs_strerror(error));
> +        /* Assign processing cycles to rx queue. */
> +        uint64_t cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
> +        dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
> +
> +    } else {
> +        /* Discard cycles. */
> +        cycle_timer_stop(&pmd->perf_stats, &timer);
> +        if (error != EAGAIN && error != EOPNOTSUPP) {
> +            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> +
> +            VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
> +                    netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
> +        }
>      }
>  
>      return batch_cnt;
> @@ -3880,30 +3847,22 @@ dpif_netdev_run(struct dpif *dpif)
>      struct dp_netdev *dp = get_dp_netdev(dpif);
>      struct dp_netdev_pmd_thread *non_pmd;
>      uint64_t new_tnl_seq;
> -    int process_packets = 0;
>  
>      ovs_mutex_lock(&dp->port_mutex);
>      non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
>      if (non_pmd) {
>          ovs_mutex_lock(&dp->non_pmd_mutex);
> -        cycles_count_start(non_pmd);
>          HMAP_FOR_EACH (port, node, &dp->ports) {
>              if (!netdev_is_pmd(port->netdev)) {
>                  int i;
>  
>                  for (i = 0; i < port->n_rxq; i++) {
> -                    process_packets =
> -                        dp_netdev_process_rxq_port(non_pmd,
> -                                                   port->rxqs[i].rx,
> -                                                   port->port_no);
> -                    cycles_count_intermediate(non_pmd, NULL,
> -                                              process_packets
> -                                              ? PMD_CYCLES_POLL_BUSY
> -                                              : PMD_CYCLES_POLL_IDLE);
> +                    dp_netdev_process_rxq_port(non_pmd,
> +                                               &port->rxqs[i],
> +                                               port->port_no);
>                  }
>              }
>          }
> -        cycles_count_end(non_pmd, PMD_CYCLES_POLL_IDLE);
>          pmd_thread_ctx_time_update(non_pmd);
>          dpif_netdev_xps_revalidate_pmd(non_pmd, false);
>          ovs_mutex_unlock(&dp->non_pmd_mutex);
> @@ -4082,18 +4041,14 @@ reload:
>          lc = UINT_MAX;
>      }
>  
> -    cycles_count_start(pmd);
> +    cycles_counter_update(s);
>      for (;;) {
>          uint64_t iter_packets = 0;
> -        pmd_perf_start_iteration(s, pmd->ctx.last_cycles);
> +        pmd_perf_start_iteration(s);
>          for (i = 0; i < poll_cnt; i++) {
>              process_packets =
> -                dp_netdev_process_rxq_port(pmd, poll_list[i].rxq->rx,
> +                dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
>                                             poll_list[i].port_no);
> -            cycles_count_intermediate(pmd, poll_list[i].rxq,
> -                                      process_packets
> -                                      ? PMD_CYCLES_POLL_BUSY
> -                                      : PMD_CYCLES_POLL_IDLE);
>              iter_packets += process_packets;
>          }
>  
> @@ -4115,13 +4070,10 @@ reload:
>              if (reload) {
>                  break;
>              }
> -            cycles_count_intermediate(pmd, NULL, PMD_CYCLES_OVERHEAD);
>          }
> -        pmd_perf_end_iteration(s, pmd->ctx.last_cycles, iter_packets);
> +        pmd_perf_end_iteration(s, iter_packets);
>      }
>  
> -    cycles_count_end(pmd, PMD_CYCLES_OVERHEAD);
> -
>      poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
>      exiting = latch_is_set(&pmd->exit_latch);
>      /* Signal here to make sure the pmd finishes
> @@ -5066,9 +5018,7 @@ handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
>          ovs_mutex_unlock(&pmd->flow_mutex);
>          emc_probabilistic_insert(pmd, key, netdev_flow);
>      }
> -    /* Only error ENOSPC can reach here. We process the packet but do not
> -     * install a datapath flow. Treat as successful. */
> -    return 0;
> +    return error;
>  }
>  
>  static inline void
> 


More information about the dev mailing list