[ovs-dev] [RFC PATCH 6/8] dpif-netdev: Refactor cycle counting

Tue Jan 9 13:43:46 UTC 2018

On 09.01.2018 03:51, Jan Scheurich wrote:
> 
>>>> @@ -3429,16 +3381,17 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>>>>      struct pmd_perf_stats *s = &pmd->perf_stats;
>>>>      struct dp_packet_batch batch;
>>>>      int error;
>>>> -    int batch_cnt = 0, output_cnt = 0;
>>>> +    int batch_cnt = 0;
>>>>
>>>>      dp_packet_batch_init(&batch);
>>>> -
>>>> -    cycles_count_intermediate(pmd, NULL, 0);
>>>>      pmd->ctx.last_rxq = rxq;
>>>> -    pmd->ctx.current_pmd_cycles_type = PMD_CYCLES_POLL_BUSY;
>>>> +
>>>> +    /* Measure duration for polling and processing rx burst. */
>>>> +    uint64_t cycles = cycles_counter(pmd);
>>>
>>> hi Jan - not a full review. Consider this function is called from
>>> dp_netdev_input()...dp_execute_cb(), if the code below is hit, the above
>>> measurement will make for double counting.
>>>
>>> <snip>
>>> #ifdef DPDK_NETDEV
>>>             if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
>>>                              && packets_->packets[0]->source
>>>                                 != p->output_pkts.packets[0]->source)) {
>>>                 /* XXX: netdev-dpdk assumes that all packets in a single
>>>                  *      output batch has the same source. Flush here to
>>>                  *      avoid memory access issues. */
>>>                 dp_netdev_pmd_flush_output_on_port(pmd, p);
>>>             }
>>> #endif
>>>             if (dp_packet_batch_size(&p->output_pkts)
>>>                 + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
>>>                 /* Flush here to avoid overflow. */
>>>                 dp_netdev_pmd_flush_output_on_port(pmd, p);
>>>             }
>>> </snip>
>>>
>>> I wouldn't be too concerned about the first one because it's unlikely. I
>>> would be more concerned about the second one, as it is quite likely that
>>> multiple rxqs are sending to a single port and the cycles for tx could
>>> be significant. Take 2 rxq's sending packets to a vhost port, unless we
>>> got batches of 32 on each rxq there would be double counting for one of
>>> the queues everytime. There could be more cases like this also, as there
>>> is flush from a lot of places. I didn't compare, but from memory I don't
>>> think this would be an issues in Ilya's patches.
>>>
>>> start/intermediate/stop type functions might be better than storing
>>> cycles locally in functions, because you could stop and start the count
>>> from any function you need. In this case, you could avoid the double
>>> counting by something like stop/call
>>> dp_netdev_pmd_flush_output_on_port()/start. That might start to make the
>>> code more like Ilya's, so it's probably good to get his review.
>>
>> You are right, I overlooked the possibility for a tx burst triggered immediately by
>> executing an output action. This needs fixing, and I agree that a scheme which
>> is able to "suspend" an ongoing measurement at any time will be needed for
>> that. Let me have another look at the previous scheme to see if I can simplify
>> that. If not we can stick to the original solution.
>>
> 
> I have prepared a slightly more advanced version that allows arbitrary nesting of cycles measurements where the nested measurements are excluded from the outer measurement: Here's the outline. I am including this in the next version of my umbrella series. This version isn't tested yet:
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 4761d3b..e487828 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -516,6 +516,8 @@ struct tx_port {
>      struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
>  };
>  
> +struct cycle_timer;
> +
>  /* A set of properties for the current processing loop that is not directly
>   * associated with the pmd thread itself, but with the packets being
>   * processed or the short-term system configuration (for example, time).
> @@ -527,6 +529,7 @@ struct dp_netdev_pmd_thread_ctx {
>      uint64_t last_cycles;
>      /* RX queue from which last packet was received. */
>      struct dp_netdev_rxq *last_rxq;
> +    struct cycle_timer *cur_timer;
>  };
>  
>  /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
> @@ -632,6 +635,71 @@ struct dpif_netdev {
>      uint64_t last_port_seq;
>  };
>  
> +/* Support for accurate timing of PMD execution on TSC clock cycle level.
> + * These functions are intended to be invoked in the context of pmd threads. */
> +
> +/* Read the TSC cycle register and cache it in the pmd context. Any function
> + * not requiring clock cycle accuracy should read the cached value from the
> + * context. */
> +static inline uint64_t
> +cycles_counter(struct dp_netdev_pmd_thread *pmd)
> +{
> +#ifdef DPDK_NETDEV
> +    return pmd->ctx.last_cycles = rte_get_tsc_cycles();
> +#else
> +    return pmd->ctx.last_cycles = 0;
> +#endif
> +}
> +
> +/* A nestable timer for measuring execution time in TSC cycles.
> + *
> + * Usage:
> + * struct cycle_timer timer;
> + *
> + * cycle_timer_start(pmd, &timer);
> + * <Timed execution>
> + * uint64_t cycles = cycle_timer_stop(pmd, &timer);
> + *
> + * The caller must guarantee that a call to cycle_timer_start() is always
> + * paired with a call to cycle_stimer_stop().
> + *
> + * Is is possible to have nested cycles timers within the timed code. The
> + * execution time measured by the nested timers is excluded from the time
> + * measured by the embracing timer.
> + */
> +
> +struct cycle_timer {
> +    uint64_t *offset;
> +    struct cycle_timer *interrupted;
> +};
> +
> +static void
> +cycle_timer_start(struct dp_netdev_pmd_thread *pmd,
> +                  struct cycle_timer *timer)
> +{
> +    /* Suspend current timer, if any. */
> +    timer->interrupted = pmd->ctx.cur_timer;
> +    timer->offset = cycles_counter(pmd);
> +    pmd->ctx.cur_timer = timer;
> +}
> +
> +static uint64_t
> +cycle_timer_stop(struct dp_netdev_pmd_thread *pmd,
> +                 struct cycle_timer *timer)
> +{
> +    /* Assert that this is the current cycle timer. */
> +    ovs_assert(pmd->ctx.cur_timer == timer);
> +    uint64_t cycles = cycles_counter(pmd) - timer->offset;
> +    if (timer->interrupted) {
> +        /* Adjust the start offset by the used cycles. */
> +        timer->interrupted->offset += cycles;

I think, this will not work for nesting more than 2 timers, because
you will adjust only by the time of the one level deeper timer.
You probably need one more field in the timer structure to accumulate
sleep time while unwinding the timer's stack and not modify original
offsets like:

	uint64_t cycles = cycles_counter(pmd) - timer->offset;
	if (timer->interrupted) {
		timer->interrupted->interrupted_time += cycles;
	}
	pmd->ctx.cur_timer = timer->interrupted;
	return cycles - timer->interrupted_time;

> +    }
> +    /* Restore suspended timer, if any. */
> +    pmd->ctx.cur_timer = timer->interrupted;
> +    return cycles;
> +}
> +
>  static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
>                                struct dp_netdev_port **portp)
>      OVS_REQUIRES(dp->port_mutex);
> @@ -3267,16 +3335,6 @@ dp_netdev_actions_free(struct dp_netdev_actions *actions)
>      free(actions);
>  }
>  
> 
> 
> 
> 
>