[ovs-dev] [PATCH v8 5/6] dpif-netdev: Time based output batching.

Wed Dec 20 14:06:39 UTC 2017

> >
> > Good catch,
> >
> > I was going to push this today but I'll hold off until this issue is
> resolved, I don’t want an existing feature such as the rxq balancing being
> negatively impacted upon if we can avoid it.
> >
> > Ian
> 
> Ian, in general, I'm good with pushing the series without this patch
> because simple output batching provides significant performance
> improvement itself for bonding/multiflow cases. Maybe we can delay only
> time-based approach until we have solution for rxq-cycles issue.
> 

I think that’s reasonable. I think with the RXQ balancing there will be a number of situations/corner cases we need to consider and test for so the extra time to give feedback on it would be useful. I'll also hold off on the DPDK docs patch you have for the TX flushing and both can be merged together at a later stage.

Kevin, would this solution be acceptable to you?

If so, I think I'm correct in saying it's just this patch that should not be merged, patches 1-4 of the set can still be applied as well as patch 6?

Ian

> Best regards, Ilya Maximets.
> 
> >>
> >>>          }
> >>>
> >>>          if (lc++ > 1024) {
> >>> @@ -4150,9 +4228,6 @@ reload:
> >>>              lc = 0;
> >>>
> >>>              coverage_try_clear();
> >>> -            /* It's possible that the time was not updated on current
> >>> -             * iteration, if there were no received packets. */
> >>> -            pmd_thread_ctx_time_update(pmd);
> >>>              dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
> >>>              if (!ovsrcu_try_quiesce()) {
> >>>                  emc_cache_slow_sweep(&pmd->flow_cache);
> >>> @@ -4238,7 +4313,7 @@ dp_netdev_run_meter(struct dp_netdev *dp,
> >>> struct
> >> dp_packet_batch *packets_,
> >>>      memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
> >>>
> >>>      /* All packets will hit the meter at the same time. */
> >>> -    long_delta_t = (now - meter->used); /* msec */
> >>> +    long_delta_t = (now - meter->used) / 1000; /* msec */
> >>>
> >>>      /* Make sure delta_t will not be too large, so that bucket will
> not
> >>>       * wrap around below. */
> >>> @@ -4394,7 +4469,7 @@ dpif_netdev_meter_set(struct dpif *dpif,
> >> ofproto_meter_id *meter_id,
> >>>          meter->flags = config->flags;
> >>>          meter->n_bands = config->n_bands;
> >>>          meter->max_delta_t = 0;
> >>> -        meter->used = time_msec();
> >>> +        meter->used = time_usec();
> >>>
> >>>          /* set up bands */
> >>>          for (i = 0; i < config->n_bands; ++i) { @@ -4592,6 +4667,7
> >>> @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct
> >> dp_netdev *dp,
> >>>      pmd->core_id = core_id;
> >>>      pmd->numa_id = numa_id;
> >>>      pmd->need_reload = false;
> >>> +    pmd->n_output_batches = 0;
> >>>
> >>>      ovs_refcount_init(&pmd->ref_cnt);
> >>>      latch_init(&pmd->exit_latch);
> >>> @@ -4779,6 +4855,7 @@ dp_netdev_add_port_tx_to_pmd(struct
> >>> dp_netdev_pmd_thread *pmd,
> >>>
> >>>      tx->port = port;
> >>>      tx->qid = -1;
> >>> +    tx->flush_time = 0LL;
> >>>      dp_packet_batch_init(&tx->output_pkts);
> >>>
> >>>      hmap_insert(&pmd->tx_ports, &tx->node,
> >>> hash_port_no(tx->port->port_no)); @@ -4942,7 +5019,7 @@
> >> packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
> >>>      struct dp_netdev_flow *flow = batch->flow;
> >>>
> >>>      dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
> >>> -                        batch->tcp_flags, pmd->ctx.now);
> >>> +                        batch->tcp_flags, pmd->ctx.now / 1000);
> >>>
> >>>      actions = dp_netdev_flow_get_actions(flow);
> >>>
> >>> @@ -5317,7 +5394,7 @@ dpif_netdev_xps_revalidate_pmd(const struct
> >> dp_netdev_pmd_thread *pmd,
> >>>              continue;
> >>>          }
> >>>          interval = pmd->ctx.now - tx->last_used;
> >>> -        if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT_MS)) {
> >>> +        if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
> >>>              port = tx->port;
> >>>              ovs_mutex_lock(&port->txq_used_mutex);
> >>>              port->txq_used[tx->qid]--; @@ -5338,7 +5415,7 @@
> >>> dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
> >>>      interval = pmd->ctx.now - tx->last_used;
> >>>      tx->last_used = pmd->ctx.now;
> >>>
> >>> -    if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT_MS)) {
> >>> +    if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
> >>>          return tx->qid;
> >>>      }
> >>>
> >>> @@ -5470,12 +5547,16 @@ dp_execute_cb(void *aux_, struct
> >>> dp_packet_batch
> >> *packets_,
> >>>                  dp_netdev_pmd_flush_output_on_port(pmd, p);
> >>>              }
> >>>  #endif
> >>> -            if (OVS_UNLIKELY(dp_packet_batch_size(&p->output_pkts)
> >>> -                       + dp_packet_batch_size(packets_) >
> >> NETDEV_MAX_BURST)) {
> >>> -                /* Some packets was generated while input batch
> >> processing.
> >>> -                 * Flush here to avoid overflow. */
> >>> +            if (dp_packet_batch_size(&p->output_pkts)
> >>> +                + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST)
> {
> >>> +                /* Flush here to avoid overflow. */
> >>>                  dp_netdev_pmd_flush_output_on_port(pmd, p);
> >>>              }
> >>> +
> >>> +            if (dp_packet_batch_is_empty(&p->output_pkts)) {
> >>> +                pmd->n_output_batches++;
> >>> +            }
> >>> +
> >>>              DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
> >>>                  dp_packet_batch_add(&p->output_pkts, packet);
> >>>              }
> >>> @@ -5717,7 +5798,7 @@ dp_execute_cb(void *aux_, struct
> >>> dp_packet_batch
> >> *packets_,
> >>>          conntrack_execute(&dp->conntrack, packets_,
> >>> aux->flow->dl_type,
> >> force,
> >>>                            commit, zone, setmark, setlabel,
> >>> aux->flow- tp_src,
> >>>                            aux->flow->tp_dst, helper,
> >> nat_action_info_ref,
> >>> -                          pmd->ctx.now);
> >>> +                          pmd->ctx.now / 1000);
> >>>          break;
> >>>      }
> >>>
> >>> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index
> >>> 21ffaf5..bc6b1be 100644
> >>> --- a/vswitchd/vswitch.xml
> >>> +++ b/vswitchd/vswitch.xml
> >>> @@ -359,6 +359,22 @@
> >>>          </p>
> >>>        </column>
> >>>
> >>> +      <column name="other_config" key="tx-flush-interval"
> >>> +              type='{"type": "integer",
> >>> +                     "minInteger": 0, "maxInteger": 1000000}'>
> >>> +        <p>
> >>> +          Specifies the time in microseconds that a packet can wait
> >>> + in
> >> output
> >>> +          batch for sending i.e. amount of time that packet can
> >>> + spend
> >> in an
> >>> +          intermediate output queue before sending to netdev.
> >>> +          This option can be used to configure balance between
> >> throughput
> >>> +          and latency. Lower values decreases latency while higher
> >> values
> >>> +          may be useful to achieve higher performance.
> >>> +        </p>
> >>> +        <p>
> >>> +          Defaults to 0 i.e. instant packet sending (latency
> >> optimized).
> >>> +        </p>
> >>> +      </column>
> >>> +
> >>>        <column name="other_config" key="n-handler-threads"
> >>>                type='{"type": "integer", "minInteger": 1}'>
> >>>          <p>
> >>>
> >