[ovs-dev] [PATCH 4/4] tunneling: Avoid recirculation on datapath by computing the recirculate actions at translate time.

Joe Stringer joe at ovn.org
Mon Jun 26 18:05:56 UTC 2017


On 16 June 2017 at 09:45, Sugesh Chandran <sugesh.chandran at intel.com> wrote:
> This patch set removes the recirculation of encapsulated tunnel packets
> if possible. It is done by computing the post tunnel actions at the time of
> translation. The combined nested action set are programmed in the datapath
> using CLONE action.
>
> Signed-off-by: Sugesh Chandran <sugesh.chandran at intel.com>
> Signed-off-by: Zoltán Balogh <zoltan.balogh at ericsson.com>
> Co-authored-by: Zoltán Balogh <zoltan.balogh at ericsson.com>
> ---

It seems like this approach attempts to simulate the later bridge
traversal to generate the actions to be processed, forgets all of this
processing, then if it won't work, generates the recirc action.. but
if it would work, goes back through the latter bridge traversal again,
this time keeping all of the generated netlink actions; attributing
stats; and applying side effects.

So, the first traversal needs to *not* attribute stats/side effects
initially, until we later do the check and determine that it's the
right thing to do... but later on, if we determine that it's the
correct set of actions, then ideally we wouldn't have to go and do the
whole same translation again. The common case should be that the
latter bridge traversal is the actual actions we want to do.. Could we
instead, for this first traversal, clear out the
packet/resubmit_stats/etc, then provide an xlate_cache object to
collect stats/side effects, traverse the other bridge, then if we
determine that it was viable to do it this way, keep the buffer as-is,
use xlate_cache to handle the stats/side effects, then (if there is
already an xin->xc) add temporary xlate_cache entries to the end of
the xin->xc?

Modulo need for the clone action, of course... but in my eyes even if
we just needed to shift a portion of the odp buffer back/forward to
insert/remove a clone action afterwards, then doing this memmove would
be cheaper than performing the whole translation for the latter bridge
again.

Now, in the packet processing path for xlate_actions I wouldn't expect
the xlate_cache to be there today. If implementing the above, then we
would introduce xlate_cache to that path, which can be expensive due
to things like taking atomic refcounts on rules. So, I am making an
assumption that in this situation, performing one xlate_actions in the
subsequent bridge with xlate_cache being collected is cheaper than
performing the two xlate_actions with no xlate_cache. It might be
worthwhile to check out that assumption.

For what it's worth, I think Andy had some thoughts on doing cloning
in this area, but he's OOO so won't be able to provide that feedback
just yet. He had posted the patches below. There is outstanding
feedback on these so I imagine that he will want to respin them. It
might be worth taking a look to see how they might interact with your
series:
http://patchwork.ozlabs.org/patch/767655/
http://patchwork.ozlabs.org/patch/767656/

>  lib/dpif-netdev.c                  | 18 +-------
>  ofproto/ofproto-dpif-xlate-cache.c | 14 ++++++-
>  ofproto/ofproto-dpif-xlate-cache.h | 15 ++++++-
>  ofproto/ofproto-dpif-xlate.c       | 85 ++++++++++++++++++++++++++++++++++----
>  ofproto/ofproto-dpif-xlate.h       |  1 +
>  ofproto/ofproto-dpif.c             |  3 +-
>  6 files changed, 107 insertions(+), 29 deletions(-)
>
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 2b65dc7..7e10f1d 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -5029,24 +5029,8 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
>
>      case OVS_ACTION_ATTR_TUNNEL_PUSH:
>          if (*depth < MAX_RECIRC_DEPTH) {
> -            struct dp_packet_batch tnl_pkt;
> -            struct dp_packet_batch *orig_packets_ = packets_;
> -            int err;
> -
> -            if (!may_steal) {
> -                dp_packet_batch_clone(&tnl_pkt, packets_);
> -                packets_ = &tnl_pkt;
> -                dp_packet_batch_reset_cutlen(orig_packets_);
> -            }
> -
>              dp_packet_batch_apply_cutlen(packets_);
> -
> -            err = push_tnl_action(pmd, a, packets_);
> -            if (!err) {
> -                (*depth)++;
> -                dp_netdev_recirculate(pmd, packets_);
> -                (*depth)--;
> -            }
> +            push_tnl_action(pmd, a, packets_);
>              return;
>          }
>          break;
> diff --git a/ofproto/ofproto-dpif-xlate-cache.c b/ofproto/ofproto-dpif-xlate-cache.c
> index 9161701..945f169 100644
> --- a/ofproto/ofproto-dpif-xlate-cache.c
> +++ b/ofproto/ofproto-dpif-xlate-cache.c
> @@ -89,7 +89,7 @@ xlate_cache_netdev(struct xc_entry *entry, const struct dpif_flow_stats *stats)
>  /* Push stats and perform side effects of flow translation. */
>  void
>  xlate_push_stats_entry(struct xc_entry *entry,
> -                       const struct dpif_flow_stats *stats)
> +                       struct dpif_flow_stats *stats)
>  {
>      struct eth_addr dmac;
>
> @@ -160,6 +160,14 @@ xlate_push_stats_entry(struct xc_entry *entry,
>              entry->controller.am = NULL; /* One time only. */
>          }
>          break;
> +    case XC_TUNNEL_HEADER:
> +        if (entry->tunnel_hdr.operation == ADD) {
> +            stats->n_bytes += stats->n_packets * entry->tunnel_hdr.hdr_size;
> +        } else {
> +            stats->n_bytes -= stats->n_packets * entry->tunnel_hdr.hdr_size;
> +        }
> +
> +        break;
>      default:
>          OVS_NOT_REACHED();
>      }
> @@ -167,7 +175,7 @@ xlate_push_stats_entry(struct xc_entry *entry,
>
>  void
>  xlate_push_stats(struct xlate_cache *xcache,
> -                 const struct dpif_flow_stats *stats)
> +                 struct dpif_flow_stats *stats)
>  {
>      if (!stats->n_packets) {
>          return;
> @@ -245,6 +253,8 @@ xlate_cache_clear_entry(struct xc_entry *entry)
>              entry->controller.am = NULL;
>          }
>          break;
> +    case XC_TUNNEL_HEADER:
> +        break;
>      default:
>          OVS_NOT_REACHED();
>      }
> diff --git a/ofproto/ofproto-dpif-xlate-cache.h b/ofproto/ofproto-dpif-xlate-cache.h
> index 13f7cbc..4ead2f6 100644
> --- a/ofproto/ofproto-dpif-xlate-cache.h
> +++ b/ofproto/ofproto-dpif-xlate-cache.h
> @@ -52,6 +52,7 @@ enum xc_type {
>      XC_GROUP,
>      XC_TNL_NEIGH,
>      XC_CONTROLLER,
> +    XC_TUNNEL_HEADER,
>  };
>
>  /* xlate_cache entries hold enough information to perform the side effects of
> @@ -119,6 +120,16 @@ struct xc_entry {
>              struct ofproto_dpif *ofproto;
>              struct ofproto_async_msg *am;
>          } controller;
> +        struct {
> +            uint64_t max_len;
> +        } truncate;

This appears to be unused.

> +        struct {
> +            enum {
> +                ADD,
> +                REMOVE,
> +            } operation;
> +            uint16_t hdr_size;
> +        } tunnel_hdr;
>      };
>  };
>
> @@ -134,8 +145,8 @@ struct xlate_cache {
>  void xlate_cache_init(struct xlate_cache *);
>  struct xlate_cache *xlate_cache_new(void);
>  struct xc_entry *xlate_cache_add_entry(struct xlate_cache *, enum xc_type);
> -void xlate_push_stats_entry(struct xc_entry *, const struct dpif_flow_stats *);
> -void xlate_push_stats(struct xlate_cache *, const struct dpif_flow_stats *);
> +void xlate_push_stats_entry(struct xc_entry *, struct dpif_flow_stats *);
> +void xlate_push_stats(struct xlate_cache *, struct dpif_flow_stats *);
>  void xlate_cache_clear_entry(struct xc_entry *);
>  void xlate_cache_clear(struct xlate_cache *);
>  void xlate_cache_uninit(struct xlate_cache *);
> diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
> index c110f2a..bf93e8b 100644
> --- a/ofproto/ofproto-dpif-xlate.c
> +++ b/ofproto/ofproto-dpif-xlate.c
> @@ -3259,8 +3259,7 @@ is_tunnel_actions_clone_ready(struct ofpbuf *actions)
>
>  static bool
>  nested_clone_valid_on_tunnel(struct xlate_ctx *ctx, const struct xport *xport,
> -                             struct xport *out_dev,
> -                             struct ovs_action_push_tnl *tnl_push_data)
> +                             struct xport *out_dev)
>  {
>      const struct dpif_flow_stats *bckup_resubmit_stats;
>      struct xlate_cache *bckup_xcache;
> @@ -3285,7 +3284,6 @@ nested_clone_valid_on_tunnel(struct xlate_ctx *ctx, const struct xport *xport,
>      bckup_xcache = ctx->xin->xcache;
>      ctx->xin->resubmit_stats =  NULL;
>      ctx->xin->xcache = NULL;
> -    odp_put_tnl_push_action(ctx->odp_actions, tnl_push_data);
>      apply_nested_clone_actions(ctx, xport, out_dev);
>      nested_act_flag = is_tunnel_actions_clone_ready(ctx->odp_actions);
>
> @@ -3318,6 +3316,17 @@ build_tunnel_send(struct xlate_ctx *ctx, const struct xport *xport,
>      int err;
>      char buf_sip6[INET6_ADDRSTRLEN];
>      char buf_dip6[INET6_ADDRSTRLEN];
> +    /* Structures to backup Ethernet and IP of base_flow. */
> +    struct flow old_base_flow;
> +    struct flow old_flow;
> +    struct dpif_flow_stats tmp_resubmit_stats;
> +    const struct dpif_flow_stats *old_resubmit_stats = NULL;
> +    struct xlate_cache *old_xcache = NULL;
> +
> +    /* Backup base_flow data. */
> +    copy_flow(&old_base_flow, &ctx->base_flow);
> +    copy_flow(&old_flow, &ctx->xin->flow);
> +    memset(&tmp_resubmit_stats, 0, sizeof tmp_resubmit_stats);
>
>      err = tnl_route_lookup_flow(flow, &d_ip6, &s_ip6, &out_dev);
>      if (err) {
> @@ -3378,12 +3387,72 @@ build_tunnel_send(struct xlate_ctx *ctx, const struct xport *xport,
>      tnl_push_data.tnl_port = odp_to_u32(tunnel_odp_port);
>      tnl_push_data.out_port = odp_to_u32(out_dev->odp_port);
>
> -    /* After tunnel header has been added, packet_type of flow and base_flow
> -     * need to be set to PT_ETH. */
> -    ctx->xin->flow.packet_type = htonl(PT_ETH);
> -    ctx->base_flow.packet_type = htonl(PT_ETH);
> +    /* After tunnel header has been added, MAC and IP data of flow and
> +     * base_flow need to be set properly, since there is not recirculation
> +     * any more when sending packet to tunnel. */
>
> +    propagate_tunnel_data_to_flow(ctx, dmac, smac, s_ip6, s_ip,
> +                                  tnl_params.is_ipv6, tnl_push_data.tnl_type);
> +
> +
> +    size_t push_action_size = 0;
> +    size_t clone_ofs = nl_msg_start_nested(ctx->odp_actions,
> +                                           OVS_ACTION_ATTR_CLONE);
>      odp_put_tnl_push_action(ctx->odp_actions, &tnl_push_data);
> +    push_action_size = ctx->odp_actions->size;
> +
> +    /* Try to see if its possible to apply nested clone action.
> +     * There is no way to know the nested clone is a valid on this
> +     * tunnel without performing the translate. Doing a temporary
> +     * translate for validation.
> +     */
> +    if (!nested_clone_valid_on_tunnel(ctx, xport, out_dev)) {
> +        /* Datapath is not doing the recirculation now, so lets make it
> +         * happen explicitly.
> +         */
> +        nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_RECIRC, 0);
> +        nl_msg_end_nested(ctx->odp_actions, clone_ofs);
> +        goto out;
> +    }
> +
> +    if (ctx->xin->resubmit_stats) {
> +        ovs_mutex_lock(&ctx->xin->resubmit_stats_mutex);
> +        old_resubmit_stats = ctx->xin->resubmit_stats;
> +        tmp_resubmit_stats = *ctx->xin->resubmit_stats;
> +        tmp_resubmit_stats.n_bytes += tnl_push_data.header_len;
> +        ctx->xin->resubmit_stats = &tmp_resubmit_stats;
> +        ovs_mutex_unlock(&ctx->xin->resubmit_stats_mutex);

What exactly does this mutex protect?

> +
> +        if (ctx->xin->xcache) {
> +            struct xc_entry *entry;
> +            entry = xlate_cache_add_entry(ctx->xin->xcache, XC_TUNNEL_HEADER);
> +            entry->tunnel_hdr.hdr_size = tnl_push_data.header_len;
> +            entry->tunnel_hdr.operation = ADD;
> +            old_xcache = ctx->xin->xcache;

Is store/restore of xcache really necessary in the current approach?

> +        }
> +    }
> +
> +    apply_nested_clone_actions(ctx, xport, out_dev);
> +    if (ctx->odp_actions->size > push_action_size) {
> +        /* Update the CLONE action only when combined */
> +        nl_msg_end_nested(ctx->odp_actions, clone_ofs);
> +    }
> +    else {
> +        /* No actions after the tunnel, no need of clone */
> +        nl_msg_cancel_nested(ctx->odp_actions, clone_ofs);
> +        odp_put_tnl_push_action(ctx->odp_actions, &tnl_push_data);
> +    }
> +    if (ctx->xin->resubmit_stats) {
> +        /* Revert the resubmit stats after clone actions */
> +        ctx->xin->resubmit_stats = old_resubmit_stats;
> +        if (ctx->xin->xcache) {
> +            ctx->xin->xcache = old_xcache;
> +        }
> +    }
> +out:
> +    /* Restore flow and base_flow data. */
> +    copy_flow(&ctx->base_flow, &old_base_flow);
> +    copy_flow(&ctx->xin->flow, &old_flow);
>      return 0;
>  }
>
> @@ -6133,6 +6202,8 @@ xlate_in_init(struct xlate_in *xin, struct ofproto_dpif *ofproto,
>      xin->odp_actions = odp_actions;
>      xin->in_packet_out = false;
>
> +    ovs_mutex_init_adaptive(&xin->resubmit_stats_mutex);
> +
>      /* Do recirc lookup. */
>      xin->frozen_state = NULL;
>      if (flow->recirc_id) {
> diff --git a/ofproto/ofproto-dpif-xlate.h b/ofproto/ofproto-dpif-xlate.h
> index 68e114a..8e48665 100644
> --- a/ofproto/ofproto-dpif-xlate.h
> +++ b/ofproto/ofproto-dpif-xlate.h
> @@ -112,6 +112,7 @@ struct xlate_in {
>       * This is normally null so the client has to set it manually after
>       * calling xlate_in_init(). */
>      const struct dpif_flow_stats *resubmit_stats;
> +    struct ovs_mutex resubmit_stats_mutex;
>
>      /* Counters carried over from a pre-existing translation of a related flow.
>       * This can occur due to, e.g., the translation of an ARP packet that was
> diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
> index 23ba1a3..a2c73c5 100644
> --- a/ofproto/ofproto-dpif.c
> +++ b/ofproto/ofproto-dpif.c
> @@ -4591,7 +4591,7 @@ packet_xlate_revert(struct ofproto *ofproto OVS_UNUSED,
>  static void
>  ofproto_dpif_xcache_execute(struct ofproto_dpif *ofproto,
>                              struct xlate_cache *xcache,
> -                            const struct dpif_flow_stats *stats)
> +                            struct dpif_flow_stats *stats)
>      OVS_REQUIRES(ofproto_mutex)
>  {
>      struct xc_entry *entry;
> @@ -4624,6 +4624,7 @@ ofproto_dpif_xcache_execute(struct ofproto_dpif *ofproto,
>          case XC_GROUP:
>          case XC_TNL_NEIGH:
>          case XC_CONTROLLER:
> +        case XC_TUNNEL_HEADER:
>              xlate_push_stats_entry(entry, stats);
>              break;
>          default:
> --
> 2.7.4
>


More information about the dev mailing list