[ovs-dev] [RFC v2 PATCH 4/5] dpif-netdev: Skip encap action during datapath execution

Thu May 28 08:12:47 UTC 2020

On Tue, May 26, 2020 at 7:12 PM Eli Britstein <elibr at mellanox.com> wrote:
>
>
> On 5/18/2020 10:27 PM, Sriharsha Basavapatna wrote:
> > In this patch we check if action processing (apart from OUTPUT action),
> > should be skipped for a given dp_netdev_flow. Specifically, we check if
> > the action is TNL_PUSH and if it has been offloaded to HW, then we do not
> > push the tunnel header in SW. The datapath only executes the OUTPUT action.
> > The packet will be encapsulated in HW during transmit.
> This commit should be before the actual offloading. currently, the
> version after the offload and before this skip does not work.

Sure, I'll also move the get_stats() patch before the offloading patch.

> >
> > Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna at broadcom.com>
> > ---
> >   lib/dpif-netdev.c | 247 +++++++++++++++++++++++++++++++++++++++++-----
> >   1 file changed, 224 insertions(+), 23 deletions(-)
> >
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > index 4315f237c..07d16ad61 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -112,6 +112,7 @@ COVERAGE_DEFINE(datapath_drop_recirc_error);
> >   COVERAGE_DEFINE(datapath_drop_invalid_port);
> >   COVERAGE_DEFINE(datapath_drop_invalid_tnl_port);
> >   COVERAGE_DEFINE(datapath_drop_rx_invalid_packet);
> > +COVERAGE_DEFINE(datapath_skip_tunnel_push);
> >
> >   /* Protects against changes to 'dp_netdevs'. */
> >   static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
> > @@ -547,6 +548,7 @@ struct dp_netdev_flow {
> >        */
> >       bool partial_actions_offloaded;
> >       odp_port_t  egress_offload_port;
> > +    struct ovs_mutex partial_action_offload_mutex;
> locking in the datapath should be avoided. furthermore, the mutex is in
> struct dp_netdev_flow. there is an instance of this struct for each pmd
> thread, and the flow put is done on a mega-flow. different pmd threads
> will have different instance of this mutex. this locking scheme doesn't
> resolve the issues from v1 about non-synched between datapath and hw
> offload.

I understand locking is not preferred, but I wanted to start with a
mutex to get the functionality right and possibly replace it with
better synchronization primitives subsequently (that's the reason it
is currently wrapped in separate lock() and unlock() functions). The
lock correctly synchronizes when the offload thread is processing the
same pmd-flow (dp_netdev_flow) that is running in the datapath. But
your point about a different pmd thread is right. We will have to then
synchronize it at the mega-flow level.

Having said that, assuming it is not synched (as in patchset v1), and
the tunnel header is pushed in SW, then the HW wouldn't push the
tunnel header again since the packet is already encapsulated. That is,
there's no matching flow in HW and the action wouldn't be executed in
HW. So there won't be any issue of double encap. I've also confirmed
this with testing (debug code to ignore successfully offloaded flow
and still push the header in SW).  Due to this, a few packets (at the
beginning of the session) that could have been offloaded will just end
up consuming SW cycles, but eventually packets get offloaded. But it's
not bad considering that the trade-off is additional complexity of
synchronization code.

Based on this, I'm going to revert back this part of the code (i.e,
no-sync), and I'll add inline comments to explain this.

>
> >
> >       /* Statistics. */
> >       struct dp_netdev_flow_stats stats;
> > @@ -801,7 +803,8 @@ static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
> >                                         bool should_steal,
> >                                         const struct flow *flow,
> >                                         const struct nlattr *actions,
> > -                                      size_t actions_len);
> > +                                      size_t actions_len,
> > +                                      const struct dp_netdev_flow *dp_flow);
> >   static void dp_netdev_input(struct dp_netdev_pmd_thread *,
> >                               struct dp_packet_batch *, odp_port_t port_no);
> >   static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
> > @@ -2361,17 +2364,159 @@ dp_netdev_append_flow_offload(struct dp_flow_offload_item *offload)
> >       ovs_mutex_unlock(&dp_flow_offload.mutex);
> >   }
> >
> > +/*
> > + * Mapping structure to map ufid to a partial offload egress device.
> > + * Synchronization: accessed only in the context of the offload thread.
> > + */
> > +struct ufid_to_egdev_data {
> > +    const struct cmap_node node;   /* link to cmap */
> > +    ovs_u128 mega_ufid;            /* mega-ufid being mapped */
> > +    odp_port_t egress_port_num;    /* Port number mapped to */
> > +    struct dp_netdev_flow *flow;   /* flow that maps to this ufid */
> > +};
> mapping code for offloads should be done in offload layer and not in
> dpif. though the partial offload (mark+rss) does the same, i think it
> was wrong for that case too.

I'd like to keep it in the dpif-netdev layer for now, since this is
the layer for flow selection logic, for egress partial offload. Note
that this code is already in the context of the offload thread
(dp_netdev_flow_offload_put()).

> > +
> > +/*
> > + * A mapping from mega-ufid to partial-offload egress-device.
> > + */
> > +static struct cmap ufid_to_egdev = CMAP_INITIALIZER;
> > +
> > +static uint32_t
> > +ufid_to_egdev_refcnt(const ovs_u128 *mega_ufid)
> > +{
> > +    size_t hash = hash_bytes(mega_ufid, sizeof *mega_ufid, 0);
> > +    struct ufid_to_egdev_data *data;
> > +    uint32_t refcnt = 0;
> > +
> > +    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_egdev) {
> > +        if (ovs_u128_equals(*mega_ufid, data->mega_ufid)) {
> > +            refcnt++;
> > +        }
> > +    }
> > +
> > +    return refcnt;
> > +}
> > +
> > +/* Find egdev_data @(ufid, flow) */
> > +static struct ufid_to_egdev_data *
> > +ufid_to_egdev_data_find(const ovs_u128 *ufid,
> > +                        const struct dp_netdev_flow *flow)
> > +{
> > +    size_t hash = hash_bytes(ufid, sizeof *ufid, 0);
> > +    struct ufid_to_egdev_data *data;
> > +
> > +    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_egdev) {
> > +        if (data->flow == flow && ovs_u128_equals(*ufid, data->mega_ufid)) {
> > +            return data;
> > +        }
> > +    }
> > +
> > +    return NULL;
> > +}
> > +
> > +/* Map the given pair of mega-ufid and flow to the given port. Returns 0
> > + * when the mapping is created initially in the context of a flow. For
> > + * subsequent calls, if it is a new flow with the same mega-ufid, creates
> > + * a mapping entry but returns EEXIST (i.e, at least one other flow with
> > + * the same ufid exists in the table). If it is an already mapped mega-ufid
> > + * & flow pair, returns EEXIST.
> > + */
> >   static int
> > -partial_offload_egress_flow_del(struct dp_netdev_pmd_thread *pmd,
> > -                                struct dp_netdev_flow *flow)
> > +map_ufid_to_egdev(const ovs_u128 *mega_ufid,
> > +                  const struct dp_netdev_flow *flow,
> > +                  odp_port_t egress_port_num)
> > +{
> > +    struct ufid_to_egdev_data *data;
> > +    size_t hash;
> > +
> > +    data = ufid_to_egdev_data_find(mega_ufid, flow);
> > +    if (data) {
> > +        /* mapping already exists for the given <mega-ufid,flow> pair */
> > +        VLOG_DBG_RL("ufid_to_egdev mapping already exists for flow: %p\n",
> > +                    flow);
> > +        return EEXIST;
> > +    }
> > +
> > +    data = xzalloc(sizeof *data);
> > +    data->mega_ufid = *mega_ufid;
> > +    data->egress_port_num = egress_port_num;
> > +    data->flow = flow;
> > +
> > +    hash = hash_bytes(mega_ufid, sizeof *mega_ufid, 0);
> > +    cmap_insert(&ufid_to_egdev,
> > +                CONST_CAST(struct cmap_node *, &data->node), hash);
> > +
> > +    if (ufid_to_egdev_refcnt(mega_ufid) > 1) {
> > +        /* at least one mapping exists for the mega_ufid */
> > +        VLOG_DBG_RL("At least one ufid_to_egdev mapping exists, flow: %p\n",
> > +                    flow);
> > +        return EEXIST;
> > +    }
> > +
> > +    /* first mapping created for the mega_ufid */
> > +    VLOG_DBG_RL("Created the first ufid_to_egdev mapping; flow: %p\n", flow);
> > +    return 0;
> > +}
> > +
> > +static uint32_t
> > +unmap_ufid_to_egdev(const ovs_u128 *mega_ufid,
> > +                    const struct dp_netdev_flow *flow)
> >   {
> > -    int ret;
> > -    struct netdev *port;
> > -    odp_port_t out_port = flow->egress_offload_port;
> > +    struct ufid_to_egdev_data *data;
> > +    uint32_t refcnt;
> > +    size_t hash;
> > +
> > +    data = ufid_to_egdev_data_find(mega_ufid, flow);
> > +    ovs_assert(data);
> > +
> > +    hash = hash_bytes(&data->mega_ufid, sizeof data->mega_ufid, 0);
> > +    cmap_remove(&ufid_to_egdev,
> > +                CONST_CAST(struct cmap_node *, &data->node), hash);
> > +    ovsrcu_postpone(free, data);
> > +
> > +    refcnt = ufid_to_egdev_refcnt(mega_ufid);
> > +    VLOG_DBG_RL("Unmapped ufid_to_egdev: flow: %p, refcnt: %d\n",
> > +                flow, refcnt);
> > +
> > +    return refcnt;
> > +}
> > +
> > +static inline void
> > +partial_action_offload_lock(struct dp_netdev_flow *flow)
> > +{
> > +    ovs_mutex_lock(&flow->partial_action_offload_mutex);
> > +}
> > +
> > +static inline void
> > +partial_action_offload_unlock(struct dp_netdev_flow *flow)
> > +{
> > +    ovs_mutex_unlock(&flow->partial_action_offload_mutex);
> > +}
> > +
> > +static int
> > +partial_offload_egress_flow_del(struct dp_flow_offload_item *offload)
> > +{
> > +    struct dp_netdev_pmd_thread *pmd = offload->pmd;
> > +    struct dp_netdev_flow *flow = offload->flow;
> >       const char *dpif_type_str = dpif_normalize_type(pmd->dp->class->type);
> > +    struct netdev *port;
> > +    uint32_t refcnt;
> > +    int ret;
> > +
> > +    partial_action_offload_lock(offload->flow);
> > +    refcnt = unmap_ufid_to_egdev(&flow->mega_ufid, offload->flow);
> > +    if (refcnt) {
> > +        flow->egress_offload_port = NULL;
> > +        flow->partial_actions_offloaded = false;
> > +        partial_action_offload_unlock(flow);
> > +        return 0;
> > +    }
> >
> > -    port = netdev_ports_get(out_port, dpif_type_str);
> > +    /* The egress dev is not referenced by any flow with the given ufid.
> > +     * We can now remove the partial-action egress-flow from hardware.
> > +     */
> > +    port = netdev_ports_get(flow->egress_offload_port, dpif_type_str);
> >       if (!port) {
> > +        partial_action_offload_unlock(flow);
> >           return -1;
> >       }
> >
> > @@ -2382,15 +2527,27 @@ partial_offload_egress_flow_del(struct dp_netdev_pmd_thread *pmd,
> >       ovs_mutex_unlock(&pmd->dp->port_mutex);
> >       netdev_close(port);
> >
> > +    if (ret) {
> > +        partial_action_offload_unlock(flow);
> > +        return ret;
> > +    }
> > +
> > +    flow->egress_offload_port = NULL;
> > +    flow->partial_actions_offloaded = false;
> > +
> > +    partial_action_offload_unlock(offload->flow);
> > +
> > +    VLOG_DBG_RL("Deleted partial_offloaded egress flow: %p pmd: %p id: %d\n",
> > +                flow, pmd, offload->flow->pmd_id);
> >       return ret;
> >   }
> >
> >   static int
> >   dp_netdev_flow_offload_del(struct dp_flow_offload_item *offload)
> >   {
> > -    if (offload->flow->partial_actions_offloaded &&
> > -        offload->flow->egress_offload_port != ODPP_NONE) {
> > -        return partial_offload_egress_flow_del(offload->pmd, offload->flow);
> > +    if (unlikely(offload->flow->partial_actions_offloaded &&
> > +        offload->flow->egress_offload_port != ODPP_NONE)) {
> > +        return partial_offload_egress_flow_del(offload);
> >       } else {
> >           return mark_to_flow_disassociate(offload->pmd, offload->flow);
> >       }
> > @@ -2608,7 +2765,8 @@ dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
> >       info.attr_egress = 0;
> >       info.partial_actions = 0;
> >
> > -    if (dp_netdev_partial_offload_supported(port, offload, &egress_port)) {
> > +    if (unlikely(dp_netdev_partial_offload_supported(port, offload,
> > +                  &egress_port))) {
> >           if (egress_port) {
> >               netdev_close(port);
> >               port = egress_port;
> > @@ -2618,11 +2776,25 @@ dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
> >           info.partial_actions = 1;
> >       }
> >
> > -    if (alloc_mark && dp_netdev_alloc_flow_mark(flow, modification, &mark)) {
> > -            /* flow already offloaded */
> > +    if (unlikely(info.partial_actions && egress_port)) {
> > +        partial_action_offload_lock(flow);
> > +        if (map_ufid_to_egdev(&flow->mega_ufid, flow,
> > +            flow->egress_offload_port) == EEXIST) {
> > +            /* Partial action already offloaded for the ufid+egdev */
> > +            flow->partial_actions_offloaded = true;
> > +            partial_action_offload_unlock(flow);
> >               netdev_close(port);
> > +            VLOG_DBG_RL("Partial offload exists, flow: %p pmd: %p id: %d\n",
> > +                        flow, offload->pmd, flow->pmd_id);
> >               return 0;
> > +        }
> > +    } else if (alloc_mark &&
> > +               dp_netdev_alloc_flow_mark(flow, modification, &mark)) {
> > +            /* flow already offloaded */
> > +        netdev_close(port);
> > +        return 0;
> >       }
> > +
> >       info.flow_mark = mark;
> >
> >       /* Taking a global 'port_mutex' to fulfill thread safety restrictions for
> > @@ -2639,15 +2811,25 @@ dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
> >           goto err_free;
> >       }
> >
> > -    if (info.partial_actions) {
> > +    if (unlikely(info.partial_actions && egress_port)) {
> >           flow->partial_actions_offloaded = true;
> > +        VLOG_DBG_RL("Partial offloaded (egress) flow: %p pmd: %p id: %d\n",
> > +                    flow, offload->pmd, flow->pmd_id);
> > +        partial_action_offload_unlock(flow);
> >       } else if (!modification) {
> >           megaflow_to_mark_associate(&flow->mega_ufid, mark);
> >           mark_to_flow_associate(mark, flow);
> >       }
> > +
> >       return 0;
> >
> >   err_free:
> > +    if (unlikely(info.partial_actions) && egress_port) {
> > +        VLOG_DBG_RL("Partial offload(egress) failed flow: %p pmd: %p id: %d\n",
> > +                    flow, offload->pmd, flow->pmd_id);
> > +        unmap_ufid_to_egdev(&flow->mega_ufid, offload->flow);
> > +        partial_action_offload_unlock(flow);
> > +    }
> >       if (mark != INVALID_FLOW_MARK) {
> >           if (!modification) {
> >               netdev_offload_flow_mark_free(mark);
> > @@ -3523,6 +3705,7 @@ dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
> >       flow->mark = INVALID_FLOW_MARK;
> >       flow->partial_actions_offloaded = false;
> >       flow->egress_offload_port = ODPP_NONE;
> > +    ovs_mutex_init(&flow->partial_action_offload_mutex);
> >       *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
> >       *CONST_CAST(struct flow *, &flow->flow) = match->flow;
> >       *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
> > @@ -4026,7 +4209,7 @@ dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
> >       dp_packet_batch_init_packet(&pp, execute->packet);
> >       pp.do_not_steal = true;
> >       dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
> > -                              execute->actions, execute->actions_len);
> > +                              execute->actions, execute->actions_len, NULL);
> >       dp_netdev_pmd_flush_output_packets(pmd, true);
> >
> >       if (pmd->core_id == NON_PMD_CORE_ID) {
> > @@ -6654,7 +6837,7 @@ packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
> >       actions = dp_netdev_flow_get_actions(flow);
> >
> >       dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
> > -                              actions->actions, actions->size);
> > +                              actions->actions, actions->size, flow);
> >   }
> >
> >   static inline void
> > @@ -6962,7 +7145,7 @@ handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
> >        * we'll send the packet up twice. */
> >       dp_packet_batch_init_packet(&b, packet);
> >       dp_netdev_execute_actions(pmd, &b, true, &match.flow,
> > -                              actions->data, actions->size);
> > +                              actions->data, actions->size, NULL);
> >
> >       add_actions = put_actions->size ? put_actions : actions;
> >       if (OVS_LIKELY(error != ENOSPC)) {
> > @@ -7197,6 +7380,7 @@ dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
> >   struct dp_netdev_execute_aux {
> >       struct dp_netdev_pmd_thread *pmd;
> >       const struct flow *flow;
> > +    const void *dp_flow;    /* for partial action offload */
> >   };
> >
> >   static void
> > @@ -7341,7 +7525,7 @@ dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
> >       if (!error || error == ENOSPC) {
> >           dp_packet_batch_init_packet(&b, packet);
> >           dp_netdev_execute_actions(pmd, &b, should_steal, flow,
> > -                                  actions->data, actions->size);
> > +                                  actions->data, actions->size, NULL);
> >       } else if (should_steal) {
> >           dp_packet_delete(packet);
> >           COVERAGE_INC(datapath_drop_userspace_action_error);
> > @@ -7360,6 +7544,7 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
> >       int type = nl_attr_type(a);
> >       struct tx_port *p;
> >       uint32_t packet_count, packets_dropped;
> > +    struct dp_netdev_flow *dp_flow = aux->dp_flow;
> >
> >       switch ((enum ovs_action_attr)type) {
> >       case OVS_ACTION_ATTR_OUTPUT:
> > @@ -7417,9 +7602,24 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
> >           }
> >           dp_packet_batch_apply_cutlen(packets_);
> >           packet_count = dp_packet_batch_size(packets_);
> > -        if (push_tnl_action(pmd, a, packets_)) {
> > -            COVERAGE_ADD(datapath_drop_tunnel_push_error,
> > -                         packet_count);
> > +        /* Execute tnl_push action in SW, only if it is not offloaded
> > +         * as a partial action in HW. Otherwise, HW pushes the tunnel
> > +         * header during output processing. */
> > +        if (likely(!netdev_is_flow_api_enabled() || !dp_flow)) {
> > +            if (push_tnl_action(pmd, a, packets_)) {
> > +                COVERAGE_ADD(datapath_drop_tunnel_push_error, packet_count);
> > +            }
> > +        } else { /* netdev_flow_api_enabled && dp_flow */
> > +            partial_action_offload_lock(dp_flow);
> > +            if (!dp_flow->partial_actions_offloaded) {
> > +                if (push_tnl_action(pmd, a, packets_)) {
> > +                    COVERAGE_ADD(datapath_drop_tunnel_push_error,
> > +                                 packet_count);
> > +                }
> > +            } else {
> > +                COVERAGE_ADD(datapath_skip_tunnel_push, packet_count);
> > +            }
> > +            partial_action_offload_unlock(dp_flow);
> >           }
> >           return;
> >
> > @@ -7707,9 +7907,10 @@ static void
> >   dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
> >                             struct dp_packet_batch *packets,
> >                             bool should_steal, const struct flow *flow,
> > -                          const struct nlattr *actions, size_t actions_len)
> > +                          const struct nlattr *actions, size_t actions_len,
> > +                          const struct dp_netdev_flow *dp_flow)
> >   {
> > -    struct dp_netdev_execute_aux aux = { pmd, flow };
> > +    struct dp_netdev_execute_aux aux = { pmd, flow, dp_flow };
> >
> >       odp_execute_actions(&aux, packets, should_steal, actions,
> >                           actions_len, dp_execute_cb);