[ovs-dev] [PATCH v14 4/7] netdev-offload-tc: Introduce group ID management API

Eelco Chaudron echaudro at redhat.com
Mon Sep 6 11:41:19 UTC 2021


On 15 Jul 2021, at 8:01, Chris Mi wrote:

> When offloading sample action to TC, userspace creates a unique ID
> to map sFlow action and tunnel info and passes this ID to kernel instead
> of the sFlow info. psample will send this ID and sampled packet to
> userspace. Using the ID, userspace can recover the sFlow info and send
> sampled packet to the right sFlow monitoring host.
>
> Signed-off-by: Chris Mi <cmi at nvidia.com>
> Reviewed-by: Eli Britstein <elibr at nvidia.com>
> ---
>  lib/dpif-offload-provider.h |  18 +++
>  lib/netdev-offload-tc.c     | 272 +++++++++++++++++++++++++++++++++++-
>  lib/netdev-offload.h        |   1 +
>  lib/tc.h                    |   1 +
>  4 files changed, 290 insertions(+), 2 deletions(-)
>
> diff --git a/lib/dpif-offload-provider.h b/lib/dpif-offload-provider.h
> index 97108402a..b765eb9a2 100644
> --- a/lib/dpif-offload-provider.h
> +++ b/lib/dpif-offload-provider.h
> @@ -17,9 +17,27 @@
>  #ifndef DPIF_OFFLOAD_PROVIDER_H
>  #define DPIF_OFFLOAD_PROVIDER_H
>
> +#include "netlink-protocol.h"
> +#include "openvswitch/packets.h"
> +#include "openvswitch/types.h"
> +
>  struct dpif;
>  struct dpif_offload_sflow;
>
> +/* When offloading sample action, userspace creates a unique ID to map
> + * sFlow action and tunnel info and passes this ID to datapath instead
> + * of the sFlow info. Datapath will send this ID and sampled packet to
> + * userspace. Using the ID, userspace can recover the sFlow info and send
> + * sampled packet to the right sFlow monitoring host.
> + */
> +struct dpif_sflow_attr {
> +    const struct nlattr *action; /* sFlow action */
> +    void *userdata;              /* struct user_action_cookie */
> +    size_t userdata_len;         /* struct user_action_cookie length */
> +    struct flow_tnl *tunnel;     /* tunnel info */
> +    ovs_u128 ufid;               /* flow ufid */
> +};
> +
>  struct dpif_offload_api {
>      void (*init)(void);
>      void (*uninit)(void);
> diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c
> index 9845e8d3f..2f16cf279 100644
> --- a/lib/netdev-offload-tc.c
> +++ b/lib/netdev-offload-tc.c
> @@ -40,6 +40,7 @@
>  #include "unaligned.h"
>  #include "util.h"
>  #include "dpif-provider.h"
> +#include "cmap.h"
>
>  VLOG_DEFINE_THIS_MODULE(netdev_offload_tc);
>
> @@ -62,6 +63,262 @@ struct chain_node {
>      uint32_t chain;
>  };
>
> +/* This maps a psample group ID to struct dpif_sflow_attr for sFlow */
> +struct sgid_node {
> +    struct ovs_list exp_node OVS_GUARDED;
> +    struct cmap_node metadata_node;
> +    struct cmap_node id_node;
> +    struct ovs_refcount refcount;
> +    uint32_t hash;
> +    uint32_t id;
> +    const struct dpif_sflow_attr sflow;
> +};
> +
> +static struct ovs_rwlock sgid_rwlock = OVS_RWLOCK_INITIALIZER;
> +
> +static long long int sgid_last_run OVS_GUARDED_BY(sgid_rwlock);
> +
> +static struct cmap sgid_map = CMAP_INITIALIZER;
> +static struct cmap sgid_metadata_map = CMAP_INITIALIZER;
> +
> +static struct ovs_list sgid_expiring OVS_GUARDED_BY(sgid_rwlock)
> +    = OVS_LIST_INITIALIZER(&sgid_expiring);
> +static struct ovs_list sgid_expired OVS_GUARDED_BY(sgid_rwlock)
> +    = OVS_LIST_INITIALIZER(&sgid_expired);
> +
> +static uint32_t next_sample_group_id OVS_GUARDED_BY(sgid_rwlock) = 1;
> +
> +#define SGID_RUN_INTERVAL   250     /* msec */
> +
> +static void
> +sgid_node_free(struct sgid_node *node)
> +{
> +    free(node->sflow.tunnel);
> +    free(CONST_CAST(void *, node->sflow.action));
> +    free(node->sflow.userdata);
> +    free(node);
> +}
> +
> +static void
> +sgid_cleanup(void)
> +{
> +    long long int now = time_msec();
> +    struct sgid_node *node;
> +
> +    /* Do maintenance at most 4 times / sec. */
> +    ovs_rwlock_rdlock(&sgid_rwlock);
> +    if (now - sgid_last_run < SGID_RUN_INTERVAL) {
> +        ovs_rwlock_unlock(&sgid_rwlock);
> +        return;
> +    }
> +    ovs_rwlock_unlock(&sgid_rwlock);
> +
> +    ovs_rwlock_wrlock(&sgid_rwlock);
> +    sgid_last_run = now;
> +
> +    LIST_FOR_EACH_POP (node, exp_node, &sgid_expired) {
> +        cmap_remove(&sgid_map, &node->id_node, node->id);
> +        ovsrcu_postpone(sgid_node_free, node);
> +    }
> +
> +    if (!ovs_list_is_empty(&sgid_expiring)) {
> +        /* 'sgid_expired' is now empty, move nodes in
> +         * 'sgid_expiring' to it. */
> +        ovs_list_splice(&sgid_expired,
> +                        ovs_list_front(&sgid_expiring),
> +                        &sgid_expiring);
> +    }

I still do not believe copying some similar code without understanding the impact helps here.
So I think this still needs to be resolved. I introduced previous comments and replies from the v9 patchset:

EC> Trying to understand why we have two list? Why can't we directly cleanup from expiring list?

CM> The code takes the recirc id management for example (ofproto-dpif-rid.c). I'm not 100% sure the two lists are really needed here. But the recirc id code exists for a long time. I think it is mature. And I don't want to re-invent something.
CM> And when psample thread receives the sampled packets, it needs to lookup the sgid_node based on the group id. But the sgid_node could be deleted anytime by the revalidator.
CM> So I think two list can resolve this issue. We have 250 ms interval to wait for the sampled packets on the fly to be processed.

EC> Just adding code because it exists somewhere else does not make sense to me. If you do not have a valid reason, why add all this complexity?
EC> I would like some other reviewer's comments on this, maybe I missed something, Ilya?

> +    ovs_rwlock_unlock(&sgid_rwlock);
> +}
> +
> +/* Lockless RCU protected lookup.  If node is needed accross RCU quiescent
> + * state, caller should copy the contents. */
> +static const struct sgid_node *
> +sgid_find(uint32_t id)
> +{
> +    const struct cmap_node *node = cmap_find(&sgid_map, id);
> +
> +    return node ? CONTAINER_OF(node, const struct sgid_node, id_node) : NULL;
> +}
> +
> +const struct dpif_sflow_attr *
> +dpif_offload_sflow_attr_find(uint32_t id)
> +{
> +    const struct sgid_node *node;
> +
> +    node = sgid_find(id);
> +    if (!node) {
> +        return NULL;
> +    }
> +
> +    return &node->sflow;
> +}
> +
> +static uint32_t
> +dpif_sflow_attr_hash(const struct dpif_sflow_attr *sflow)
> +{
> +    return hash_bytes(&sflow->ufid, sizeof sflow->ufid, 0);
> +}
> +
> +static bool
> +dpif_sflow_attr_equal(const struct dpif_sflow_attr *a,
> +                      const struct dpif_sflow_attr *b)
> +{
> +    return ovs_u128_equals(a->ufid, b->ufid);
> +}
> +
> +/* Lockless RCU protected lookup.  If node is needed accross RCU quiescent
> + * state, caller should take a reference. */
> +static struct sgid_node *
> +sgid_find_equal(const struct dpif_sflow_attr *target, uint32_t hash)
> +{
> +    struct sgid_node *node;
> +
> +    CMAP_FOR_EACH_WITH_HASH (node, metadata_node, hash, &sgid_metadata_map) {
> +        if (dpif_sflow_attr_equal(&node->sflow, target)) {
> +            return node;
> +        }
> +    }
> +    return NULL;
> +}
> +
> +static struct sgid_node *
> +sgid_ref_equal(const struct dpif_sflow_attr *target, uint32_t hash)
> +{
> +    struct sgid_node *node;
> +
> +    do {
> +        node = sgid_find_equal(target, hash);
> +        /* Try again if the node was released before we get the reference. */
> +    } while (node && !ovs_refcount_try_ref_rcu(&node->refcount));
> +
> +    return node;
> +}
> +
> +static void
> +dpif_sflow_attr_clone(struct dpif_sflow_attr *new,
> +                      const struct dpif_sflow_attr *old)
> +{
> +    new->action = xmemdup(old->action, old->action->nla_len);
> +    new->userdata = xmemdup(old->userdata, old->userdata_len);
> +    new->userdata_len = old->userdata_len;
> +    new->tunnel = old->tunnel
> +                  ? xmemdup(old->tunnel, sizeof *old->tunnel)
> +                  : NULL;
> +    new->ufid = old->ufid;
> +}
> +
> +/* We use the id as the hash value, which works due to cmap internal rehashing.
> + * We also only insert nodes with unique IDs, so all possible hash collisions
> + * remain internal to the cmap. */
> +static struct sgid_node *
> +sgid_find__(uint32_t id)
> +    OVS_REQUIRES(sgid_rwlock)
> +{
> +    struct cmap_node *node = cmap_find_protected(&sgid_map, id);
> +
> +    return node ? CONTAINER_OF(node, struct sgid_node, id_node) : NULL;
> +}
> +
> +/* Allocate a unique group id for the given set of flow metadata. The id
> + * space is 2^^32, but if looping too many times, we still can't find a
> + * free one, that means something is wrong, return NULL.
> + */
> +#define SGID_ALLOC_RETRY_MAX 8192
> +static struct sgid_node *
> +sgid_alloc__(const struct dpif_sflow_attr *sflow, uint32_t hash)
> +{
> +    struct sgid_node *node = xzalloc(sizeof *node);
> +    int i = 0;
> +
> +    node->hash = hash;
> +    ovs_refcount_init(&node->refcount);
> +    dpif_sflow_attr_clone(CONST_CAST(struct dpif_sflow_attr *,
> +                                     &node->sflow), sflow);
> +
> +    ovs_rwlock_wrlock(&sgid_rwlock);
> +    for (;;) {
> +        node->id = next_sample_group_id++;
> +        if (OVS_UNLIKELY(!node->id)) {
> +            next_sample_group_id = 1;
> +            node->id = next_sample_group_id++;
> +        }
> +        /* Find if the id is free. */
> +        if (OVS_LIKELY(!sgid_find__(node->id))) {
> +            break;
> +        }
> +        if (++i == SGID_ALLOC_RETRY_MAX) {
> +            VLOG_ERR("%s: Can't find a free group ID after %d retries",
> +                     __func__, i);
> +            ovs_rwlock_unlock(&sgid_rwlock);
> +            sgid_node_free(node);
> +            return NULL;
> +        }
> +    }
> +    cmap_insert(&sgid_map, &node->id_node, node->id);
> +    cmap_insert(&sgid_metadata_map, &node->metadata_node, node->hash);
> +    ovs_rwlock_unlock(&sgid_rwlock);
> +    return node;
> +}
> +
> +/* Allocate a unique group id for the given set of flow metadata and
> +   optional actions. */
> +static uint32_t
> +sgid_alloc_ctx(const struct dpif_sflow_attr *sflow)
> +{
> +    uint32_t hash = dpif_sflow_attr_hash(sflow);
> +    struct sgid_node *node = sgid_ref_equal(sflow, hash);
> +
> +    if (!node) {
> +        node = sgid_alloc__(sflow, hash);
> +    }
> +
> +    return node ? node->id : 0;
> +}
> +
> +static void
> +sgid_node_unref(const struct sgid_node *node_)
> +    OVS_EXCLUDED(sgid_rwlock)
> +{
> +    struct sgid_node *node = CONST_CAST(struct sgid_node *, node_);
> +
> +    ovs_rwlock_wrlock(&sgid_rwlock);
> +    if (node && ovs_refcount_unref(&node->refcount) == 1) {
> +        /* Prevent re-use of this node by removing the node from
> +         * sgid_metadata_map' */
> +        cmap_remove(&sgid_metadata_map, &node->metadata_node, node->hash);
> +        /* We keep the node in the 'sgid_map' so that it can be found as
> +         * long as it lingers, and add it to the 'sgid_expiring' list. */
> +        ovs_list_insert(&sgid_expiring, &node->exp_node);

Also re-introducing my v9 comments, related to the same as above:

EC> Why do we do a delayed remove here? Any sflow packets in the pipeline can be silently deleted, based on a failing sgid_find().

CM> Please see my above comment for the two list issue.

EC> The only thing concern might be ID reuse. But I think with 32^2 IDs, and the current allocator schema, this is unlikely to happen.
EC> It would make the code cleaner, and the rwlock can be a normal lock.

> +    }
> +    ovs_rwlock_unlock(&sgid_rwlock);
> +}
> +
> +static void
> +sgid_free(uint32_t id)
> +{
> +    const struct sgid_node *node;
> +
> +    if (!id) {
> +        return;
> +    }
> +
> +    node = sgid_find(id);
> +    if (node) {
> +        sgid_node_unref(node);
> +    } else {
> +        VLOG_ERR("%s: Freeing nonexistent group ID: %"PRIu32, __func__, id);
> +    }
> +}
> +
> +static int
> +del_filter_and_sgid(struct tcf_id *id)
> +{
> +    sgid_free(id->sample_group_id);
> +    id->sample_group_id = 0;
> +    return tc_del_filter(id);
> +}
> +
>  static bool
>  is_internal_port(const char *type)
>  {
> @@ -204,7 +461,7 @@ del_filter_and_ufid_mapping(struct tcf_id *id, const ovs_u128 *ufid)
>  {
>      int err;
>
> -    err = tc_del_filter(id);
> +    err = del_filter_and_sgid(id);

This function name explicitly states what it does delete filter and ufid mapping. Should we update the name now that we also cleanup sgid? As to me, it looks like we are overloading this function to do sgid cleanup?

>      if (!err) {
>          del_ufid_tc_mapping(ufid);
>      }
> @@ -426,7 +683,7 @@ netdev_tc_flow_flush(struct netdev *netdev)
>              continue;
>          }
>
> -        err = tc_del_filter(&data->id);
> +        err = del_filter_and_sgid(&data->id);
>          if (!err) {
>              del_ufid_tc_mapping_unlocked(&data->ufid);
>          }
> @@ -466,6 +723,12 @@ netdev_tc_flow_dump_create(struct netdev *netdev,
>
>      *dump_out = dump;
>
> +    /* Cleanup the sFlow group ID nodes in sgid_expired list.
> +     * In order to avoid creating a dedicated thread to do it, put it in
> +     * the revalidator thread.
> +     */
> +    sgid_cleanup();
> +
>      return 0;
>  }
>
> @@ -1917,6 +2180,11 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match,
>              action->type = TC_ACT_GOTO;
>              action->chain = 0;  /* 0 is reserved and not used by recirc. */
>              flower.action_count++;
> +        } else if (nl_attr_type(nla) == OVS_ACTION_ATTR_SAMPLE) {
> +            struct dpif_sflow_attr sflow_attr;
> +
> +            memset(&sflow_attr, 0, sizeof sflow_attr);
> +            sgid_alloc_ctx(&sflow_attr);

Here we should have an error check, as allocation could fail!

>          } else {
>              VLOG_DBG_RL(&rl, "unsupported put action type: %d",
>                          nl_attr_type(nla));
> diff --git a/lib/netdev-offload.h b/lib/netdev-offload.h
> index e7fcedae9..d57d70470 100644
> --- a/lib/netdev-offload.h
> +++ b/lib/netdev-offload.h
> @@ -138,6 +138,7 @@ int netdev_ports_flow_get(const char *dpif_type, struct match *match,
>  int netdev_ports_get_n_flows(const char *dpif_type,
>                               odp_port_t port_no, uint64_t *n_flows);
>
> +const struct dpif_sflow_attr *dpif_offload_sflow_attr_find(uint32_t id);
>  #ifdef  __cplusplus
>  }
>  #endif
> diff --git a/lib/tc.h b/lib/tc.h
> index a147ca461..2e4084f48 100644
> --- a/lib/tc.h
> +++ b/lib/tc.h
> @@ -276,6 +276,7 @@ struct tcf_id {
>      uint32_t chain;
>      uint16_t prio;
>      uint32_t handle;
> +    uint32_t sample_group_id;
>  };
>
>  static inline struct tcf_id
> -- 
> 2.21.0



More information about the dev mailing list