[ovs-dev] [PATCH v9 08/11] netdev-offload-tc: Introduce group ID management API

Eelco Chaudron echaudro at redhat.com
Tue Jan 12 19:49:12 UTC 2021



On 15 Dec 2020, at 4:38, Chris Mi wrote:

> When offloading sample action to TC, userspace creates a unique ID
> to map sFlow action and tunnel info and passes this ID to kernel 
> instead
> of the sFlow info. psample will send this ID and sampled packet to
> userspace. Using the ID, userspace can recover the sFlow info and send
> sampled packet to the right sFlow monitoring host.
>
> Signed-off-by: Chris Mi <cmi at nvidia.com>
> Reviewed-by: Eli Britstein <elibr at nvidia.com>
> ---
>  lib/netdev-offload-tc.c | 260 
> ++++++++++++++++++++++++++++++++++++++++
>  lib/tc.h                |   1 +
>  2 files changed, 261 insertions(+)
>
> diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c
> index 9c5be9f92..5c3c811a2 100644
> --- a/lib/netdev-offload-tc.c
> +++ b/lib/netdev-offload-tc.c
> @@ -39,6 +39,7 @@
>  #include "unaligned.h"
>  #include "util.h"
>  #include "dpif-provider.h"
> +#include "cmap.h"
>
>  VLOG_DEFINE_THIS_MODULE(netdev_offload_tc);
>
> @@ -57,6 +58,255 @@ struct netlink_field {
>      int size;
>  };
>
> +/* This maps a psample group ID to struct dpif_sflow_attr for sFlow 
> */
> +struct gid_node {
> +    struct ovs_list exp_node OVS_GUARDED;
> +    struct cmap_node metadata_node;
> +    struct cmap_node id_node;
> +    struct ovs_refcount refcount;
> +    uint32_t hash;
> +    uint32_t id;
> +    const struct dpif_sflow_attr sflow;
> +};

All your variables/functions etc. all use gid (group ID) but this seems 
a bit general. Maybe you can make it more sample specific, i.e. sgid?

> +
> +static struct ovs_rwlock gid_rwlock = OVS_RWLOCK_INITIALIZER;
> +
> +static long long int gid_last_run OVS_GUARDED_BY(gid_rwlock);
> +
> +static struct cmap gid_map = CMAP_INITIALIZER;
> +static struct cmap gid_metadata_map = CMAP_INITIALIZER;
> +
> +static struct ovs_list gid_expiring OVS_GUARDED_BY(gid_rwlock)
> +    = OVS_LIST_INITIALIZER(&gid_expiring);
> +static struct ovs_list gid_expired OVS_GUARDED_BY(gid_rwlock)
> +    = OVS_LIST_INITIALIZER(&gid_expired);
> +
> +static uint32_t next_group_id OVS_GUARDED_BY(gid_rwlock) = 1;
> +
> +#define GID_RUN_INTERVAL   250     /* msec */
> +
> +static void
> +gid_node_free(struct gid_node *node)
> +{
> +    if (node->sflow.tunnel) {
free() does the null check, so this can just be 
free(node->sflow.tunnel);
> +        free(node->sflow.tunnel); +    }
> +    free(CONST_CAST(void *, node->sflow.sflow));
> +    free(node->sflow.userdata);
> +    free(node);
> +}
> +
> +static void
> +gid_cleanup(void)
> +{
> +    long long int now = time_msec();
> +    struct gid_node *node;
> +
> +    /* Do maintenance at most 4 times / sec. */
> +    ovs_rwlock_rdlock(&gid_rwlock);
> +    if (now - gid_last_run < GID_RUN_INTERVAL) {
> +        ovs_rwlock_unlock(&gid_rwlock);
> +        return;
> +    }
> +    ovs_rwlock_unlock(&gid_rwlock);
> +
> +    ovs_rwlock_wrlock(&gid_rwlock);
> +    gid_last_run = now;
> +
> +    LIST_FOR_EACH_POP (node, exp_node, &gid_expired) {
> +        cmap_remove(&gid_map, &node->id_node, node->id);
> +        ovsrcu_postpone(gid_node_free, node);
> +    }
> +
> +    if (!ovs_list_is_empty(&gid_expiring)) {
> +        /* 'gid_expired' is now empty, move nodes in
> +         * 'gid_expiring' to it. */
> +        ovs_list_splice(&gid_expired,
> +                        ovs_list_front(&gid_expiring),
> +                        &gid_expiring);
> +    }

Trying to understand why we have two list? Why can't we directly cleanup 
from expiring list?

> +    ovs_rwlock_unlock(&gid_rwlock);
> +}
> +
> +/* Lockless RCU protected lookup.  If node is needed accross RCU 
> quiescent
> + * state, caller should copy the contents. */
> +static const struct gid_node *
> +gid_find(uint32_t id)
> +{
> +    const struct cmap_node *node = cmap_find(&gid_map, id);
> +
> +    return node
> +        ? CONTAINER_OF(node, const struct gid_node, id_node)
> +        : NULL;
> +}
> +
> +static uint32_t
> +dpif_sflow_attr_hash(const struct dpif_sflow_attr *sflow)
> +{
> +    uint32_t hash1 = hash_bytes(sflow->sflow, sflow->sflow->nla_len, 
> 0);
> +    uint32_t hash2;
> +
> +    if (!sflow->tunnel) {
> +        return hash1;
> +    }
> +
> +    hash2 = hash_bytes(sflow->tunnel, sizeof *sflow->tunnel, 0);
> +    return hash_add(hash1, hash2);

I think this can be:
     uint32_t hash = hash_bytes(sflow->sflow, sflow->sflow->nla_len, 0);

     if (sflow->tunnel) {
         hash = hash_bytes(sflow->tunnel, sizeof *sflow->tunnel, hash);
     }
     return hash;


> +}
> +
> +static bool
> +dpif_sflow_attr_equal(const struct dpif_sflow_attr *a,
> +                      const struct dpif_sflow_attr *b)
> +{
> +    if (a->sflow->nla_len != b->sflow->nla_len
> +        || memcmp(a->sflow, b->sflow, a->sflow->nla_len)) {
> +        return false;
> +    }
> +    if (!a->tunnel && !b->tunnel) {
> +        return true;
> +    }
> +    if (a->tunnel && b->tunnel) {
> +        return !memcmp(a->tunnel, b->tunnel, sizeof *a->tunnel);
> +    }
> +
> +    return false;
> +}
> +
> +/* Lockless RCU protected lookup.  If node is needed accross RCU 
> quiescent
> + * state, caller should take a reference. */
> +static struct gid_node *
> +gid_find_equal(const struct dpif_sflow_attr *target, uint32_t hash)
> +{
> +    struct gid_node *node;
> +
> +    CMAP_FOR_EACH_WITH_HASH (node, metadata_node, hash, 
> &gid_metadata_map) {
> +        if (dpif_sflow_attr_equal(&node->sflow, target)) {
> +            return node;
> +        }
> +    }
> +    return NULL;
> +}
> +
> +static struct gid_node *
> +gid_ref_equal(const struct dpif_sflow_attr *target, uint32_t hash)
> +{
> +    struct gid_node *node;
> +
> +    do {
> +        node = gid_find_equal(target, hash);
> +        /* Try again if the node was released before we get the 
> reference. */
> +    } while (node && !ovs_refcount_try_ref_rcu(&node->refcount));
> +
> +    return node;
> +}
> +
> +static void
> +dpif_sflow_attr_clone(struct dpif_sflow_attr *new,
> +                      const struct dpif_sflow_attr *old)
> +{
> +    new->sflow = xmalloc(old->sflow->nla_len);
> +    memcpy(CONST_CAST(void *, new->sflow), old->sflow, 
> old->sflow->nla_len);
> +

There is xmemdup() you could use here, and below

> +    new->userdata_len = old->userdata_len;
> +    new->userdata = xmalloc(new->userdata_len);
> +    memcpy(new->userdata, old->userdata, new->userdata_len);
> +
> +    if (old->tunnel) {
> +        new->tunnel = xzalloc(sizeof *new->tunnel);
> +        memcpy(new->tunnel, old->tunnel, sizeof *new->tunnel);
> +    } else {
> +        new->tunnel = NULL;
> +    }
> +}
> +
> +/* We use the id as the hash value, which works due to cmap internal 
> rehashing.
> + * We also only insert nodes with unique IDs, so all possible hash 
> collisions
> + * remain internal to the cmap. */
> +static struct gid_node *
> +gid_find__(uint32_t id)
> +    OVS_REQUIRES(gid_rwlock)
> +{
> +    struct cmap_node *node = cmap_find_protected(&gid_map, id);
> +
> +    return node ? CONTAINER_OF(node, struct gid_node, id_node) : 
> NULL;
> +}
> +
> +/* Allocate a unique group id for the given set of flow metadata.
> + * The ID space is 2^^32, so there should never be a situation in 
> which all
> + * the IDs are used up.  We loop until we find a free one. */

I don't agree that we should assume we find an entry because if there is 
a bug we will loop forever. A simple fix would be to make sure we have 
wrapped around, if so quit with NULL. What might even be better is to 
have a fixed amount of tries (8K, 64K, or something), as I'm wondering 
how much 2^32 - 1 will take ;) Or some warning message if it takes more 
than xK retries? Just something that we know we might need to re-work 
this function.

> +static struct gid_node *
> +gid_alloc__(const struct dpif_sflow_attr *sflow, uint32_t hash)
> +{
> +    struct gid_node *node = xzalloc(sizeof *node);
> +
> +    node->hash = hash;
> +    ovs_refcount_init(&node->refcount);
> +    dpif_sflow_attr_clone(CONST_CAST(struct dpif_sflow_attr *, 
> &node->sflow),
> +                          sflow);
> +
> +    ovs_rwlock_wrlock(&gid_rwlock);
> +    for (;;) {
> +        node->id = next_group_id++;
> +        if (OVS_UNLIKELY(!node->id)) {
> +            next_group_id = 1;
> +            node->id = next_group_id++;
> +        }
> +        /* Find if the id is free. */
> +        if (OVS_LIKELY(!gid_find__(node->id))) {
> +            break;
> +        }
> +    }
> +    cmap_insert(&gid_map, &node->id_node, node->id);
> +    cmap_insert(&gid_metadata_map, &node->metadata_node, node->hash);
> +    ovs_rwlock_unlock(&gid_rwlock);
> +    return node;
> +}
> +
> +/* Allocate a unique group id for the given set of flow metadata and
> +   optional actions. */
> +static uint32_t
> +gid_alloc_ctx(const struct dpif_sflow_attr *sflow)
> +{
> +    uint32_t hash = dpif_sflow_attr_hash(sflow);
> +    struct gid_node *node = gid_ref_equal(sflow, hash);
> +
> +    if (!node) {
> +        node = gid_alloc__(sflow, hash);
> +    }

I think gid_alloc__() should be allowed to fail (see above), so check 
for it:
        return node ? node->id : 0

> +    return node->id;
> +}
> +
> +static void
> +gid_node_unref(const struct gid_node *node_)
> +    OVS_EXCLUDED(gid_rwlock)
> +{
> +    struct gid_node *node = CONST_CAST(struct gid_node *, node_);
> +
> +    ovs_rwlock_wrlock(&gid_rwlock);
> +    if (node && ovs_refcount_unref(&node->refcount) == 1) {
> +        /* Prevent re-use of this node by removing the node from
> +         * gid_metadata_map' */
> +        cmap_remove(&gid_metadata_map, &node->metadata_node, 
> node->hash);
> +        /* We keep the node in the 'gid_map' so that it can be found 
> as
> +         * long as it lingers, and add it to the 'gid_expiring' list. 
> */
> +        ovs_list_insert(&gid_expiring, &node->exp_node);

Why do we do a delayed remove here? Any sflow packets in the pipeline 
can be silently deleted, based on a failing sgid_find().

The only thing concern might be ID reuse. But I think with 32^2 IDs, and 
the current allocator schema, this is unlikely to happen.

It would make the code cleaner, and the rwlock can be a normal lock.

> +    }
> +    ovs_rwlock_unlock(&gid_rwlock);
> +}
> +
> +static void
> +gid_free(uint32_t id)
> +{
> +    const struct gid_node *node;

All use cases of gid_free() check for 0 before calling, maybe we can 
include it here and remove it from the other places. This makes it 
behave like free().

> +
> +    node = gid_find(id);
> +    if (node) {
> +        gid_node_unref(node);
> +    } else {
> +        VLOG_ERR("Freeing nonexistent group ID: %"PRIu32, id);
> +    }
> +}
> +
>  static bool
>  is_internal_port(const char *type)
>  {
> @@ -203,6 +453,9 @@ del_filter_and_ufid_mapping(struct tcf_id *id, 
> const ovs_u128 *ufid)
>      if (!err) {
>          del_ufid_tc_mapping(ufid);
>      }
> +    if (id->sflow_group_id) {
> +        gid_free(id->sflow_group_id);
> +    }
>      return err;
>  }
>
> @@ -398,6 +651,8 @@ netdev_tc_flow_dump_create(struct netdev *netdev,
>
>      *dump_out = dump;
>
> +    gid_cleanup();
> +

I guess this can be removed if we can delete the pigs directly, if not, 
we might want to put a comment here explaining why this is called from 
this function.

>      return 0;
>  }
>
> @@ -1797,6 +2052,11 @@ netdev_tc_flow_put(struct netdev *netdev, 
> struct match *match,
>              action->type = TC_ACT_GOTO;
>              action->chain = 0;  /* 0 is reserved and not used by 
> recirc. */
>              flower.action_count++;
> +        } else if (nl_attr_type(nla) == OVS_ACTION_ATTR_SAMPLE) {
> +            struct dpif_sflow_attr sflow_attr;
> +
> +            memset(&sflow_attr, 0, sizeof sflow_attr);
> +            gid_alloc_ctx(&sflow_attr);

Guess here we should report an error if gid_alloc_ctx() fails.
  if (!gid_alloc_ctx(&sflow_attr)) {
     VLOG_DBG_RL(&rl, "Failed allocating group id for sample action");

  }

>          } else {
>              VLOG_DBG_RL(&rl, "unsupported put action type: %d",
>                          nl_attr_type(nla));
> diff --git a/lib/tc.h b/lib/tc.h
> index 281231c0d..cc2ad025d 100644
> --- a/lib/tc.h
> +++ b/lib/tc.h
> @@ -273,6 +273,7 @@ struct tcf_id {
>      uint32_t chain;
>      uint16_t prio;
>      uint32_t handle;
> +    uint32_t sflow_group_id;
>  };
>
>  static inline struct tcf_id
> -- 
> 2.26.2



More information about the dev mailing list