[ovs-dev] [PATCH v9 08/11] netdev-offload-tc: Introduce group ID management API

Fri Jan 22 03:27:21 UTC 2021

On 1/13/2021 3:49 AM, Eelco Chaudron wrote:
>
>
> On 15 Dec 2020, at 4:38, Chris Mi wrote:
>
>> When offloading sample action to TC, userspace creates a unique ID
>> to map sFlow action and tunnel info and passes this ID to kernel instead
>> of the sFlow info. psample will send this ID and sampled packet to
>> userspace. Using the ID, userspace can recover the sFlow info and send
>> sampled packet to the right sFlow monitoring host.
>>
>> Signed-off-by: Chris Mi <cmi at nvidia.com>
>> Reviewed-by: Eli Britstein <elibr at nvidia.com>
>> ---
>>  lib/netdev-offload-tc.c | 260 ++++++++++++++++++++++++++++++++++++++++
>>  lib/tc.h                |   1 +
>>  2 files changed, 261 insertions(+)
>>
>> diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c
>> index 9c5be9f92..5c3c811a2 100644
>> --- a/lib/netdev-offload-tc.c
>> +++ b/lib/netdev-offload-tc.c
>> @@ -39,6 +39,7 @@
>>  #include "unaligned.h"
>>  #include "util.h"
>>  #include "dpif-provider.h"
>> +#include "cmap.h"
>>
>>  VLOG_DEFINE_THIS_MODULE(netdev_offload_tc);
>>
>> @@ -57,6 +58,255 @@ struct netlink_field {
>>      int size;
>>  };
>>
>> +/* This maps a psample group ID to struct dpif_sflow_attr for sFlow */
>> +struct gid_node {
>> +    struct ovs_list exp_node OVS_GUARDED;
>> +    struct cmap_node metadata_node;
>> +    struct cmap_node id_node;
>> +    struct ovs_refcount refcount;
>> +    uint32_t hash;
>> +    uint32_t id;
>> +    const struct dpif_sflow_attr sflow;
>> +};
>
> All your variables/functions etc. all use gid (group ID) but this 
> seems a bit general. Maybe you can make it more sample specific, i.e. 
> sgid?
Done.
>
>> +
>> +static struct ovs_rwlock gid_rwlock = OVS_RWLOCK_INITIALIZER;
>> +
>> +static long long int gid_last_run OVS_GUARDED_BY(gid_rwlock);
>> +
>> +static struct cmap gid_map = CMAP_INITIALIZER;
>> +static struct cmap gid_metadata_map = CMAP_INITIALIZER;
>> +
>> +static struct ovs_list gid_expiring OVS_GUARDED_BY(gid_rwlock)
>> +    = OVS_LIST_INITIALIZER(&gid_expiring);
>> +static struct ovs_list gid_expired OVS_GUARDED_BY(gid_rwlock)
>> +    = OVS_LIST_INITIALIZER(&gid_expired);
>> +
>> +static uint32_t next_group_id OVS_GUARDED_BY(gid_rwlock) = 1;
>> +
>> +#define GID_RUN_INTERVAL   250     /* msec */
>> +
>> +static void
>> +gid_node_free(struct gid_node *node)
>> +{
>> +    if (node->sflow.tunnel) {
> free() does the null check, so this can just be free(node->sflow.tunnel);
Done.
>> +        free(node->sflow.tunnel); + }
>> +    free(CONST_CAST(void *, node->sflow.sflow));
>> +    free(node->sflow.userdata);
>> +    free(node);
>> +}
>> +
>> +static void
>> +gid_cleanup(void)
>> +{
>> +    long long int now = time_msec();
>> +    struct gid_node *node;
>> +
>> +    /* Do maintenance at most 4 times / sec. */
>> +    ovs_rwlock_rdlock(&gid_rwlock);
>> +    if (now - gid_last_run < GID_RUN_INTERVAL) {
>> +        ovs_rwlock_unlock(&gid_rwlock);
>> +        return;
>> +    }
>> +    ovs_rwlock_unlock(&gid_rwlock);
>> +
>> +    ovs_rwlock_wrlock(&gid_rwlock);
>> +    gid_last_run = now;
>> +
>> +    LIST_FOR_EACH_POP (node, exp_node, &gid_expired) {
>> +        cmap_remove(&gid_map, &node->id_node, node->id);
>> +        ovsrcu_postpone(gid_node_free, node);
>> +    }
>> +
>> +    if (!ovs_list_is_empty(&gid_expiring)) {
>> +        /* 'gid_expired' is now empty, move nodes in
>> +         * 'gid_expiring' to it. */
>> +        ovs_list_splice(&gid_expired,
>> +                        ovs_list_front(&gid_expiring),
>> +                        &gid_expiring);
>> +    }
>
> Trying to understand why we have two list? Why can't we directly 
> cleanup from expiring list?
The code takes the recirc id management for example (ofproto-dpif-rid.c).
I'm not 100% sure the two lists are really needed here. But the recirc 
id code exists for a long time.
I think it is mature. And I don't want to re-invent something.

And when psample thread receives the sampled packets, it needs to lookup 
the sgid_node
based on the group id. But the sgid_node could be deleted anytime by the 
revalidator.
So I think two list can resolve this issue. We have 250 ms interval to 
wait for the sampled packets
on the fly to be processed.
>> +    ovs_rwlock_unlock(&gid_rwlock);
>> +}
>> +
>> +/* Lockless RCU protected lookup.  If node is needed accross RCU 
>> quiescent
>> + * state, caller should copy the contents. */
>> +static const struct gid_node *
>> +gid_find(uint32_t id)
>> +{
>> +    const struct cmap_node *node = cmap_find(&gid_map, id);
>> +
>> +    return node
>> +        ? CONTAINER_OF(node, const struct gid_node, id_node)
>> +        : NULL;
>> +}
>> +
>> +static uint32_t
>> +dpif_sflow_attr_hash(const struct dpif_sflow_attr *sflow)
>> +{
>> +    uint32_t hash1 = hash_bytes(sflow->sflow, sflow->sflow->nla_len, 
>> 0);
>> +    uint32_t hash2;
>> +
>> +    if (!sflow->tunnel) {
>> +        return hash1;
>> +    }
>> +
>> +    hash2 = hash_bytes(sflow->tunnel, sizeof *sflow->tunnel, 0);
>> +    return hash_add(hash1, hash2);
>
> I think this can be:
>     uint32_t hash = hash_bytes(sflow->sflow, sflow->sflow->nla_len, 0);
>
>     if (sflow->tunnel) {
>         hash = hash_bytes(sflow->tunnel, sizeof *sflow->tunnel, hash);
>     }
>     return hash;
>
Done. Thanks.
>
>> +}
>> +
>> +static bool
>> +dpif_sflow_attr_equal(const struct dpif_sflow_attr *a,
>> +                      const struct dpif_sflow_attr *b)
>> +{
>> +    if (a->sflow->nla_len != b->sflow->nla_len
>> +        || memcmp(a->sflow, b->sflow, a->sflow->nla_len)) {
>> +        return false;
>> +    }
>> +    if (!a->tunnel && !b->tunnel) {
>> +        return true;
>> +    }
>> +    if (a->tunnel && b->tunnel) {
>> +        return !memcmp(a->tunnel, b->tunnel, sizeof *a->tunnel);
>> +    }
>> +
>> +    return false;
>> +}
>> +
>> +/* Lockless RCU protected lookup.  If node is needed accross RCU 
>> quiescent
>> + * state, caller should take a reference. */
>> +static struct gid_node *
>> +gid_find_equal(const struct dpif_sflow_attr *target, uint32_t hash)
>> +{
>> +    struct gid_node *node;
>> +
>> +    CMAP_FOR_EACH_WITH_HASH (node, metadata_node, hash, 
>> &gid_metadata_map) {
>> +        if (dpif_sflow_attr_equal(&node->sflow, target)) {
>> +            return node;
>> +        }
>> +    }
>> +    return NULL;
>> +}
>> +
>> +static struct gid_node *
>> +gid_ref_equal(const struct dpif_sflow_attr *target, uint32_t hash)
>> +{
>> +    struct gid_node *node;
>> +
>> +    do {
>> +        node = gid_find_equal(target, hash);
>> +        /* Try again if the node was released before we get the 
>> reference. */
>> +    } while (node && !ovs_refcount_try_ref_rcu(&node->refcount));
>> +
>> +    return node;
>> +}
>> +
>> +static void
>> +dpif_sflow_attr_clone(struct dpif_sflow_attr *new,
>> +                      const struct dpif_sflow_attr *old)
>> +{
>> +    new->sflow = xmalloc(old->sflow->nla_len);
>> +    memcpy(CONST_CAST(void *, new->sflow), old->sflow, 
>> old->sflow->nla_len);
>> +
>
> There is xmemdup() you could use here, and below
Done. Thanks. The code is greatly simplified.
>
>> +    new->userdata_len = old->userdata_len;
>> +    new->userdata = xmalloc(new->userdata_len);
>> +    memcpy(new->userdata, old->userdata, new->userdata_len);
>> +
>> +    if (old->tunnel) {
>> +        new->tunnel = xzalloc(sizeof *new->tunnel);
>> +        memcpy(new->tunnel, old->tunnel, sizeof *new->tunnel);
>> +    } else {
>> +        new->tunnel = NULL;
>> +    }
>> +}
>> +
>> +/* We use the id as the hash value, which works due to cmap internal 
>> rehashing.
>> + * We also only insert nodes with unique IDs, so all possible hash 
>> collisions
>> + * remain internal to the cmap. */
>> +static struct gid_node *
>> +gid_find__(uint32_t id)
>> +    OVS_REQUIRES(gid_rwlock)
>> +{
>> +    struct cmap_node *node = cmap_find_protected(&gid_map, id);
>> +
>> +    return node ? CONTAINER_OF(node, struct gid_node, id_node) : NULL;
>> +}
>> +
>> +/* Allocate a unique group id for the given set of flow metadata.
>> + * The ID space is 2^^32, so there should never be a situation in 
>> which all
>> + * the IDs are used up.  We loop until we find a free one. */
>
> I don't agree that we should assume we find an entry because if there 
> is a bug we will loop forever. A simple fix would be to make sure we 
> have wrapped around, if so quit with NULL. What might even be better 
> is to have a fixed amount of tries (8K, 64K, or something), as I'm 
> wondering how much 2^32 - 1 will take ;) Or some warning message if it 
> takes more than xK retries? Just something that we know we might need 
> to re-work this function.
Done.
>
>> +static struct gid_node *
>> +gid_alloc__(const struct dpif_sflow_attr *sflow, uint32_t hash)
>> +{
>> +    struct gid_node *node = xzalloc(sizeof *node);
>> +
>> +    node->hash = hash;
>> +    ovs_refcount_init(&node->refcount);
>> +    dpif_sflow_attr_clone(CONST_CAST(struct dpif_sflow_attr *, 
>> &node->sflow),
>> +                          sflow);
>> +
>> +    ovs_rwlock_wrlock(&gid_rwlock);
>> +    for (;;) {
>> +        node->id = next_group_id++;
>> +        if (OVS_UNLIKELY(!node->id)) {
>> +            next_group_id = 1;
>> +            node->id = next_group_id++;
>> +        }
>> +        /* Find if the id is free. */
>> +        if (OVS_LIKELY(!gid_find__(node->id))) {
>> +            break;
>> +        }
>> +    }
>> +    cmap_insert(&gid_map, &node->id_node, node->id);
>> +    cmap_insert(&gid_metadata_map, &node->metadata_node, node->hash);
>> +    ovs_rwlock_unlock(&gid_rwlock);
>> +    return node;
>> +}
>> +
>> +/* Allocate a unique group id for the given set of flow metadata and
>> +   optional actions. */
>> +static uint32_t
>> +gid_alloc_ctx(const struct dpif_sflow_attr *sflow)
>> +{
>> +    uint32_t hash = dpif_sflow_attr_hash(sflow);
>> +    struct gid_node *node = gid_ref_equal(sflow, hash);
>> +
>> +    if (!node) {
>> +        node = gid_alloc__(sflow, hash);
>> +    }
>
> I think gid_alloc__() should be allowed to fail (see above), so check 
> for it:
>        return node ? node->id : 0
Done.
>
>> +    return node->id;
>> +}
>> +
>> +static void
>> +gid_node_unref(const struct gid_node *node_)
>> +    OVS_EXCLUDED(gid_rwlock)
>> +{
>> +    struct gid_node *node = CONST_CAST(struct gid_node *, node_);
>> +
>> +    ovs_rwlock_wrlock(&gid_rwlock);
>> +    if (node && ovs_refcount_unref(&node->refcount) == 1) {
>> +        /* Prevent re-use of this node by removing the node from
>> +         * gid_metadata_map' */
>> +        cmap_remove(&gid_metadata_map, &node->metadata_node, 
>> node->hash);
>> +        /* We keep the node in the 'gid_map' so that it can be found as
>> +         * long as it lingers, and add it to the 'gid_expiring' 
>> list. */
>> +        ovs_list_insert(&gid_expiring, &node->exp_node);
>
> Why do we do a delayed remove here? Any sflow packets in the pipeline 
> can be silently deleted, based on a failing sgid_find().
Please see my above comment for the two list issue.
> The only thing concern might be ID reuse. But I think with 32^2 IDs, 
> and the current allocator schema, this is unlikely to happen.
>
> It would make the code cleaner, and the rwlock can be a normal lock.
>
>> +    }
>> +    ovs_rwlock_unlock(&gid_rwlock);
>> +}
>> +
>> +static void
>> +gid_free(uint32_t id)
>> +{
>> +    const struct gid_node *node;
>
> All use cases of gid_free() check for 0 before calling, maybe we can 
> include it here and remove it from the other places. This makes it 
> behave like free().
Done.
>
>> +
>> +    node = gid_find(id);
>> +    if (node) {
>> +        gid_node_unref(node);
>> +    } else {
>> +        VLOG_ERR("Freeing nonexistent group ID: %"PRIu32, id);
>> +    }
>> +}
>> +
>>  static bool
>>  is_internal_port(const char *type)
>>  {
>> @@ -203,6 +453,9 @@ del_filter_and_ufid_mapping(struct tcf_id *id, 
>> const ovs_u128 *ufid)
>>      if (!err) {
>>          del_ufid_tc_mapping(ufid);
>>      }
>> +    if (id->sflow_group_id) {
>> +        gid_free(id->sflow_group_id);
>> +    }
>>      return err;
>>  }
>>
>> @@ -398,6 +651,8 @@ netdev_tc_flow_dump_create(struct netdev *netdev,
>>
>>      *dump_out = dump;
>>
>> +    gid_cleanup();
>> +
>
> I guess this can be removed if we can delete the pigs directly, if 
> not, we might want to put a comment here explaining why this is called 
> from this function.
OK, I'll put a comment here.
>
>>      return 0;
>>  }
>>
>> @@ -1797,6 +2052,11 @@ netdev_tc_flow_put(struct netdev *netdev, 
>> struct match *match,
>>              action->type = TC_ACT_GOTO;
>>              action->chain = 0;  /* 0 is reserved and not used by 
>> recirc. */
>>              flower.action_count++;
>> +        } else if (nl_attr_type(nla) == OVS_ACTION_ATTR_SAMPLE) {
>> +            struct dpif_sflow_attr sflow_attr;
>> +
>> +            memset(&sflow_attr, 0, sizeof sflow_attr);
>> +            gid_alloc_ctx(&sflow_attr);
>
> Guess here we should report an error if gid_alloc_ctx() fails.
>  if (!gid_alloc_ctx(&sflow_attr)) {
>     VLOG_DBG_RL(&rl, "Failed allocating group id for sample action");
>
>  }
Will fix it in the last patch.
>
>>          } else {
>>              VLOG_DBG_RL(&rl, "unsupported put action type: %d",
>>                          nl_attr_type(nla));
>> diff --git a/lib/tc.h b/lib/tc.h
>> index 281231c0d..cc2ad025d 100644
>> --- a/lib/tc.h
>> +++ b/lib/tc.h
>> @@ -273,6 +273,7 @@ struct tcf_id {
>>      uint32_t chain;
>>      uint16_t prio;
>>      uint32_t handle;
>> +    uint32_t sflow_group_id;
>>  };
>>
>>  static inline struct tcf_id
>> -- 
>> 2.26.2
>