[ovs-dev] [PATCH v9 08/11] netdev-offload-tc: Introduce group ID management API
Chris Mi
cmi at nvidia.com
Fri Jan 22 03:27:21 UTC 2021
On 1/13/2021 3:49 AM, Eelco Chaudron wrote:
>
>
> On 15 Dec 2020, at 4:38, Chris Mi wrote:
>
>> When offloading sample action to TC, userspace creates a unique ID
>> to map sFlow action and tunnel info and passes this ID to kernel instead
>> of the sFlow info. psample will send this ID and sampled packet to
>> userspace. Using the ID, userspace can recover the sFlow info and send
>> sampled packet to the right sFlow monitoring host.
>>
>> Signed-off-by: Chris Mi <cmi at nvidia.com>
>> Reviewed-by: Eli Britstein <elibr at nvidia.com>
>> ---
>> lib/netdev-offload-tc.c | 260 ++++++++++++++++++++++++++++++++++++++++
>> lib/tc.h | 1 +
>> 2 files changed, 261 insertions(+)
>>
>> diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c
>> index 9c5be9f92..5c3c811a2 100644
>> --- a/lib/netdev-offload-tc.c
>> +++ b/lib/netdev-offload-tc.c
>> @@ -39,6 +39,7 @@
>> #include "unaligned.h"
>> #include "util.h"
>> #include "dpif-provider.h"
>> +#include "cmap.h"
>>
>> VLOG_DEFINE_THIS_MODULE(netdev_offload_tc);
>>
>> @@ -57,6 +58,255 @@ struct netlink_field {
>> int size;
>> };
>>
>> +/* This maps a psample group ID to struct dpif_sflow_attr for sFlow */
>> +struct gid_node {
>> + struct ovs_list exp_node OVS_GUARDED;
>> + struct cmap_node metadata_node;
>> + struct cmap_node id_node;
>> + struct ovs_refcount refcount;
>> + uint32_t hash;
>> + uint32_t id;
>> + const struct dpif_sflow_attr sflow;
>> +};
>
> All your variables/functions etc. all use gid (group ID) but this
> seems a bit general. Maybe you can make it more sample specific, i.e.
> sgid?
Done.
>
>> +
>> +static struct ovs_rwlock gid_rwlock = OVS_RWLOCK_INITIALIZER;
>> +
>> +static long long int gid_last_run OVS_GUARDED_BY(gid_rwlock);
>> +
>> +static struct cmap gid_map = CMAP_INITIALIZER;
>> +static struct cmap gid_metadata_map = CMAP_INITIALIZER;
>> +
>> +static struct ovs_list gid_expiring OVS_GUARDED_BY(gid_rwlock)
>> + = OVS_LIST_INITIALIZER(&gid_expiring);
>> +static struct ovs_list gid_expired OVS_GUARDED_BY(gid_rwlock)
>> + = OVS_LIST_INITIALIZER(&gid_expired);
>> +
>> +static uint32_t next_group_id OVS_GUARDED_BY(gid_rwlock) = 1;
>> +
>> +#define GID_RUN_INTERVAL 250 /* msec */
>> +
>> +static void
>> +gid_node_free(struct gid_node *node)
>> +{
>> + if (node->sflow.tunnel) {
> free() does the null check, so this can just be free(node->sflow.tunnel);
Done.
>> + free(node->sflow.tunnel); + }
>> + free(CONST_CAST(void *, node->sflow.sflow));
>> + free(node->sflow.userdata);
>> + free(node);
>> +}
>> +
>> +static void
>> +gid_cleanup(void)
>> +{
>> + long long int now = time_msec();
>> + struct gid_node *node;
>> +
>> + /* Do maintenance at most 4 times / sec. */
>> + ovs_rwlock_rdlock(&gid_rwlock);
>> + if (now - gid_last_run < GID_RUN_INTERVAL) {
>> + ovs_rwlock_unlock(&gid_rwlock);
>> + return;
>> + }
>> + ovs_rwlock_unlock(&gid_rwlock);
>> +
>> + ovs_rwlock_wrlock(&gid_rwlock);
>> + gid_last_run = now;
>> +
>> + LIST_FOR_EACH_POP (node, exp_node, &gid_expired) {
>> + cmap_remove(&gid_map, &node->id_node, node->id);
>> + ovsrcu_postpone(gid_node_free, node);
>> + }
>> +
>> + if (!ovs_list_is_empty(&gid_expiring)) {
>> + /* 'gid_expired' is now empty, move nodes in
>> + * 'gid_expiring' to it. */
>> + ovs_list_splice(&gid_expired,
>> + ovs_list_front(&gid_expiring),
>> + &gid_expiring);
>> + }
>
> Trying to understand why we have two list? Why can't we directly
> cleanup from expiring list?
The code takes the recirc id management for example (ofproto-dpif-rid.c).
I'm not 100% sure the two lists are really needed here. But the recirc
id code exists for a long time.
I think it is mature. And I don't want to re-invent something.
And when psample thread receives the sampled packets, it needs to lookup
the sgid_node
based on the group id. But the sgid_node could be deleted anytime by the
revalidator.
So I think two list can resolve this issue. We have 250 ms interval to
wait for the sampled packets
on the fly to be processed.
>> + ovs_rwlock_unlock(&gid_rwlock);
>> +}
>> +
>> +/* Lockless RCU protected lookup. If node is needed accross RCU
>> quiescent
>> + * state, caller should copy the contents. */
>> +static const struct gid_node *
>> +gid_find(uint32_t id)
>> +{
>> + const struct cmap_node *node = cmap_find(&gid_map, id);
>> +
>> + return node
>> + ? CONTAINER_OF(node, const struct gid_node, id_node)
>> + : NULL;
>> +}
>> +
>> +static uint32_t
>> +dpif_sflow_attr_hash(const struct dpif_sflow_attr *sflow)
>> +{
>> + uint32_t hash1 = hash_bytes(sflow->sflow, sflow->sflow->nla_len,
>> 0);
>> + uint32_t hash2;
>> +
>> + if (!sflow->tunnel) {
>> + return hash1;
>> + }
>> +
>> + hash2 = hash_bytes(sflow->tunnel, sizeof *sflow->tunnel, 0);
>> + return hash_add(hash1, hash2);
>
> I think this can be:
> uint32_t hash = hash_bytes(sflow->sflow, sflow->sflow->nla_len, 0);
>
> if (sflow->tunnel) {
> hash = hash_bytes(sflow->tunnel, sizeof *sflow->tunnel, hash);
> }
> return hash;
>
Done. Thanks.
>
>> +}
>> +
>> +static bool
>> +dpif_sflow_attr_equal(const struct dpif_sflow_attr *a,
>> + const struct dpif_sflow_attr *b)
>> +{
>> + if (a->sflow->nla_len != b->sflow->nla_len
>> + || memcmp(a->sflow, b->sflow, a->sflow->nla_len)) {
>> + return false;
>> + }
>> + if (!a->tunnel && !b->tunnel) {
>> + return true;
>> + }
>> + if (a->tunnel && b->tunnel) {
>> + return !memcmp(a->tunnel, b->tunnel, sizeof *a->tunnel);
>> + }
>> +
>> + return false;
>> +}
>> +
>> +/* Lockless RCU protected lookup. If node is needed accross RCU
>> quiescent
>> + * state, caller should take a reference. */
>> +static struct gid_node *
>> +gid_find_equal(const struct dpif_sflow_attr *target, uint32_t hash)
>> +{
>> + struct gid_node *node;
>> +
>> + CMAP_FOR_EACH_WITH_HASH (node, metadata_node, hash,
>> &gid_metadata_map) {
>> + if (dpif_sflow_attr_equal(&node->sflow, target)) {
>> + return node;
>> + }
>> + }
>> + return NULL;
>> +}
>> +
>> +static struct gid_node *
>> +gid_ref_equal(const struct dpif_sflow_attr *target, uint32_t hash)
>> +{
>> + struct gid_node *node;
>> +
>> + do {
>> + node = gid_find_equal(target, hash);
>> + /* Try again if the node was released before we get the
>> reference. */
>> + } while (node && !ovs_refcount_try_ref_rcu(&node->refcount));
>> +
>> + return node;
>> +}
>> +
>> +static void
>> +dpif_sflow_attr_clone(struct dpif_sflow_attr *new,
>> + const struct dpif_sflow_attr *old)
>> +{
>> + new->sflow = xmalloc(old->sflow->nla_len);
>> + memcpy(CONST_CAST(void *, new->sflow), old->sflow,
>> old->sflow->nla_len);
>> +
>
> There is xmemdup() you could use here, and below
Done. Thanks. The code is greatly simplified.
>
>> + new->userdata_len = old->userdata_len;
>> + new->userdata = xmalloc(new->userdata_len);
>> + memcpy(new->userdata, old->userdata, new->userdata_len);
>> +
>> + if (old->tunnel) {
>> + new->tunnel = xzalloc(sizeof *new->tunnel);
>> + memcpy(new->tunnel, old->tunnel, sizeof *new->tunnel);
>> + } else {
>> + new->tunnel = NULL;
>> + }
>> +}
>> +
>> +/* We use the id as the hash value, which works due to cmap internal
>> rehashing.
>> + * We also only insert nodes with unique IDs, so all possible hash
>> collisions
>> + * remain internal to the cmap. */
>> +static struct gid_node *
>> +gid_find__(uint32_t id)
>> + OVS_REQUIRES(gid_rwlock)
>> +{
>> + struct cmap_node *node = cmap_find_protected(&gid_map, id);
>> +
>> + return node ? CONTAINER_OF(node, struct gid_node, id_node) : NULL;
>> +}
>> +
>> +/* Allocate a unique group id for the given set of flow metadata.
>> + * The ID space is 2^^32, so there should never be a situation in
>> which all
>> + * the IDs are used up. We loop until we find a free one. */
>
> I don't agree that we should assume we find an entry because if there
> is a bug we will loop forever. A simple fix would be to make sure we
> have wrapped around, if so quit with NULL. What might even be better
> is to have a fixed amount of tries (8K, 64K, or something), as I'm
> wondering how much 2^32 - 1 will take ;) Or some warning message if it
> takes more than xK retries? Just something that we know we might need
> to re-work this function.
Done.
>
>> +static struct gid_node *
>> +gid_alloc__(const struct dpif_sflow_attr *sflow, uint32_t hash)
>> +{
>> + struct gid_node *node = xzalloc(sizeof *node);
>> +
>> + node->hash = hash;
>> + ovs_refcount_init(&node->refcount);
>> + dpif_sflow_attr_clone(CONST_CAST(struct dpif_sflow_attr *,
>> &node->sflow),
>> + sflow);
>> +
>> + ovs_rwlock_wrlock(&gid_rwlock);
>> + for (;;) {
>> + node->id = next_group_id++;
>> + if (OVS_UNLIKELY(!node->id)) {
>> + next_group_id = 1;
>> + node->id = next_group_id++;
>> + }
>> + /* Find if the id is free. */
>> + if (OVS_LIKELY(!gid_find__(node->id))) {
>> + break;
>> + }
>> + }
>> + cmap_insert(&gid_map, &node->id_node, node->id);
>> + cmap_insert(&gid_metadata_map, &node->metadata_node, node->hash);
>> + ovs_rwlock_unlock(&gid_rwlock);
>> + return node;
>> +}
>> +
>> +/* Allocate a unique group id for the given set of flow metadata and
>> + optional actions. */
>> +static uint32_t
>> +gid_alloc_ctx(const struct dpif_sflow_attr *sflow)
>> +{
>> + uint32_t hash = dpif_sflow_attr_hash(sflow);
>> + struct gid_node *node = gid_ref_equal(sflow, hash);
>> +
>> + if (!node) {
>> + node = gid_alloc__(sflow, hash);
>> + }
>
> I think gid_alloc__() should be allowed to fail (see above), so check
> for it:
> return node ? node->id : 0
Done.
>
>> + return node->id;
>> +}
>> +
>> +static void
>> +gid_node_unref(const struct gid_node *node_)
>> + OVS_EXCLUDED(gid_rwlock)
>> +{
>> + struct gid_node *node = CONST_CAST(struct gid_node *, node_);
>> +
>> + ovs_rwlock_wrlock(&gid_rwlock);
>> + if (node && ovs_refcount_unref(&node->refcount) == 1) {
>> + /* Prevent re-use of this node by removing the node from
>> + * gid_metadata_map' */
>> + cmap_remove(&gid_metadata_map, &node->metadata_node,
>> node->hash);
>> + /* We keep the node in the 'gid_map' so that it can be found as
>> + * long as it lingers, and add it to the 'gid_expiring'
>> list. */
>> + ovs_list_insert(&gid_expiring, &node->exp_node);
>
> Why do we do a delayed remove here? Any sflow packets in the pipeline
> can be silently deleted, based on a failing sgid_find().
Please see my above comment for the two list issue.
> The only thing concern might be ID reuse. But I think with 32^2 IDs,
> and the current allocator schema, this is unlikely to happen.
>
> It would make the code cleaner, and the rwlock can be a normal lock.
>
>> + }
>> + ovs_rwlock_unlock(&gid_rwlock);
>> +}
>> +
>> +static void
>> +gid_free(uint32_t id)
>> +{
>> + const struct gid_node *node;
>
> All use cases of gid_free() check for 0 before calling, maybe we can
> include it here and remove it from the other places. This makes it
> behave like free().
Done.
>
>> +
>> + node = gid_find(id);
>> + if (node) {
>> + gid_node_unref(node);
>> + } else {
>> + VLOG_ERR("Freeing nonexistent group ID: %"PRIu32, id);
>> + }
>> +}
>> +
>> static bool
>> is_internal_port(const char *type)
>> {
>> @@ -203,6 +453,9 @@ del_filter_and_ufid_mapping(struct tcf_id *id,
>> const ovs_u128 *ufid)
>> if (!err) {
>> del_ufid_tc_mapping(ufid);
>> }
>> + if (id->sflow_group_id) {
>> + gid_free(id->sflow_group_id);
>> + }
>> return err;
>> }
>>
>> @@ -398,6 +651,8 @@ netdev_tc_flow_dump_create(struct netdev *netdev,
>>
>> *dump_out = dump;
>>
>> + gid_cleanup();
>> +
>
> I guess this can be removed if we can delete the pigs directly, if
> not, we might want to put a comment here explaining why this is called
> from this function.
OK, I'll put a comment here.
>
>> return 0;
>> }
>>
>> @@ -1797,6 +2052,11 @@ netdev_tc_flow_put(struct netdev *netdev,
>> struct match *match,
>> action->type = TC_ACT_GOTO;
>> action->chain = 0; /* 0 is reserved and not used by
>> recirc. */
>> flower.action_count++;
>> + } else if (nl_attr_type(nla) == OVS_ACTION_ATTR_SAMPLE) {
>> + struct dpif_sflow_attr sflow_attr;
>> +
>> + memset(&sflow_attr, 0, sizeof sflow_attr);
>> + gid_alloc_ctx(&sflow_attr);
>
> Guess here we should report an error if gid_alloc_ctx() fails.
> if (!gid_alloc_ctx(&sflow_attr)) {
> VLOG_DBG_RL(&rl, "Failed allocating group id for sample action");
>
> }
Will fix it in the last patch.
>
>> } else {
>> VLOG_DBG_RL(&rl, "unsupported put action type: %d",
>> nl_attr_type(nla));
>> diff --git a/lib/tc.h b/lib/tc.h
>> index 281231c0d..cc2ad025d 100644
>> --- a/lib/tc.h
>> +++ b/lib/tc.h
>> @@ -273,6 +273,7 @@ struct tcf_id {
>> uint32_t chain;
>> uint16_t prio;
>> uint32_t handle;
>> + uint32_t sflow_group_id;
>> };
>>
>> static inline struct tcf_id
>> --
>> 2.26.2
>
More information about the dev
mailing list