[ovs-dev] [PATCH v9 08/11] netdev-offload-tc: Introduce group ID management API
Eelco Chaudron
echaudro at redhat.com
Tue Jan 12 19:49:12 UTC 2021
On 15 Dec 2020, at 4:38, Chris Mi wrote:
> When offloading sample action to TC, userspace creates a unique ID
> to map sFlow action and tunnel info and passes this ID to kernel
> instead
> of the sFlow info. psample will send this ID and sampled packet to
> userspace. Using the ID, userspace can recover the sFlow info and send
> sampled packet to the right sFlow monitoring host.
>
> Signed-off-by: Chris Mi <cmi at nvidia.com>
> Reviewed-by: Eli Britstein <elibr at nvidia.com>
> ---
> lib/netdev-offload-tc.c | 260
> ++++++++++++++++++++++++++++++++++++++++
> lib/tc.h | 1 +
> 2 files changed, 261 insertions(+)
>
> diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c
> index 9c5be9f92..5c3c811a2 100644
> --- a/lib/netdev-offload-tc.c
> +++ b/lib/netdev-offload-tc.c
> @@ -39,6 +39,7 @@
> #include "unaligned.h"
> #include "util.h"
> #include "dpif-provider.h"
> +#include "cmap.h"
>
> VLOG_DEFINE_THIS_MODULE(netdev_offload_tc);
>
> @@ -57,6 +58,255 @@ struct netlink_field {
> int size;
> };
>
> +/* This maps a psample group ID to struct dpif_sflow_attr for sFlow
> */
> +struct gid_node {
> + struct ovs_list exp_node OVS_GUARDED;
> + struct cmap_node metadata_node;
> + struct cmap_node id_node;
> + struct ovs_refcount refcount;
> + uint32_t hash;
> + uint32_t id;
> + const struct dpif_sflow_attr sflow;
> +};
All your variables/functions etc. all use gid (group ID) but this seems
a bit general. Maybe you can make it more sample specific, i.e. sgid?
> +
> +static struct ovs_rwlock gid_rwlock = OVS_RWLOCK_INITIALIZER;
> +
> +static long long int gid_last_run OVS_GUARDED_BY(gid_rwlock);
> +
> +static struct cmap gid_map = CMAP_INITIALIZER;
> +static struct cmap gid_metadata_map = CMAP_INITIALIZER;
> +
> +static struct ovs_list gid_expiring OVS_GUARDED_BY(gid_rwlock)
> + = OVS_LIST_INITIALIZER(&gid_expiring);
> +static struct ovs_list gid_expired OVS_GUARDED_BY(gid_rwlock)
> + = OVS_LIST_INITIALIZER(&gid_expired);
> +
> +static uint32_t next_group_id OVS_GUARDED_BY(gid_rwlock) = 1;
> +
> +#define GID_RUN_INTERVAL 250 /* msec */
> +
> +static void
> +gid_node_free(struct gid_node *node)
> +{
> + if (node->sflow.tunnel) {
free() does the null check, so this can just be
free(node->sflow.tunnel);
> + free(node->sflow.tunnel); + }
> + free(CONST_CAST(void *, node->sflow.sflow));
> + free(node->sflow.userdata);
> + free(node);
> +}
> +
> +static void
> +gid_cleanup(void)
> +{
> + long long int now = time_msec();
> + struct gid_node *node;
> +
> + /* Do maintenance at most 4 times / sec. */
> + ovs_rwlock_rdlock(&gid_rwlock);
> + if (now - gid_last_run < GID_RUN_INTERVAL) {
> + ovs_rwlock_unlock(&gid_rwlock);
> + return;
> + }
> + ovs_rwlock_unlock(&gid_rwlock);
> +
> + ovs_rwlock_wrlock(&gid_rwlock);
> + gid_last_run = now;
> +
> + LIST_FOR_EACH_POP (node, exp_node, &gid_expired) {
> + cmap_remove(&gid_map, &node->id_node, node->id);
> + ovsrcu_postpone(gid_node_free, node);
> + }
> +
> + if (!ovs_list_is_empty(&gid_expiring)) {
> + /* 'gid_expired' is now empty, move nodes in
> + * 'gid_expiring' to it. */
> + ovs_list_splice(&gid_expired,
> + ovs_list_front(&gid_expiring),
> + &gid_expiring);
> + }
Trying to understand why we have two list? Why can't we directly cleanup
from expiring list?
> + ovs_rwlock_unlock(&gid_rwlock);
> +}
> +
> +/* Lockless RCU protected lookup. If node is needed accross RCU
> quiescent
> + * state, caller should copy the contents. */
> +static const struct gid_node *
> +gid_find(uint32_t id)
> +{
> + const struct cmap_node *node = cmap_find(&gid_map, id);
> +
> + return node
> + ? CONTAINER_OF(node, const struct gid_node, id_node)
> + : NULL;
> +}
> +
> +static uint32_t
> +dpif_sflow_attr_hash(const struct dpif_sflow_attr *sflow)
> +{
> + uint32_t hash1 = hash_bytes(sflow->sflow, sflow->sflow->nla_len,
> 0);
> + uint32_t hash2;
> +
> + if (!sflow->tunnel) {
> + return hash1;
> + }
> +
> + hash2 = hash_bytes(sflow->tunnel, sizeof *sflow->tunnel, 0);
> + return hash_add(hash1, hash2);
I think this can be:
uint32_t hash = hash_bytes(sflow->sflow, sflow->sflow->nla_len, 0);
if (sflow->tunnel) {
hash = hash_bytes(sflow->tunnel, sizeof *sflow->tunnel, hash);
}
return hash;
> +}
> +
> +static bool
> +dpif_sflow_attr_equal(const struct dpif_sflow_attr *a,
> + const struct dpif_sflow_attr *b)
> +{
> + if (a->sflow->nla_len != b->sflow->nla_len
> + || memcmp(a->sflow, b->sflow, a->sflow->nla_len)) {
> + return false;
> + }
> + if (!a->tunnel && !b->tunnel) {
> + return true;
> + }
> + if (a->tunnel && b->tunnel) {
> + return !memcmp(a->tunnel, b->tunnel, sizeof *a->tunnel);
> + }
> +
> + return false;
> +}
> +
> +/* Lockless RCU protected lookup. If node is needed accross RCU
> quiescent
> + * state, caller should take a reference. */
> +static struct gid_node *
> +gid_find_equal(const struct dpif_sflow_attr *target, uint32_t hash)
> +{
> + struct gid_node *node;
> +
> + CMAP_FOR_EACH_WITH_HASH (node, metadata_node, hash,
> &gid_metadata_map) {
> + if (dpif_sflow_attr_equal(&node->sflow, target)) {
> + return node;
> + }
> + }
> + return NULL;
> +}
> +
> +static struct gid_node *
> +gid_ref_equal(const struct dpif_sflow_attr *target, uint32_t hash)
> +{
> + struct gid_node *node;
> +
> + do {
> + node = gid_find_equal(target, hash);
> + /* Try again if the node was released before we get the
> reference. */
> + } while (node && !ovs_refcount_try_ref_rcu(&node->refcount));
> +
> + return node;
> +}
> +
> +static void
> +dpif_sflow_attr_clone(struct dpif_sflow_attr *new,
> + const struct dpif_sflow_attr *old)
> +{
> + new->sflow = xmalloc(old->sflow->nla_len);
> + memcpy(CONST_CAST(void *, new->sflow), old->sflow,
> old->sflow->nla_len);
> +
There is xmemdup() you could use here, and below
> + new->userdata_len = old->userdata_len;
> + new->userdata = xmalloc(new->userdata_len);
> + memcpy(new->userdata, old->userdata, new->userdata_len);
> +
> + if (old->tunnel) {
> + new->tunnel = xzalloc(sizeof *new->tunnel);
> + memcpy(new->tunnel, old->tunnel, sizeof *new->tunnel);
> + } else {
> + new->tunnel = NULL;
> + }
> +}
> +
> +/* We use the id as the hash value, which works due to cmap internal
> rehashing.
> + * We also only insert nodes with unique IDs, so all possible hash
> collisions
> + * remain internal to the cmap. */
> +static struct gid_node *
> +gid_find__(uint32_t id)
> + OVS_REQUIRES(gid_rwlock)
> +{
> + struct cmap_node *node = cmap_find_protected(&gid_map, id);
> +
> + return node ? CONTAINER_OF(node, struct gid_node, id_node) :
> NULL;
> +}
> +
> +/* Allocate a unique group id for the given set of flow metadata.
> + * The ID space is 2^^32, so there should never be a situation in
> which all
> + * the IDs are used up. We loop until we find a free one. */
I don't agree that we should assume we find an entry because if there is
a bug we will loop forever. A simple fix would be to make sure we have
wrapped around, if so quit with NULL. What might even be better is to
have a fixed amount of tries (8K, 64K, or something), as I'm wondering
how much 2^32 - 1 will take ;) Or some warning message if it takes more
than xK retries? Just something that we know we might need to re-work
this function.
> +static struct gid_node *
> +gid_alloc__(const struct dpif_sflow_attr *sflow, uint32_t hash)
> +{
> + struct gid_node *node = xzalloc(sizeof *node);
> +
> + node->hash = hash;
> + ovs_refcount_init(&node->refcount);
> + dpif_sflow_attr_clone(CONST_CAST(struct dpif_sflow_attr *,
> &node->sflow),
> + sflow);
> +
> + ovs_rwlock_wrlock(&gid_rwlock);
> + for (;;) {
> + node->id = next_group_id++;
> + if (OVS_UNLIKELY(!node->id)) {
> + next_group_id = 1;
> + node->id = next_group_id++;
> + }
> + /* Find if the id is free. */
> + if (OVS_LIKELY(!gid_find__(node->id))) {
> + break;
> + }
> + }
> + cmap_insert(&gid_map, &node->id_node, node->id);
> + cmap_insert(&gid_metadata_map, &node->metadata_node, node->hash);
> + ovs_rwlock_unlock(&gid_rwlock);
> + return node;
> +}
> +
> +/* Allocate a unique group id for the given set of flow metadata and
> + optional actions. */
> +static uint32_t
> +gid_alloc_ctx(const struct dpif_sflow_attr *sflow)
> +{
> + uint32_t hash = dpif_sflow_attr_hash(sflow);
> + struct gid_node *node = gid_ref_equal(sflow, hash);
> +
> + if (!node) {
> + node = gid_alloc__(sflow, hash);
> + }
I think gid_alloc__() should be allowed to fail (see above), so check
for it:
return node ? node->id : 0
> + return node->id;
> +}
> +
> +static void
> +gid_node_unref(const struct gid_node *node_)
> + OVS_EXCLUDED(gid_rwlock)
> +{
> + struct gid_node *node = CONST_CAST(struct gid_node *, node_);
> +
> + ovs_rwlock_wrlock(&gid_rwlock);
> + if (node && ovs_refcount_unref(&node->refcount) == 1) {
> + /* Prevent re-use of this node by removing the node from
> + * gid_metadata_map' */
> + cmap_remove(&gid_metadata_map, &node->metadata_node,
> node->hash);
> + /* We keep the node in the 'gid_map' so that it can be found
> as
> + * long as it lingers, and add it to the 'gid_expiring' list.
> */
> + ovs_list_insert(&gid_expiring, &node->exp_node);
Why do we do a delayed remove here? Any sflow packets in the pipeline
can be silently deleted, based on a failing sgid_find().
The only thing concern might be ID reuse. But I think with 32^2 IDs, and
the current allocator schema, this is unlikely to happen.
It would make the code cleaner, and the rwlock can be a normal lock.
> + }
> + ovs_rwlock_unlock(&gid_rwlock);
> +}
> +
> +static void
> +gid_free(uint32_t id)
> +{
> + const struct gid_node *node;
All use cases of gid_free() check for 0 before calling, maybe we can
include it here and remove it from the other places. This makes it
behave like free().
> +
> + node = gid_find(id);
> + if (node) {
> + gid_node_unref(node);
> + } else {
> + VLOG_ERR("Freeing nonexistent group ID: %"PRIu32, id);
> + }
> +}
> +
> static bool
> is_internal_port(const char *type)
> {
> @@ -203,6 +453,9 @@ del_filter_and_ufid_mapping(struct tcf_id *id,
> const ovs_u128 *ufid)
> if (!err) {
> del_ufid_tc_mapping(ufid);
> }
> + if (id->sflow_group_id) {
> + gid_free(id->sflow_group_id);
> + }
> return err;
> }
>
> @@ -398,6 +651,8 @@ netdev_tc_flow_dump_create(struct netdev *netdev,
>
> *dump_out = dump;
>
> + gid_cleanup();
> +
I guess this can be removed if we can delete the pigs directly, if not,
we might want to put a comment here explaining why this is called from
this function.
> return 0;
> }
>
> @@ -1797,6 +2052,11 @@ netdev_tc_flow_put(struct netdev *netdev,
> struct match *match,
> action->type = TC_ACT_GOTO;
> action->chain = 0; /* 0 is reserved and not used by
> recirc. */
> flower.action_count++;
> + } else if (nl_attr_type(nla) == OVS_ACTION_ATTR_SAMPLE) {
> + struct dpif_sflow_attr sflow_attr;
> +
> + memset(&sflow_attr, 0, sizeof sflow_attr);
> + gid_alloc_ctx(&sflow_attr);
Guess here we should report an error if gid_alloc_ctx() fails.
if (!gid_alloc_ctx(&sflow_attr)) {
VLOG_DBG_RL(&rl, "Failed allocating group id for sample action");
}
> } else {
> VLOG_DBG_RL(&rl, "unsupported put action type: %d",
> nl_attr_type(nla));
> diff --git a/lib/tc.h b/lib/tc.h
> index 281231c0d..cc2ad025d 100644
> --- a/lib/tc.h
> +++ b/lib/tc.h
> @@ -273,6 +273,7 @@ struct tcf_id {
> uint32_t chain;
> uint16_t prio;
> uint32_t handle;
> + uint32_t sflow_group_id;
> };
>
> static inline struct tcf_id
> --
> 2.26.2
More information about the dev
mailing list