[ovs-dev] [megaflow v4 1/2] datapath: Mega flow implementation

Thu Jun 13 02:23:53 UTC 2013

On Wed, Jun 12, 2013 at 10:00 AM, Andy Zhou <azhou at nicira.com> wrote:
> Add mega flow support in kernel datapath.
>
> Pravin has made significant contributions to this patch. Including
> the mega flow id look up scheme, API clean ups, and bug fixes.
>
> Co-authored-by: Pravin B Shelar <pshelar at nicira.com>
> Signed-off-by: Pravin B Shelar <pshelar at nicira.com>
> Signed-off-by: Andy Zhou <azhou at nicira.com>

Can you expand the commit message a little bit to describe the
benefits (and perhaps some performance numbers)? It would be helpful
to have more context when the patch is submitted upstream.

A number of my comments are related to kernel style. In addition to
the ones that I pointed out, can you also run checkpatch.pl from the
Linux kernel tree?

> diff --git a/datapath/datapath.c b/datapath/datapath.c
> index 42af315..755f26a 100644
> --- a/datapath/datapath.c
> +++ b/datapath/datapath.c
> @@ -1229,35 +1246,39 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
[...]
> -       error = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
> +
> +       ovs_match_init(&match, &key, &mfm);
> +       error = ovs_match_from_nlattrs(&match,
> +                       a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]);
>         if (error)
> -               goto error;
> +               goto err_kfree;
>
>         /* Validate actions. */
>         if (a[OVS_FLOW_ATTR_ACTIONS]) {
>                 acts = ovs_flow_actions_alloc(nla_len(a[OVS_FLOW_ATTR_ACTIONS]));
>                 error = PTR_ERR(acts);
>                 if (IS_ERR(acts))
> -                       goto error;
> +                       goto err_kfree;
>
>                 error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &key,  0, &acts);
>                 if (error)
>                         goto err_kfree;
>         } else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) {
>                 error = -EINVAL;
> -               goto error;
> +               goto err_kfree;
>         }

I don't believe that anything has been allocated yet and therefore
there is nothing to free in the event of an error in these three
cases.

> @@ -1375,7 +1419,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
>         }
>
>         table = ovsl_dereference(dp->table);
> -       flow = ovs_flow_tbl_lookup(table, &key, key_len);
> +       flow = ovs_flow_tbl_lookup_unmasked(table, &key, match.range.end);

Since we've gone to the trouble of creating it, it seems like it might
be easier to just pass in the match struct to these unmasked lookup
and comparison functions.

> diff --git a/datapath/flow.c b/datapath/flow.c
> index 7f897bd..5c8faac 100644
> --- a/datapath/flow.c
> +++ b/datapath/flow.c
> +static void update_range__(struct sw_flow_match *match,
> +                         size_t offset, size_t size, bool is_mask)
> +{
> +       struct sw_flow_key_range *range = NULL;
> +       size_t start = offset;
> +       size_t end = offset + size;
> +
> +       if (!is_mask)
> +               range = &match->range;
> +       else if (match->mfm)
> +               range = &match->mfm->range;
> +
> +       if (range == NULL)
> +               return;

Please just use !range.

> +static bool ovs_match_validate(const struct sw_flow_match *match,
> +               u64 key_attrs, u64 mask_attrs)
> +{
> +       u64 key_expected = 0;
> +       u64 mask_allowed = key_attrs;  /* At most allow all key attributes */
> +
> +

Extra blank line.

> +       /* Check key attributes. */
> +       if (match->key->eth.type == htons(ETH_P_ARP)) {
> +               key_expected |= 1ULL << OVS_KEY_ATTR_ARP;
> +               if (match->mfm && (match->mfm->key.eth.type == htons(0xffff)))
> +                   mask_allowed |= 1ULL << OVS_KEY_ATTR_ARP;
> +       }
> +
> +       if (match->key->eth.type == htons(ETH_P_RARP)) {
> +               key_expected |= 1ULL << OVS_KEY_ATTR_ARP;
> +               if (match->mfm && (match->mfm->key.eth.type == htons(0xffff)))
> +                   mask_allowed |= 1ULL << OVS_KEY_ATTR_ARP;
> +       }

For the cases where we two protocol values with the same result, it's
probably best to just combine them with an ||.

> +       if (match->key->eth.type == htons(ETH_P_IP)) {
> +               key_expected |= 1ULL << OVS_KEY_ATTR_IPV4;
> +               if (match->mfm && (match->mfm->key.eth.type != htons(0xffff)))

Isn't this supposed to be == instead of !=?

[...]

> +       if ((key_attrs & key_expected) != key_expected)
> +               /* Key attributes check failed. */
> +               return false;
> +
> +       if ((mask_attrs & mask_allowed ) != mask_attrs)

Extra space in the parentheses.

> +static void flow_key_mask(struct sw_flow_key *dst,
> +                         const struct sw_flow_key *src,
> +                         const struct sw_flow_mask *mfm)
> +{
> +       u8 *m = (u8 *)&mfm->key + mfm->range.start;
> +       u8 *s = (u8 *)src + mfm->range.start;
> +       u8 *d = (u8 *)dst + mfm->range.start;
> +       int i;
> +
> +       memset(dst, 0, sizeof(*dst));
> +       for (i = 0; i < ovs_sw_flow_mask_size_roundup(mfm); i++) {
> +               *d = *s & *m;
> +               d++, s++, m++;
> +       }
> +}

What if we just did an in-place masking? This would avoid the need to
assume the hash algorithm's alignment requirements or do any roundup
at all.

> @@ -277,6 +464,7 @@ struct flow_table *ovs_flow_tbl_alloc(int new_size)
>                 kfree(table);
>                 return NULL;
>         }
> +

Can you try to avoid introducing whitespace changes unrelated to the
patch? There are a number of them.

>  static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
> -                       int *key_lenp, int nh_len)
> +                       int nh_len)
[...]
>  out:
> -       *key_lenp = key_len;
>         return error;

We can probably just remove the 'out' label altogether now.

> @@ -773,7 +947,6 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
>         }
>
>  out:
> -       *key_lenp = key_len;
>         return error;

Same thing here.

> @@ -791,36 +964,82 @@ static int flow_key_start(struct sw_flow_key *key)
>                 return offsetof(struct sw_flow_key, phy);
>  }
>
> +

Extra blank line.

> +static bool __flow_cmp_unmasked_key(const struct sw_flow *flow,
> +                 const struct sw_flow_key *key, int key_start, int key_len)
> +{
> +       return __cmp_key(&flow->unmasked_key, key, key_start, key_len);
> +}
> +
> +

Here too.

> +bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
> +               const struct sw_flow_key *key, int key_len)
> +{
> +       int key_start;
> +       key_start = flow_key_start(key);
> +
> +       return (__flow_cmp_unmasked_key(flow, key, key_start, key_len));

The parentheses around the return clause aren't really necessary.

> +static bool is_zero_field(const u8* fp, size_t size)

Please put a space before the asterisk.

>  {
> +       if (!fp)
> +               return false;

Does it make more logical sense for a NULL field to be considered
zero? I'm not sure that we can ever really hit this case anyways.

> +       for (i = 0; i < size; i++)
> +               if (*(fp + i))
> +                       return false;

The nicest way of doing this to just OR everything together and check
at the end.

> @@ -975,8 +1103,14 @@ static int parse_flow_nlattrs(const struct nlattr *attr,
>                 if (nla_len(nla) != expected_len && expected_len != -1)
>                         return -EINVAL;
>
> -               attrs |= 1ULL << type;
> -               a[type] = nla;
> +               if (attrs & (1ULL << type))
> +                       /* Duplicate filed. */

"field"

> +static int parse_flow_nz_nlattrs(const struct nlattr *attr,
> +                             const struct nlattr *a[], u64 *attrsp)

What is nz?

> +{
> +       return parse_flow_nlattrs_imp(attr, a, attrsp, true);

Please stick with the underscore convention.

> +}
> +
> +static int parse_flow_nlattrs(const struct nlattr *attr,
> +                             const struct nlattr *a[], u64 *attrsp)
> +{
> +       return parse_flow_nlattrs_imp(attr, a, attrsp, false);
> +}

In general, it seems that there is a number of functions that are
trivial wrappers around their implementation, which I'm not sure
really improves readability.

> +static int ovs_key_from_nlattrs(struct sw_flow_match *match,  u64 attrs,
> +               const struct nlattr **a, bool is_mask)
[...]
> +}
> +
> +

Double blank line.

> +/**
> + * ovs_match_from_nlattrs - parses Netlink attributes into a flow key and
> + * mask. In case the mask is not specified (mask == NULL), the flow is treated
> + * as a Micro flow.  Otherwise, it is a Mega flow.
> + * @match: receives the extracted flow match information.
> + * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
> + * sequence. The fields should of the packet that triggered the creation
> + * of this mega (or micro) flow.
> + * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink
> + * attribute carries the mega flow masks fields.
> + */

Can you remove the terms mega and micro flow from this description as well?

> -int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, int key_len, const struct nlattr *attr)
> +int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow,
> +               const struct nlattr *attr)

Can you update the comment above this function to reflect the removal
of key_len?

> +struct sw_flow *ovs_flow_lookup(struct flow_table *tbl,
> +                               const struct sw_flow_key *key)
> +{
> +       struct sw_flow *flow = NULL;
> +       struct sw_flow_mask *mfm;
> +
> +       list_for_each_entry_rcu(mfm, &mask_list, list) {
> +               flow = ovs_flow_tbl_lookup(tbl, key, mfm);
> +               if (flow)  /* Found */
> +                       break;
> +       }
> +
> +       return flow;
> +}

It would be nice to group this with the other lookup/comparison functions.

> +struct sw_flow_mask *ovs_sw_flow_mask_alloc(void)
> +{
> +       struct sw_flow_mask *mfm;
> +
> +       mfm = kmalloc(sizeof *mfm, GFP_KERNEL);
> +       if (mfm) {
> +               mfm->ref_count = 0;
> +       }

This set of braces isn't necessary.

> +void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *mfm)
> +{
> +       if (mfm && mfm->ref_count) {

I probably wouldn't silently accept a double free - you could do a
BUG_ON if the ref_count is 0.

> +               mfm->ref_count--;
> +
> +               if (!mfm->ref_count) {
> +                       list_del_rcu(&mfm->list);
> +                       call_rcu(&mfm->rcu, rcu_free_sw_flow_mask_cb);
> +               }
> +       }
> +}

I'm not sure that it's actually necessary to use an RCU callback to
track the lifetime of masks - you could unlink them from the list when
the flow is deleted rather than from its RCU callback. Beyond
potentially simplifying things, it fixes a bug - we manipulate the
list and refcount from the flow RCU callback assuming that OVS lock is
held but it isn't at that point (incidentally, lockdep should have
flagged this so it might be worth running).

> diff --git a/datapath/flow.h b/datapath/flow.h
> index dba66cf..a9e18c9 100644
> --- a/datapath/flow.h
> +++ b/datapath/flow.h
> @@ -120,6 +122,8 @@ struct sw_flow {
>         u32 hash;
>
>         struct sw_flow_key key;
> +       struct sw_flow_key unmasked_key;
> +       struct sw_flow_mask __rcu *mfm;

I'm assuming that 'mfm' stands for mega flow mask - since we've
removed this from other places in the code, we should probably choose
a new name here as well.

>  struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
> +                                   const struct sw_flow_key *key,
> +                                   struct sw_flow_mask *mask);

I don't think this is used outside of flow.c, so it could be made static.

> +struct sw_flow *ovs_flow_tbl_lookup_unmasked(struct flow_table *table,
>                                     struct sw_flow_key *key, int len);
> +
>  void ovs_flow_tbl_destroy(struct flow_table *table);
>  void ovs_flow_tbl_deferred_destroy(struct flow_table *table);
>  struct flow_table *ovs_flow_tbl_alloc(int new_size);
>  struct flow_table *ovs_flow_tbl_expand(struct flow_table *table);
>  struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table);
>  void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
> -                        struct sw_flow_key *key, int key_len);
> +                        const struct sw_flow_key *key, int key_len);
> +
>  void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow);

Is there a meaning to having "tbl_" in a function's name? On lookup,
it seems to indicate a masked lookup but I'm not sure that meaning is
consistent across the other functions.

> +struct sw_flow_mask {
> +       struct sw_flow_key_range range;
> +       struct sw_flow_key key;
> +       struct list_head list;
> +       struct rcu_head rcu;
> +       int ref_count;
> +};

It would be nice to sort this by size so that we can avoid unnecessary
padding, even if it doesn't really matter now.

> +int ovs_mega_flow_from_nlattrs(struct sw_flow_key *key,
> +                              struct sw_flow_mask *mask,
> +                              const struct nlattr *);

I think this is just a prototype without a function now.