[ovs-dev] [PATCH 2/2] datapath: Hash only the part of struct sw_flow_key populated by flow_extract().

Thu Apr 28 20:37:41 UTC 2011

On Mon, Apr 25, 2011 at 6:11 PM, Andrew Evans <aevans at nicira.com> wrote:
> @@ -293,7 +294,7 @@ void dp_process_received_packet(struct vport *p, struct sk_buff *skb)
>
>                /* Look up flow. */
>                flow_node = tbl_lookup(rcu_dereference(dp->table), &key,
> -                                       flow_hash(&key), flow_cmp);
> +                                      flow_hash(&key, key_len), flow_cmp);

The key length is also useful for the flow comparison.  It's maybe not
quite as significant as the hash but it would still be good to get
that optimization as well.

> diff --git a/datapath/flow.c b/datapath/flow.c
> index 558ed19..aaf9d99 100644
> --- a/datapath/flow.c
> +++ b/datapath/flow.c
> +#define SW_FLOW_KEY_OFFSET(field)                      \
> +       offsetof(struct sw_flow_key, field) +           \
> +       sizeof(((struct sw_flow_key *)0)->field)

The second half of this macro can be replaced with FIELD_SIZEOF.

>  static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
> -                       int nh_len)
> +                       int *key_len, int nh_len)
>  {
>        struct icmp6hdr *icmp = icmp6_hdr(skb);
> +       int saved_key_len = *key_len;
>
>        /* The ICMPv6 type and code fields use the 16-bit transport port
>         * fields, so we need to store them in 16-bit network byte order.
>         */
>        key->ipv6.tp.src = htons(icmp->icmp6_type);
>        key->ipv6.tp.dst = htons(icmp->icmp6_code);
> +       *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp.dst);

Don't we need to store the key_len at this point, not before?  If we
jump to the invalid block then we don't get rid of type and code.

> @@ -351,12 +360,14 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
>                                        goto invalid;
>                                memcpy(key->ipv6.nd_sha,
>                                    &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
> +                               *key_len = SW_FLOW_KEY_OFFSET(ipv6.nd_sha);
>                        } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR
>                                   && opt_len == 8) {
>                                if (unlikely(!is_zero_ether_addr(key->ipv6.nd_tha)))
>                                        goto invalid;
>                                memcpy(key->ipv6.nd_tha,
>                                    &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
> +                               *key_len = SW_FLOW_KEY_OFFSET(ipv6.nd_tha);

Don't we need the highest of these options, not the most recent?  If
source comes second, it will cause us to not use the target.

>  int flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
> -                bool *is_frag)
> +               int *key_len, bool *is_frag)
>  {
>        struct ethhdr *eth;
>
>        memset(key, 0, sizeof(*key));
>        key->tun_id = OVS_CB(skb)->tun_id;
>        key->in_port = in_port;
> +       *key_len = SW_FLOW_KEY_OFFSET(in_port);

I would probably keep this in a local variable and then assign to the
argument on exit.  It's probably better to have a single point of
return anyways.

> -u32 flow_hash(const struct sw_flow_key *key)
> +u32 flow_hash(const struct sw_flow_key *key, int key_len)
>  {
> -       return jhash2((u32*)key, sizeof(*key) / sizeof(u32), hash_seed);
> +       return jhash2((u32*)key, (key_len + sizeof(u32) - 1) / sizeof(u32), hash_seed);

There's a macro to do this type of round up division called
DIV_ROUND_UP.  It doesn't exist on all kernels that we support but I
had a backported version lying around that I sent out so you could use
it.

> @@ -675,12 +703,12 @@ int flow_from_nlattrs(struct sw_flow_key *swkey, const struct nlattr *attr)
>                        if (swkey->dl_type != htons(ETH_P_IPV6))
>                                return -EINVAL;
>                        ipv6_key = nla_data(nla);
> +                       swkey->nw_proto = ipv6_key->ipv6_proto;
> +                       swkey->nw_tos = ipv6_key->ipv6_tos;
>                        memcpy(&swkey->ipv6.src, ipv6_key->ipv6_src,
>                                        sizeof(swkey->ipv6.src));
>                        memcpy(&swkey->ipv6.dst, ipv6_key->ipv6_dst,
>                                        sizeof(swkey->ipv6.dst));
> -                       swkey->nw_proto = ipv6_key->ipv6_proto;
> -                       swkey->nw_tos = ipv6_key->ipv6_tos;

Is there a reason why these two lines were rearranged?  I noticed it
before as well.

> @@ -744,6 +777,7 @@ int flow_from_nlattrs(struct sw_flow_key *swkey, const struct nlattr *attr)
>                        swkey->nw_proto = ntohs(arp_key->arp_op);
>                        memcpy(swkey->ipv4.arp.sha, arp_key->arp_sha, ETH_ALEN);
>                        memcpy(swkey->ipv4.arp.tha, arp_key->arp_tha, ETH_ALEN);
> +                       *key_len = SW_FLOW_KEY_OFFSET(ipv4.arp.tha);

Can't we just use SW_FLOW_KEY_OFFSET(ipv4.arp) rather than relying on
remembering the last element?

> @@ -789,6 +826,7 @@ int flow_from_nlattrs(struct sw_flow_key *swkey, const struct nlattr *attr)
>                    swkey->nw_proto == IPPROTO_UDP ||
>                    swkey->nw_proto == IPPROTO_ICMP)
>                        return -EINVAL;
> +               *key_len = SW_FLOW_KEY_OFFSET(ipv4.dst);

Why are some of these set above and some down here?  It seems that you
could just set the length each time the we add a new segment of the
flow.  We enforce ordering, so there's no danger of an earlier element
overwriting a later one.  If we set the length for each segment, even
those where it isn't strictly necessary it would be much easier to
verify that the length is set correctly everywhere.  We might also
want to put in a warning to check that the key_len is not zero.