[ovs-dev] [PATCH 2/2] datapath: Hash only the part of struct sw_flow_key populated by flow_extract().

Andrew Evans aevans at nicira.com
Fri Apr 29 08:22:37 UTC 2011


On 4/28/11 1:37 PM, Jesse Gross wrote:
> On Mon, Apr 25, 2011 at 6:11 PM, Andrew Evans <aevans at nicira.com> wrote:
>> @@ -293,7 +294,7 @@ void dp_process_received_packet(struct vport *p, struct sk_buff *skb)
>>
>>                /* Look up flow. */
>>                flow_node = tbl_lookup(rcu_dereference(dp->table), &key,
>> -                                       flow_hash(&key), flow_cmp);
>> +                                      flow_hash(&key, key_len), flow_cmp);
> 
> The key length is also useful for the flow comparison.  It's maybe not
> quite as significant as the hash but it would still be good to get
> that optimization as well.

Ok. I considered that but stopped short when I realized that
tbl_lookup() would have to change, which would ripple into the tunnel
code. Here's an incremental for that:

--- cut here ---
diff --git a/datapath/datapath.c b/datapath/datapath.c
index 9727f87..597fbb4 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -293,7 +293,7 @@ void dp_process_received_packet(struct vport *p, struct sk_buff *skb)
 		}
 
 		/* Look up flow. */
-		flow_node = tbl_lookup(rcu_dereference(dp->table), &key,
+		flow_node = tbl_lookup(rcu_dereference(dp->table), &key, key_len,
 				       flow_hash(&key, key_len), flow_cmp);
 		if (unlikely(!flow_node)) {
 			struct dp_upcall_info upcall;
@@ -983,7 +983,7 @@ static int odp_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 
 	hash = flow_hash(&key, key_len);
 	table = get_table_protected(dp);
-	flow_node = tbl_lookup(table, &key, hash, flow_cmp);
+	flow_node = tbl_lookup(table, &key, key_len, hash, flow_cmp);
 	if (!flow_node) {
 		struct sw_flow_actions *acts;
 
@@ -1106,7 +1106,7 @@ static int odp_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
 		return -ENODEV;
 
 	table = get_table_protected(dp);
-	flow_node = tbl_lookup(table, &key, flow_hash(&key, key_len),
+	flow_node = tbl_lookup(table, &key, key_len, flow_hash(&key, key_len),
 			       flow_cmp);
 	if (!flow_node)
 		return -ENOENT;
@@ -1143,7 +1143,7 @@ static int odp_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
  		return -ENODEV;
 
 	table = get_table_protected(dp);
-	flow_node = tbl_lookup(table, &key, flow_hash(&key, key_len),
+	flow_node = tbl_lookup(table, &key, key_len, flow_hash(&key, key_len),
 			       flow_cmp);
 	if (!flow_node)
 		return -ENOENT;
diff --git a/datapath/flow.c b/datapath/flow.c
index 8950c0e..b53359f 100644
--- a/datapath/flow.c
+++ b/datapath/flow.c
@@ -588,12 +588,12 @@ u32 flow_hash(const struct sw_flow_key *key, int key_len)
 	return jhash2((u32*)key, DIV_ROUND_UP(key_len, sizeof(u32)), hash_seed);
 }
 
-int flow_cmp(const struct tbl_node *node, void *key2_)
+int flow_cmp(const struct tbl_node *node, void *key2_, int len)
 {
 	const struct sw_flow_key *key1 = &flow_cast(node)->key;
 	const struct sw_flow_key *key2 = key2_;
 
-	return !memcmp(key1, key2, sizeof(struct sw_flow_key));
+	return !memcmp(key1, key2, len);
 }
 
 /**
diff --git a/datapath/flow.h b/datapath/flow.h
index 7e650b3..d7fc17d 100644
--- a/datapath/flow.h
+++ b/datapath/flow.h
@@ -123,7 +123,7 @@ void flow_used(struct sw_flow *, struct sk_buff *);
 u64 flow_used_time(unsigned long flow_jiffies);
 
 u32 flow_hash(const struct sw_flow_key *, int key_len);
-int flow_cmp(const struct tbl_node *, void *target);
+int flow_cmp(const struct tbl_node *, void *target, int len);
 
 /* Upper bound on the length of a nlattr-formatted flow key.  The longest
  * nlattr-formatted flow key would be:
diff --git a/datapath/table.c b/datapath/table.c
index 47fa016..4bc91d7 100644
--- a/datapath/table.c
+++ b/datapath/table.c
@@ -178,14 +178,14 @@ static struct tbl_bucket __rcu **find_bucket(struct tbl *table, u32 hash)
 	return &table->buckets[l1][l2];
 }
 
-static int search_bucket(const struct tbl_bucket *bucket, void *target, u32 hash,
-			 int (*cmp)(const struct tbl_node *, void *))
+static int search_bucket(const struct tbl_bucket *bucket, void *target, int len, u32 hash,
+			 int (*cmp)(const struct tbl_node *, void *, int len))
 {
 	int i;
 
 	for (i = 0; i < bucket->n_objs; i++) {
 		struct tbl_node *obj = bucket->objs[i];
-		if (obj->hash == hash && likely(cmp(obj, target)))
+		if (obj->hash == hash && likely(cmp(obj, target, len)))
 			return i;
 	}
 
@@ -204,8 +204,8 @@ static int search_bucket(const struct tbl_bucket *bucket, void *target, u32 hash
  * Searches @table for an object identified by @target.  Returns the tbl_node
  * contained in the object if successful, otherwise %NULL.
  */
-struct tbl_node *tbl_lookup(struct tbl *table, void *target, u32 hash,
-			    int (*cmp)(const struct tbl_node *, void *))
+struct tbl_node *tbl_lookup(struct tbl *table, void *target, int len, u32 hash,
+			    int (*cmp)(const struct tbl_node *, void *, int))
 {
 	struct tbl_bucket __rcu **bucketp = find_bucket(table, hash);
 	struct tbl_bucket *bucket = get_bucket(*bucketp);
@@ -214,7 +214,7 @@ struct tbl_node *tbl_lookup(struct tbl *table, void *target, u32 hash,
 	if (!bucket)
 		return NULL;
 
-	index = search_bucket(bucket, target, hash, cmp);
+	index = search_bucket(bucket, target, len, hash, cmp);
 	if (index < 0)
 		return NULL;
 
diff --git a/datapath/table.h b/datapath/table.h
index 22574be..3a0c2a6 100644
--- a/datapath/table.h
+++ b/datapath/table.h
@@ -55,8 +55,8 @@ struct tbl {
 
 struct tbl *tbl_create(unsigned int n_buckets);
 void tbl_destroy(struct tbl *, void (*destructor)(struct tbl_node *));
-struct tbl_node *tbl_lookup(struct tbl *, void *target, u32 hash,
-			    int (*cmp)(const struct tbl_node *, void *target));
+struct tbl_node *tbl_lookup(struct tbl *, void *target, int len, u32 hash,
+			    int (*cmp)(const struct tbl_node *, void *target, int len));
 int tbl_insert(struct tbl *, struct tbl_node *, u32 hash);
 int tbl_remove(struct tbl *, struct tbl_node *);
 unsigned int tbl_count(struct tbl *);
diff --git a/datapath/tunnel.c b/datapath/tunnel.c
index 5754d92..46ab416 100644
--- a/datapath/tunnel.c
+++ b/datapath/tunnel.c
@@ -186,7 +186,7 @@ struct port_lookup_key {
  * Modifies 'target' to store the rcu_dereferenced pointer that was used to do
  * the comparision.
  */
-static int port_cmp(const struct tbl_node *node, void *target)
+static int port_cmp(const struct tbl_node *node, void *target, int unused)
 {
 	const struct tnl_vport *tnl_vport = tnl_vport_table_cast(node);
 	struct port_lookup_key *lookup = target;
@@ -337,7 +337,8 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be64 key,
 		lookup.tunnel_type = tunnel_type & ~TNL_T_KEY_MATCH;
 
 		if (key_local_remote_ports) {
-			tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
+			tbl_node = tbl_lookup(table, &lookup, sizeof(lookup),
+					      port_hash(&lookup), port_cmp);
 			if (tbl_node)
 				goto found;
 		}
@@ -345,7 +346,8 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be64 key,
 		if (key_remote_ports) {
 			lookup.saddr = 0;
 
-			tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
+			tbl_node = tbl_lookup(table, &lookup, sizeof(lookup),
+					      port_hash(&lookup), port_cmp);
 			if (tbl_node)
 				goto found;
 
@@ -358,7 +360,8 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be64 key,
 		lookup.tunnel_type = tunnel_type & ~TNL_T_KEY_EXACT;
 
 		if (local_remote_ports) {
-			tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
+			tbl_node = tbl_lookup(table, &lookup, sizeof(lookup),
+					      port_hash(&lookup), port_cmp);
 			if (tbl_node)
 				goto found;
 		}
@@ -366,7 +369,8 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be64 key,
 		if (remote_ports) {
 			lookup.saddr = 0;
 
-			tbl_node = tbl_lookup(table, &lookup, port_hash(&lookup), port_cmp);
+			tbl_node = tbl_lookup(table, &lookup, sizeof(lookup),
+					      port_hash(&lookup), port_cmp);
 			if (tbl_node)
 				goto found;
 		}
@@ -959,7 +963,7 @@ static struct tnl_cache *build_cache(struct vport *vport,
 			goto done;
 
 		flow_node = tbl_lookup(rcu_dereference(dst_vport->dp->table),
-				       &flow_key,
+				       &flow_key, flow_key_len,
 				       flow_hash(&flow_key, flow_key_len),
 				       flow_cmp);
 		if (flow_node) {
--- cut here ---

>> diff --git a/datapath/flow.c b/datapath/flow.c
>> index 558ed19..aaf9d99 100644
>> --- a/datapath/flow.c
>> +++ b/datapath/flow.c
>> +#define SW_FLOW_KEY_OFFSET(field)                      \
>> +       offsetof(struct sw_flow_key, field) +           \
>> +       sizeof(((struct sw_flow_key *)0)->field)
> 
> The second half of this macro can be replaced with FIELD_SIZEOF.

Thanks, changed.

>>  static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
>> -                       int nh_len)
>> +                       int *key_len, int nh_len)
>>  {
>>        struct icmp6hdr *icmp = icmp6_hdr(skb);
>> +       int saved_key_len = *key_len;
>>
>>        /* The ICMPv6 type and code fields use the 16-bit transport port
>>         * fields, so we need to store them in 16-bit network byte order.
>>         */
>>        key->ipv6.tp.src = htons(icmp->icmp6_type);
>>        key->ipv6.tp.dst = htons(icmp->icmp6_code);
>> +       *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp.dst);
> 
> Don't we need to store the key_len at this point, not before?  If we
> jump to the invalid block then we don't get rid of type and code.

It looks to me like we abort processing of this packet if an error is
returned from this function. I guess there's no harm in reporting a
partial result on error though.

> 
>> @@ -351,12 +360,14 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
>>                                        goto invalid;
>>                                memcpy(key->ipv6.nd_sha,
>>                                    &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
>> +                               *key_len = SW_FLOW_KEY_OFFSET(ipv6.nd_sha);
>>                        } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR
>>                                   && opt_len == 8) {
>>                                if (unlikely(!is_zero_ether_addr(key->ipv6.nd_tha)))
>>                                        goto invalid;
>>                                memcpy(key->ipv6.nd_tha,
>>                                    &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
>> +                               *key_len = SW_FLOW_KEY_OFFSET(ipv6.nd_tha);
> 
> Don't we need the highest of these options, not the most recent?  If
> source comes second, it will cause us to not use the target.

Thanks for pointing that out. I didn't realize the ordering was indeterminate.

Here's an incremental diff to fix the above issues:

--- cut here ---
diff --git a/datapath/flow.c b/datapath/flow.c
index bdd4f7d..8950c0e 100644
--- a/datapath/flow.c
+++ b/datapath/flow.c
@@ -310,17 +310,17 @@ static __be16 parse_ethertype(struct sk_buff *skb)
 }
 
 static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
-			int *key_len, int nh_len)
+			int *key_lenp, int nh_len)
 {
 	struct icmp6hdr *icmp = icmp6_hdr(skb);
-	int saved_key_len = *key_len;
+	int key_len;
 
 	/* The ICMPv6 type and code fields use the 16-bit transport port
 	 * fields, so we need to store them in 16-bit network byte order.
 	 */
 	key->ipv6.tp.src = htons(icmp->icmp6_type);
 	key->ipv6.tp.dst = htons(icmp->icmp6_code);
-	*key_len = SW_FLOW_KEY_OFFSET(ipv6.tp.dst);
+	key_len = SW_FLOW_KEY_OFFSET(ipv6.tp.dst);
 
 	if (icmp->icmp6_code == 0 &&
 	    (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION ||
@@ -339,7 +339,7 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
 
 		nd = (struct nd_msg *)skb_transport_header(skb);
 		ipv6_addr_copy(&key->ipv6.nd.target, &nd->target);
-		*key_len = SW_FLOW_KEY_OFFSET(ipv6.nd.target);
+		key_len = SW_FLOW_KEY_OFFSET(ipv6.nd.target);
 
 		icmp_len -= sizeof(*nd);
 		offset = 0;
@@ -360,14 +360,16 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
 					goto invalid;
 				memcpy(key->ipv6.nd.sll,
 				    &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
-				*key_len = SW_FLOW_KEY_OFFSET(ipv6.nd.sll);
+				if (key_len < SW_FLOW_KEY_OFFSET(ipv6.nd.sll))
+					key_len = SW_FLOW_KEY_OFFSET(ipv6.nd.sll);
 			} else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR
 				   && opt_len == 8) {
 				if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll)))
 					goto invalid;
 				memcpy(key->ipv6.nd.tll,
 				    &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
-				*key_len = SW_FLOW_KEY_OFFSET(ipv6.nd.tll);
+				if (key_len < SW_FLOW_KEY_OFFSET(ipv6.nd.tll))
+					key_len = SW_FLOW_KEY_OFFSET(ipv6.nd.tll);
 			}
 
 			icmp_len -= opt_len;
@@ -375,14 +377,15 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
 		}
 	}
 
-	return 0;
+	goto out;
 
 invalid:
 	memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target));
 	memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll));
 	memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll));
-	*key_len = saved_key_len;
 
+out:
+	*key_lenp = key_len;
 	return 0;
 }
 

--- cut here ---

(I would have used the max() macro to prevent moving key_len backward,
but the embedded pointer type comparison generated warnings that I
couldn't easily work around.)

>>  int flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
>> -                bool *is_frag)
>> +               int *key_len, bool *is_frag)
>>  {
>>        struct ethhdr *eth;
>>
>>        memset(key, 0, sizeof(*key));
>>        key->tun_id = OVS_CB(skb)->tun_id;
>>        key->in_port = in_port;
>> +       *key_len = SW_FLOW_KEY_OFFSET(in_port);
> 
> I would probably keep this in a local variable and then assign to the
> argument on exit.  It's probably better to have a single point of
> return anyways.

Ok, I like that better. Here's an incremental diff. (I might have taken
"single point of return" too literally -- please tell me if so.)

--- cut here ---
diff --git a/datapath/flow.c b/datapath/flow.c
index b53359f..7eb5d79 100644
--- a/datapath/flow.c
+++ b/datapath/flow.c
@@ -416,14 +416,16 @@ out:
  *      For other key->dl_type values it is left untouched.
  */
 int flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
-		int *key_len, bool *is_frag)
+		 int *key_lenp, bool *is_frag)
 {
+	int error = 0;
 	struct ethhdr *eth;
+	int key_len;
 
 	memset(key, 0, sizeof(*key));
 	key->tun_id = OVS_CB(skb)->tun_id;
 	key->in_port = in_port;
-	*key_len = SW_FLOW_KEY_OFFSET(in_port);
+	key_len = SW_FLOW_KEY_OFFSET(in_port);
 	*is_frag = false;
 
 	/*
@@ -444,8 +446,10 @@ int flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
 	 * bytes, which is always sufficient without IP options, and then check
 	 * whether we need to pull more later when we look at the IP header.
 	 */
-	if (!pskb_may_pull(skb, min(skb->len, 64u)))
-		return -ENOMEM;
+	if (!pskb_may_pull(skb, min(skb->len, 64u))) {
+		error = -ENOMEM;
+		goto out;
+	}
 
 	skb_reset_mac_header(skb);
 
@@ -463,22 +467,21 @@ int flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
 		parse_vlan(skb, key);
 
 	key->dl_type = parse_ethertype(skb);
-	*key_len = SW_FLOW_KEY_OFFSET(dl_type);
+	key_len = SW_FLOW_KEY_OFFSET(dl_type);
 	skb_reset_network_header(skb);
 	__skb_push(skb, skb->data - (unsigned char *)eth);
 
 	/* Network layer. */
 	if (key->dl_type == htons(ETH_P_IP)) {
 		struct iphdr *nh;
-		int error;
 
 		error = check_iphdr(skb);
 		if (unlikely(error)) {
 			if (error == -EINVAL) {
 				skb->transport_header = skb->network_header;
-				return 0;
+				error = 0;
 			}
-			return error;
+			goto out;
 		}
 
 		nh = ip_hdr(skb);
@@ -486,7 +489,7 @@ int flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
 		key->ipv4.dst = nh->daddr;
 		key->nw_tos = nh->tos & ~INET_ECN_MASK;
 		key->nw_proto = nh->protocol;
-		*key_len = SW_FLOW_KEY_OFFSET(ipv4.dst);
+		key_len = SW_FLOW_KEY_OFFSET(ipv4.dst);
 
 		/* Transport layer. */
 		if (!(nh->frag_off & htons(IP_MF | IP_OFFSET)) &&
@@ -496,14 +499,14 @@ int flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
 					struct tcphdr *tcp = tcp_hdr(skb);
 					key->ipv4.tp.src = tcp->source;
 					key->ipv4.tp.dst = tcp->dest;
-					*key_len = SW_FLOW_KEY_OFFSET(ipv4.tp.dst);
+					key_len = SW_FLOW_KEY_OFFSET(ipv4.tp.dst);
 				}
 			} else if (key->nw_proto == IPPROTO_UDP) {
 				if (udphdr_ok(skb)) {
 					struct udphdr *udp = udp_hdr(skb);
 					key->ipv4.tp.src = udp->source;
 					key->ipv4.tp.dst = udp->dest;
-					*key_len = SW_FLOW_KEY_OFFSET(ipv4.tp.dst);
+					key_len = SW_FLOW_KEY_OFFSET(ipv4.tp.dst);
 				}
 			} else if (key->nw_proto == IPPROTO_ICMP) {
 				if (icmphdr_ok(skb)) {
@@ -513,7 +516,7 @@ int flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
 					 * in 16-bit network byte order. */
 					key->ipv4.tp.src = htons(icmp->type);
 					key->ipv4.tp.dst = htons(icmp->code);
-					*key_len = SW_FLOW_KEY_OFFSET(ipv4.tp.dst);
+					key_len = SW_FLOW_KEY_OFFSET(ipv4.tp.dst);
 				}
 			}
 		} else
@@ -532,7 +535,7 @@ int flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
 			/* We only match on the lower 8 bits of the opcode. */
 			if (ntohs(arp->ar_op) <= 0xff) {
 				key->nw_proto = ntohs(arp->ar_op);
-				*key_len = SW_FLOW_KEY_OFFSET(nw_proto);
+				key_len = SW_FLOW_KEY_OFFSET(nw_proto);
 			}
 
 			if (key->nw_proto == ARPOP_REQUEST
@@ -541,19 +544,20 @@ int flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
 				memcpy(&key->ipv4.dst, arp->ar_tip, sizeof(key->ipv4.dst));
 				memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN);
 				memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN);
-				*key_len = SW_FLOW_KEY_OFFSET(ipv4.arp.tha);
+				key_len = SW_FLOW_KEY_OFFSET(ipv4.arp.tha);
 			}
 		}
 	} else if (key->dl_type == htons(ETH_P_IPV6)) {
 		int nh_len;             /* IPv6 Header + Extensions */
 
-		nh_len = parse_ipv6hdr(skb, key, key_len);
+		nh_len = parse_ipv6hdr(skb, key, &key_len);
 		if (unlikely(nh_len < 0)) {
 			if (nh_len == -EINVAL) {
 				skb->transport_header = skb->network_header;
-				return 0;
+			} else {
+				error = nh_len;
 			}
-			return nh_len;
+			goto out;
 		}
 
 		/* Transport layer. */
@@ -562,25 +566,28 @@ int flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
 				struct tcphdr *tcp = tcp_hdr(skb);
 				key->ipv6.tp.src = tcp->source;
 				key->ipv6.tp.dst = tcp->dest;
-				*key_len = SW_FLOW_KEY_OFFSET(ipv6.tp.dst);
+				key_len = SW_FLOW_KEY_OFFSET(ipv6.tp.dst);
 			}
 		} else if (key->nw_proto == NEXTHDR_UDP) {
 			if (udphdr_ok(skb)) {
 				struct udphdr *udp = udp_hdr(skb);
 				key->ipv6.tp.src = udp->source;
 				key->ipv6.tp.dst = udp->dest;
-				*key_len = SW_FLOW_KEY_OFFSET(ipv6.tp.dst);
+				key_len = SW_FLOW_KEY_OFFSET(ipv6.tp.dst);
 			}
 		} else if (key->nw_proto == NEXTHDR_ICMP) {
 			if (icmp6hdr_ok(skb)) {
-				int error = parse_icmpv6(skb, key, key_len,
-							 nh_len);
+				error = parse_icmpv6(skb, key, &key_len,
+						     nh_len);
 				if (error < 0)
-					return error;
+					goto out;
 			}
 		}
 	}
-	return 0;
+
+out:
+	*key_lenp = key_len;
+	return error;
 }
 
 u32 flow_hash(const struct sw_flow_key *key, int key_len)
--- cut here ---

>> -u32 flow_hash(const struct sw_flow_key *key)
>> +u32 flow_hash(const struct sw_flow_key *key, int key_len)
>>  {
>> -       return jhash2((u32*)key, sizeof(*key) / sizeof(u32), hash_seed);
>> +       return jhash2((u32*)key, (key_len + sizeof(u32) - 1) / sizeof(u32), hash_seed);
> 
> There's a macro to do this type of round up division called
> DIV_ROUND_UP.  It doesn't exist on all kernels that we support but I
> had a backported version lying around that I sent out so you could use
> it.

Thanks, I've made that change.

>> @@ -675,12 +703,12 @@ int flow_from_nlattrs(struct sw_flow_key *swkey, const struct nlattr *attr)
>>                        if (swkey->dl_type != htons(ETH_P_IPV6))
>>                                return -EINVAL;
>>                        ipv6_key = nla_data(nla);
>> +                       swkey->nw_proto = ipv6_key->ipv6_proto;
>> +                       swkey->nw_tos = ipv6_key->ipv6_tos;
>>                        memcpy(&swkey->ipv6.src, ipv6_key->ipv6_src,
>>                                        sizeof(swkey->ipv6.src));
>>                        memcpy(&swkey->ipv6.dst, ipv6_key->ipv6_dst,
>>                                        sizeof(swkey->ipv6.dst));
>> -                       swkey->nw_proto = ipv6_key->ipv6_proto;
>> -                       swkey->nw_tos = ipv6_key->ipv6_tos;
> 
> Is there a reason why these two lines were rearranged?  I noticed it
> before as well.

I did it only to make the order of assignments match the order of the
fields in the struct so I could more easily keep track of how much of
the flow key had been populated. I'll change it back if you want me to.

> 
>> @@ -744,6 +777,7 @@ int flow_from_nlattrs(struct sw_flow_key *swkey, const struct nlattr *attr)
>>                        swkey->nw_proto = ntohs(arp_key->arp_op);
>>                        memcpy(swkey->ipv4.arp.sha, arp_key->arp_sha, ETH_ALEN);
>>                        memcpy(swkey->ipv4.arp.tha, arp_key->arp_tha, ETH_ALEN);
>> +                       *key_len = SW_FLOW_KEY_OFFSET(ipv4.arp.tha);
> 
> Can't we just use SW_FLOW_KEY_OFFSET(ipv4.arp) rather than relying on
> remembering the last element?

We could. That does make sense, though I think that it might make it
harder to notice the need to change the field name in the future if new
fields are added or fields are rearranged. Do you think that's not worth
worrying about?

>> @@ -789,6 +826,7 @@ int flow_from_nlattrs(struct sw_flow_key *swkey, const struct nlattr *attr)
>>                    swkey->nw_proto == IPPROTO_UDP ||
>>                    swkey->nw_proto == IPPROTO_ICMP)
>>                        return -EINVAL;
>> +               *key_len = SW_FLOW_KEY_OFFSET(ipv4.dst);
> 
> Why are some of these set above and some down here?  It seems that you
> could just set the length each time the we add a new segment of the
> flow.  We enforce ordering, so there's no danger of an earlier element
> overwriting a later one.  If we set the length for each segment, even
> those where it isn't strictly necessary it would be much easier to
> verify that the length is set correctly everywhere.  We might also
> want to put in a warning to check that the key_len is not zero.

I agree. I'll fix that tomorrow and post an incremental.

Thanks for reviewing this.





More information about the dev mailing list