[ovs-dev] [RFC 2/2] datapath: Cache netlink flow key, mask on flow_put.

Thu Jul 10 05:29:28 UTC 2014

Converting the flow key and mask back into netlink format during flow
dump is fairly expensive. By caching the netlink versions of these
during flow setup, and copying the memory directly during flow dump, we
are able to support up to 33% more flows in the datapath. Flow setup
rate decreases by around 10%.

Previous revalidator thread perf, 1 revalidator, 15 handlers:
15.17%  ovs-vswitchd  [kernel.kallsyms]   [k] memcpy
 9.05%  ovs-vswitchd  [kernel.kallsyms]   [k] ovs_nla_put_flow
 9.01%  ovs-vswitchd  [kernel.kallsyms]   [k] memset
 3.94%  ovs-vswitchd  [kernel.kallsyms]   [k] __nla_reserve
 3.86%  ovs-vswitchd  ovs-vswitchd        [.] revalidate.isra.12
 2.56%  ovs-vswitchd  [kernel.kallsyms]   [k] _raw_spin_lock_bh
 2.50%  ovs-vswitchd  [kernel.kallsyms]   [k] __nla_put
 2.19%  ovs-vswitchd  [kernel.kallsyms]   [k] ovs_flow_cmd_fill_info
 2.04%  ovs-vswitchd  [kernel.kallsyms]   [k] ovs_flow_tbl_dump_next
 1.44%  ovs-vswitchd  [kernel.kallsyms]   [k] _raw_read_lock
 1.29%  ovs-vswitchd  [kernel.kallsyms]   [k] mutex_unlock
 1.25%  ovs-vswitchd  [kernel.kallsyms]   [k] skb_put
 1.17%  ovs-vswitchd  [kernel.kallsyms]   [k] nla_put
 1.14%  ovs-vswitchd  libpthread-2.19.so  [.] 0x000000000000f8b1
 1.12%  ovs-vswitchd  ovs-vswitchd        [.] revalidator_sweep__.constprop.16

New revalidator perf:
 7.78%  ovs-vswitchd  [kernel.kallsyms]   [k] mspin_lock
 7.18%  ovs-vswitchd  [kernel.kallsyms]   [k] memcpy
 5.00%  ovs-vswitchd  ovs-vswitchd        [.] revalidate.isra.12
 3.84%  ovs-vswitchd  [kernel.kallsyms]   [k] ovs_flow_cmd_fill_info
 3.49%  ovs-vswitchd  [kernel.kallsyms]   [k] _raw_spin_lock_bh
 3.06%  ovs-vswitchd  [kernel.kallsyms]   [k] ovs_flow_cmd_fill_match
 2.79%  ovs-vswitchd  [kernel.kallsyms]   [k] ovs_flow_tbl_dump_next
 2.01%  ovs-vswitchd  [kernel.kallsyms]   [k] masked_flow_lookup
 2.01%  ovs-vswitchd  [kernel.kallsyms]   [k] _raw_read_lock
 1.67%  ovs-vswitchd  libpthread-2.19.so  [.] pthread_mutex_lock
 1.60%  ovs-vswitchd  [kernel.kallsyms]   [k] mutex_spin_on_owner
 1.56%  ovs-vswitchd  libpthread-2.19.so  [.] 0x000000000000f8ad
 1.53%  ovs-vswitchd  [kernel.kallsyms]   [k] mutex_unlock
 1.52%  ovs-vswitchd  ovs-vswitchd        [.] revalidator_sweep__.constprop.16

Suggested-by: Jesse Gross <jesse at nicira.com>
Signed-off-by: Joe Stringer <joestringer at nicira.com>
---
I'm operating under the assumption that we can't just copy the version
of the netlink key/mask directly from userspace as-is, due to the
following comment in ofproto/ofproto-dpif-upcall.c:

/* Since the kernel is free to ignore wildcarded bits in the mask, we can't
 * directly check that the masks are the same.  Instead we check that the
 * mask in the kernel is more specific i.e. less wildcarded, than what
 * we've calculated here.  This guarantees we don't catch any packets we
 * shouldn't with the megaflow. */

I.E., the kernel may change the mask during storage, so caching the
userspace version in this code may cause flow_dump to report the mask
incorrectly. The simple approach is to convert the key to the internal
format and back, which this patch uses. An alternative could be to copy
the userspace version, then iterate through the copy and adjust it for
the current mask conversion behaviour. I wanted to check whether the
assumption holds before trying this out.

Is there somewhere that lists which fields the datapath supports/doesn't
support wildcards for?
---
 datapath/datapath.c   |   30 +++++++++++++++++++++++++-----
 datapath/flow.h       |    1 +
 datapath/flow_table.c |   12 +++++++++++-
 datapath/flow_table.h |    2 +-
 4 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/datapath/datapath.c b/datapath/datapath.c
index bdbcbd5..45586cf 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -566,7 +566,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 		packet->protocol = htons(ETH_P_802_2);
 
 	/* Build an sw_flow for sending this packet. */
-	flow = ovs_flow_alloc();
+	flow = ovs_flow_alloc(0);
 	err = PTR_ERR(flow);
 	if (IS_ERR(flow))
 		goto err_kfree_skb;
@@ -684,11 +684,16 @@ static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats,
 	}
 }
 
+static size_t ovs_nl_match_size(void)
+{
+	return nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_KEY */
+		+ nla_total_size(key_attr_size()); /* OVS_FLOW_ATTR_MASK */
+}
+
 static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts)
 {
 	return NLMSG_ALIGN(sizeof(struct ovs_header))
-		+ nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_KEY */
-		+ nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_MASK */
+		+ ovs_nl_match_size()
 		+ nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */
 		+ nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */
 		+ nla_total_size(8) /* OVS_FLOW_ATTR_USED */
@@ -698,11 +703,19 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts)
 /* Called with ovs_mutex or RCU read lock. */
 static int ovs_flow_cmd_fill_match(struct datapath *dp,
 				   const struct sw_flow *flow,
+				   const struct sk_buff *nl_cache,
 				   struct sk_buff *skb)
 {
 	struct nlattr *nla;
 	int err;
 
+	if (skb_tailroom(skb) < ovs_nl_match_size())
+		goto nla_put_failure;
+
+	if (nl_cache)
+		return skb_copy_bits(nl_cache, 0, skb_put(skb, nl_cache->len),
+				     nl_cache->len);
+
 	/* Fill flow key. */
 	nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY);
 	if (!nla)
@@ -821,7 +834,7 @@ static int ovs_flow_cmd_fill_info(struct datapath *dp,
 		return -EMSGSIZE;
 	ovs_header->dp_ifindex = dp_ifindex;
 
-	err = ovs_flow_cmd_fill_match(dp, flow, skb);
+	err = ovs_flow_cmd_fill_match(dp, flow, flow->nl_match_cache, skb);
 	if (err)
 		goto error;
 
@@ -905,7 +918,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
 
 	/* Most of the time we need to allocate a new flow, do it before
 	 * locking. */
-	new_flow = ovs_flow_alloc();
+	new_flow = ovs_flow_alloc(ovs_nl_match_size());
 	if (IS_ERR(new_flow)) {
 		error = PTR_ERR(new_flow);
 		goto error;
@@ -957,6 +970,13 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
 			goto err_unlock_ovs;
 		}
 
+		/* Init netlink match cache. */
+		if (likely(new_flow->nl_match_cache)) {
+			error = ovs_flow_cmd_fill_match(dp, new_flow, NULL,
+							new_flow->nl_match_cache);
+			BUG_ON(error < 0);
+		}
+
 		if (unlikely(reply)) {
 			error = ovs_flow_cmd_fill_info(dp, new_flow,
 						       ovs_header->dp_ifindex,
diff --git a/datapath/flow.h b/datapath/flow.h
index f6afa48..f1bf289 100644
--- a/datapath/flow.h
+++ b/datapath/flow.h
@@ -189,6 +189,7 @@ struct sw_flow {
 	struct sw_flow_key unmasked_key;
 	struct sw_flow_mask *mask;
 	struct sw_flow_actions __rcu *sf_acts;
+	struct sk_buff *nl_match_cache;
 	struct flow_stats __rcu *stats[]; /* One for each NUMA node.  First one
 					   * is allocated at flow creation time,
 					   * the rest are allocated on demand
diff --git a/datapath/flow_table.c b/datapath/flow_table.c
index 9ab1020..879c4c3 100644
--- a/datapath/flow_table.c
+++ b/datapath/flow_table.c
@@ -80,7 +80,7 @@ void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
 		*d++ = *s++ & *m++;
 }
 
-struct sw_flow *ovs_flow_alloc(void)
+struct sw_flow *ovs_flow_alloc(size_t match_len)
 {
 	struct sw_flow *flow;
 	struct flow_stats *stats;
@@ -92,6 +92,7 @@ struct sw_flow *ovs_flow_alloc(void)
 
 	flow->sf_acts = NULL;
 	flow->mask = NULL;
+	flow->nl_match_cache = NULL;
 	flow->stats_last_writer = NUMA_NO_NODE;
 
 	/* Initialize the default stat node. */
@@ -100,6 +101,12 @@ struct sw_flow *ovs_flow_alloc(void)
 	if (!stats)
 		goto err;
 
+	if (match_len) {
+		flow->nl_match_cache = alloc_skb(match_len, GFP_KERNEL);
+		if (!flow->nl_match_cache)
+			goto err_free_stats;
+	}
+
 	spin_lock_init(&stats->lock);
 
 	RCU_INIT_POINTER(flow->stats[0], stats);
@@ -109,6 +116,8 @@ struct sw_flow *ovs_flow_alloc(void)
 			RCU_INIT_POINTER(flow->stats[node], NULL);
 
 	return flow;
+err_free_stats:
+	kmem_cache_free(flow_stats_cache, flow);
 err:
 	kmem_cache_free(flow_cache, flow);
 	return ERR_PTR(-ENOMEM);
@@ -146,6 +155,7 @@ static void flow_free(struct sw_flow *flow)
 {
 	int node;
 
+	kfree_skb(flow->nl_match_cache);
 	kfree((struct sw_flow_actions __force *)flow->sf_acts);
 	for_each_node(node)
 		if (flow->stats[node])
diff --git a/datapath/flow_table.h b/datapath/flow_table.h
index a05d36a..5edf1c5 100644
--- a/datapath/flow_table.h
+++ b/datapath/flow_table.h
@@ -69,7 +69,7 @@ extern struct kmem_cache *flow_stats_cache;
 int ovs_flow_init(void);
 void ovs_flow_exit(void);
 
-struct sw_flow *ovs_flow_alloc(void);
+struct sw_flow *ovs_flow_alloc(size_t match_len);
 void ovs_flow_free(struct sw_flow *, bool deferred);
 
 int ovs_flow_tbl_init(struct flow_table *);
-- 
1.7.10.4