[ovs-dev] [RFC PATCH 2/4] datapath: NUMA node flow stats.

Thu Jan 9 00:15:46 UTC 2014

Keep kernel flow stats for each NUMA node rather than each (logical)
CPU.  This almost removes the kernel-side OVS locking overhead
otherwise on the top of perf report and improves TCP_CRR test scores
by roughly 50% (with 4 handler and 2 revalidator threads with a TCP
port flow forcing all connection setups to userspace).  My limited
testing revealed no performance regression for tests where only two
wildcarded kernel flows are used.

Signed-off-by: Jarno Rajahalme <jrajahalme at nicira.com>
---
 datapath/flow.c       |   71 +++++++++++++++----------------------------------
 datapath/flow.h       |    4 +--
 datapath/flow_table.c |   41 ++++++++++++++++++----------
 datapath/flow_table.h |    2 ++
 4 files changed, 52 insertions(+), 66 deletions(-)

diff --git a/datapath/flow.c b/datapath/flow.c
index b1b2596..0002759 100644
--- a/datapath/flow.c
+++ b/datapath/flow.c
@@ -67,10 +67,10 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
 	struct flow_stats *stats;
 	__be16 tcp_flags = 0;
 
-	if (!flow->stats.is_percpu)
+	if (!flow->stats.per_numa_mem)
 		stats = flow->stats.stat;
 	else
-		stats = this_cpu_ptr(flow->stats.cpu_stats);
+		stats = &flow->stats.numa_stats[numa_node_id()];
 
 	if ((flow->key.eth.type == htons(ETH_P_IP) ||
 	     flow->key.eth.type == htons(ETH_P_IPV6)) &&
@@ -79,23 +79,20 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
 		tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb));
 	}
 
-	spin_lock(&stats->lock);
+	spin_lock_bh(&stats->lock);
 	stats->used = jiffies;
 	stats->packet_count++;
 	stats->byte_count += skb->len;
 	stats->tcp_flags |= tcp_flags;
-	spin_unlock(&stats->lock);
+	spin_unlock_bh(&stats->lock);
 }
 
-static void stats_read(struct flow_stats *stats, bool lock_bh,
+static void stats_read(struct flow_stats *stats,
 		       struct ovs_flow_stats *ovs_stats,
 		       unsigned long *used, __be16 *tcp_flags)
 {
 	if (stats->packet_count) {
-		if (lock_bh)
-			spin_lock_bh(&stats->lock);
-		else
-			spin_lock(&stats->lock);
+		spin_lock_bh(&stats->lock);
 
 		if (time_after(stats->used, *used))
 			*used = stats->used;
@@ -103,73 +100,47 @@ static void stats_read(struct flow_stats *stats, bool lock_bh,
 		ovs_stats->n_packets += stats->packet_count;
 		ovs_stats->n_bytes += stats->byte_count;
 
-		if (lock_bh)
-			spin_unlock_bh(&stats->lock);
-		else
-			spin_unlock(&stats->lock);
+		spin_unlock_bh(&stats->lock);
 	}
 }
 
 void ovs_flow_stats_get(struct sw_flow *flow, struct ovs_flow_stats *ovs_stats,
 			unsigned long *used, __be16 *tcp_flags)
 {
-	int cpu, cur_cpu;
-
 	*used = 0;
 	*tcp_flags = 0;
 	memset(ovs_stats, 0, sizeof(*ovs_stats));
 
-	if (!flow->stats.is_percpu) {
-		stats_read(flow->stats.stat, true, ovs_stats, used, tcp_flags);
+	if (!flow->stats.per_numa_mem) {
+		stats_read(flow->stats.stat, ovs_stats, used, tcp_flags);
 	} else {
-		cur_cpu = get_cpu();
-
-		for_each_possible_cpu(cpu) {
-			struct flow_stats *stats;
-			bool lock_bh;
-
-			stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
-			lock_bh = (cpu == cur_cpu);
-			stats_read(stats, lock_bh, ovs_stats, used, tcp_flags);
-		}
-		put_cpu();
+		int node;
+		for (node = 0; node < ovs_numa_nodes; node++)
+			stats_read(&flow->stats.numa_stats[node],
+				   ovs_stats, used, tcp_flags);
 	}
 }
 
-static void stats_reset(struct flow_stats *stats, bool lock_bh)
+static void stats_reset(struct flow_stats *stats)
 {
-	if (lock_bh)
-		spin_lock_bh(&stats->lock);
-	else
-		spin_lock(&stats->lock);
+	spin_lock_bh(&stats->lock);
 
 	stats->used = 0;
 	stats->packet_count = 0;
 	stats->byte_count = 0;
 	stats->tcp_flags = 0;
 
-	if (lock_bh)
-		spin_unlock_bh(&stats->lock);
-	else
-		spin_unlock(&stats->lock);
+	spin_unlock_bh(&stats->lock);
 }
 
 void ovs_flow_stats_clear(struct sw_flow *flow)
 {
-	int cpu, cur_cpu;
-
-	if (!flow->stats.is_percpu) {
-		stats_reset(flow->stats.stat, true);
+	if (!flow->stats.per_numa_mem) {
+		stats_reset(flow->stats.stat);
 	} else {
-		cur_cpu = get_cpu();
-
-		for_each_possible_cpu(cpu) {
-			bool lock_bh;
-
-			lock_bh = (cpu == cur_cpu);
-			stats_reset(per_cpu_ptr(flow->stats.cpu_stats, cpu), lock_bh);
-		}
-		put_cpu();
+		int node;
+		for (node = 0; node < ovs_numa_nodes; node++)
+			stats_reset(&flow->stats.numa_stats[node]);
 	}
 }
 
diff --git a/datapath/flow.h b/datapath/flow.h
index eafcfd8..14ea93f 100644
--- a/datapath/flow.h
+++ b/datapath/flow.h
@@ -158,10 +158,10 @@ struct flow_stats {
 };
 
 struct sw_flow_stats {
-	bool is_percpu;
+	char * per_numa_mem;		/* NULL for shared stats on 'stat'. */
 	union {
 		struct flow_stats *stat;
-		struct flow_stats __percpu *cpu_stats;
+		struct flow_stats *numa_stats;
 	};
 };
 
diff --git a/datapath/flow_table.c b/datapath/flow_table.c
index 4232b82..ec8e5a4 100644
--- a/datapath/flow_table.c
+++ b/datapath/flow_table.c
@@ -72,10 +72,9 @@ void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
 		*d++ = *s++ & *m++;
 }
 
-struct sw_flow *ovs_flow_alloc(bool percpu_stats)
+struct sw_flow *ovs_flow_alloc(bool pernumanode_stats)
 {
 	struct sw_flow *flow;
-	int cpu;
 
 	flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
 	if (!flow)
@@ -84,25 +83,28 @@ struct sw_flow *ovs_flow_alloc(bool percpu_stats)
 	flow->sf_acts = NULL;
 	flow->mask = NULL;
 
-	flow->stats.is_percpu = percpu_stats;
+	if (!pernumanode_stats || !ovs_numa_nodes) {
+		flow->stats.per_numa_mem = NULL;
 
-	if (!percpu_stats) {
 		flow->stats.stat = kzalloc(sizeof(*flow->stats.stat), GFP_KERNEL);
 		if (!flow->stats.stat)
 			goto err;
 
 		spin_lock_init(&flow->stats.stat->lock);
 	} else {
-		flow->stats.cpu_stats = alloc_percpu(struct flow_stats);
-		if (!flow->stats.cpu_stats)
-			goto err;
+		int node;
 
-		for_each_possible_cpu(cpu) {
-			struct flow_stats *cpu_stats;
+		flow->stats.per_numa_mem
+			= kzalloc(sizeof(struct flow_stats) * ovs_numa_nodes
+				  + (SMP_CACHE_BYTES - 1), GFP_KERNEL);
 
-			cpu_stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
-			spin_lock_init(&cpu_stats->lock);
-		}
+		if (!flow->stats.per_numa_mem)
+			goto err;
+
+		flow->stats.numa_stats
+			= (struct flow_stats *)L1_CACHE_ALIGN((u64)flow->stats.per_numa_mem);
+		for (node = 0; node < ovs_numa_nodes; node++)
+			spin_lock_init(&flow->stats.numa_stats[node].lock);
 	}
 	return flow;
 err:
@@ -141,8 +143,8 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
 static void flow_free(struct sw_flow *flow)
 {
 	kfree((struct sf_flow_acts __force *)flow->sf_acts);
-	if (flow->stats.is_percpu)
-		free_percpu(flow->stats.cpu_stats);
+	if (flow->stats.per_numa_mem)
+		kfree(flow->stats.per_numa_mem);
 	else
 		kfree(flow->stats.stat);
 	kmem_cache_free(flow_cache, flow);
@@ -601,10 +603,14 @@ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
 	return 0;
 }
 
+int ovs_numa_nodes = 0;
+
 /* Initializes the flow module.
  * Returns zero if successful or a negative error code. */
 int ovs_flow_init(void)
 {
+	int node;
+
 	BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
 	BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
 
@@ -613,6 +619,13 @@ int ovs_flow_init(void)
 	if (flow_cache == NULL)
 		return -ENOMEM;
 
+	/* Count the NUMA nodes. */
+	for_each_node_with_cpus(node) {
+		if (node >= ovs_numa_nodes)
+			ovs_numa_nodes = node + 1;
+	}
+	pr_info("ovs_flow_init: ovs_numa_nodes: %d.\n", ovs_numa_nodes);
+
 	return 0;
 }
 
diff --git a/datapath/flow_table.h b/datapath/flow_table.h
index 1996e34..3ff1077 100644
--- a/datapath/flow_table.h
+++ b/datapath/flow_table.h
@@ -52,6 +52,8 @@ struct flow_table {
 	unsigned int count;
 };
 
+extern int ovs_numa_nodes;
+
 int ovs_flow_init(void);
 void ovs_flow_exit(void);
 
-- 
1.7.10.4