[ovs-dev] [PATCH v5 2/2] datapath: Per NUMA node flow stats.

Thu Feb 6 23:13:53 UTC 2014

Keep kernel flow stats for each NUMA node rather than each (logical)
    CPU.  This avoids using the per-CPU allocator and removes most of the
    kernel-side OVS locking overhead otherwise on the top of perf reports
    and allows OVS to scale better with higher number of threads.

    With 9 handlers and 4 revalidators netperf TCP_CRR test flow setup
    rate doubles on a server with two hyper-threaded physical CPUs (16
    logical cores each) compared to the current OVS master.  Tested with
    non-trivial flow table with a TCP port match rule forcing all new
    connections with unique port numbers to OVS userspace.  The IP
    addresses are still wildcarded, so the kernel flows are not considered
    as exact match 5-tuple flows.  This type of flows can be expected to
    appear in large numbers as the result of more effective wildcarding
    made possible by improvements in OVS userspace flow classifier.

    Perf results for this test (master):

    Events: 305K cycles
    +   8.43%     ovs-vswitchd  [kernel.kallsyms]   [k] mutex_spin_on_owner
    +   5.64%     ovs-vswitchd  [kernel.kallsyms]   [k] __ticket_spin_lock
    +   4.75%     ovs-vswitchd  ovs-vswitchd        [.] find_match_wc
    +   3.32%     ovs-vswitchd  libpthread-2.15.so  [.] pthread_mutex_lock
    +   2.61%     ovs-vswitchd  [kernel.kallsyms]   [k] pcpu_alloc_area
    +   2.19%     ovs-vswitchd  ovs-vswitchd        [.] flow_hash_in_minimask_range
    +   2.03%          swapper  [kernel.kallsyms]   [k] intel_idle
    +   1.84%     ovs-vswitchd  libpthread-2.15.so  [.] pthread_mutex_unlock
    +   1.64%     ovs-vswitchd  ovs-vswitchd        [.] classifier_lookup
    +   1.58%     ovs-vswitchd  libc-2.15.so        [.] 0x7f4e6
    +   1.07%     ovs-vswitchd  [kernel.kallsyms]   [k] memset
    +   1.03%          netperf  [kernel.kallsyms]   [k] __ticket_spin_lock
    +   0.92%          swapper  [kernel.kallsyms]   [k] __ticket_spin_lock
    ...

    And after this patch:

    Events: 356K cycles
    +   6.85%     ovs-vswitchd  ovs-vswitchd        [.] find_match_wc
    +   4.63%     ovs-vswitchd  libpthread-2.15.so  [.] pthread_mutex_lock
    +   3.06%     ovs-vswitchd  [kernel.kallsyms]   [k] __ticket_spin_lock
    +   2.81%     ovs-vswitchd  ovs-vswitchd        [.] flow_hash_in_minimask_range
    +   2.51%     ovs-vswitchd  libpthread-2.15.so  [.] pthread_mutex_unlock
    +   2.27%     ovs-vswitchd  ovs-vswitchd        [.] classifier_lookup
    +   1.84%     ovs-vswitchd  libc-2.15.so        [.] 0x15d30f
    +   1.74%     ovs-vswitchd  [kernel.kallsyms]   [k] mutex_spin_on_owner
    +   1.47%          swapper  [kernel.kallsyms]   [k] intel_idle
    +   1.34%     ovs-vswitchd  ovs-vswitchd        [.] flow_hash_in_minimask
    +   1.33%     ovs-vswitchd  ovs-vswitchd        [.] rule_actions_unref
    +   1.16%     ovs-vswitchd  ovs-vswitchd        [.] hindex_node_with_hash
    +   1.16%     ovs-vswitchd  ovs-vswitchd        [.] do_xlate_actions
    +   1.09%     ovs-vswitchd  ovs-vswitchd        [.] ofproto_rule_ref
    +   1.01%          netperf  [kernel.kallsyms]   [k] __ticket_spin_lock
    ...

    There is a small increase in kernel spinlock overhead due to the same
    spinlock being shared between multiple cores of the same physical CPU,
    but that is barely visible in the netperf TCP_CRR test performance
    (maybe ~1% performance drop, hard to tell exactly due to variance in the test
    results), when testing for kernel module throughput (with no userspace
    activity, handful of kernel flows).

    On flow setup, a single stats instance is allocated (for the NUMA node
    0).  As CPUs from multiple NUMA nodes start updating stats, new
    NUMA-node specific stats instances are allocated.  This allocation on
    the packet processing code path is made to never sleep or look for
    emergency memory pools, minimizing the allocation latency.  If the
    allocation fails, the existing preallocated stats instance is used.
    Also, if only CPUs from one NUMA-node are updating the preallocated
    stats instance, no additional stats instances are allocated.  This
    eliminates the need to pre-allocate stats instances that will not be
    used, also relieving the stats reader from the burden of reading stats
    that are never used.  Finally, this allocation strategy allows the
    removal of the existing exact-5-tuple heuristics.

    Signed-off-by: Jarno Rajahalme <jrajahalme at nicira.com>
---
    v5: Addressed more comments by Pravin:
    - Removed prefetching as it may decrease performance when action execution
      takes more time (as with tunnels).
    - Disable preemption when using spin_lock() to avoid potential deadlocks.
    - Mark the flow stats cache as __read_mostly.

    v4: Addressed comments by Pravin:
    - Do not use __GFP_NOTRACK
    - Use spin_lock() instead of spin_lock_bh() when locking stats on remote nodes.
    - Be hotplug compatible by using num_possible_nodes() when allocating.
    - Remove unnecessary struct alignment attributes.

 datapath/datapath.c     |   13 ++--
 datapath/flow.c         |  170 +++++++++++++++++++++++++++--------------------
 datapath/flow.h         |   17 +++--
 datapath/flow_netlink.c |   58 ++--------------
 datapath/flow_netlink.h |    1 -
 datapath/flow_table.c   |   57 +++++++++-------
 datapath/flow_table.h   |    4 +-
 7 files changed, 151 insertions(+), 169 deletions(-)

diff --git a/datapath/datapath.c b/datapath/datapath.c
index d528ba0..cda91a6 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -252,7 +252,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
 	OVS_CB(skb)->flow = flow;
 	OVS_CB(skb)->pkt_key = &key;
 
-	ovs_flow_stats_update(OVS_CB(skb)->flow, skb);
+	ovs_flow_stats_update(flow, skb);
 	ovs_execute_actions(dp, skb);
 	stats_counter = &stats->n_hit;
 
@@ -525,7 +525,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 		packet->protocol = htons(ETH_P_802_2);
 
 	/* Build an sw_flow for sending this packet. */
-	flow = ovs_flow_alloc(false);
+	flow = ovs_flow_alloc();
 	err = PTR_ERR(flow);
 	if (IS_ERR(flow))
 		goto err_kfree_skb;
@@ -783,7 +783,6 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 	struct datapath *dp;
 	struct sw_flow_actions *acts = NULL;
 	struct sw_flow_match match;
-	bool exact_5tuple;
 	int error;
 
 	/* Extract key. */
@@ -792,7 +791,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 		goto error;
 
 	ovs_match_init(&match, &key, &mask);
-	error = ovs_nla_get_match(&match, &exact_5tuple,
+	error = ovs_nla_get_match(&match,
 				  a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]);
 	if (error)
 		goto error;
@@ -831,7 +830,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 			goto err_unlock_ovs;
 
 		/* Allocate flow. */
-		flow = ovs_flow_alloc(!exact_5tuple);
+		flow = ovs_flow_alloc();
 		if (IS_ERR(flow)) {
 			error = PTR_ERR(flow);
 			goto err_unlock_ovs;
@@ -915,7 +914,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	ovs_match_init(&match, &key, NULL);
-	err = ovs_nla_get_match(&match, NULL, a[OVS_FLOW_ATTR_KEY], NULL);
+	err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL);
 	if (err)
 		return err;
 
@@ -969,7 +968,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	ovs_match_init(&match, &key, NULL);
-	err = ovs_nla_get_match(&match, NULL, a[OVS_FLOW_ATTR_KEY], NULL);
+	err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL);
 	if (err)
 		goto unlock;
 
diff --git a/datapath/flow.c b/datapath/flow.c
index abe6789..e86034e 100644
--- a/datapath/flow.c
+++ b/datapath/flow.c
@@ -64,14 +64,10 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies)
 
 void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
 {
-	struct flow_stats *stats;
+	int node = numa_node_id();
+	struct flow_stats *stats = flow->stats[node];
 	__be16 tcp_flags = 0;
 
-	if (!flow->stats.is_percpu)
-		stats = flow->stats.stat;
-	else
-		stats = this_cpu_ptr(flow->stats.cpu_stats);
-
 	if ((flow->key.eth.type == htons(ETH_P_IP) ||
 	     flow->key.eth.type == htons(ETH_P_IPV6)) &&
 	    flow->key.ip.frag != OVS_FRAG_TYPE_LATER &&
@@ -80,96 +76,126 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
 		tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb));
 	}
 
-	spin_lock(&stats->lock);
+	/* Check if already have node-specific stats. */
+	if (likely(stats))
+		spin_lock(&stats->lock);
+	else {
+		stats = flow->stats[0]; /* Pre-allocated. */
+		spin_lock(&stats->lock);
+
+		/* If the current NUMA-node is the only writer on the
+		 * pre-allocated stats keep using them.
+		 * A previous locker may have already allocated the stats,
+		 * so we need to check again.  If the node-specific stats
+		 * were already allocated, we update the pre-allocated stats
+		 * as we have already locked them. */
+		if (likely(stats->last_writer != node && stats->last_writer >= 0
+			   && !flow->stats[node])) {
+			/* Try to allocate node-specific stats. */
+			struct flow_stats *new_stats;
+
+			new_stats = kmem_cache_alloc_node(flow_stats_cache,
+							  GFP_THISNODE |
+							  __GFP_NOMEMALLOC,
+							  node);
+			if (likely(new_stats)) {
+				new_stats->used = jiffies;
+				new_stats->packet_count = 1;
+				new_stats->byte_count = skb->len;
+				new_stats->tcp_flags = tcp_flags;
+				new_stats->last_writer = node;
+				spin_lock_init(&new_stats->lock);
+
+				flow->stats[node] = new_stats;
+				goto unlock; /* Unlock the pre-allocated stats. */
+			}
+		}
+	}
+
 	stats->used = jiffies;
 	stats->packet_count++;
 	stats->byte_count += skb->len;
 	stats->tcp_flags |= tcp_flags;
+	stats->last_writer = node;
+unlock:
 	spin_unlock(&stats->lock);
 }
 
-static void stats_read(struct flow_stats *stats, bool lock_bh,
-		       struct ovs_flow_stats *ovs_stats,
-		       unsigned long *used, __be16 *tcp_flags)
-{
-	if (lock_bh)
-		spin_lock_bh(&stats->lock);
-	else
-		spin_lock(&stats->lock);
-
-	if (time_after(stats->used, *used))
-		*used = stats->used;
-	*tcp_flags |= stats->tcp_flags;
-	ovs_stats->n_packets += stats->packet_count;
-	ovs_stats->n_bytes += stats->byte_count;
-
-	if (lock_bh)
-		spin_unlock_bh(&stats->lock);
-	else
-		spin_unlock(&stats->lock);
-}
-
 void ovs_flow_stats_get(struct sw_flow *flow, struct ovs_flow_stats *ovs_stats,
 			unsigned long *used, __be16 *tcp_flags)
 {
-	int cpu, cur_cpu;
+	int node, local_node;
+	struct flow_stats *stats;
 
-	*used = 0;
-	*tcp_flags = 0;
-	memset(ovs_stats, 0, sizeof(*ovs_stats));
+	preempt_disable();
+	local_node = numa_node_id();
 
-	if (!flow->stats.is_percpu) {
-		stats_read(flow->stats.stat, true, ovs_stats, used, tcp_flags);
+	/* Start from the local node. */
+	stats = flow->stats[local_node];
+	if (stats && likely(stats->packet_count)) {
+		spin_lock_bh(&stats->lock);
+		*used = stats->used;
+		*tcp_flags = stats->tcp_flags;
+		ovs_stats->n_packets = stats->packet_count;
+		ovs_stats->n_bytes = stats->byte_count;
+		spin_unlock_bh(&stats->lock);
 	} else {
-		cur_cpu = get_cpu();
-
-		for_each_possible_cpu(cpu) {
-			struct flow_stats *stats;
-			bool lock_bh;
+		*used = 0;
+		*tcp_flags = 0;
+		ovs_stats->n_packets = 0;
+		ovs_stats->n_bytes = 0;
+	}
 
-			stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
-			lock_bh = (cpu == cur_cpu);
-			stats_read(stats, lock_bh, ovs_stats, used, tcp_flags);
+	/* Collect stats from other nodes. */
+	for_each_node(node) {
+		if (node == local_node)
+			continue; /* Done already. */
+		stats = flow->stats[node];
+		if (stats && likely(stats->packet_count)) {
+			spin_lock(&stats->lock);
+			if (time_after(stats->used, *used))
+				*used = stats->used;
+			*tcp_flags |= stats->tcp_flags;
+			ovs_stats->n_packets += stats->packet_count;
+			ovs_stats->n_bytes += stats->byte_count;
+			spin_unlock(&stats->lock);
 		}
-		put_cpu();
 	}
-}
-
-static void stats_reset(struct flow_stats *stats, bool lock_bh)
-{
-	if (lock_bh)
-		spin_lock_bh(&stats->lock);
-	else
-		spin_lock(&stats->lock);
-
-	stats->used = 0;
-	stats->packet_count = 0;
-	stats->byte_count = 0;
-	stats->tcp_flags = 0;
-
-	if (lock_bh)
-		spin_unlock_bh(&stats->lock);
-	else
-		spin_unlock(&stats->lock);
+	preempt_enable();
 }
 
 void ovs_flow_stats_clear(struct sw_flow *flow)
 {
-	int cpu, cur_cpu;
-
-	if (!flow->stats.is_percpu) {
-		stats_reset(flow->stats.stat, true);
-	} else {
-		cur_cpu = get_cpu();
+	int node, local_node;
+	struct flow_stats *stats;
 
-		for_each_possible_cpu(cpu) {
-			bool lock_bh;
+	preempt_disable();
+	local_node = numa_node_id();
 
-			lock_bh = (cpu == cur_cpu);
-			stats_reset(per_cpu_ptr(flow->stats.cpu_stats, cpu), lock_bh);
+	/* Start from the current node. */
+	stats = flow->stats[local_node];
+	if (stats) {
+		spin_lock_bh(&stats->lock);
+		stats->used = 0;
+		stats->packet_count = 0;
+		stats->byte_count = 0;
+		stats->tcp_flags = 0;
+		spin_unlock_bh(&stats->lock);
+	}
+	for_each_node(node) {
+		if (node == local_node)
+			continue; /* Done already. */
+		stats = flow->stats[node];
+		if (stats) {
+			spin_lock(&stats->lock);
+			stats->used = 0;
+			stats->packet_count = 0;
+			stats->byte_count = 0;
+			stats->tcp_flags = 0;
+			spin_unlock(&stats->lock);
 		}
-		put_cpu();
 	}
+	preempt_enable();
 }
 
 static int check_header(struct sk_buff *skb, int len)
diff --git a/datapath/flow.h b/datapath/flow.h
index eafcfd8..f6cce35 100644
--- a/datapath/flow.h
+++ b/datapath/flow.h
@@ -155,14 +155,9 @@ struct flow_stats {
 	unsigned long used;		/* Last used time (in jiffies). */
 	spinlock_t lock;		/* Lock for atomic stats update. */
 	__be16 tcp_flags;		/* Union of seen TCP flags. */
-};
-
-struct sw_flow_stats {
-	bool is_percpu;
-	union {
-		struct flow_stats *stat;
-		struct flow_stats __percpu *cpu_stats;
-	};
+	int last_writer;		/* NUMA-node id of the last writer or
+					 * -1. Meaningful for 'stats[0]' only.
+					 */
 };
 
 struct sw_flow {
@@ -174,7 +169,11 @@ struct sw_flow {
 	struct sw_flow_key unmasked_key;
 	struct sw_flow_mask *mask;
 	struct sw_flow_actions __rcu *sf_acts;
-	struct sw_flow_stats stats;
+	struct flow_stats *stats[];	/* One for each NUMA node.  First one
+					 * is allocated at flow creation time,
+					 * the rest are allocated on demand
+					 * while holding the 'stats[0].lock'.
+					 */
 };
 
 struct arp_eth_header {
diff --git a/datapath/flow_netlink.c b/datapath/flow_netlink.c
index 39fe4bf..3024a61 100644
--- a/datapath/flow_netlink.c
+++ b/datapath/flow_netlink.c
@@ -268,20 +268,6 @@ static bool is_all_zero(const u8 *fp, size_t size)
 	return true;
 }
 
-static bool is_all_set(const u8 *fp, size_t size)
-{
-	int i;
-
-	if (!fp)
-		return false;
-
-	for (i = 0; i < size; i++)
-		if (fp[i] != 0xff)
-			return false;
-
-	return true;
-}
-
 static int __parse_flow_nlattrs(const struct nlattr *attr,
 				const struct nlattr *a[],
 				u64 *attrsp, bool nz)
@@ -503,9 +489,8 @@ static int metadata_from_nlattrs(struct sw_flow_match *match,  u64 *attrs,
 	return 0;
 }
 
-static int ovs_key_from_nlattrs(struct sw_flow_match *match,  bool *exact_5tuple,
-				u64 attrs, const struct nlattr **a,
-				bool is_mask)
+static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
+				const struct nlattr **a, bool is_mask)
 {
 	int err;
 	u64 orig_attrs = attrs;
@@ -562,11 +547,6 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match,  bool *exact_5tuple
 		SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
 	}
 
-	if (is_mask && exact_5tuple) {
-		if (match->mask->key.eth.type != htons(0xffff))
-			*exact_5tuple = false;
-	}
-
 	if (attrs & (1ULL << OVS_KEY_ATTR_IPV4)) {
 		const struct ovs_key_ipv4 *ipv4_key;
 
@@ -589,13 +569,6 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match,  bool *exact_5tuple
 		SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
 				ipv4_key->ipv4_dst, is_mask);
 		attrs &= ~(1ULL << OVS_KEY_ATTR_IPV4);
-
-		if (is_mask && exact_5tuple && *exact_5tuple) {
-			if (ipv4_key->ipv4_proto != 0xff ||
-			    ipv4_key->ipv4_src != htonl(0xffffffff) ||
-			    ipv4_key->ipv4_dst != htonl(0xffffffff))
-				*exact_5tuple = false;
-		}
 	}
 
 	if (attrs & (1ULL << OVS_KEY_ATTR_IPV6)) {
@@ -627,15 +600,6 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match,  bool *exact_5tuple
 				is_mask);
 
 		attrs &= ~(1ULL << OVS_KEY_ATTR_IPV6);
-
-		if (is_mask && exact_5tuple && *exact_5tuple) {
-			if (ipv6_key->ipv6_proto != 0xff ||
-			    !is_all_set((const u8 *)ipv6_key->ipv6_src,
-					sizeof(match->key->ipv6.addr.src)) ||
-			    !is_all_set((const u8 *)ipv6_key->ipv6_dst,
-					sizeof(match->key->ipv6.addr.dst)))
-				*exact_5tuple = false;
-		}
 	}
 
 	if (attrs & (1ULL << OVS_KEY_ATTR_ARP)) {
@@ -678,11 +642,6 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match,  bool *exact_5tuple
 					tcp_key->tcp_dst, is_mask);
 		}
 		attrs &= ~(1ULL << OVS_KEY_ATTR_TCP);
-
-		if (is_mask && exact_5tuple && *exact_5tuple &&
-		    (tcp_key->tcp_src != htons(0xffff) ||
-		     tcp_key->tcp_dst != htons(0xffff)))
-			*exact_5tuple = false;
 	}
 
 	if (attrs & (1ULL << OVS_KEY_ATTR_TCP_FLAGS)) {
@@ -714,11 +673,6 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match,  bool *exact_5tuple
 					udp_key->udp_dst, is_mask);
 		}
 		attrs &= ~(1ULL << OVS_KEY_ATTR_UDP);
-
-		if (is_mask && exact_5tuple && *exact_5tuple &&
-		    (udp_key->udp_src != htons(0xffff) ||
-		     udp_key->udp_dst != htons(0xffff)))
-			*exact_5tuple = false;
 	}
 
 	if (attrs & (1ULL << OVS_KEY_ATTR_SCTP)) {
@@ -804,7 +758,6 @@ static void sw_flow_mask_set(struct sw_flow_mask *mask,
  * attribute specifies the mask field of the wildcarded flow.
  */
 int ovs_nla_get_match(struct sw_flow_match *match,
-		      bool *exact_5tuple,
 		      const struct nlattr *key,
 		      const struct nlattr *mask)
 {
@@ -852,13 +805,10 @@ int ovs_nla_get_match(struct sw_flow_match *match,
 		}
 	}
 
-	err = ovs_key_from_nlattrs(match, NULL, key_attrs, a, false);
+	err = ovs_key_from_nlattrs(match, key_attrs, a, false);
 	if (err)
 		return err;
 
-	if (exact_5tuple)
-		*exact_5tuple = true;
-
 	if (mask) {
 		err = parse_flow_mask_nlattrs(mask, a, &mask_attrs);
 		if (err)
@@ -896,7 +846,7 @@ int ovs_nla_get_match(struct sw_flow_match *match,
 			}
 		}
 
-		err = ovs_key_from_nlattrs(match, exact_5tuple, mask_attrs, a, true);
+		err = ovs_key_from_nlattrs(match, mask_attrs, a, true);
 		if (err)
 			return err;
 	} else {
diff --git a/datapath/flow_netlink.h b/datapath/flow_netlink.h
index b31fbe2..4401510 100644
--- a/datapath/flow_netlink.h
+++ b/datapath/flow_netlink.h
@@ -45,7 +45,6 @@ int ovs_nla_put_flow(const struct sw_flow_key *,
 int ovs_nla_get_flow_metadata(struct sw_flow *flow,
 			      const struct nlattr *attr);
 int ovs_nla_get_match(struct sw_flow_match *match,
-		      bool *exact_5tuple,
 		      const struct nlattr *,
 		      const struct nlattr *);
 
diff --git a/datapath/flow_table.c b/datapath/flow_table.c
index 4e6b1c0..3f0829c 100644
--- a/datapath/flow_table.c
+++ b/datapath/flow_table.c
@@ -50,6 +50,7 @@
 #define REHASH_INTERVAL		(10 * 60 * HZ)
 
 static struct kmem_cache *flow_cache;
+struct kmem_cache *flow_stats_cache __read_mostly;
 
 static u16 range_n_bytes(const struct sw_flow_key_range *range)
 {
@@ -74,10 +75,10 @@ void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
 		*d++ = *s++ & *m++;
 }
 
-struct sw_flow *ovs_flow_alloc(bool percpu_stats)
+struct sw_flow *ovs_flow_alloc(void)
 {
 	struct sw_flow *flow;
-	int cpu;
+	int node;
 
 	flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
 	if (!flow)
@@ -86,26 +87,19 @@ struct sw_flow *ovs_flow_alloc(bool percpu_stats)
 	flow->sf_acts = NULL;
 	flow->mask = NULL;
 
-	flow->stats.is_percpu = percpu_stats;
+	/* Initialize the default stat node. */
+	flow->stats[0] = kmem_cache_alloc_node(flow_stats_cache,
+					       GFP_KERNEL | __GFP_ZERO, 0);
+	if (!flow->stats[0])
+		goto err;
 
-	if (!percpu_stats) {
-		flow->stats.stat = kzalloc(sizeof(*flow->stats.stat), GFP_KERNEL);
-		if (!flow->stats.stat)
-			goto err;
+	spin_lock_init(&flow->stats[0]->lock);
+	flow->stats[0]->last_writer = -1;
 
-		spin_lock_init(&flow->stats.stat->lock);
-	} else {
-		flow->stats.cpu_stats = alloc_percpu(struct flow_stats);
-		if (!flow->stats.cpu_stats)
-			goto err;
-
-		for_each_possible_cpu(cpu) {
-			struct flow_stats *cpu_stats;
+	for_each_node(node)
+		if (node > 0)
+			flow->stats[node] = NULL;
 
-			cpu_stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
-			spin_lock_init(&cpu_stats->lock);
-		}
-	}
 	return flow;
 err:
 	kmem_cache_free(flow_cache, flow);
@@ -142,11 +136,12 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
 
 static void flow_free(struct sw_flow *flow)
 {
+	int node;
+
 	kfree((struct sf_flow_acts __force *)flow->sf_acts);
-	if (flow->stats.is_percpu)
-		free_percpu(flow->stats.cpu_stats);
-	else
-		kfree(flow->stats.stat);
+	for_each_node(node)
+		if (flow->stats[node])
+			kmem_cache_free(flow_stats_cache, flow->stats[node]);
 	kmem_cache_free(flow_cache, flow);
 }
 
@@ -608,16 +603,28 @@ int ovs_flow_init(void)
 	BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
 	BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
 
-	flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
-					0, NULL);
+	flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
+				       + (num_possible_nodes()
+					  * sizeof(struct flow_stats *)),
+				       0, SLAB_HWCACHE_ALIGN, NULL);
 	if (flow_cache == NULL)
 		return -ENOMEM;
 
+	flow_stats_cache
+		= kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats),
+				    0, SLAB_HWCACHE_ALIGN, NULL);
+	if (flow_stats_cache == NULL) {
+		kmem_cache_destroy(flow_cache);
+		flow_cache = NULL;
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 
 /* Uninitializes the flow module. */
 void ovs_flow_exit(void)
 {
+	kmem_cache_destroy(flow_stats_cache);
 	kmem_cache_destroy(flow_cache);
 }
diff --git a/datapath/flow_table.h b/datapath/flow_table.h
index baaeb10..ca8a582 100644
--- a/datapath/flow_table.h
+++ b/datapath/flow_table.h
@@ -52,10 +52,12 @@ struct flow_table {
 	unsigned int count;
 };
 
+extern struct kmem_cache *flow_stats_cache;
+
 int ovs_flow_init(void);
 void ovs_flow_exit(void);
 
-struct sw_flow *ovs_flow_alloc(bool percpu_stats);
+struct sw_flow *ovs_flow_alloc(void);
 void ovs_flow_free(struct sw_flow *, bool deferred);
 
 int ovs_flow_tbl_init(struct flow_table *);
-- 
1.7.10.4