[ovs-dev] [RFC PATCH 2/4] datapath: NUMA node flow stats.
Jarno Rajahalme
jrajahalme at nicira.com
Thu Jan 9 00:15:46 UTC 2014
Keep kernel flow stats for each NUMA node rather than each (logical)
CPU. This almost removes the kernel-side OVS locking overhead
otherwise on the top of perf report and improves TCP_CRR test scores
by roughly 50% (with 4 handler and 2 revalidator threads with a TCP
port flow forcing all connection setups to userspace). My limited
testing revealed no performance regression for tests where only two
wildcarded kernel flows are used.
Signed-off-by: Jarno Rajahalme <jrajahalme at nicira.com>
---
datapath/flow.c | 71 +++++++++++++++----------------------------------
datapath/flow.h | 4 +--
datapath/flow_table.c | 41 ++++++++++++++++++----------
datapath/flow_table.h | 2 ++
4 files changed, 52 insertions(+), 66 deletions(-)
diff --git a/datapath/flow.c b/datapath/flow.c
index b1b2596..0002759 100644
--- a/datapath/flow.c
+++ b/datapath/flow.c
@@ -67,10 +67,10 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
struct flow_stats *stats;
__be16 tcp_flags = 0;
- if (!flow->stats.is_percpu)
+ if (!flow->stats.per_numa_mem)
stats = flow->stats.stat;
else
- stats = this_cpu_ptr(flow->stats.cpu_stats);
+ stats = &flow->stats.numa_stats[numa_node_id()];
if ((flow->key.eth.type == htons(ETH_P_IP) ||
flow->key.eth.type == htons(ETH_P_IPV6)) &&
@@ -79,23 +79,20 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb));
}
- spin_lock(&stats->lock);
+ spin_lock_bh(&stats->lock);
stats->used = jiffies;
stats->packet_count++;
stats->byte_count += skb->len;
stats->tcp_flags |= tcp_flags;
- spin_unlock(&stats->lock);
+ spin_unlock_bh(&stats->lock);
}
-static void stats_read(struct flow_stats *stats, bool lock_bh,
+static void stats_read(struct flow_stats *stats,
struct ovs_flow_stats *ovs_stats,
unsigned long *used, __be16 *tcp_flags)
{
if (stats->packet_count) {
- if (lock_bh)
- spin_lock_bh(&stats->lock);
- else
- spin_lock(&stats->lock);
+ spin_lock_bh(&stats->lock);
if (time_after(stats->used, *used))
*used = stats->used;
@@ -103,73 +100,47 @@ static void stats_read(struct flow_stats *stats, bool lock_bh,
ovs_stats->n_packets += stats->packet_count;
ovs_stats->n_bytes += stats->byte_count;
- if (lock_bh)
- spin_unlock_bh(&stats->lock);
- else
- spin_unlock(&stats->lock);
+ spin_unlock_bh(&stats->lock);
}
}
void ovs_flow_stats_get(struct sw_flow *flow, struct ovs_flow_stats *ovs_stats,
unsigned long *used, __be16 *tcp_flags)
{
- int cpu, cur_cpu;
-
*used = 0;
*tcp_flags = 0;
memset(ovs_stats, 0, sizeof(*ovs_stats));
- if (!flow->stats.is_percpu) {
- stats_read(flow->stats.stat, true, ovs_stats, used, tcp_flags);
+ if (!flow->stats.per_numa_mem) {
+ stats_read(flow->stats.stat, ovs_stats, used, tcp_flags);
} else {
- cur_cpu = get_cpu();
-
- for_each_possible_cpu(cpu) {
- struct flow_stats *stats;
- bool lock_bh;
-
- stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
- lock_bh = (cpu == cur_cpu);
- stats_read(stats, lock_bh, ovs_stats, used, tcp_flags);
- }
- put_cpu();
+ int node;
+ for (node = 0; node < ovs_numa_nodes; node++)
+ stats_read(&flow->stats.numa_stats[node],
+ ovs_stats, used, tcp_flags);
}
}
-static void stats_reset(struct flow_stats *stats, bool lock_bh)
+static void stats_reset(struct flow_stats *stats)
{
- if (lock_bh)
- spin_lock_bh(&stats->lock);
- else
- spin_lock(&stats->lock);
+ spin_lock_bh(&stats->lock);
stats->used = 0;
stats->packet_count = 0;
stats->byte_count = 0;
stats->tcp_flags = 0;
- if (lock_bh)
- spin_unlock_bh(&stats->lock);
- else
- spin_unlock(&stats->lock);
+ spin_unlock_bh(&stats->lock);
}
void ovs_flow_stats_clear(struct sw_flow *flow)
{
- int cpu, cur_cpu;
-
- if (!flow->stats.is_percpu) {
- stats_reset(flow->stats.stat, true);
+ if (!flow->stats.per_numa_mem) {
+ stats_reset(flow->stats.stat);
} else {
- cur_cpu = get_cpu();
-
- for_each_possible_cpu(cpu) {
- bool lock_bh;
-
- lock_bh = (cpu == cur_cpu);
- stats_reset(per_cpu_ptr(flow->stats.cpu_stats, cpu), lock_bh);
- }
- put_cpu();
+ int node;
+ for (node = 0; node < ovs_numa_nodes; node++)
+ stats_reset(&flow->stats.numa_stats[node]);
}
}
diff --git a/datapath/flow.h b/datapath/flow.h
index eafcfd8..14ea93f 100644
--- a/datapath/flow.h
+++ b/datapath/flow.h
@@ -158,10 +158,10 @@ struct flow_stats {
};
struct sw_flow_stats {
- bool is_percpu;
+ char * per_numa_mem; /* NULL for shared stats on 'stat'. */
union {
struct flow_stats *stat;
- struct flow_stats __percpu *cpu_stats;
+ struct flow_stats *numa_stats;
};
};
diff --git a/datapath/flow_table.c b/datapath/flow_table.c
index 4232b82..ec8e5a4 100644
--- a/datapath/flow_table.c
+++ b/datapath/flow_table.c
@@ -72,10 +72,9 @@ void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
*d++ = *s++ & *m++;
}
-struct sw_flow *ovs_flow_alloc(bool percpu_stats)
+struct sw_flow *ovs_flow_alloc(bool pernumanode_stats)
{
struct sw_flow *flow;
- int cpu;
flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
if (!flow)
@@ -84,25 +83,28 @@ struct sw_flow *ovs_flow_alloc(bool percpu_stats)
flow->sf_acts = NULL;
flow->mask = NULL;
- flow->stats.is_percpu = percpu_stats;
+ if (!pernumanode_stats || !ovs_numa_nodes) {
+ flow->stats.per_numa_mem = NULL;
- if (!percpu_stats) {
flow->stats.stat = kzalloc(sizeof(*flow->stats.stat), GFP_KERNEL);
if (!flow->stats.stat)
goto err;
spin_lock_init(&flow->stats.stat->lock);
} else {
- flow->stats.cpu_stats = alloc_percpu(struct flow_stats);
- if (!flow->stats.cpu_stats)
- goto err;
+ int node;
- for_each_possible_cpu(cpu) {
- struct flow_stats *cpu_stats;
+ flow->stats.per_numa_mem
+ = kzalloc(sizeof(struct flow_stats) * ovs_numa_nodes
+ + (SMP_CACHE_BYTES - 1), GFP_KERNEL);
- cpu_stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
- spin_lock_init(&cpu_stats->lock);
- }
+ if (!flow->stats.per_numa_mem)
+ goto err;
+
+ flow->stats.numa_stats
+ = (struct flow_stats *)L1_CACHE_ALIGN((u64)flow->stats.per_numa_mem);
+ for (node = 0; node < ovs_numa_nodes; node++)
+ spin_lock_init(&flow->stats.numa_stats[node].lock);
}
return flow;
err:
@@ -141,8 +143,8 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
static void flow_free(struct sw_flow *flow)
{
kfree((struct sf_flow_acts __force *)flow->sf_acts);
- if (flow->stats.is_percpu)
- free_percpu(flow->stats.cpu_stats);
+ if (flow->stats.per_numa_mem)
+ kfree(flow->stats.per_numa_mem);
else
kfree(flow->stats.stat);
kmem_cache_free(flow_cache, flow);
@@ -601,10 +603,14 @@ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
return 0;
}
+int ovs_numa_nodes = 0;
+
/* Initializes the flow module.
* Returns zero if successful or a negative error code. */
int ovs_flow_init(void)
{
+ int node;
+
BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
@@ -613,6 +619,13 @@ int ovs_flow_init(void)
if (flow_cache == NULL)
return -ENOMEM;
+ /* Count the NUMA nodes. */
+ for_each_node_with_cpus(node) {
+ if (node >= ovs_numa_nodes)
+ ovs_numa_nodes = node + 1;
+ }
+ pr_info("ovs_flow_init: ovs_numa_nodes: %d.\n", ovs_numa_nodes);
+
return 0;
}
diff --git a/datapath/flow_table.h b/datapath/flow_table.h
index 1996e34..3ff1077 100644
--- a/datapath/flow_table.h
+++ b/datapath/flow_table.h
@@ -52,6 +52,8 @@ struct flow_table {
unsigned int count;
};
+extern int ovs_numa_nodes;
+
int ovs_flow_init(void);
void ovs_flow_exit(void);
--
1.7.10.4
More information about the dev
mailing list