[ovs-dev] [PATCH v5 2/2] datapath: Per NUMA node flow stats.

Mon Feb 10 19:31:26 UTC 2014

On Feb 8, 2014, at 7:01 PM, Jesse Gross <jesse at nicira.com> wrote:

> On Thu, Feb 6, 2014 at 3:13 PM, Jarno Rajahalme <jrajahalme at nicira.com> wrote:
>> diff --git a/datapath/flow.c b/datapath/flow.c
>> index abe6789..e86034e 100644
>> --- a/datapath/flow.c
>> +++ b/datapath/flow.c
>> @@ -80,96 +76,126 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
>>                tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb));
>>        }
>> 
>> -       spin_lock(&stats->lock);
>> +       /* Check if already have node-specific stats. */
>> +       if (likely(stats))
>> +               spin_lock(&stats->lock);
>> +       else {
>> +               stats = flow->stats[0]; /* Pre-allocated. */
>> +               spin_lock(&stats->lock);
>> +
>> +               /* If the current NUMA-node is the only writer on the
>> +                * pre-allocated stats keep using them.
>> +                * A previous locker may have already allocated the stats,
>> +                * so we need to check again.  If the node-specific stats
>> +                * were already allocated, we update the pre-allocated stats
>> +                * as we have already locked them. */
>> +               if (likely(stats->last_writer != node && stats->last_writer >= 0
>> +                          && !flow->stats[node])) {
> 
> I don't know if this likely() annotation is right - it seems that it
> would be more common that we are using node 0's stats than allocating.

I changed this to:

		if (unlikely(stats->last_writer != node)
		    && likely(stats->last_writer >= 0)
		    && likely(!flow->stats[node])) {

> 
>> static int check_header(struct sk_buff *skb, int len)
>> diff --git a/datapath/flow.h b/datapath/flow.h
>> index eafcfd8..f6cce35 100644
>> --- a/datapath/flow.h
>> +++ b/datapath/flow.h
>> @@ -155,14 +155,9 @@ struct flow_stats {
>>        unsigned long used;             /* Last used time (in jiffies). */
>>        spinlock_t lock;                /* Lock for atomic stats update. */
>>        __be16 tcp_flags;               /* Union of seen TCP flags. */
>> -};
>> -
>> -struct sw_flow_stats {
>> -       bool is_percpu;
>> -       union {
>> -               struct flow_stats *stat;
>> -               struct flow_stats __percpu *cpu_stats;
>> -       };
>> +       int last_writer;                /* NUMA-node id of the last writer or
>> +                                        * -1. Meaningful for 'stats[0]' only.
>> +                                        */
> 
> Should we put last_writer directly in struct sw_flow stats? It would
> reduce the size a little bit and might even be a little clearer.
> 

This patch actually removes the struct sw_flow_stats, and the ‘last_writer’ is part of the struct flow_stats. The alternative to put it to struct sw_flow would use more space (flow_stats are allocated cache line aligned, so there is some slack), and make the struct sw_flow a bit more volatile than it needs to be. To be clear, the definitions now look like this:

struct flow_stats {
	u64 packet_count;		/* Number of packets matched. */
	u64 byte_count;			/* Number of bytes matched. */
	unsigned long used;		/* Last used time (in jiffies). */
	spinlock_t lock;		/* Lock for atomic stats update. */
	__be16 tcp_flags;		/* Union of seen TCP flags. */
	int last_writer;		/* NUMA-node id of the last writer or
					 * -1. Meaningful for 'stats[0]' only.
					 */
};

struct sw_flow {
	struct rcu_head rcu;
	struct hlist_node hash_node[2];
	u32 hash;

	struct sw_flow_key key;
	struct sw_flow_key unmasked_key;
	struct sw_flow_mask *mask;
	struct sw_flow_actions __rcu *sf_acts;
	struct flow_stats *stats[];	/* One for each NUMA node.  First one
					 * is allocated at flow creation time,
					 * the rest are allocated on demand
					 * while holding the 'stats[0].lock'.
					 */
};

>> diff --git a/datapath/flow_table.c b/datapath/flow_table.c
>> index 4e6b1c0..3f0829c 100644
>> --- a/datapath/flow_table.c
>> +++ b/datapath/flow_table.c
>> @@ -608,16 +603,28 @@ int ovs_flow_init(void)
>>        BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
>>        BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
>> 
>> -       flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
>> -                                       0, NULL);
>> +       flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
>> +                                      + (num_possible_nodes()
>> +                                         * sizeof(struct flow_stats *)),
>> +                                      0, SLAB_HWCACHE_ALIGN, NULL);
> 
> Why do we need to cache line align the flows? We shouldn't be writing
> to them very often.

I’ll revert this.

> 
> One other thing, it might be good to do a revert of the 5-tuple patch
> first and then layer this on top as it would probably make it clearer
> what is going on.

OK, v6 coming up!

  Jarno