[ovs-dev] [PATCH v3] datapath: NUMA node flow stats.

Pravin Shelar pshelar at nicira.com
Tue Feb 4 17:32:29 UTC 2014


On Tue, Feb 4, 2014 at 9:03 AM, Jarno Rajahalme <jrajahalme at nicira.com> wrote:
> Pravin,
>
> Thanks for your thorough review. Some comments below,
>
>   Jarno
>
> On Feb 3, 2014, at 9:14 AM, Pravin Shelar <pshelar at nicira.com> wrote:
>
> On Thu, Jan 23, 2014 at 4:43 PM, Jarno Rajahalme <jrajahalme at nicira.com>
> wrote:
>
>
> ...
>
> +static void get_skb_stats(struct flow_stats *stat, const struct sw_flow
> *flow,
> +                         const struct sk_buff *skb)
> +{
> +       int node = numa_node_id();
> +
> +       /* Prepare for writing later. */
> +       if (unlikely(!flow->stats[node]))
> +               node = 0;
> +       spin_lock_prefetch(&flow->stats[node]->lock);
> +
>
> lock-prefetch looks pretty aggressive, specially when path between
> prefetch and actual use is not pre-defined, it depends on actions
> list. execute actions has lot more memory footprint when it come to
> output action to tunnel port.
>
>
> The risk being that prefetch is too early in that case? Do you have a setup
> where you could test that case?
>
Yes, It seems bit early. I will give it try on my setup.

>
> +       stat->used = jiffies;
> +       stat->packet_count = 1;
> +       stat->byte_count = skb->len;
> +       stat->tcp_flags = 0;
> +
> +       if ((flow->key.eth.type == htons(ETH_P_IP) ||
> +            flow->key.eth.type == htons(ETH_P_IPV6)) &&
> +           flow->key.ip.proto == IPPROTO_TCP &&
> +           likely(skb->len >= skb_transport_offset(skb) + sizeof(struct
> tcphdr))) {
> +               stat->tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb));
> +       }
> +}
> +
> /* Must be called with rcu_read_lock. */
> void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
> {
> @@ -221,6 +244,7 @@ void ovs_dp_process_received_packet(struct vport *p,
> struct sk_buff *skb)
>        struct sw_flow *flow;
>        struct dp_stats_percpu *stats;
>        struct sw_flow_key key;
> +       struct flow_stats stat;
>        u64 *stats_counter;
>        u32 n_mask_hit;
>        int error;
> @@ -252,8 +276,9 @@ void ovs_dp_process_received_packet(struct vport *p,
> struct sk_buff *skb)
>        OVS_CB(skb)->flow = flow;
>        OVS_CB(skb)->pkt_key = &key;
>
> -       ovs_flow_stats_update(OVS_CB(skb)->flow, skb);
> +       get_skb_stats(&stat, flow, skb);
>        ovs_execute_actions(dp, skb);
> +       ovs_flow_stats_update(flow, &stat);
>        stats_counter = &stats->n_hit;
>
> I am not sure why would you divide stats update in two steps? We could
> have just done prefetch.
>
>
> Applying the actions can change the skb, and we want to record the size
> before any modifications.
>
ok.

> ...
>
>
> +static inline bool alloc_stats(struct sw_flow *flow,
> +                              const struct flow_stats *stat, int node)
> +{
> +       struct flow_stats *stats;
> +       /* Alloc stats for the 'node', but with minimal latency. */
> +       stats = kmem_cache_alloc_node(flow_stats_cache, GFP_THISNODE |
> +                                     __GFP_NOTRACK | __GFP_NOMEMALLOC,
> node);
>
>
> __GFP_NOTRACK is hardly used out side of memory allocator. Why would
> we need it in OVS. It only affects memory allocation if debugging
> (CONFIG_KMEMCHECK) is on. CONFIG_KMEMCHECK is pretty heavy weight
> tracing so it is not usually on.
>
>
> OK, thanks, will take __GFP_NOTRACK off then.
>
> ...
>
> -                       stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
> -                       lock_bh = (cpu == cur_cpu);
> -                       stats_read(stats, lock_bh, ovs_stats, used,
> tcp_flags);
> +       /* Collect per NUMA-node stats from other nodes. */
> +       for (node = 0; node < ovs_numa_nodes; node++) {
> +               if (node == current_node)
> +                       continue; /* Done already. */
> +               stats = flow->stats[node];
> +               if (stats && likely(stats->packet_count)) {
> +                       spin_lock_bh(&stats->lock);
> +                       if (time_after(stats->used, *used))
> +                               *used = stats->used;
> +                       *tcp_flags |= stats->tcp_flags;
> +                       ovs_stats->n_packets += stats->packet_count;
> +                       ovs_stats->n_bytes += stats->byte_count;
> +                       spin_unlock_bh(&stats->lock);
>                }
> -               put_cpu();
>
>
> We can selectively use spin_lock_bh for locking stats on local node to
> improve interrupt latency.
>
>
> I.e., use spin_lock() for the other than local node? OK.
>
As done currently you need to disable preemption using get/put cpu.

> ...
>
> static struct kmem_cache *flow_cache;
> +struct kmem_cache *flow_stats_cache;
> +
>
> Can you mark it __read_mostly.
>
> +int ovs_numa_nodes = 0;
>
>
> I think I'll get rid of this. Or did you mean I should mark the
> flow_stats_cache __read_mostly?
>
ok.

> ...
>
> -               spin_lock_init(&flow->stats.stat->lock);
> -       } else {
> -               flow->stats.cpu_stats = alloc_percpu(struct flow_stats);
> -               if (!flow->stats.cpu_stats)
> -                       goto err;
> +       spin_lock_init(&flow->stats[0]->lock);
> +       flow->stats[0]->last_writer = -1;
>
> -               for_each_possible_cpu(cpu) {
> -                       struct flow_stats *cpu_stats;
> +       for (node = 1; node < ovs_numa_nodes; node++)
> +               flow->stats[node] = NULL;
>
> -                       cpu_stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
> -                       spin_lock_init(&cpu_stats->lock);
> -               }
> -       }
>
> Can you use for_each_node() here? this works better when numa node are
> sparse.
>
>
> OK.
>
>
>        return flow;
> err:
>        kmem_cache_free(flow_cache, flow);
> @@ -142,11 +137,12 @@ static struct flex_array *alloc_buckets(unsigned int
> n_buckets)
>
> static void flow_free(struct sw_flow *flow)
> {
> +       int node;
> +
>        kfree((struct sf_flow_acts __force *)flow->sf_acts);
> -       if (flow->stats.is_percpu)
> -               free_percpu(flow->stats.cpu_stats);
> -       else
> -               kfree(flow->stats.stat);
> +       for (node = 0; node < ovs_numa_nodes; node++)
> +               if (flow->stats[node])
> +                       kmem_cache_free(flow_stats_cache,
> flow->stats[node]);
>
> same as above.
>
>
> OK.
>
>        kmem_cache_free(flow_cache, flow);
> }
>
> @@ -603,19 +599,41 @@ int ovs_flow_tbl_insert(struct flow_table *table,
> struct sw_flow *flow,
>  * Returns zero if successful or a negative error code. */
> int ovs_flow_init(void)
> {
> +       int node;
> +
>        BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
>        BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
>
> -       flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
> -                                       0, NULL);
> +       /* Count the NUMA nodes. */
> +       for_each_node_with_cpus(node) {
> +               if (node >= ovs_numa_nodes)
> +                       ovs_numa_nodes = node + 1;
> +       }
> +       pr_info("ovs_flow_init: ovs_numa_nodes: %d.\n", ovs_numa_nodes);
> +
>
>
> This does not handle hotplug cpu. cpu/node can become online after ovs
> kernel module is loaded. That can cause segment fault on stats access.
> We can directly use num_possible_nodes().
>
>
> OK.
>
>
> +       flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
> +                                      + (ovs_numa_nodes
> +                                         * sizeof(struct flow_stats *)),
> +                                      __alignof__(struct sw_flow),
> +                                      SLAB_HWCACHE_ALIGN, NULL);
>
>
> without any explicit alignment struct sw_flow gets platform alignment
> by compiler which is 8 bytes for x86-64. When HWCACHE-ALIGN flag is
> specified the alignment arg does not help kmem-cache-create since
> HW-CACHE-ALIGN is L1 cache align which is larger than default
> alignment.
>
>
> So I should use 0 align arg if I know that that compiler alignment for the
> struct is less than L1 cache alignment?
>
That should be fine.

>
>        if (flow_cache == NULL)
>                return -ENOMEM;
>
> +       flow_stats_cache
> +               = kmem_cache_create("sw_flow_stats", sizeof(struct
> flow_stats),
> +                                   L1_CACHE_BYTES, SLAB_HWCACHE_ALIGN,
> NULL);
>
> same as above, I am not sure how specifying alignment and alignment-flag
> helps.
>
> If you want to align struct flow_stats at L1_cache then may be we can
> get rid of explicit alignment done for it at its definition.
> In general we should get rid of aligning structure and create cache
> with correct alignment here.
>
>
> OK.
>
>   Jarno
>



More information about the dev mailing list