[ovs-dev] [PATCH] Improve kernel hash table

Jesse Gross jesse at nicira.com
Sat Sep 3 00:35:36 UTC 2011


On Tue, Aug 23, 2011 at 3:34 PM, Pravin Shelar <pshelar at nicira.com> wrote:
>     Currently OVS uses its own hashing implmentation for hash tables which
> has some problems, e.g. error case on deletion code. Following patch
> replaces that with hlist based hash table which is consistent with other
> kernel hash tables. As Jesse suggested, flex-array is used for allocating
> hash buckets, So that we can have large hash-table without large contiguous
> kernel memory.
>     struct sw_flow is rearragned accodring to frequency of writes to a given
> struct member.

Please put the struct rearrangement in a separate patch.

This is missing a signed-off-by line.

This doesn't compile on older kernels:
/home/jesse/openvswitch/datapath/linux/flow.c: In function ‘flow_destroy_table’:
/home/jesse/openvswitch/datapath/linux/flow.c:288: error: implicit
declaration of function ‘hlist_del_init_rcu’
make[5]: *** [/home/jesse/openvswitch/datapath/linux/flow.o] Error 1
make[5]: *** Waiting for unfinished jobs....
/home/jesse/openvswitch/datapath/linux/tunnel.c: In function ‘move_port’:
/home/jesse/openvswitch/datapath/linux/tunnel.c:268: error: implicit
declaration of function ‘hlist_del_init_rcu’
make[5]: *** [/home/jesse/openvswitch/datapath/linux/tunnel.o] Error 1

It also causes a sparse warning:
/home/jesse/openvswitch/datapath/linux/tunnel.c:1583:15: warning:
non-ANSI function declaration of function 'tnl_exit'
/home/jesse/openvswitch/datapath/linux/tunnel.c:289:18: warning:
symbol 'port_table_lookup' was not declared. Should it be static?

> diff --git a/datapath/datapath.c b/datapath/datapath.c
> index 0b6e2e5..ba0b05c 100644
> --- a/datapath/datapath.c
> +++ b/datapath/datapath.c
> @@ -693,7 +673,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
>        err = flow_extract(packet, -1, &flow->key, &key_len, &is_frag);
>        if (err)
>                goto err_flow_put;
> -       flow->tbl_node.hash = flow_hash(&flow->key, key_len);

You should still actually initialize the hash in the flow.  We want
packets that are executed from userspace to look the same as those
that are processed entirely in the kernel.

> diff --git a/datapath/flow.c b/datapath/flow.c
> index 9d7fbc9..7e83d53 100644
> --- a/datapath/flow.c
> +++ b/datapath/flow.c
> +struct sw_flow *flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *last)
> +{
> +       struct sw_flow *flow = NULL;
> +       struct hlist_head *head;
> +       struct hlist_node *n;
> +       int i;
> +
> +       while (*bucket < table->n_buckets) {
> +               i = 0;
> +               head = flex_array_get(table->buckets, *bucket);
> +               hlist_for_each_entry_rcu(flow, n, head, hash_node) {
> +                       if (i < *last) {
> +                               i++;
> +                               continue;
> +                       }
> +                       *last = i + 1;
> +                       return flow;
> +               }
> +               (*bucket)++;
> +               *last = 0;
> +               flow = NULL;
> +       }
> +
> +       return flow;

Can't we just return NULL down here instead of zeroing out flow above?

> +struct flow_table *flow_hash_expand(struct flow_table *table)
> +{
> +       struct flow_table *new_table;
> +       struct sw_flow *flow;
> +       struct hlist_head *head;
> +       struct hlist_node *n, *pos;
> +       int n_buckets = table->n_buckets << 2;

Left shift by 2 multiplies by 4, which I don't think is what you want.
 I would just use a normal multiply.

> +void flow_insert(struct flow_table *table, struct sw_flow *flow)
> +{
> +       struct hlist_head *head;
> +       struct hlist_node *n;
> +       struct sw_flow *cflow;
> +
> +       head = find_bucket(table, flow->hash);
> +
> +       hlist_for_each_entry_rcu(cflow, n, head, hash_node) {
> +               if (!memcmp(&cflow->key, &flow->key, sizeof(cflow->key))) {
> +                       BUG();
> +                       return;
> +               }
> +       }

This check seems a little heavyweight for something that should never
happen.  At the very least, we should check the hash first before
doing the comparison as we do in flow lookup but I'm not sure that
it's really worth doing at all.

> diff --git a/datapath/flow.h b/datapath/flow.h
> index 997692c..b304c60 100644
> --- a/datapath/flow.h
> +++ b/datapath/flow.h
> +#define TBL_MIN_BUCKETS                1024
> +#define TBL_MAX_BUCKETS (TBL_MIN_BUCKETS * TBL_MIN_BUCKETS)

There's no real need for a max size - it's really governed by the flex
array implementation and on 64-bit machines the actual limit is 256k
buckets.

> +static inline int flow_tbl_count(struct flow_table *table)
>  {
> -       return container_of(node, struct sw_flow, tbl_node);
> +       return table->count;
>  }
>
> +static inline int flow_tbl_need_to_expand(struct flow_table *table)
> +{
> +       return (table->count > table->n_buckets);
> +}
[...]
> +int flow_tbl_need_to_expand(struct flow_table *table);
> +struct sw_flow *flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx);

No need to both define and prototype these functions.

Also, can we pick a consistent prefix for the functions that deal with
the flow table and keep them separate from the ones that deal with a
single flow?  Currently we have flow_tbl_*, flow_*, flow_hash_*, TBL,
and flow_*_table.  Personally I would vote for flow_tbl_*.

> diff --git a/datapath/tunnel.c b/datapath/tunnel.c
> index 67556d7..8121b02 100644
> --- a/datapath/tunnel.c
> +++ b/datapath/tunnel.c
> -static int add_port(struct vport *vport)
> +static int alloc_port_table(void)
>  {
> -       struct tbl *cur_table = rtnl_dereference(port_table);
> -       struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
> -       int err;
> +       unsigned int i;
> +       static struct hlist_head *head;
>
> -       if (!port_table) {
> -               struct tbl *new_table;
> +       head = kmalloc(PORT_TABLE_SIZE * sizeof(struct hlist_head *),
> +                       GFP_KERNEL);
> +       if (!head)
> +               return -ENOMEM;
>
> -               new_table = tbl_create(TBL_MIN_BUCKETS);
> -               if (!new_table)
> -                       return -ENOMEM;
> +       for (i = 0; i < PORT_TABLE_SIZE; i++)
> +               INIT_HLIST_HEAD(&head[i]);
>
> -               rcu_assign_pointer(port_table, new_table);
> -               schedule_cache_cleaner();
> +       smp_wmb();
> +       port_table = head;

There's no corresponding read memory barrier to go with this.  You
could insert a smp_read_barrier_depends() but I think using the RCU
primitives is a little clearer.

> +static int add_port(struct vport *vport)
> +{
> +       struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
> +       u32 hash = mutable_hash(rtnl_dereference(tnl_vport->mutable));
> +       int err;
>
> -               new_table = tbl_expand(cur_table);
> -               if (IS_ERR(new_table)) {
> -                       if (PTR_ERR(new_table) != -ENOSPC)
> -                               return PTR_ERR(new_table);
> -               } else {
> -                       rcu_assign_pointer(port_table, new_table);
> -                       tbl_deferred_destroy(cur_table, NULL);
> -               }
> +       if (unlikely(!port_table)) {
> +               err = alloc_port_table();
> +               if (err)
> +                       return err;
>        }
> +       if (port_table_count == 0)
> +               schedule_cache_cleaner();

Can't we just move this into the !port_table condition above?

> +struct tnl_vport *port_table_lookup(struct port_lookup_key *lookup)
> +{
> +       struct hlist_node *n;
> +       struct hlist_head *bucket;
> +       u32 hash = port_hash(lookup);
> +       struct tnl_vport * tnl_vport;
> +
> +       bucket = find_bucket(hash);
> +
> +       hlist_for_each_entry_rcu(tnl_vport, n, bucket, hash_node) {
> +               if (!port_cmp(tnl_vport, lookup)) {
> +                       return tnl_vport;
> +               }

Don't need the braces here.

>  static void cache_cleaner(struct work_struct *work)
>  {
> +       struct hlist_node *n;
> +       struct hlist_head *bucket;
> +       struct tnl_vport * tnl_vport;
> +       int i;
> +
>        schedule_cache_cleaner();
>
>        rcu_read_lock();
> -       tbl_foreach(rcu_dereference(port_table), cache_cleaner_cb, NULL);
> +       for (i = 0; i < PORT_TABLE_SIZE; i++) {
> +               bucket = &port_table[i];
> +               hlist_for_each_entry_rcu(tnl_vport, n, bucket, hash_node) {
> +                       __cache_cleaner(tnl_vport);
> +               }

Extra braces here as well.

> +void tnl_exit()
> +{
> +       struct hlist_node *n, *pos;
> +       struct hlist_head *hash_head;
> +       struct tnl_vport * tnl_vport;
> +       int i;
> +
> +       if (!port_table)
> +               return;
> +
> +       cancel_delayed_work_sync(&cache_cleaner_wq);
> +       for (i = 0; i < PORT_TABLE_SIZE; i++) {
> +               hash_head = &port_table[i];
> +               hlist_for_each_entry_safe(tnl_vport, pos, n, hash_head,
> +                                         hash_node) {
> +                       BUG();
> +                       goto out;
> +               }
> +       }
> +out:
> +       kfree(port_table);
> +}

The only part of this that is actually necessary is freeing of
port_table, right?  Wouldn't it be easier to just do that when we
delete the last port and then we can drop this entire function?  It's
somewhat weirdly asymmetric now.

Did you see my comments before about tnl_destroy() no longer needing
to call tnl_find_port()?  Also, were you going to look into the return
codes for close/destroy?



More information about the dev mailing list