[ovs-dev] [PATCH] Improve kernel hash table

Pravin Shelar pshelar at nicira.com
Wed Aug 24 07:12:09 UTC 2011


Jesse,

I forgot to commit `git rm table.c table.h` to this patch. I will do
it when I push final patch.

Thanks,
Pravin.


On Tue, Aug 23, 2011 at 3:34 PM, Pravin Shelar <pshelar at nicira.com> wrote:
>     Currently OVS uses its own hashing implmentation for hash tables which
> has some problems, e.g. error case on deletion code. Following patch
> replaces that with hlist based hash table which is consistent with other
> kernel hash tables. As Jesse suggested, flex-array is used for allocating
> hash buckets, So that we can have large hash-table without large contiguous
> kernel memory.
>     struct sw_flow is rearragned accodring to frequency of writes to a given
> struct member.
> ---
>  datapath/Modules.mk |    2 -
>  datapath/datapath.c |  111 +++++++++----------------
>  datapath/datapath.h |    2 +-
>  datapath/flow.c     |  190 ++++++++++++++++++++++++++++++++++++++++--
>  datapath/flow.h     |   61 +++++++++----
>  datapath/tunnel.c   |  230 ++++++++++++++++++++++++++-------------------------
>  datapath/tunnel.h   |    4 +-
>  7 files changed, 385 insertions(+), 215 deletions(-)
>
> diff --git a/datapath/Modules.mk b/datapath/Modules.mk
> index 587569f..8eb4e91 100644
> --- a/datapath/Modules.mk
> +++ b/datapath/Modules.mk
> @@ -18,7 +18,6 @@ openvswitch_sources = \
>        dp_sysfs_if.c \
>        flow.c \
>        loop_counter.c \
> -       table.c \
>        tunnel.c \
>        vlan.c \
>        vport.c \
> @@ -37,7 +36,6 @@ openvswitch_headers = \
>        dp_sysfs.h \
>        flow.h \
>        loop_counter.h \
> -       table.h \
>        tunnel.h \
>        vlan.h \
>        vport.h \
> diff --git a/datapath/datapath.c b/datapath/datapath.c
> index 0b6e2e5..ba0b05c 100644
> --- a/datapath/datapath.c
> +++ b/datapath/datapath.c
> @@ -49,8 +49,8 @@
>  #include "datapath.h"
>  #include "actions.h"
>  #include "flow.h"
> -#include "table.h"
>  #include "vlan.h"
> +#include "tunnel.h"
>  #include "vport-internal_dev.h"
>
>  #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18) || \
> @@ -107,7 +107,7 @@ struct datapath *get_dp(int dp_ifindex)
>  EXPORT_SYMBOL_GPL(get_dp);
>
>  /* Must be called with genl_mutex. */
> -static struct tbl *get_table_protected(struct datapath *dp)
> +static struct flow_table *get_table_protected(struct datapath *dp)
>  {
>        return rcu_dereference_protected(dp->table, lockdep_genl_is_held());
>  }
> @@ -225,7 +225,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
>  {
>        struct datapath *dp = container_of(rcu, struct datapath, rcu);
>
> -       tbl_destroy((struct tbl __force *)dp->table, flow_free_tbl);
> +       flow_destroy_table(dp->table);
>        free_percpu(dp->stats_percpu);
>        kobject_put(&dp->ifobj);
>  }
> @@ -269,6 +269,7 @@ int dp_detach_port(struct vport *p)
>  void dp_process_received_packet(struct vport *p, struct sk_buff *skb)
>  {
>        struct datapath *dp = p->dp;
> +       struct sw_flow *flow;
>        struct dp_stats_percpu *stats;
>        int stats_counter_off;
>        int error;
> @@ -277,7 +278,6 @@ void dp_process_received_packet(struct vport *p, struct sk_buff *skb)
>
>        if (!OVS_CB(skb)->flow) {
>                struct sw_flow_key key;
> -               struct tbl_node *flow_node;
>                int key_len;
>                bool is_frag;
>
> @@ -295,9 +295,8 @@ void dp_process_received_packet(struct vport *p, struct sk_buff *skb)
>                }
>
>                /* Look up flow. */
> -               flow_node = tbl_lookup(rcu_dereference(dp->table), &key, key_len,
> -                                      flow_hash(&key, key_len), flow_cmp);
> -               if (unlikely(!flow_node)) {
> +               flow = flow_lookup(rcu_dereference(dp->table), &key, key_len);
> +               if (unlikely(!flow)) {
>                        struct dp_upcall_info upcall;
>
>                        upcall.cmd = OVS_PACKET_CMD_MISS;
> @@ -311,7 +310,7 @@ void dp_process_received_packet(struct vport *p, struct sk_buff *skb)
>                        goto out;
>                }
>
> -               OVS_CB(skb)->flow = flow_cast(flow_node);
> +               OVS_CB(skb)->flow = flow;
>        }
>
>        stats_counter_off = offsetof(struct dp_stats_percpu, n_hit);
> @@ -524,8 +523,8 @@ err_kfree_skbs:
>  /* Called with genl_mutex. */
>  static int flush_flows(int dp_ifindex)
>  {
> -       struct tbl *old_table;
> -       struct tbl *new_table;
> +       struct flow_table *old_table;
> +       struct flow_table *new_table;
>        struct datapath *dp;
>
>        dp = get_dp(dp_ifindex);
> @@ -533,14 +532,13 @@ static int flush_flows(int dp_ifindex)
>                return -ENODEV;
>
>        old_table = get_table_protected(dp);
> -       new_table = tbl_create(TBL_MIN_BUCKETS);
> +       new_table = flow_hash_alloc(TBL_MIN_BUCKETS);
>        if (!new_table)
>                return -ENOMEM;
>
>        rcu_assign_pointer(dp->table, new_table);
>
> -       tbl_deferred_destroy(old_table, flow_free_tbl);
> -
> +       flow_deferred_destroy_table(old_table);
>        return 0;
>  }
>
> @@ -622,24 +620,6 @@ static void clear_stats(struct sw_flow *flow)
>        flow->byte_count = 0;
>  }
>
> -/* Called with genl_mutex. */
> -static int expand_table(struct datapath *dp)
> -{
> -       struct tbl *old_table = get_table_protected(dp);
> -       struct tbl *new_table;
> -
> -       new_table = tbl_expand(old_table);
> -       if (IS_ERR(new_table)) {
> -               if (PTR_ERR(new_table) != -ENOSPC)
> -                       return PTR_ERR(new_table);
> -       } else {
> -               rcu_assign_pointer(dp->table, new_table);
> -               tbl_deferred_destroy(old_table, NULL);
> -       }
> -
> -       return 0;
> -}
> -
>  static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
>  {
>        struct ovs_header *ovs_header = info->userhdr;
> @@ -693,7 +673,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
>        err = flow_extract(packet, -1, &flow->key, &key_len, &is_frag);
>        if (err)
>                goto err_flow_put;
> -       flow->tbl_node.hash = flow_hash(&flow->key, key_len);
>
>        err = flow_metadata_from_nlattrs(&flow->key.eth.in_port,
>                                         &flow->key.eth.tun_id,
> @@ -747,9 +726,9 @@ static struct genl_ops dp_packet_genl_ops[] = {
>  static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats)
>  {
>        int i;
> -       struct tbl *table = get_table_protected(dp);
> +       struct flow_table *table = get_table_protected(dp);
>
> -       stats->n_flows = tbl_count(table);
> +       stats->n_flows = flow_tbl_count(table);
>
>        stats->n_frags = stats->n_hit = stats->n_missed = stats->n_lost = 0;
>        for_each_possible_cpu(i) {
> @@ -940,13 +919,11 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
>  {
>        struct nlattr **a = info->attrs;
>        struct ovs_header *ovs_header = info->userhdr;
> -       struct tbl_node *flow_node;
>        struct sw_flow_key key;
>        struct sw_flow *flow;
>        struct sk_buff *reply;
>        struct datapath *dp;
> -       struct tbl *table;
> -       u32 hash;
> +       struct flow_table *table;
>        int error;
>        int key_len;
>
> @@ -973,10 +950,9 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
>        if (!dp)
>                goto error;
>
> -       hash = flow_hash(&key, key_len);
>        table = get_table_protected(dp);
> -       flow_node = tbl_lookup(table, &key, key_len, hash, flow_cmp);
> -       if (!flow_node) {
> +       flow = flow_lookup(table, &key, key_len);
> +       if (!flow) {
>                struct sw_flow_actions *acts;
>
>                /* Bail out if we're not allowed to create a new flow. */
> @@ -985,11 +961,15 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
>                        goto error;
>
>                /* Expand table, if necessary, to make room. */
> -               if (tbl_count(table) >= tbl_n_buckets(table)) {
> -                       error = expand_table(dp);
> -                       if (error)
> -                               goto error;
> -                       table = get_table_protected(dp);
> +               if (flow_tbl_need_to_expand(table)) {
> +                       struct flow_table *new_table;
> +
> +                       new_table = flow_hash_expand(table);
> +                       if (!IS_ERR(new_table)) {
> +                               rcu_assign_pointer(dp->table, new_table);
> +                               flow_deferred_destroy_table(table);
> +                               table = get_table_protected(dp);
> +                       }
>                }
>
>                /* Allocate flow. */
> @@ -1009,9 +989,8 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
>                rcu_assign_pointer(flow->sf_acts, acts);
>
>                /* Put flow in bucket. */
> -               error = tbl_insert(table, &flow->tbl_node, hash);
> -               if (error)
> -                       goto error_free_flow;
> +               flow->hash = flow_hash(&key, key_len);
> +               flow_insert(table, flow);
>
>                reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
>                                                info->snd_seq, OVS_FLOW_CMD_NEW);
> @@ -1031,7 +1010,6 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
>                        goto error;
>
>                /* Update actions. */
> -               flow = flow_cast(flow_node);
>                old_acts = rcu_dereference_protected(flow->sf_acts,
>                                                     lockdep_genl_is_held());
>                if (a[OVS_FLOW_ATTR_ACTIONS] &&
> @@ -1079,11 +1057,10 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
>        struct nlattr **a = info->attrs;
>        struct ovs_header *ovs_header = info->userhdr;
>        struct sw_flow_key key;
> -       struct tbl_node *flow_node;
>        struct sk_buff *reply;
>        struct sw_flow *flow;
>        struct datapath *dp;
> -       struct tbl *table;
> +       struct flow_table *table;
>        int err;
>        int key_len;
>
> @@ -1098,12 +1075,10 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
>                return -ENODEV;
>
>        table = get_table_protected(dp);
> -       flow_node = tbl_lookup(table, &key, key_len, flow_hash(&key, key_len),
> -                              flow_cmp);
> -       if (!flow_node)
> +       flow = flow_lookup(table, &key, key_len);
> +       if (!flow)
>                return -ENOENT;
>
> -       flow = flow_cast(flow_node);
>        reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid, info->snd_seq, OVS_FLOW_CMD_NEW);
>        if (IS_ERR(reply))
>                return PTR_ERR(reply);
> @@ -1116,11 +1091,10 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
>        struct nlattr **a = info->attrs;
>        struct ovs_header *ovs_header = info->userhdr;
>        struct sw_flow_key key;
> -       struct tbl_node *flow_node;
>        struct sk_buff *reply;
>        struct sw_flow *flow;
>        struct datapath *dp;
> -       struct tbl *table;
> +       struct flow_table *table;
>        int err;
>        int key_len;
>
> @@ -1135,21 +1109,15 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
>                return -ENODEV;
>
>        table = get_table_protected(dp);
> -       flow_node = tbl_lookup(table, &key, key_len, flow_hash(&key, key_len),
> -                              flow_cmp);
> -       if (!flow_node)
> +       flow = flow_lookup(table, &key, key_len);
> +       if (!flow)
>                return -ENOENT;
> -       flow = flow_cast(flow_node);
>
>        reply = ovs_flow_cmd_alloc_info(flow);
>        if (!reply)
>                return -ENOMEM;
>
> -       err = tbl_remove(table, flow_node);
> -       if (err) {
> -               kfree_skb(reply);
> -               return err;
> -       }
> +       flow_remove(table, flow);
>
>        err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_pid,
>                                     info->snd_seq, 0, OVS_FLOW_CMD_DEL);
> @@ -1172,17 +1140,15 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
>                return -ENODEV;
>
>        for (;;) {
> -               struct tbl_node *flow_node;
>                struct sw_flow *flow;
>                u32 bucket, obj;
>
>                bucket = cb->args[0];
>                obj = cb->args[1];
> -               flow_node = tbl_next(get_table_protected(dp), &bucket, &obj);
> -               if (!flow_node)
> +               flow = flow_tbl_next(get_table_protected(dp), &bucket, &obj);
> +               if (!flow)
>                        break;
>
> -               flow = flow_cast(flow_node);
>                if (ovs_flow_cmd_fill_info(flow, dp, skb, NETLINK_CB(cb->skb).pid,
>                                           cb->nlh->nlmsg_seq, NLM_F_MULTI,
>                                           OVS_FLOW_CMD_NEW) < 0)
> @@ -1377,7 +1343,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
>
>        /* Allocate table. */
>        err = -ENOMEM;
> -       rcu_assign_pointer(dp->table, tbl_create(TBL_MIN_BUCKETS));
> +       rcu_assign_pointer(dp->table, flow_hash_alloc(TBL_MIN_BUCKETS));
>        if (!dp->table)
>                goto err_free_dp;
>
> @@ -1423,7 +1389,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
>  err_destroy_local_port:
>        dp_detach_port(get_vport_protected(dp, OVSP_LOCAL));
>  err_destroy_table:
> -       tbl_destroy(get_table_protected(dp), NULL);
> +       flow_destroy_table(get_table_protected(dp));
>  err_free_dp:
>        kfree(dp);
>  err_put_module:
> @@ -2088,6 +2054,7 @@ static void dp_cleanup(void)
>        unregister_netdevice_notifier(&dp_device_notifier);
>        vport_exit();
>        flow_exit();
> +       tnl_exit();
>  }
>
>  module_init(dp_init);
> diff --git a/datapath/datapath.h b/datapath/datapath.h
> index d14f974..b4f08ac 100644
> --- a/datapath/datapath.h
> +++ b/datapath/datapath.h
> @@ -85,7 +85,7 @@ struct datapath {
>        int drop_frags;
>
>        /* Flow table. */
> -       struct tbl __rcu *table;
> +       struct flow_table __rcu *table;
>
>        /* Switch ports. */
>        struct vport __rcu *ports[DP_MAX_PORTS];
> diff --git a/datapath/flow.c b/datapath/flow.c
> index 9d7fbc9..7e83d53 100644
> --- a/datapath/flow.c
> +++ b/datapath/flow.c
> @@ -216,14 +216,153 @@ struct sw_flow *flow_alloc(void)
>        return flow;
>  }
>
> -void flow_free_tbl(struct tbl_node *node)
> +static struct hlist_head __rcu *find_bucket(struct flow_table * table, u32 hash)
>  {
> -       struct sw_flow *flow = flow_cast(node);
> +       return flex_array_get(table->buckets,
> +                               (hash & (table->n_buckets - 1)));
> +}
> +
> +static struct flex_array  __rcu *alloc_buckets(unsigned int n_buckets)
> +{
> +       struct flex_array  __rcu * buckets;
> +       int i, err;
> +
> +       buckets = flex_array_alloc(sizeof(struct hlist_head *),
> +                                  n_buckets, GFP_KERNEL);
> +       if (!buckets)
> +               return NULL;
> +
> +       err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
> +       if (err) {
> +               flex_array_free(buckets);
> +               return NULL;
> +       }
> +
> +       for (i = 0; i < n_buckets; i++)
> +               INIT_HLIST_HEAD((struct hlist_head *)
> +                                       flex_array_get(buckets, i));
> +
> +       return buckets;
> +}
> +
> +static void free_buckets(struct flex_array * buckets)
> +{
> +       flex_array_free(buckets);
> +}
> +
> +struct flow_table *flow_hash_alloc(int new_size)
> +{
> +       struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL);
> +
> +       if (!table)
> +               return NULL;
> +
> +       table->buckets = alloc_buckets(new_size);
> +
> +       if (!table->buckets) {
> +               kfree(table);
> +               return NULL;
> +       }
> +       table->n_buckets = new_size;
> +       table->count = 0;
> +
> +       return table;
> +}
>
> +static void flow_free(struct sw_flow *flow)
> +{
>        flow->dead = true;
>        flow_put(flow);
>  }
>
> +void flow_destroy_table(struct flow_table *table)
> +{
> +       int i;
> +
> +       for (i = 0; i < table->n_buckets; i++) {
> +               struct sw_flow *flow;
> +               struct hlist_head *head = flex_array_get(table->buckets, i);
> +               struct hlist_node *node, *n;
> +
> +               hlist_for_each_entry_safe(flow, node, n, head, hash_node) {
> +                       hlist_del_init_rcu(&flow->hash_node);
> +                       flow_free(flow);
> +               }
> +       }
> +
> +       free_buckets(table->buckets);
> +       kfree(table);
> +}
> +
> +static void flow_destroy_table_rcu_cb(struct rcu_head *rcu)
> +{
> +       struct flow_table *table = container_of(rcu, struct flow_table, rcu);
> +
> +       flow_destroy_table(table);
> +}
> +
> +void flow_deferred_destroy_table(struct flow_table *table)
> +{
> +        if (!table)
> +                return;
> +
> +        call_rcu(&table->rcu, flow_destroy_table_rcu_cb);
> +}
> +
> +struct sw_flow *flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *last)
> +{
> +       struct sw_flow *flow = NULL;
> +       struct hlist_head *head;
> +       struct hlist_node *n;
> +       int i;
> +
> +       while (*bucket < table->n_buckets) {
> +               i = 0;
> +               head = flex_array_get(table->buckets, *bucket);
> +               hlist_for_each_entry_rcu(flow, n, head, hash_node) {
> +                       if (i < *last) {
> +                               i++;
> +                               continue;
> +                       }
> +                       *last = i + 1;
> +                       return flow;
> +               }
> +               (*bucket)++;
> +               *last = 0;
> +               flow = NULL;
> +       }
> +
> +       return flow;
> +}
> +
> +struct flow_table *flow_hash_expand(struct flow_table *table)
> +{
> +       struct flow_table *new_table;
> +       struct sw_flow *flow;
> +       struct hlist_head *head;
> +       struct hlist_node *n, *pos;
> +       int n_buckets = table->n_buckets << 2;
> +       int i;
> +
> +       if (n_buckets > TBL_MAX_BUCKETS)
> +               return ERR_PTR(-ENOSPC);
> +
> +       new_table = flow_hash_alloc(n_buckets);
> +       if (!new_table)
> +               return ERR_PTR(-ENOMEM);
> +
> +       for (i = 0; i < table->n_buckets; i++) {
> +               head = flex_array_get(table->buckets, i);
> +
> +               hlist_for_each_entry_safe(flow, n, pos, head, hash_node) {
> +                       hlist_del_init_rcu(&flow->hash_node);
> +                       flow_insert(new_table, flow);
> +               }
> +       }
> +
> +       return new_table;
> +}
> +
>  /* RCU callback used by flow_deferred_free. */
>  static void rcu_free_flow_callback(struct rcu_head *rcu)
>  {
> @@ -589,12 +728,51 @@ u32 flow_hash(const struct sw_flow_key *key, int key_len)
>        return jhash2((u32*)key, DIV_ROUND_UP(key_len, sizeof(u32)), hash_seed);
>  }
>
> -int flow_cmp(const struct tbl_node *node, void *key2_, int len)
> +struct sw_flow * flow_lookup(struct flow_table *table, struct sw_flow_key *key,
> +                            int key_len)
>  {
> -       const struct sw_flow_key *key1 = &flow_cast(node)->key;
> -       const struct sw_flow_key *key2 = key2_;
> +       struct sw_flow *flow;
> +       struct hlist_node *n;
> +       struct hlist_head *head;
> +       u32 hash;
>
> -       return !memcmp(key1, key2, len);
> +       hash = flow_hash(key, key_len);
> +
> +       head = find_bucket(table, hash);
> +       hlist_for_each_entry_rcu(flow, n, head, hash_node) {
> +               if (flow->hash == hash &&
> +                   !memcmp(&flow->key, key, key_len)) {
> +                       return flow;
> +               }
> +       }
> +       return NULL;
> +}
> +
> +void flow_insert(struct flow_table *table, struct sw_flow *flow)
> +{
> +       struct hlist_head *head;
> +       struct hlist_node *n;
> +       struct sw_flow *cflow;
> +
> +       head = find_bucket(table, flow->hash);
> +
> +       hlist_for_each_entry_rcu(cflow, n, head, hash_node) {
> +               if (!memcmp(&cflow->key, &flow->key, sizeof(cflow->key))) {
> +                       BUG();
> +                       return;
> +               }
> +       }
> +       hlist_add_head_rcu(&flow->hash_node, head);
> +       table->count++;
> +}
> +
> +void flow_remove(struct flow_table *table, struct sw_flow *flow)
> +{
> +       if (!hlist_unhashed(&flow->hash_node)) {
> +               hlist_del_init_rcu(&flow->hash_node);
> +               table->count--;
> +               BUG_ON(table->count < 0);
> +       }
>  }
>
>  /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute.  */
> diff --git a/datapath/flow.h b/datapath/flow.h
> index 997692c..b304c60 100644
> --- a/datapath/flow.h
> +++ b/datapath/flow.h
> @@ -18,9 +18,8 @@
>  #include <linux/in6.h>
>  #include <linux/jiffies.h>
>  #include <linux/time.h>
> -
> +#include <linux/flex_array.h>
>  #include "openvswitch/datapath-protocol.h"
> -#include "table.h"
>
>  struct sk_buff;
>
> @@ -78,21 +77,21 @@ struct sw_flow_key {
>        };
>  };
>
> -struct sw_flow {
> -       struct rcu_head rcu;
> -       struct tbl_node tbl_node;
>
> -       struct sw_flow_key key;
> +struct sw_flow {
>        struct sw_flow_actions __rcu *sf_acts;
> -
> -       atomic_t refcnt;
> +       struct hlist_node  hash_node;
> +       struct sw_flow_key key;
> +       u32 hash;
> +       u8 tcp_flags;           /* Union of seen TCP flags. */
>        bool dead;
>
> -       spinlock_t lock;        /* Lock for values below. */
> -       unsigned long used;     /* Last used time (in jiffies). */
> -       u64 packet_count;       /* Number of packets matched. */
> -       u64 byte_count;         /* Number of bytes matched. */
> -       u8 tcp_flags;           /* Union of seen TCP flags. */
> +       spinlock_t lock;        /* Lock for values below. */
> +       unsigned long used;     /* Last used time (in jiffies). */
> +       u64 packet_count;       /* Number of packets matched. */
> +       u64 byte_count;         /* Number of bytes matched. */
> +       atomic_t refcnt;
> +       struct rcu_head rcu;
>  };
>
>  struct arp_eth_header
> @@ -115,7 +114,6 @@ void flow_exit(void);
>
>  struct sw_flow *flow_alloc(void);
>  void flow_deferred_free(struct sw_flow *);
> -void flow_free_tbl(struct tbl_node *);
>
>  struct sw_flow_actions *flow_actions_alloc(const struct nlattr *);
>  void flow_deferred_free_acts(struct sw_flow_actions *);
> @@ -128,9 +126,6 @@ int flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *,
>  void flow_used(struct sw_flow *, struct sk_buff *);
>  u64 flow_used_time(unsigned long flow_jiffies);
>
> -u32 flow_hash(const struct sw_flow_key *, int key_lenp);
> -int flow_cmp(const struct tbl_node *, void *target, int len);
> -
>  /* Upper bound on the length of a nlattr-formatted flow key.  The longest
>  * nlattr-formatted flow key would be:
>  *
> @@ -155,9 +150,37 @@ int flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
>  int flow_metadata_from_nlattrs(u16 *in_port, __be64 *tun_id,
>                               const struct nlattr *);
>
> -static inline struct sw_flow *flow_cast(const struct tbl_node *node)
> +#define TBL_MIN_BUCKETS                1024
> +#define TBL_MAX_BUCKETS (TBL_MIN_BUCKETS * TBL_MIN_BUCKETS)
> +
> +struct flow_table {
> +        struct flex_array *buckets;
> +        unsigned int count, n_buckets;
> +        struct rcu_head rcu;
> +};
> +
> +static inline int flow_tbl_count(struct flow_table *table)
>  {
> -       return container_of(node, struct sw_flow, tbl_node);
> +       return table->count;
>  }
>
> +static inline int flow_tbl_need_to_expand(struct flow_table *table)
> +{
> +       return (table->count > table->n_buckets);
> +}
> +
> +struct sw_flow *flow_lookup(struct flow_table *table, struct sw_flow_key *key,
> +                           int len);
> +void flow_destroy_table(struct flow_table *table);
> +void flow_deferred_destroy_table(struct flow_table *table);
> +struct flow_table *flow_hash_alloc(int new_size);
> +struct flow_table *flow_hash_expand(struct flow_table *table);
> +void flow_insert(struct flow_table *table, struct sw_flow *flow);
> +void flow_remove(struct flow_table *table, struct sw_flow *flow);
> +u32 flow_hash(const struct sw_flow_key *key, int key_len);
> +
> +int flow_tbl_count(struct flow_table *table);
> +int flow_tbl_need_to_expand(struct flow_table *table);
> +struct sw_flow *flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx);
> +
>  #endif /* flow.h */
> diff --git a/datapath/tunnel.c b/datapath/tunnel.c
> index 67556d7..8121b02 100644
> --- a/datapath/tunnel.c
> +++ b/datapath/tunnel.c
> @@ -13,6 +13,7 @@
>  #include <linux/in.h>
>  #include <linux/in_route.h>
>  #include <linux/jhash.h>
> +#include <linux/list.h>
>  #include <linux/kernel.h>
>  #include <linux/version.h>
>  #include <linux/workqueue.h>
> @@ -31,7 +32,6 @@
>  #include "actions.h"
>  #include "checksum.h"
>  #include "datapath.h"
> -#include "table.h"
>  #include "tunnel.h"
>  #include "vlan.h"
>  #include "vport.h"
> @@ -68,8 +68,10 @@
>  #define CACHE_CLEANER_INTERVAL (5 * HZ)
>
>  #define CACHE_DATA_ALIGN 16
> +#define PORT_TABLE_SIZE  1024
>
> -static struct tbl __rcu *port_table __read_mostly;
> +static struct hlist_head *port_table __read_mostly;
> +static int port_table_count;
>
>  static void cache_cleaner(struct work_struct *work);
>  static DECLARE_DELAYED_WORK(cache_cleaner_wq, cache_cleaner);
> @@ -95,11 +97,6 @@ static inline struct vport *tnl_vport_to_vport(const struct tnl_vport *tnl_vport
>        return vport_from_priv(tnl_vport);
>  }
>
> -static inline struct tnl_vport *tnl_vport_table_cast(const struct tbl_node *node)
> -{
> -       return container_of(node, struct tnl_vport, tbl_node);
> -}
> -
>  /* This is analogous to rtnl_dereference for the tunnel cache.  It checks that
>  * cache_lock is held, so it is only for update side code.
>  */
> @@ -186,11 +183,9 @@ struct port_lookup_key {
>  * Modifies 'target' to store the rcu_dereferenced pointer that was used to do
>  * the comparision.
>  */
> -static int port_cmp(const struct tbl_node *node, void *target, int unused)
> +static int port_cmp(const struct tnl_vport *tnl_vport,
> +                    struct port_lookup_key *lookup)
>  {
> -       const struct tnl_vport *tnl_vport = tnl_vport_table_cast(node);
> -       struct port_lookup_key *lookup = target;
> -
>        lookup->mutable = rcu_dereference_rtnl(tnl_vport->mutable);
>
>        return (lookup->mutable->tunnel_type == lookup->tunnel_type &&
> @@ -218,106 +213,95 @@ static u32 mutable_hash(const struct tnl_mutable_config *mutable)
>        return port_hash(&lookup);
>  }
>
> -static void check_table_empty(void)
> -{
> -       struct tbl *old_table = rtnl_dereference(port_table);
>
> -       if (tbl_count(old_table) == 0) {
> -               cancel_delayed_work_sync(&cache_cleaner_wq);
> -               rcu_assign_pointer(port_table, NULL);
> -               tbl_deferred_destroy(old_table, NULL);
> -       }
> +static inline struct hlist_head *find_bucket(u32 hash)
> +{
> +       return &port_table[(hash & (PORT_TABLE_SIZE - 1))];
>  }
>
> -static int add_port(struct vport *vport)
> +static int alloc_port_table(void)
>  {
> -       struct tbl *cur_table = rtnl_dereference(port_table);
> -       struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
> -       int err;
> +       unsigned int i;
> +       static struct hlist_head *head;
>
> -       if (!port_table) {
> -               struct tbl *new_table;
> +       head = kmalloc(PORT_TABLE_SIZE * sizeof(struct hlist_head *),
> +                       GFP_KERNEL);
> +       if (!head)
> +               return -ENOMEM;
>
> -               new_table = tbl_create(TBL_MIN_BUCKETS);
> -               if (!new_table)
> -                       return -ENOMEM;
> +       for (i = 0; i < PORT_TABLE_SIZE; i++)
> +               INIT_HLIST_HEAD(&head[i]);
>
> -               rcu_assign_pointer(port_table, new_table);
> -               schedule_cache_cleaner();
> +       smp_wmb();
> +       port_table = head;
> +       return 0;
> +}
>
> -       } else if (tbl_count(cur_table) > tbl_n_buckets(cur_table)) {
> -               struct tbl *new_table;
> +static int add_port(struct vport *vport)
> +{
> +       struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
> +       u32 hash = mutable_hash(rtnl_dereference(tnl_vport->mutable));
> +       int err;
>
> -               new_table = tbl_expand(cur_table);
> -               if (IS_ERR(new_table)) {
> -                       if (PTR_ERR(new_table) != -ENOSPC)
> -                               return PTR_ERR(new_table);
> -               } else {
> -                       rcu_assign_pointer(port_table, new_table);
> -                       tbl_deferred_destroy(cur_table, NULL);
> -               }
> +       if (unlikely(!port_table)) {
> +               err = alloc_port_table();
> +               if (err)
> +                       return err;
>        }
> +       if (port_table_count == 0)
> +               schedule_cache_cleaner();
>
> -       err = tbl_insert(rtnl_dereference(port_table), &tnl_vport->tbl_node,
> -                        mutable_hash(rtnl_dereference(tnl_vport->mutable)));
> -       if (err) {
> -               check_table_empty();
> -               return err;
> -       }
> +       hlist_add_head_rcu(&tnl_vport->hash_node, find_bucket(hash));
> +       port_table_count++;
>
>        (*find_port_pool(rtnl_dereference(tnl_vport->mutable)))++;
> -
>        return 0;
>  }
>
> -static int move_port(struct vport *vport, struct tnl_mutable_config *new_mutable)
> +static void move_port(struct vport *vport,
> +                     struct tnl_mutable_config *new_mutable)
>  {
> -       int err;
> -       struct tbl *cur_table = rtnl_dereference(port_table);
>        struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
>        u32 hash;
>
>        hash = mutable_hash(new_mutable);
> -       if (hash == tnl_vport->tbl_node.hash)
> -               goto table_updated;
> -
> -       /*
> -        * Ideally we should make this move atomic to avoid having gaps in
> -        * finding tunnels or the possibility of failure.  However, if we do
> -        * find a tunnel it will always be consistent.
> -        */
> -       err = tbl_remove(cur_table, &tnl_vport->tbl_node);
> -       if (err)
> -               return err;
> +       hlist_del_init_rcu(&tnl_vport->hash_node);
> +       hlist_add_head_rcu(&tnl_vport->hash_node, find_bucket(hash));
>
> -       err = tbl_insert(cur_table, &tnl_vport->tbl_node, hash);
> -       if (err) {
> -               (*find_port_pool(rtnl_dereference(tnl_vport->mutable)))--;
> -               check_table_empty();
> -               return err;
> -       }
> -
> -table_updated:
>        (*find_port_pool(rtnl_dereference(tnl_vport->mutable)))--;
>        assign_config_rcu(vport, new_mutable);
>        (*find_port_pool(rtnl_dereference(tnl_vport->mutable)))++;
> -
> -       return 0;
>  }
>
> -static int del_port(struct vport *vport)
> +static void del_port(struct vport *vport)
>  {
>        struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
> -       int err;
>
> -       err = tbl_remove(rtnl_dereference(port_table), &tnl_vport->tbl_node);
> -       if (err)
> -               return err;
> +       hlist_del_init_rcu(&tnl_vport->hash_node);
> +
> +       port_table_count--;
> +       if (port_table_count == 0)
> +               cancel_delayed_work_sync(&cache_cleaner_wq);
>
> -       check_table_empty();
>        (*find_port_pool(rtnl_dereference(tnl_vport->mutable)))--;
> +}
>
> -       return 0;
> +struct tnl_vport *port_table_lookup(struct port_lookup_key *lookup)
> +{
> +       struct hlist_node *n;
> +       struct hlist_head *bucket;
> +       u32 hash = port_hash(lookup);
> +       struct tnl_vport * tnl_vport;
> +
> +       bucket = find_bucket(hash);
> +
> +       hlist_for_each_entry_rcu(tnl_vport, n, bucket, hash_node) {
> +               if (!port_cmp(tnl_vport, lookup)) {
> +                       return tnl_vport;
> +               }
> +       }
> +
> +       return NULL;
>  }
>
>  struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be64 key,
> @@ -325,10 +309,9 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be64 key,
>                            const struct tnl_mutable_config **mutable)
>  {
>        struct port_lookup_key lookup;
> -       struct tbl *table = rcu_dereference_rtnl(port_table);
> -       struct tbl_node *tbl_node;
> +       struct tnl_vport * tnl_vport;
>
> -       if (unlikely(!table))
> +       if (unlikely(!port_table))
>                return NULL;
>
>        lookup.saddr = saddr;
> @@ -339,18 +322,15 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be64 key,
>                lookup.tunnel_type = tunnel_type & ~TNL_T_KEY_MATCH;
>
>                if (key_local_remote_ports) {
> -                       tbl_node = tbl_lookup(table, &lookup, sizeof(lookup),
> -                                             port_hash(&lookup), port_cmp);
> -                       if (tbl_node)
> +                       tnl_vport = port_table_lookup(&lookup);
> +                       if (tnl_vport)
>                                goto found;
>                }
>
>                if (key_remote_ports) {
>                        lookup.saddr = 0;
> -
> -                       tbl_node = tbl_lookup(table, &lookup, sizeof(lookup),
> -                                             port_hash(&lookup), port_cmp);
> -                       if (tbl_node)
> +                       tnl_vport = port_table_lookup(&lookup);
> +                       if (tnl_vport)
>                                goto found;
>
>                        lookup.saddr = saddr;
> @@ -362,18 +342,15 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be64 key,
>                lookup.tunnel_type = tunnel_type & ~TNL_T_KEY_EXACT;
>
>                if (local_remote_ports) {
> -                       tbl_node = tbl_lookup(table, &lookup, sizeof(lookup),
> -                                             port_hash(&lookup), port_cmp);
> -                       if (tbl_node)
> +                       tnl_vport = port_table_lookup(&lookup);
> +                       if (tnl_vport)
>                                goto found;
>                }
>
>                if (remote_ports) {
>                        lookup.saddr = 0;
> -
> -                       tbl_node = tbl_lookup(table, &lookup, sizeof(lookup),
> -                                             port_hash(&lookup), port_cmp);
> -                       if (tbl_node)
> +                       tnl_vport = port_table_lookup(&lookup);
> +                       if (tnl_vport)
>                                goto found;
>                }
>        }
> @@ -382,7 +359,7 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be64 key,
>
>  found:
>        *mutable = lookup.mutable;
> -       return tnl_vport_to_vport(tnl_vport_table_cast(tbl_node));
> +       return tnl_vport_to_vport(tnl_vport);
>  }
>
>  static void ecn_decapsulate(struct sk_buff *skb, u8 tos)
> @@ -848,10 +825,10 @@ static inline bool check_cache_valid(const struct tnl_cache *cache,
>                (cache->flow && !cache->flow->dead));
>  }
>
> -static int cache_cleaner_cb(struct tbl_node *tbl_node, void *aux)
> +static void __cache_cleaner(struct tnl_vport *tnl_vport)
>  {
> -       struct tnl_vport *tnl_vport = tnl_vport_table_cast(tbl_node);
> -       const struct tnl_mutable_config *mutable = rcu_dereference(tnl_vport->mutable);
> +       const struct tnl_mutable_config *mutable =
> +                       rcu_dereference(tnl_vport->mutable);
>        const struct tnl_cache *cache = rcu_dereference(tnl_vport->cache);
>
>        if (cache && !check_cache_valid(cache, mutable) &&
> @@ -859,16 +836,24 @@ static int cache_cleaner_cb(struct tbl_node *tbl_node, void *aux)
>                assign_cache_rcu(tnl_vport_to_vport(tnl_vport), NULL);
>                spin_unlock_bh(&tnl_vport->cache_lock);
>        }
> -
> -       return 0;
>  }
>
>  static void cache_cleaner(struct work_struct *work)
>  {
> +       struct hlist_node *n;
> +       struct hlist_head *bucket;
> +       struct tnl_vport * tnl_vport;
> +       int i;
> +
>        schedule_cache_cleaner();
>
>        rcu_read_lock();
> -       tbl_foreach(rcu_dereference(port_table), cache_cleaner_cb, NULL);
> +       for (i = 0; i < PORT_TABLE_SIZE; i++) {
> +               bucket = &port_table[i];
> +               hlist_for_each_entry_rcu(tnl_vport, n, bucket, hash_node) {
> +                       __cache_cleaner(tnl_vport);
> +               }
> +       }
>        rcu_read_unlock();
>  }
>
> @@ -949,12 +934,12 @@ static struct tnl_cache *build_cache(struct vport *vport,
>
>        if (is_internal_dev(rt_dst(rt).dev)) {
>                struct sw_flow_key flow_key;
> -               struct tbl_node *flow_node;
>                struct vport *dst_vport;
>                struct sk_buff *skb;
>                bool is_frag;
>                int err;
>                int flow_key_len;
> +               struct sw_flow *flow;
>
>                dst_vport = internal_dev_get_vport(rt_dst(rt).dev);
>                if (!dst_vport)
> @@ -974,13 +959,9 @@ static struct tnl_cache *build_cache(struct vport *vport,
>                if (err || is_frag)
>                        goto done;
>
> -               flow_node = tbl_lookup(rcu_dereference(dst_vport->dp->table),
> -                                      &flow_key, flow_key_len,
> -                                      flow_hash(&flow_key, flow_key_len),
> -                                      flow_cmp);
> -               if (flow_node) {
> -                       struct sw_flow *flow = flow_cast(flow_node);
> -
> +               flow = flow_lookup(rcu_dereference(dst_vport->dp->table),
> +                                        &flow_key, flow_key_len);
> +               if (flow) {
>                        cache->flow = flow;
>                        flow_hold(flow);
>                }
> @@ -1498,9 +1479,8 @@ int tnl_set_options(struct vport *vport, struct nlattr *options)
>        if (err)
>                goto error_free;
>
> -       err = move_port(vport, mutable);
> -       if (err)
> -               goto error_free;
> +       if (mutable_hash(mutable) != mutable_hash(old_mutable))
> +               move_port(vport, mutable);
>
>        return 0;
>
> @@ -1598,3 +1578,27 @@ void tnl_free_linked_skbs(struct sk_buff *skb)
>                skb = next;
>        }
>  }
> +
> +
> +void tnl_exit()
> +{
> +       struct hlist_node *n, *pos;
> +       struct hlist_head *hash_head;
> +       struct tnl_vport * tnl_vport;
> +       int i;
> +
> +       if (!port_table)
> +               return;
> +
> +       cancel_delayed_work_sync(&cache_cleaner_wq);
> +       for (i = 0; i < PORT_TABLE_SIZE; i++) {
> +               hash_head = &port_table[i];
> +               hlist_for_each_entry_safe(tnl_vport, pos, n, hash_head,
> +                                         hash_node) {
> +                       BUG();
> +                       goto out;
> +               }
> +       }
> +out:
> +       kfree(port_table);
> +}
> diff --git a/datapath/tunnel.h b/datapath/tunnel.h
> index b488e46..169c408 100644
> --- a/datapath/tunnel.h
> +++ b/datapath/tunnel.h
> @@ -13,7 +13,6 @@
>
>  #include "flow.h"
>  #include "openvswitch/tunnel.h"
> -#include "table.h"
>  #include "vport.h"
>
>  /*
> @@ -187,7 +186,7 @@ struct tnl_cache {
>
>  struct tnl_vport {
>        struct rcu_head rcu;
> -       struct tbl_node tbl_node;
> +       struct hlist_node hash_node;
>
>        char name[IFNAMSIZ];
>        const struct tnl_ops *tnl_ops;
> @@ -236,6 +235,7 @@ bool tnl_frag_needed(struct vport *vport,
>                     struct sk_buff *skb, unsigned int mtu, __be64 flow_key);
>  void tnl_free_linked_skbs(struct sk_buff *skb);
>
> +void tnl_exit(void);
>  static inline struct tnl_vport *tnl_vport_priv(const struct vport *vport)
>  {
>        return vport_priv(vport);
> --
> 1.7.1
>
>



More information about the dev mailing list