[ovs-dev] [PATCH] Improve kernel hash table
Pravin Shelar
pshelar at nicira.com
Wed Aug 24 07:12:09 UTC 2011
Jesse,
I forgot to commit `git rm table.c table.h` to this patch. I will do
it when I push final patch.
Thanks,
Pravin.
On Tue, Aug 23, 2011 at 3:34 PM, Pravin Shelar <pshelar at nicira.com> wrote:
> Currently OVS uses its own hashing implmentation for hash tables which
> has some problems, e.g. error case on deletion code. Following patch
> replaces that with hlist based hash table which is consistent with other
> kernel hash tables. As Jesse suggested, flex-array is used for allocating
> hash buckets, So that we can have large hash-table without large contiguous
> kernel memory.
> struct sw_flow is rearragned accodring to frequency of writes to a given
> struct member.
> ---
> datapath/Modules.mk | 2 -
> datapath/datapath.c | 111 +++++++++----------------
> datapath/datapath.h | 2 +-
> datapath/flow.c | 190 ++++++++++++++++++++++++++++++++++++++++--
> datapath/flow.h | 61 +++++++++----
> datapath/tunnel.c | 230 ++++++++++++++++++++++++++-------------------------
> datapath/tunnel.h | 4 +-
> 7 files changed, 385 insertions(+), 215 deletions(-)
>
> diff --git a/datapath/Modules.mk b/datapath/Modules.mk
> index 587569f..8eb4e91 100644
> --- a/datapath/Modules.mk
> +++ b/datapath/Modules.mk
> @@ -18,7 +18,6 @@ openvswitch_sources = \
> dp_sysfs_if.c \
> flow.c \
> loop_counter.c \
> - table.c \
> tunnel.c \
> vlan.c \
> vport.c \
> @@ -37,7 +36,6 @@ openvswitch_headers = \
> dp_sysfs.h \
> flow.h \
> loop_counter.h \
> - table.h \
> tunnel.h \
> vlan.h \
> vport.h \
> diff --git a/datapath/datapath.c b/datapath/datapath.c
> index 0b6e2e5..ba0b05c 100644
> --- a/datapath/datapath.c
> +++ b/datapath/datapath.c
> @@ -49,8 +49,8 @@
> #include "datapath.h"
> #include "actions.h"
> #include "flow.h"
> -#include "table.h"
> #include "vlan.h"
> +#include "tunnel.h"
> #include "vport-internal_dev.h"
>
> #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18) || \
> @@ -107,7 +107,7 @@ struct datapath *get_dp(int dp_ifindex)
> EXPORT_SYMBOL_GPL(get_dp);
>
> /* Must be called with genl_mutex. */
> -static struct tbl *get_table_protected(struct datapath *dp)
> +static struct flow_table *get_table_protected(struct datapath *dp)
> {
> return rcu_dereference_protected(dp->table, lockdep_genl_is_held());
> }
> @@ -225,7 +225,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
> {
> struct datapath *dp = container_of(rcu, struct datapath, rcu);
>
> - tbl_destroy((struct tbl __force *)dp->table, flow_free_tbl);
> + flow_destroy_table(dp->table);
> free_percpu(dp->stats_percpu);
> kobject_put(&dp->ifobj);
> }
> @@ -269,6 +269,7 @@ int dp_detach_port(struct vport *p)
> void dp_process_received_packet(struct vport *p, struct sk_buff *skb)
> {
> struct datapath *dp = p->dp;
> + struct sw_flow *flow;
> struct dp_stats_percpu *stats;
> int stats_counter_off;
> int error;
> @@ -277,7 +278,6 @@ void dp_process_received_packet(struct vport *p, struct sk_buff *skb)
>
> if (!OVS_CB(skb)->flow) {
> struct sw_flow_key key;
> - struct tbl_node *flow_node;
> int key_len;
> bool is_frag;
>
> @@ -295,9 +295,8 @@ void dp_process_received_packet(struct vport *p, struct sk_buff *skb)
> }
>
> /* Look up flow. */
> - flow_node = tbl_lookup(rcu_dereference(dp->table), &key, key_len,
> - flow_hash(&key, key_len), flow_cmp);
> - if (unlikely(!flow_node)) {
> + flow = flow_lookup(rcu_dereference(dp->table), &key, key_len);
> + if (unlikely(!flow)) {
> struct dp_upcall_info upcall;
>
> upcall.cmd = OVS_PACKET_CMD_MISS;
> @@ -311,7 +310,7 @@ void dp_process_received_packet(struct vport *p, struct sk_buff *skb)
> goto out;
> }
>
> - OVS_CB(skb)->flow = flow_cast(flow_node);
> + OVS_CB(skb)->flow = flow;
> }
>
> stats_counter_off = offsetof(struct dp_stats_percpu, n_hit);
> @@ -524,8 +523,8 @@ err_kfree_skbs:
> /* Called with genl_mutex. */
> static int flush_flows(int dp_ifindex)
> {
> - struct tbl *old_table;
> - struct tbl *new_table;
> + struct flow_table *old_table;
> + struct flow_table *new_table;
> struct datapath *dp;
>
> dp = get_dp(dp_ifindex);
> @@ -533,14 +532,13 @@ static int flush_flows(int dp_ifindex)
> return -ENODEV;
>
> old_table = get_table_protected(dp);
> - new_table = tbl_create(TBL_MIN_BUCKETS);
> + new_table = flow_hash_alloc(TBL_MIN_BUCKETS);
> if (!new_table)
> return -ENOMEM;
>
> rcu_assign_pointer(dp->table, new_table);
>
> - tbl_deferred_destroy(old_table, flow_free_tbl);
> -
> + flow_deferred_destroy_table(old_table);
> return 0;
> }
>
> @@ -622,24 +620,6 @@ static void clear_stats(struct sw_flow *flow)
> flow->byte_count = 0;
> }
>
> -/* Called with genl_mutex. */
> -static int expand_table(struct datapath *dp)
> -{
> - struct tbl *old_table = get_table_protected(dp);
> - struct tbl *new_table;
> -
> - new_table = tbl_expand(old_table);
> - if (IS_ERR(new_table)) {
> - if (PTR_ERR(new_table) != -ENOSPC)
> - return PTR_ERR(new_table);
> - } else {
> - rcu_assign_pointer(dp->table, new_table);
> - tbl_deferred_destroy(old_table, NULL);
> - }
> -
> - return 0;
> -}
> -
> static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
> {
> struct ovs_header *ovs_header = info->userhdr;
> @@ -693,7 +673,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
> err = flow_extract(packet, -1, &flow->key, &key_len, &is_frag);
> if (err)
> goto err_flow_put;
> - flow->tbl_node.hash = flow_hash(&flow->key, key_len);
>
> err = flow_metadata_from_nlattrs(&flow->key.eth.in_port,
> &flow->key.eth.tun_id,
> @@ -747,9 +726,9 @@ static struct genl_ops dp_packet_genl_ops[] = {
> static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats)
> {
> int i;
> - struct tbl *table = get_table_protected(dp);
> + struct flow_table *table = get_table_protected(dp);
>
> - stats->n_flows = tbl_count(table);
> + stats->n_flows = flow_tbl_count(table);
>
> stats->n_frags = stats->n_hit = stats->n_missed = stats->n_lost = 0;
> for_each_possible_cpu(i) {
> @@ -940,13 +919,11 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
> {
> struct nlattr **a = info->attrs;
> struct ovs_header *ovs_header = info->userhdr;
> - struct tbl_node *flow_node;
> struct sw_flow_key key;
> struct sw_flow *flow;
> struct sk_buff *reply;
> struct datapath *dp;
> - struct tbl *table;
> - u32 hash;
> + struct flow_table *table;
> int error;
> int key_len;
>
> @@ -973,10 +950,9 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
> if (!dp)
> goto error;
>
> - hash = flow_hash(&key, key_len);
> table = get_table_protected(dp);
> - flow_node = tbl_lookup(table, &key, key_len, hash, flow_cmp);
> - if (!flow_node) {
> + flow = flow_lookup(table, &key, key_len);
> + if (!flow) {
> struct sw_flow_actions *acts;
>
> /* Bail out if we're not allowed to create a new flow. */
> @@ -985,11 +961,15 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
> goto error;
>
> /* Expand table, if necessary, to make room. */
> - if (tbl_count(table) >= tbl_n_buckets(table)) {
> - error = expand_table(dp);
> - if (error)
> - goto error;
> - table = get_table_protected(dp);
> + if (flow_tbl_need_to_expand(table)) {
> + struct flow_table *new_table;
> +
> + new_table = flow_hash_expand(table);
> + if (!IS_ERR(new_table)) {
> + rcu_assign_pointer(dp->table, new_table);
> + flow_deferred_destroy_table(table);
> + table = get_table_protected(dp);
> + }
> }
>
> /* Allocate flow. */
> @@ -1009,9 +989,8 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
> rcu_assign_pointer(flow->sf_acts, acts);
>
> /* Put flow in bucket. */
> - error = tbl_insert(table, &flow->tbl_node, hash);
> - if (error)
> - goto error_free_flow;
> + flow->hash = flow_hash(&key, key_len);
> + flow_insert(table, flow);
>
> reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
> info->snd_seq, OVS_FLOW_CMD_NEW);
> @@ -1031,7 +1010,6 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
> goto error;
>
> /* Update actions. */
> - flow = flow_cast(flow_node);
> old_acts = rcu_dereference_protected(flow->sf_acts,
> lockdep_genl_is_held());
> if (a[OVS_FLOW_ATTR_ACTIONS] &&
> @@ -1079,11 +1057,10 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
> struct nlattr **a = info->attrs;
> struct ovs_header *ovs_header = info->userhdr;
> struct sw_flow_key key;
> - struct tbl_node *flow_node;
> struct sk_buff *reply;
> struct sw_flow *flow;
> struct datapath *dp;
> - struct tbl *table;
> + struct flow_table *table;
> int err;
> int key_len;
>
> @@ -1098,12 +1075,10 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
> return -ENODEV;
>
> table = get_table_protected(dp);
> - flow_node = tbl_lookup(table, &key, key_len, flow_hash(&key, key_len),
> - flow_cmp);
> - if (!flow_node)
> + flow = flow_lookup(table, &key, key_len);
> + if (!flow)
> return -ENOENT;
>
> - flow = flow_cast(flow_node);
> reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid, info->snd_seq, OVS_FLOW_CMD_NEW);
> if (IS_ERR(reply))
> return PTR_ERR(reply);
> @@ -1116,11 +1091,10 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
> struct nlattr **a = info->attrs;
> struct ovs_header *ovs_header = info->userhdr;
> struct sw_flow_key key;
> - struct tbl_node *flow_node;
> struct sk_buff *reply;
> struct sw_flow *flow;
> struct datapath *dp;
> - struct tbl *table;
> + struct flow_table *table;
> int err;
> int key_len;
>
> @@ -1135,21 +1109,15 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
> return -ENODEV;
>
> table = get_table_protected(dp);
> - flow_node = tbl_lookup(table, &key, key_len, flow_hash(&key, key_len),
> - flow_cmp);
> - if (!flow_node)
> + flow = flow_lookup(table, &key, key_len);
> + if (!flow)
> return -ENOENT;
> - flow = flow_cast(flow_node);
>
> reply = ovs_flow_cmd_alloc_info(flow);
> if (!reply)
> return -ENOMEM;
>
> - err = tbl_remove(table, flow_node);
> - if (err) {
> - kfree_skb(reply);
> - return err;
> - }
> + flow_remove(table, flow);
>
> err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_pid,
> info->snd_seq, 0, OVS_FLOW_CMD_DEL);
> @@ -1172,17 +1140,15 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
> return -ENODEV;
>
> for (;;) {
> - struct tbl_node *flow_node;
> struct sw_flow *flow;
> u32 bucket, obj;
>
> bucket = cb->args[0];
> obj = cb->args[1];
> - flow_node = tbl_next(get_table_protected(dp), &bucket, &obj);
> - if (!flow_node)
> + flow = flow_tbl_next(get_table_protected(dp), &bucket, &obj);
> + if (!flow)
> break;
>
> - flow = flow_cast(flow_node);
> if (ovs_flow_cmd_fill_info(flow, dp, skb, NETLINK_CB(cb->skb).pid,
> cb->nlh->nlmsg_seq, NLM_F_MULTI,
> OVS_FLOW_CMD_NEW) < 0)
> @@ -1377,7 +1343,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
>
> /* Allocate table. */
> err = -ENOMEM;
> - rcu_assign_pointer(dp->table, tbl_create(TBL_MIN_BUCKETS));
> + rcu_assign_pointer(dp->table, flow_hash_alloc(TBL_MIN_BUCKETS));
> if (!dp->table)
> goto err_free_dp;
>
> @@ -1423,7 +1389,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
> err_destroy_local_port:
> dp_detach_port(get_vport_protected(dp, OVSP_LOCAL));
> err_destroy_table:
> - tbl_destroy(get_table_protected(dp), NULL);
> + flow_destroy_table(get_table_protected(dp));
> err_free_dp:
> kfree(dp);
> err_put_module:
> @@ -2088,6 +2054,7 @@ static void dp_cleanup(void)
> unregister_netdevice_notifier(&dp_device_notifier);
> vport_exit();
> flow_exit();
> + tnl_exit();
> }
>
> module_init(dp_init);
> diff --git a/datapath/datapath.h b/datapath/datapath.h
> index d14f974..b4f08ac 100644
> --- a/datapath/datapath.h
> +++ b/datapath/datapath.h
> @@ -85,7 +85,7 @@ struct datapath {
> int drop_frags;
>
> /* Flow table. */
> - struct tbl __rcu *table;
> + struct flow_table __rcu *table;
>
> /* Switch ports. */
> struct vport __rcu *ports[DP_MAX_PORTS];
> diff --git a/datapath/flow.c b/datapath/flow.c
> index 9d7fbc9..7e83d53 100644
> --- a/datapath/flow.c
> +++ b/datapath/flow.c
> @@ -216,14 +216,153 @@ struct sw_flow *flow_alloc(void)
> return flow;
> }
>
> -void flow_free_tbl(struct tbl_node *node)
> +static struct hlist_head __rcu *find_bucket(struct flow_table * table, u32 hash)
> {
> - struct sw_flow *flow = flow_cast(node);
> + return flex_array_get(table->buckets,
> + (hash & (table->n_buckets - 1)));
> +}
> +
> +static struct flex_array __rcu *alloc_buckets(unsigned int n_buckets)
> +{
> + struct flex_array __rcu * buckets;
> + int i, err;
> +
> + buckets = flex_array_alloc(sizeof(struct hlist_head *),
> + n_buckets, GFP_KERNEL);
> + if (!buckets)
> + return NULL;
> +
> + err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
> + if (err) {
> + flex_array_free(buckets);
> + return NULL;
> + }
> +
> + for (i = 0; i < n_buckets; i++)
> + INIT_HLIST_HEAD((struct hlist_head *)
> + flex_array_get(buckets, i));
> +
> + return buckets;
> +}
> +
> +static void free_buckets(struct flex_array * buckets)
> +{
> + flex_array_free(buckets);
> +}
> +
> +struct flow_table *flow_hash_alloc(int new_size)
> +{
> + struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL);
> +
> + if (!table)
> + return NULL;
> +
> + table->buckets = alloc_buckets(new_size);
> +
> + if (!table->buckets) {
> + kfree(table);
> + return NULL;
> + }
> + table->n_buckets = new_size;
> + table->count = 0;
> +
> + return table;
> +}
>
> +static void flow_free(struct sw_flow *flow)
> +{
> flow->dead = true;
> flow_put(flow);
> }
>
> +void flow_destroy_table(struct flow_table *table)
> +{
> + int i;
> +
> + for (i = 0; i < table->n_buckets; i++) {
> + struct sw_flow *flow;
> + struct hlist_head *head = flex_array_get(table->buckets, i);
> + struct hlist_node *node, *n;
> +
> + hlist_for_each_entry_safe(flow, node, n, head, hash_node) {
> + hlist_del_init_rcu(&flow->hash_node);
> + flow_free(flow);
> + }
> + }
> +
> + free_buckets(table->buckets);
> + kfree(table);
> +}
> +
> +static void flow_destroy_table_rcu_cb(struct rcu_head *rcu)
> +{
> + struct flow_table *table = container_of(rcu, struct flow_table, rcu);
> +
> + flow_destroy_table(table);
> +}
> +
> +void flow_deferred_destroy_table(struct flow_table *table)
> +{
> + if (!table)
> + return;
> +
> + call_rcu(&table->rcu, flow_destroy_table_rcu_cb);
> +}
> +
> +struct sw_flow *flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *last)
> +{
> + struct sw_flow *flow = NULL;
> + struct hlist_head *head;
> + struct hlist_node *n;
> + int i;
> +
> + while (*bucket < table->n_buckets) {
> + i = 0;
> + head = flex_array_get(table->buckets, *bucket);
> + hlist_for_each_entry_rcu(flow, n, head, hash_node) {
> + if (i < *last) {
> + i++;
> + continue;
> + }
> + *last = i + 1;
> + return flow;
> + }
> + (*bucket)++;
> + *last = 0;
> + flow = NULL;
> + }
> +
> + return flow;
> +}
> +
> +struct flow_table *flow_hash_expand(struct flow_table *table)
> +{
> + struct flow_table *new_table;
> + struct sw_flow *flow;
> + struct hlist_head *head;
> + struct hlist_node *n, *pos;
> + int n_buckets = table->n_buckets << 2;
> + int i;
> +
> + if (n_buckets > TBL_MAX_BUCKETS)
> + return ERR_PTR(-ENOSPC);
> +
> + new_table = flow_hash_alloc(n_buckets);
> + if (!new_table)
> + return ERR_PTR(-ENOMEM);
> +
> + for (i = 0; i < table->n_buckets; i++) {
> + head = flex_array_get(table->buckets, i);
> +
> + hlist_for_each_entry_safe(flow, n, pos, head, hash_node) {
> + hlist_del_init_rcu(&flow->hash_node);
> + flow_insert(new_table, flow);
> + }
> + }
> +
> + return new_table;
> +}
> +
> /* RCU callback used by flow_deferred_free. */
> static void rcu_free_flow_callback(struct rcu_head *rcu)
> {
> @@ -589,12 +728,51 @@ u32 flow_hash(const struct sw_flow_key *key, int key_len)
> return jhash2((u32*)key, DIV_ROUND_UP(key_len, sizeof(u32)), hash_seed);
> }
>
> -int flow_cmp(const struct tbl_node *node, void *key2_, int len)
> +struct sw_flow * flow_lookup(struct flow_table *table, struct sw_flow_key *key,
> + int key_len)
> {
> - const struct sw_flow_key *key1 = &flow_cast(node)->key;
> - const struct sw_flow_key *key2 = key2_;
> + struct sw_flow *flow;
> + struct hlist_node *n;
> + struct hlist_head *head;
> + u32 hash;
>
> - return !memcmp(key1, key2, len);
> + hash = flow_hash(key, key_len);
> +
> + head = find_bucket(table, hash);
> + hlist_for_each_entry_rcu(flow, n, head, hash_node) {
> + if (flow->hash == hash &&
> + !memcmp(&flow->key, key, key_len)) {
> + return flow;
> + }
> + }
> + return NULL;
> +}
> +
> +void flow_insert(struct flow_table *table, struct sw_flow *flow)
> +{
> + struct hlist_head *head;
> + struct hlist_node *n;
> + struct sw_flow *cflow;
> +
> + head = find_bucket(table, flow->hash);
> +
> + hlist_for_each_entry_rcu(cflow, n, head, hash_node) {
> + if (!memcmp(&cflow->key, &flow->key, sizeof(cflow->key))) {
> + BUG();
> + return;
> + }
> + }
> + hlist_add_head_rcu(&flow->hash_node, head);
> + table->count++;
> +}
> +
> +void flow_remove(struct flow_table *table, struct sw_flow *flow)
> +{
> + if (!hlist_unhashed(&flow->hash_node)) {
> + hlist_del_init_rcu(&flow->hash_node);
> + table->count--;
> + BUG_ON(table->count < 0);
> + }
> }
>
> /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */
> diff --git a/datapath/flow.h b/datapath/flow.h
> index 997692c..b304c60 100644
> --- a/datapath/flow.h
> +++ b/datapath/flow.h
> @@ -18,9 +18,8 @@
> #include <linux/in6.h>
> #include <linux/jiffies.h>
> #include <linux/time.h>
> -
> +#include <linux/flex_array.h>
> #include "openvswitch/datapath-protocol.h"
> -#include "table.h"
>
> struct sk_buff;
>
> @@ -78,21 +77,21 @@ struct sw_flow_key {
> };
> };
>
> -struct sw_flow {
> - struct rcu_head rcu;
> - struct tbl_node tbl_node;
>
> - struct sw_flow_key key;
> +struct sw_flow {
> struct sw_flow_actions __rcu *sf_acts;
> -
> - atomic_t refcnt;
> + struct hlist_node hash_node;
> + struct sw_flow_key key;
> + u32 hash;
> + u8 tcp_flags; /* Union of seen TCP flags. */
> bool dead;
>
> - spinlock_t lock; /* Lock for values below. */
> - unsigned long used; /* Last used time (in jiffies). */
> - u64 packet_count; /* Number of packets matched. */
> - u64 byte_count; /* Number of bytes matched. */
> - u8 tcp_flags; /* Union of seen TCP flags. */
> + spinlock_t lock; /* Lock for values below. */
> + unsigned long used; /* Last used time (in jiffies). */
> + u64 packet_count; /* Number of packets matched. */
> + u64 byte_count; /* Number of bytes matched. */
> + atomic_t refcnt;
> + struct rcu_head rcu;
> };
>
> struct arp_eth_header
> @@ -115,7 +114,6 @@ void flow_exit(void);
>
> struct sw_flow *flow_alloc(void);
> void flow_deferred_free(struct sw_flow *);
> -void flow_free_tbl(struct tbl_node *);
>
> struct sw_flow_actions *flow_actions_alloc(const struct nlattr *);
> void flow_deferred_free_acts(struct sw_flow_actions *);
> @@ -128,9 +126,6 @@ int flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *,
> void flow_used(struct sw_flow *, struct sk_buff *);
> u64 flow_used_time(unsigned long flow_jiffies);
>
> -u32 flow_hash(const struct sw_flow_key *, int key_lenp);
> -int flow_cmp(const struct tbl_node *, void *target, int len);
> -
> /* Upper bound on the length of a nlattr-formatted flow key. The longest
> * nlattr-formatted flow key would be:
> *
> @@ -155,9 +150,37 @@ int flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
> int flow_metadata_from_nlattrs(u16 *in_port, __be64 *tun_id,
> const struct nlattr *);
>
> -static inline struct sw_flow *flow_cast(const struct tbl_node *node)
> +#define TBL_MIN_BUCKETS 1024
> +#define TBL_MAX_BUCKETS (TBL_MIN_BUCKETS * TBL_MIN_BUCKETS)
> +
> +struct flow_table {
> + struct flex_array *buckets;
> + unsigned int count, n_buckets;
> + struct rcu_head rcu;
> +};
> +
> +static inline int flow_tbl_count(struct flow_table *table)
> {
> - return container_of(node, struct sw_flow, tbl_node);
> + return table->count;
> }
>
> +static inline int flow_tbl_need_to_expand(struct flow_table *table)
> +{
> + return (table->count > table->n_buckets);
> +}
> +
> +struct sw_flow *flow_lookup(struct flow_table *table, struct sw_flow_key *key,
> + int len);
> +void flow_destroy_table(struct flow_table *table);
> +void flow_deferred_destroy_table(struct flow_table *table);
> +struct flow_table *flow_hash_alloc(int new_size);
> +struct flow_table *flow_hash_expand(struct flow_table *table);
> +void flow_insert(struct flow_table *table, struct sw_flow *flow);
> +void flow_remove(struct flow_table *table, struct sw_flow *flow);
> +u32 flow_hash(const struct sw_flow_key *key, int key_len);
> +
> +int flow_tbl_count(struct flow_table *table);
> +int flow_tbl_need_to_expand(struct flow_table *table);
> +struct sw_flow *flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx);
> +
> #endif /* flow.h */
> diff --git a/datapath/tunnel.c b/datapath/tunnel.c
> index 67556d7..8121b02 100644
> --- a/datapath/tunnel.c
> +++ b/datapath/tunnel.c
> @@ -13,6 +13,7 @@
> #include <linux/in.h>
> #include <linux/in_route.h>
> #include <linux/jhash.h>
> +#include <linux/list.h>
> #include <linux/kernel.h>
> #include <linux/version.h>
> #include <linux/workqueue.h>
> @@ -31,7 +32,6 @@
> #include "actions.h"
> #include "checksum.h"
> #include "datapath.h"
> -#include "table.h"
> #include "tunnel.h"
> #include "vlan.h"
> #include "vport.h"
> @@ -68,8 +68,10 @@
> #define CACHE_CLEANER_INTERVAL (5 * HZ)
>
> #define CACHE_DATA_ALIGN 16
> +#define PORT_TABLE_SIZE 1024
>
> -static struct tbl __rcu *port_table __read_mostly;
> +static struct hlist_head *port_table __read_mostly;
> +static int port_table_count;
>
> static void cache_cleaner(struct work_struct *work);
> static DECLARE_DELAYED_WORK(cache_cleaner_wq, cache_cleaner);
> @@ -95,11 +97,6 @@ static inline struct vport *tnl_vport_to_vport(const struct tnl_vport *tnl_vport
> return vport_from_priv(tnl_vport);
> }
>
> -static inline struct tnl_vport *tnl_vport_table_cast(const struct tbl_node *node)
> -{
> - return container_of(node, struct tnl_vport, tbl_node);
> -}
> -
> /* This is analogous to rtnl_dereference for the tunnel cache. It checks that
> * cache_lock is held, so it is only for update side code.
> */
> @@ -186,11 +183,9 @@ struct port_lookup_key {
> * Modifies 'target' to store the rcu_dereferenced pointer that was used to do
> * the comparision.
> */
> -static int port_cmp(const struct tbl_node *node, void *target, int unused)
> +static int port_cmp(const struct tnl_vport *tnl_vport,
> + struct port_lookup_key *lookup)
> {
> - const struct tnl_vport *tnl_vport = tnl_vport_table_cast(node);
> - struct port_lookup_key *lookup = target;
> -
> lookup->mutable = rcu_dereference_rtnl(tnl_vport->mutable);
>
> return (lookup->mutable->tunnel_type == lookup->tunnel_type &&
> @@ -218,106 +213,95 @@ static u32 mutable_hash(const struct tnl_mutable_config *mutable)
> return port_hash(&lookup);
> }
>
> -static void check_table_empty(void)
> -{
> - struct tbl *old_table = rtnl_dereference(port_table);
>
> - if (tbl_count(old_table) == 0) {
> - cancel_delayed_work_sync(&cache_cleaner_wq);
> - rcu_assign_pointer(port_table, NULL);
> - tbl_deferred_destroy(old_table, NULL);
> - }
> +static inline struct hlist_head *find_bucket(u32 hash)
> +{
> + return &port_table[(hash & (PORT_TABLE_SIZE - 1))];
> }
>
> -static int add_port(struct vport *vport)
> +static int alloc_port_table(void)
> {
> - struct tbl *cur_table = rtnl_dereference(port_table);
> - struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
> - int err;
> + unsigned int i;
> + static struct hlist_head *head;
>
> - if (!port_table) {
> - struct tbl *new_table;
> + head = kmalloc(PORT_TABLE_SIZE * sizeof(struct hlist_head *),
> + GFP_KERNEL);
> + if (!head)
> + return -ENOMEM;
>
> - new_table = tbl_create(TBL_MIN_BUCKETS);
> - if (!new_table)
> - return -ENOMEM;
> + for (i = 0; i < PORT_TABLE_SIZE; i++)
> + INIT_HLIST_HEAD(&head[i]);
>
> - rcu_assign_pointer(port_table, new_table);
> - schedule_cache_cleaner();
> + smp_wmb();
> + port_table = head;
> + return 0;
> +}
>
> - } else if (tbl_count(cur_table) > tbl_n_buckets(cur_table)) {
> - struct tbl *new_table;
> +static int add_port(struct vport *vport)
> +{
> + struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
> + u32 hash = mutable_hash(rtnl_dereference(tnl_vport->mutable));
> + int err;
>
> - new_table = tbl_expand(cur_table);
> - if (IS_ERR(new_table)) {
> - if (PTR_ERR(new_table) != -ENOSPC)
> - return PTR_ERR(new_table);
> - } else {
> - rcu_assign_pointer(port_table, new_table);
> - tbl_deferred_destroy(cur_table, NULL);
> - }
> + if (unlikely(!port_table)) {
> + err = alloc_port_table();
> + if (err)
> + return err;
> }
> + if (port_table_count == 0)
> + schedule_cache_cleaner();
>
> - err = tbl_insert(rtnl_dereference(port_table), &tnl_vport->tbl_node,
> - mutable_hash(rtnl_dereference(tnl_vport->mutable)));
> - if (err) {
> - check_table_empty();
> - return err;
> - }
> + hlist_add_head_rcu(&tnl_vport->hash_node, find_bucket(hash));
> + port_table_count++;
>
> (*find_port_pool(rtnl_dereference(tnl_vport->mutable)))++;
> -
> return 0;
> }
>
> -static int move_port(struct vport *vport, struct tnl_mutable_config *new_mutable)
> +static void move_port(struct vport *vport,
> + struct tnl_mutable_config *new_mutable)
> {
> - int err;
> - struct tbl *cur_table = rtnl_dereference(port_table);
> struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
> u32 hash;
>
> hash = mutable_hash(new_mutable);
> - if (hash == tnl_vport->tbl_node.hash)
> - goto table_updated;
> -
> - /*
> - * Ideally we should make this move atomic to avoid having gaps in
> - * finding tunnels or the possibility of failure. However, if we do
> - * find a tunnel it will always be consistent.
> - */
> - err = tbl_remove(cur_table, &tnl_vport->tbl_node);
> - if (err)
> - return err;
> + hlist_del_init_rcu(&tnl_vport->hash_node);
> + hlist_add_head_rcu(&tnl_vport->hash_node, find_bucket(hash));
>
> - err = tbl_insert(cur_table, &tnl_vport->tbl_node, hash);
> - if (err) {
> - (*find_port_pool(rtnl_dereference(tnl_vport->mutable)))--;
> - check_table_empty();
> - return err;
> - }
> -
> -table_updated:
> (*find_port_pool(rtnl_dereference(tnl_vport->mutable)))--;
> assign_config_rcu(vport, new_mutable);
> (*find_port_pool(rtnl_dereference(tnl_vport->mutable)))++;
> -
> - return 0;
> }
>
> -static int del_port(struct vport *vport)
> +static void del_port(struct vport *vport)
> {
> struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
> - int err;
>
> - err = tbl_remove(rtnl_dereference(port_table), &tnl_vport->tbl_node);
> - if (err)
> - return err;
> + hlist_del_init_rcu(&tnl_vport->hash_node);
> +
> + port_table_count--;
> + if (port_table_count == 0)
> + cancel_delayed_work_sync(&cache_cleaner_wq);
>
> - check_table_empty();
> (*find_port_pool(rtnl_dereference(tnl_vport->mutable)))--;
> +}
>
> - return 0;
> +struct tnl_vport *port_table_lookup(struct port_lookup_key *lookup)
> +{
> + struct hlist_node *n;
> + struct hlist_head *bucket;
> + u32 hash = port_hash(lookup);
> + struct tnl_vport * tnl_vport;
> +
> + bucket = find_bucket(hash);
> +
> + hlist_for_each_entry_rcu(tnl_vport, n, bucket, hash_node) {
> + if (!port_cmp(tnl_vport, lookup)) {
> + return tnl_vport;
> + }
> + }
> +
> + return NULL;
> }
>
> struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be64 key,
> @@ -325,10 +309,9 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be64 key,
> const struct tnl_mutable_config **mutable)
> {
> struct port_lookup_key lookup;
> - struct tbl *table = rcu_dereference_rtnl(port_table);
> - struct tbl_node *tbl_node;
> + struct tnl_vport * tnl_vport;
>
> - if (unlikely(!table))
> + if (unlikely(!port_table))
> return NULL;
>
> lookup.saddr = saddr;
> @@ -339,18 +322,15 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be64 key,
> lookup.tunnel_type = tunnel_type & ~TNL_T_KEY_MATCH;
>
> if (key_local_remote_ports) {
> - tbl_node = tbl_lookup(table, &lookup, sizeof(lookup),
> - port_hash(&lookup), port_cmp);
> - if (tbl_node)
> + tnl_vport = port_table_lookup(&lookup);
> + if (tnl_vport)
> goto found;
> }
>
> if (key_remote_ports) {
> lookup.saddr = 0;
> -
> - tbl_node = tbl_lookup(table, &lookup, sizeof(lookup),
> - port_hash(&lookup), port_cmp);
> - if (tbl_node)
> + tnl_vport = port_table_lookup(&lookup);
> + if (tnl_vport)
> goto found;
>
> lookup.saddr = saddr;
> @@ -362,18 +342,15 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be64 key,
> lookup.tunnel_type = tunnel_type & ~TNL_T_KEY_EXACT;
>
> if (local_remote_ports) {
> - tbl_node = tbl_lookup(table, &lookup, sizeof(lookup),
> - port_hash(&lookup), port_cmp);
> - if (tbl_node)
> + tnl_vport = port_table_lookup(&lookup);
> + if (tnl_vport)
> goto found;
> }
>
> if (remote_ports) {
> lookup.saddr = 0;
> -
> - tbl_node = tbl_lookup(table, &lookup, sizeof(lookup),
> - port_hash(&lookup), port_cmp);
> - if (tbl_node)
> + tnl_vport = port_table_lookup(&lookup);
> + if (tnl_vport)
> goto found;
> }
> }
> @@ -382,7 +359,7 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be64 key,
>
> found:
> *mutable = lookup.mutable;
> - return tnl_vport_to_vport(tnl_vport_table_cast(tbl_node));
> + return tnl_vport_to_vport(tnl_vport);
> }
>
> static void ecn_decapsulate(struct sk_buff *skb, u8 tos)
> @@ -848,10 +825,10 @@ static inline bool check_cache_valid(const struct tnl_cache *cache,
> (cache->flow && !cache->flow->dead));
> }
>
> -static int cache_cleaner_cb(struct tbl_node *tbl_node, void *aux)
> +static void __cache_cleaner(struct tnl_vport *tnl_vport)
> {
> - struct tnl_vport *tnl_vport = tnl_vport_table_cast(tbl_node);
> - const struct tnl_mutable_config *mutable = rcu_dereference(tnl_vport->mutable);
> + const struct tnl_mutable_config *mutable =
> + rcu_dereference(tnl_vport->mutable);
> const struct tnl_cache *cache = rcu_dereference(tnl_vport->cache);
>
> if (cache && !check_cache_valid(cache, mutable) &&
> @@ -859,16 +836,24 @@ static int cache_cleaner_cb(struct tbl_node *tbl_node, void *aux)
> assign_cache_rcu(tnl_vport_to_vport(tnl_vport), NULL);
> spin_unlock_bh(&tnl_vport->cache_lock);
> }
> -
> - return 0;
> }
>
> static void cache_cleaner(struct work_struct *work)
> {
> + struct hlist_node *n;
> + struct hlist_head *bucket;
> + struct tnl_vport * tnl_vport;
> + int i;
> +
> schedule_cache_cleaner();
>
> rcu_read_lock();
> - tbl_foreach(rcu_dereference(port_table), cache_cleaner_cb, NULL);
> + for (i = 0; i < PORT_TABLE_SIZE; i++) {
> + bucket = &port_table[i];
> + hlist_for_each_entry_rcu(tnl_vport, n, bucket, hash_node) {
> + __cache_cleaner(tnl_vport);
> + }
> + }
> rcu_read_unlock();
> }
>
> @@ -949,12 +934,12 @@ static struct tnl_cache *build_cache(struct vport *vport,
>
> if (is_internal_dev(rt_dst(rt).dev)) {
> struct sw_flow_key flow_key;
> - struct tbl_node *flow_node;
> struct vport *dst_vport;
> struct sk_buff *skb;
> bool is_frag;
> int err;
> int flow_key_len;
> + struct sw_flow *flow;
>
> dst_vport = internal_dev_get_vport(rt_dst(rt).dev);
> if (!dst_vport)
> @@ -974,13 +959,9 @@ static struct tnl_cache *build_cache(struct vport *vport,
> if (err || is_frag)
> goto done;
>
> - flow_node = tbl_lookup(rcu_dereference(dst_vport->dp->table),
> - &flow_key, flow_key_len,
> - flow_hash(&flow_key, flow_key_len),
> - flow_cmp);
> - if (flow_node) {
> - struct sw_flow *flow = flow_cast(flow_node);
> -
> + flow = flow_lookup(rcu_dereference(dst_vport->dp->table),
> + &flow_key, flow_key_len);
> + if (flow) {
> cache->flow = flow;
> flow_hold(flow);
> }
> @@ -1498,9 +1479,8 @@ int tnl_set_options(struct vport *vport, struct nlattr *options)
> if (err)
> goto error_free;
>
> - err = move_port(vport, mutable);
> - if (err)
> - goto error_free;
> + if (mutable_hash(mutable) != mutable_hash(old_mutable))
> + move_port(vport, mutable);
>
> return 0;
>
> @@ -1598,3 +1578,27 @@ void tnl_free_linked_skbs(struct sk_buff *skb)
> skb = next;
> }
> }
> +
> +
> +void tnl_exit()
> +{
> + struct hlist_node *n, *pos;
> + struct hlist_head *hash_head;
> + struct tnl_vport * tnl_vport;
> + int i;
> +
> + if (!port_table)
> + return;
> +
> + cancel_delayed_work_sync(&cache_cleaner_wq);
> + for (i = 0; i < PORT_TABLE_SIZE; i++) {
> + hash_head = &port_table[i];
> + hlist_for_each_entry_safe(tnl_vport, pos, n, hash_head,
> + hash_node) {
> + BUG();
> + goto out;
> + }
> + }
> +out:
> + kfree(port_table);
> +}
> diff --git a/datapath/tunnel.h b/datapath/tunnel.h
> index b488e46..169c408 100644
> --- a/datapath/tunnel.h
> +++ b/datapath/tunnel.h
> @@ -13,7 +13,6 @@
>
> #include "flow.h"
> #include "openvswitch/tunnel.h"
> -#include "table.h"
> #include "vport.h"
>
> /*
> @@ -187,7 +186,7 @@ struct tnl_cache {
>
> struct tnl_vport {
> struct rcu_head rcu;
> - struct tbl_node tbl_node;
> + struct hlist_node hash_node;
>
> char name[IFNAMSIZ];
> const struct tnl_ops *tnl_ops;
> @@ -236,6 +235,7 @@ bool tnl_frag_needed(struct vport *vport,
> struct sk_buff *skb, unsigned int mtu, __be64 flow_key);
> void tnl_free_linked_skbs(struct sk_buff *skb);
>
> +void tnl_exit(void);
> static inline struct tnl_vport *tnl_vport_priv(const struct vport *vport)
> {
> return vport_priv(vport);
> --
> 1.7.1
>
>
More information about the dev
mailing list