[ovs-dev] [PATCH 3/3] datapath: Implement flow table re-hashing.
Pravin B Shelar
pshelar at nicira.com
Fri Dec 2 19:00:48 UTC 2011
This patch is creates an issue where dump-flow might have
missing or duplicate flows. I will post other patches to
fix it.
--8<--------------------------cut here-------------------------->8--
Following patch introduces a timer based event to rehash flow-hash
table. It makes finding collisions difficult to for an attacker.
Suggested-by: Herbert Xu <herbert at gondor.apana.org.au>
Signed-off-by: Pravin B Shelar <pshelar at nicira.com>
---
datapath/datapath.c | 171 ++++++++++++++++++++++++++++++++++----------------
datapath/datapath.h | 2 +
datapath/flow.c | 84 +++++++++++++++++--------
datapath/flow.h | 7 ++-
4 files changed, 180 insertions(+), 84 deletions(-)
diff --git a/datapath/datapath.c b/datapath/datapath.c
index acbd3bf..ae6b39a 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -63,6 +63,10 @@
#error Kernels before 2.6.18 or after 3.2 are not supported by this version of Open vSwitch.
#endif
+#define REBUILD_INTERVAL (HZ * 10 * 60)
+
+static struct timer_list ovs_table_rebuild_timer;
+
int (*ovs_dp_ioctl_hook)(struct net_device *dev, struct ifreq *rq, int cmd);
EXPORT_SYMBOL(ovs_dp_ioctl_hook);
@@ -313,7 +317,6 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
stats_counter = &stats->n_missed;
goto out;
}
-
OVS_CB(skb)->flow = flow;
}
@@ -493,14 +496,16 @@ static int flush_flows(int dp_ifindex)
if (!dp)
return -ENODEV;
- old_table = genl_dereference(dp->table);
new_table = ovs_flow_tbl_alloc(TBL_MIN_BUCKETS);
if (!new_table)
return -ENOMEM;
+ spin_lock_bh(&dp->table_lock);
+ old_table = rcu_dereference(dp->table);
rcu_assign_pointer(dp->table, new_table);
ovs_flow_tbl_deferred_destroy(old_table);
+ spin_unlock_bh(&dp->table_lock);
return 0;
}
@@ -815,9 +820,12 @@ static struct genl_ops dp_packet_genl_ops[] = {
static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats)
{
int i;
- struct flow_table *table = genl_dereference(dp->table);
+ struct flow_table *table;
+ rcu_read_lock();
+ table = rcu_dereference(dp->table);
stats->n_flows = ovs_flow_tbl_count(table);
+ rcu_read_unlock();
stats->n_hit = stats->n_missed = stats->n_lost = 0;
for_each_possible_cpu(i) {
@@ -949,7 +957,7 @@ static struct sk_buff *ovs_flow_cmd_alloc_info(struct sw_flow *flow)
len += NLMSG_ALIGN(sizeof(struct ovs_header));
- return genlmsg_new(len, GFP_KERNEL);
+ return genlmsg_new(len, GFP_ATOMIC);
}
static struct sk_buff *ovs_flow_cmd_build_info(struct sw_flow *flow,
@@ -977,6 +985,8 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *reply;
struct datapath *dp;
struct flow_table *table;
+ struct sw_flow_actions *old_acts;
+ struct nlattr *acts_attrs;
int error;
int key_len;
@@ -1003,16 +1013,27 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
if (!dp)
goto error;
- table = genl_dereference(dp->table);
+ rcu_read_lock();
+ table = rcu_dereference(dp->table);
flow = ovs_flow_tbl_lookup(table, &key, key_len);
+ rcu_read_unlock();
if (!flow) {
struct sw_flow_actions *acts;
+ spin_lock_bh(&dp->table_lock);
+ /* Retry with lock. */
+ table = rcu_dereference(dp->table);
+ flow = ovs_flow_tbl_lookup(table, &key, key_len);
+ if (flow) {
+ spin_unlock_bh(&dp->table_lock);
+ goto found;
+ }
/* Bail out if we're not allowed to create a new flow. */
error = -ENOENT;
- if (info->genlhdr->cmd == OVS_FLOW_CMD_SET)
+ if (info->genlhdr->cmd == OVS_FLOW_CMD_SET) {
+ spin_unlock_bh(&dp->table_lock);
goto error;
-
+ }
/* Expand table, if necessary, to make room. */
if (ovs_flow_tbl_need_to_expand(table)) {
struct flow_table *new_table;
@@ -1021,13 +1042,14 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
if (!IS_ERR(new_table)) {
rcu_assign_pointer(dp->table, new_table);
ovs_flow_tbl_deferred_destroy(table);
- table = genl_dereference(dp->table);
+ table = rcu_dereference(dp->table);
}
}
/* Allocate flow. */
flow = ovs_flow_alloc();
if (IS_ERR(flow)) {
+ spin_unlock_bh(&dp->table_lock);
error = PTR_ERR(flow);
goto error;
}
@@ -1037,63 +1059,65 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
/* Obtain actions. */
acts = ovs_flow_actions_alloc(a[OVS_FLOW_ATTR_ACTIONS]);
error = PTR_ERR(acts);
- if (IS_ERR(acts))
+ if (IS_ERR(acts)) {
+ spin_unlock_bh(&dp->table_lock);
goto error_free_flow;
+ }
rcu_assign_pointer(flow->sf_acts, acts);
/* Put flow in bucket. */
flow->hash = ovs_flow_hash(&key, key_len);
ovs_flow_tbl_insert(table, flow);
+ spin_unlock_bh(&dp->table_lock);
reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
info->snd_seq,
OVS_FLOW_CMD_NEW);
- } else {
- /* We found a matching flow. */
- struct sw_flow_actions *old_acts;
- struct nlattr *acts_attrs;
-
- /* Bail out if we're not allowed to modify an existing flow.
- * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
- * because Generic Netlink treats the latter as a dump
- * request. We also accept NLM_F_EXCL in case that bug ever
- * gets fixed.
- */
- error = -EEXIST;
- if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW &&
- info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL))
- goto error;
+ goto reply;
+ }
+found:
- /* Update actions. */
- old_acts = rcu_dereference_protected(flow->sf_acts,
- lockdep_genl_is_held());
- acts_attrs = a[OVS_FLOW_ATTR_ACTIONS];
- if (acts_attrs &&
- (old_acts->actions_len != nla_len(acts_attrs) ||
- memcmp(old_acts->actions, nla_data(acts_attrs),
- old_acts->actions_len))) {
- struct sw_flow_actions *new_acts;
-
- new_acts = ovs_flow_actions_alloc(acts_attrs);
- error = PTR_ERR(new_acts);
- if (IS_ERR(new_acts))
- goto error;
+ /* Bail out if we're not allowed to modify an existing flow.
+ * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
+ * because Generic Netlink treats the latter as a dump
+ * request. We also accept NLM_F_EXCL in case that bug ever
+ * gets fixed.
+ */
+ error = -EEXIST;
+ if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW &&
+ info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL))
+ goto error;
- rcu_assign_pointer(flow->sf_acts, new_acts);
- ovs_flow_deferred_free_acts(old_acts);
- }
+ /* Update actions. */
+ old_acts = rcu_dereference_protected(flow->sf_acts,
+ lockdep_genl_is_held());
+ acts_attrs = a[OVS_FLOW_ATTR_ACTIONS];
+ if (acts_attrs &&
+ (old_acts->actions_len != nla_len(acts_attrs) ||
+ memcmp(old_acts->actions, nla_data(acts_attrs), old_acts->actions_len))) {
+ struct sw_flow_actions *new_acts;
+
+ new_acts = ovs_flow_actions_alloc(acts_attrs);
+ error = PTR_ERR(new_acts);
+ if (IS_ERR(new_acts))
+ goto error;
- reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
- info->snd_seq, OVS_FLOW_CMD_NEW);
+ rcu_assign_pointer(flow->sf_acts, new_acts);
+ ovs_flow_deferred_free_acts(old_acts);
+ }
- /* Clear stats. */
- if (a[OVS_FLOW_ATTR_CLEAR]) {
- spin_lock_bh(&flow->lock);
- clear_stats(flow);
- spin_unlock_bh(&flow->lock);
- }
+ reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
+ info->snd_seq, OVS_FLOW_CMD_NEW);
+
+ /* Clear stats. */
+ if (a[OVS_FLOW_ATTR_CLEAR]) {
+ spin_lock_bh(&flow->lock);
+ clear_stats(flow);
+ spin_unlock_bh(&flow->lock);
}
+
+reply:
if (!IS_ERR(reply))
genl_notify(reply, genl_info_net(info), info->snd_pid,
ovs_dp_flow_multicast_group.id, info->nlhdr,
@@ -1132,8 +1156,10 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
if (!dp)
return -ENODEV;
- table = genl_dereference(dp->table);
+ rcu_read_lock();
+ table = rcu_dereference(dp->table);
flow = ovs_flow_tbl_lookup(table, &key, key_len);
+ rcu_read_unlock();
if (!flow)
return -ENOENT;
@@ -1167,16 +1193,22 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
if (!dp)
return -ENODEV;
- table = genl_dereference(dp->table);
+ spin_lock_bh(&dp->table_lock);
+ table = rcu_dereference(dp->table);
flow = ovs_flow_tbl_lookup(table, &key, key_len);
- if (!flow)
+ if (!flow) {
+ spin_unlock_bh(&dp->table_lock);
return -ENOENT;
+ }
reply = ovs_flow_cmd_alloc_info(flow);
- if (!reply)
+ if (!reply) {
+ spin_unlock_bh(&dp->table_lock);
return -ENOMEM;
+ }
ovs_flow_tbl_remove(table, flow);
+ spin_unlock_bh(&dp->table_lock);
err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_pid,
info->snd_seq, 0, OVS_FLOW_CMD_DEL);
@@ -1199,7 +1231,8 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (!dp)
return -ENODEV;
- table = genl_dereference(dp->table);
+ rcu_read_lock();
+ table = rcu_dereference(dp->table);
for (;;) {
struct sw_flow *flow;
@@ -1220,6 +1253,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->args[0] = bucket;
cb->args[1] = obj;
}
+ rcu_read_unlock();
return skb->len;
}
@@ -1371,6 +1405,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
* /sys/class/net/<devname>/brif later, if sysfs is enabled. */
dp->ifobj.kset = NULL;
kobject_init(&dp->ifobj, &dp_ktype);
+ spin_lock_init(&dp->table_lock);
/* Allocate table. */
err = -ENOMEM;
@@ -1422,7 +1457,7 @@ err_destroy_local_port:
err_destroy_percpu:
free_percpu(dp->stats_percpu);
err_destroy_table:
- ovs_flow_tbl_destroy(genl_dereference(dp->table));
+ ovs_flow_tbl_destroy(rcu_dereference(dp->table));
err_free_dp:
kfree(dp);
err_put_module:
@@ -2039,6 +2074,27 @@ error:
return err;
}
+static void ovs_table_rebuild(unsigned long dummy)
+{
+ struct datapath *dp;
+ unsigned long now = jiffies;
+
+ list_for_each_entry_rcu(dp, &dps, list_node) {
+ struct flow_table *new_table;
+
+ spin_lock(&dp->table_lock);
+ new_table = ovs_flow_tbl_rehash(rcu_dereference(dp->table));
+
+ if (!IS_ERR(new_table)) {
+ ovs_flow_tbl_deferred_destroy(rcu_dereference(dp->table));
+ rcu_assign_pointer(dp->table, new_table);
+ }
+ spin_unlock(&dp->table_lock);
+ }
+
+ mod_timer(&ovs_table_rebuild_timer, now + REBUILD_INTERVAL);
+}
+
static int __init dp_init(void)
{
struct sk_buff *dummy_skb;
@@ -2069,6 +2125,10 @@ static int __init dp_init(void)
if (err < 0)
goto error_unreg_notifier;
+ setup_timer(&ovs_table_rebuild_timer, ovs_table_rebuild, 0);
+ ovs_table_rebuild_timer.expires = jiffies + REBUILD_INTERVAL;
+ add_timer(&ovs_table_rebuild_timer);
+
return 0;
error_unreg_notifier:
@@ -2086,6 +2146,7 @@ error:
static void dp_cleanup(void)
{
rcu_barrier();
+ del_timer(&ovs_table_rebuild_timer);
dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
unregister_netdevice_notifier(&ovs_dp_device_notifier);
ovs_vport_exit();
diff --git a/datapath/datapath.h b/datapath/datapath.h
index 27151b9..76487f6 100644
--- a/datapath/datapath.h
+++ b/datapath/datapath.h
@@ -80,6 +80,8 @@ struct datapath {
/* Flow table. */
struct flow_table __rcu *table;
+ /* Protects table and table updates. */
+ spinlock_t table_lock;
/* Switch ports. */
struct vport __rcu *ports[DP_MAX_PORTS];
diff --git a/datapath/flow.c b/datapath/flow.c
index a357077..9e64458 100644
--- a/datapath/flow.c
+++ b/datapath/flow.c
@@ -47,7 +47,6 @@
#include "vlan.h"
static struct kmem_cache *flow_cache;
-static unsigned int hash_seed __read_mostly;
static int check_header(struct sk_buff *skb, int len)
{
@@ -263,7 +262,7 @@ struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *actions)
if (actions_len > 2 * DP_MAX_PORTS * nla_total_size(4))
return ERR_PTR(-EINVAL);
- sfa = kmalloc(sizeof(*sfa) + actions_len, GFP_KERNEL);
+ sfa = kmalloc(sizeof(*sfa) + actions_len, GFP_ATOMIC);
if (!sfa)
return ERR_PTR(-ENOMEM);
@@ -276,7 +275,7 @@ struct sw_flow *ovs_flow_alloc(void)
{
struct sw_flow *flow;
- flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
+ flow = kmem_cache_alloc(flow_cache, GFP_ATOMIC);
if (!flow)
return ERR_PTR(-ENOMEM);
@@ -290,7 +289,7 @@ struct sw_flow *ovs_flow_alloc(void)
static struct hlist_head *find_bucket(struct flow_table *table, u32 hash)
{
- hash = jhash_1word(hash, hash_seed);
+ hash = jhash_1word(hash, table->hash_seed);
return flex_array_get(table->buckets,
(hash & (table->n_buckets - 1)));
}
@@ -301,11 +300,11 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
int i, err;
buckets = flex_array_alloc(sizeof(struct hlist_head *),
- n_buckets, GFP_KERNEL);
+ n_buckets, GFP_ATOMIC);
if (!buckets)
return NULL;
- err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
+ err = flex_array_prealloc(buckets, 0, n_buckets, GFP_ATOMIC);
if (err) {
flex_array_free(buckets);
return NULL;
@@ -325,7 +324,7 @@ static void free_buckets(struct flex_array *buckets)
struct flow_table *ovs_flow_tbl_alloc(int new_size)
{
- struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL);
+ struct flow_table *table = kmalloc(sizeof(*table), GFP_ATOMIC);
if (!table)
return NULL;
@@ -338,6 +337,9 @@ struct flow_table *ovs_flow_tbl_alloc(int new_size)
}
table->n_buckets = new_size;
table->count = 0;
+ table->node_ver = 0;
+ table->keep_flows = 0;
+ get_random_bytes(&table->hash_seed, sizeof(u32));
return table;
}
@@ -355,17 +357,22 @@ void ovs_flow_tbl_destroy(struct flow_table *table)
if (!table)
return;
+ if (table->keep_flows)
+ goto skip_flows;
+
for (i = 0; i < table->n_buckets; i++) {
struct sw_flow *flow;
struct hlist_head *head = flex_array_get(table->buckets, i);
struct hlist_node *node, *n;
+ int ver = table->node_ver;
- hlist_for_each_entry_safe(flow, node, n, head, hash_node) {
- hlist_del_init_rcu(&flow->hash_node);
+ hlist_for_each_entry_safe(flow, node, n, head, hash_node[ver]) {
+ hlist_del_rcu(&flow->hash_node[ver]);
flow_free(flow);
}
}
+skip_flows:
free_buckets(table->buckets);
kfree(table);
}
@@ -390,12 +397,14 @@ struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *la
struct sw_flow *flow;
struct hlist_head *head;
struct hlist_node *n;
+ int ver;
int i;
+ ver = table->node_ver;
while (*bucket < table->n_buckets) {
i = 0;
head = flex_array_get(table->buckets, *bucket);
- hlist_for_each_entry_rcu(flow, n, head, hash_node) {
+ hlist_for_each_entry_rcu(flow, n, head, hash_node[ver]) {
if (i < *last) {
i++;
continue;
@@ -410,32 +419,53 @@ struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *la
return NULL;
}
-struct flow_table *ovs_flow_tbl_expand(struct flow_table *table)
+static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new)
{
- struct flow_table *new_table;
- int n_buckets = table->n_buckets * 2;
+ int old_ver;
int i;
- new_table = ovs_flow_tbl_alloc(n_buckets);
- if (!new_table)
- return ERR_PTR(-ENOMEM);
+ old_ver = old->node_ver;
+ new->node_ver = !old_ver;
- for (i = 0; i < table->n_buckets; i++) {
+ /* Insert in new table. */
+ for (i = 0; i < old->n_buckets; i++) {
struct sw_flow *flow;
struct hlist_head *head;
- struct hlist_node *n, *pos;
+ struct hlist_node *n;
- head = flex_array_get(table->buckets, i);
+ head = flex_array_get(old->buckets, i);
- hlist_for_each_entry_safe(flow, n, pos, head, hash_node) {
- hlist_del_init_rcu(&flow->hash_node);
- ovs_flow_tbl_insert(new_table, flow);
+ hlist_for_each_entry(flow, n, head, hash_node[old_ver]) {
+ ovs_flow_tbl_insert(new, flow);
}
}
+ old->keep_flows = 1;
+}
+
+static struct flow_table *__flow_tbl_rehash(struct flow_table *table, int n_buckets)
+{
+ struct flow_table *new_table;
+
+ new_table = ovs_flow_tbl_alloc(n_buckets);
+ if (!new_table)
+ return ERR_PTR(-ENOMEM);
+
+ flow_table_copy_flows(table, new_table);
return new_table;
}
+struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table)
+{
+ return __flow_tbl_rehash(table, table->n_buckets);
+}
+
+struct flow_table *ovs_flow_tbl_expand(struct flow_table *table)
+{
+ return __flow_tbl_rehash(table, (table->n_buckets * 2));
+}
+
+
/* RCU callback used by ovs_flow_deferred_free. */
static void rcu_free_flow_callback(struct rcu_head *rcu)
{
@@ -828,7 +858,7 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
hash = ovs_flow_hash(key, key_len);
head = find_bucket(table, hash);
- hlist_for_each_entry_rcu(flow, n, head, hash_node) {
+ hlist_for_each_entry_rcu(flow, n, head, hash_node[table->node_ver]) {
if (flow->hash == hash &&
!memcmp(&flow->key, key, key_len)) {
@@ -843,14 +873,14 @@ void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow)
struct hlist_head *head;
head = find_bucket(table, flow->hash);
- hlist_add_head_rcu(&flow->hash_node, head);
+ hlist_add_head_rcu(&flow->hash_node[table->node_ver], head);
table->count++;
}
void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
{
- if (!hlist_unhashed(&flow->hash_node)) {
- hlist_del_init_rcu(&flow->hash_node);
+ if (!hlist_unhashed(&flow->hash_node[table->node_ver])) {
+ hlist_del_init_rcu(&flow->hash_node[table->node_ver]);
table->count--;
BUG_ON(table->count < 0);
}
@@ -1398,8 +1428,6 @@ int ovs_flow_init(void)
if (flow_cache == NULL)
return -ENOMEM;
- get_random_bytes(&hash_seed, sizeof(hash_seed));
-
return 0;
}
diff --git a/datapath/flow.h b/datapath/flow.h
index 36e738d..16e3310 100644
--- a/datapath/flow.h
+++ b/datapath/flow.h
@@ -96,7 +96,7 @@ struct sw_flow_key {
struct sw_flow {
struct rcu_head rcu;
- struct hlist_node hash_node;
+ struct hlist_node hash_node[2];
u32 hash;
struct sw_flow_key key;
@@ -174,6 +174,9 @@ struct flow_table {
struct flex_array *buckets;
unsigned int count, n_buckets;
struct rcu_head rcu;
+ int node_ver;
+ u32 hash_seed;
+ bool keep_flows;
};
static inline int ovs_flow_tbl_count(struct flow_table *table)
@@ -186,6 +189,8 @@ static inline int ovs_flow_tbl_need_to_expand(struct flow_table *table)
return (table->count > table->n_buckets);
}
+struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table);
+void flow_table_clear_flows(struct flow_table *old);
struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
struct sw_flow_key *key, int len);
void ovs_flow_tbl_destroy(struct flow_table *table);
--
1.7.1
More information about the dev
mailing list