[ovs-dev] [PATCH 2/2] datapath: Add loop detection for RT kernels.

Jesse Gross jesse at nicira.com
Fri Oct 22 23:34:14 UTC 2010


Our normal loop detection is fairly lightweight but requires
disabling preemption while packet processing takes place.  On
RT kernels this isn't acceptable and interacts badly with spinlocks,
so we can't use it.  This implements a form of pseudo thread local
storage and uses that instead as it is capable of tracking a thread
across CPU migrations.  It is more expensive than the preemption
disabled version, so we continue to use that on non-RT kernels.

Signed-off-by: Jesse Gross <jesse at nicira.com>
---
 datapath/Modules.mk     |    2 +
 datapath/datapath.c     |   50 +++++--------
 datapath/loop_counter.c |  174 +++++++++++++++++++++++++++++++++++++++++++++++
 datapath/loop_counter.h |   30 ++++++++
 4 files changed, 225 insertions(+), 31 deletions(-)
 create mode 100644 datapath/loop_counter.c
 create mode 100644 datapath/loop_counter.h

diff --git a/datapath/Modules.mk b/datapath/Modules.mk
index b632297..cbf65a6 100644
--- a/datapath/Modules.mk
+++ b/datapath/Modules.mk
@@ -16,6 +16,7 @@ openvswitch_sources = \
 	dp_sysfs_dp.c \
 	dp_sysfs_if.c \
 	flow.c \
+	loop_counter.c \
 	table.c \
 	tunnel.c \
 	vport.c \
@@ -32,6 +33,7 @@ openvswitch_headers = \
 	datapath.h \
 	dp_sysfs.h \
 	flow.h \
+	loop_counter.h \
 	odp-compat.h \
 	table.h \
 	tunnel.h \
diff --git a/datapath/datapath.c b/datapath/datapath.c
index b41110d..c87f552 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -47,6 +47,7 @@
 #include "datapath.h"
 #include "actions.h"
 #include "flow.h"
+#include "loop_counter.h"
 #include "odp-compat.h"
 #include "table.h"
 #include "vport-internal_dev.h"
@@ -69,23 +70,6 @@ EXPORT_SYMBOL(dp_ioctl_hook);
 static struct datapath *dps[ODP_MAX];
 static DEFINE_MUTEX(dp_mutex);
 
-/* We limit the number of times that we pass into dp_process_received_packet()
- * to avoid blowing out the stack in the event that we have a loop. */
-struct loop_counter {
-	int count;		/* Count. */
-	bool looping;		/* Loop detected? */
-};
-
-#define DP_MAX_LOOPS 5
-
-/* We use a separate counter for each CPU for both interrupt and non-interrupt
- * context in order to keep the limit deterministic for a given packet. */
-struct percpu_loop_counters {
-	struct loop_counter counters[2];
-};
-
-static DEFINE_PER_CPU(struct percpu_loop_counters, dp_loop_counters);
-
 static int new_dp_port(struct datapath *, struct odp_port *, int port_no);
 
 /* Must be called with rcu_read_lock or dp_mutex. */
@@ -526,14 +510,6 @@ out:
 	return err;
 }
 
-static void suppress_loop(struct datapath *dp, struct sw_flow_actions *actions)
-{
-	if (net_ratelimit())
-		pr_warn("%s: flow looped %d times, dropping\n",
-			dp_name(dp), DP_MAX_LOOPS);
-	actions->n_actions = 0;
-}
-
 /* Must be called with rcu_read_lock. */
 void dp_process_received_packet(struct dp_port *p, struct sk_buff *skb)
 {
@@ -581,11 +557,16 @@ void dp_process_received_packet(struct dp_port *p, struct sk_buff *skb)
 	acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts);
 
 	/* Check whether we've looped too much. */
-	loop = &get_cpu_var(dp_loop_counters).counters[!!in_interrupt()];
-	if (unlikely(++loop->count > DP_MAX_LOOPS))
+	loop = loop_get_counter();
+	if (unlikely(IS_ERR(loop))) {
+		kfree_skb(skb);
+		stats_counter_off = offsetof(struct dp_stats_percpu, n_lost);
+		goto out;
+	}
+	if (unlikely(++loop->count > MAX_LOOPS))
 		loop->looping = true;
 	if (unlikely(loop->looping)) {
-		suppress_loop(dp, acts);
+		loop_suppress(dp, acts);
 		goto out_loop;
 	}
 
@@ -596,13 +577,13 @@ void dp_process_received_packet(struct dp_port *p, struct sk_buff *skb)
 
 	/* Check whether sub-actions looped too much. */
 	if (unlikely(loop->looping))
-		suppress_loop(dp, acts);
+		loop_suppress(dp, acts);
 
 out_loop:
 	/* Decrement loop counter. */
 	if (!--loop->count)
 		loop->looping = false;
-	put_cpu_var(dp_loop_counters);
+	loop_put_counter(loop);
 
 out:
 	/* Update datapath statistics. */
@@ -2234,10 +2215,14 @@ static int __init dp_init(void)
 	if (err)
 		goto error_flow_exit;
 
-	err = register_netdevice_notifier(&dp_device_notifier);
+	err = loop_init();
 	if (err)
 		goto error_vport_exit;
 
+	err = register_netdevice_notifier(&dp_device_notifier);
+	if (err)
+		goto error_loop_exit;
+
 	major = register_chrdev(0, "openvswitch", &openvswitch_fops);
 	if (err < 0)
 		goto error_unreg_notifier;
@@ -2246,6 +2231,8 @@ static int __init dp_init(void)
 
 error_unreg_notifier:
 	unregister_netdevice_notifier(&dp_device_notifier);
+error_loop_exit:
+	loop_exit();
 error_vport_exit:
 	vport_exit();
 error_flow_exit:
@@ -2259,6 +2246,7 @@ static void dp_cleanup(void)
 	rcu_barrier();
 	unregister_chrdev(major, "openvswitch");
 	unregister_netdevice_notifier(&dp_device_notifier);
+	loop_exit();
 	vport_exit();
 	flow_exit();
 }
diff --git a/datapath/loop_counter.c b/datapath/loop_counter.c
new file mode 100644
index 0000000..bbae7d1
--- /dev/null
+++ b/datapath/loop_counter.c
@@ -0,0 +1,174 @@
+/*
+ * Distributed under the terms of the GNU GPL version 2.
+ * Copyright (c) 2010 Nicira Networks.
+ *
+ * Significant portions of this file may be copied from parts of the Linux
+ * kernel, by Linus Torvalds and others.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/hardirq.h>
+#include <linux/jhash.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/rculist.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+
+#include "loop_counter.h"
+
+void loop_suppress(struct datapath *dp, struct sw_flow_actions *actions)
+{
+	if (net_ratelimit())
+		pr_warn("%s: flow looped %d times, dropping\n",
+			dp_name(dp), MAX_LOOPS);
+	actions->n_actions = 0;
+}
+
+#ifndef CONFIG_PREEMPT_RT
+/* We use a separate counter for each CPU for both interrupt and non-interrupt
+ * context in order to keep the limit deterministic for a given packet. */
+struct percpu_loop_counters {
+	struct loop_counter counters[2];
+};
+static DEFINE_PER_CPU(struct percpu_loop_counters, loop_counters);
+
+int loop_init(void)
+{
+	return 0;
+}
+
+void loop_exit(void) { }
+
+struct loop_counter *loop_get_counter(void)
+{
+	return &get_cpu_var(loop_counters).counters[!!in_interrupt()];
+}
+
+void loop_put_counter(struct loop_counter *counter)
+{
+	put_cpu_var(loop_counters);
+}
+
+#else /* !CONFIG_PREEMPT_RT */
+
+struct task_entry {
+	struct loop_counter counter ____cacheline_aligned_in_smp;
+
+	atomic_t refcnt;
+	unsigned long last_used;
+	struct timer_list timer;
+	struct rcu_head rcu;
+
+	struct hlist_node node;
+	unsigned long task;
+};
+
+#define TASK_TIMEOUT (5 * HZ)
+#define TASK_HASH_BUCKETS NR_CPUS
+static struct hlist_head *task_hash;
+DEFINE_SPINLOCK(task_hash_lock);
+
+int loop_init(void)
+{
+	task_hash = kzalloc(sizeof(struct hlist_head) * TASK_HASH_BUCKETS,
+				GFP_KERNEL);
+	if (!task_hash)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void loop_exit(void)
+{
+	kfree(task_hash);
+}
+
+static void free_task_rcu(struct rcu_head *rcu)
+{
+	struct task_entry *task = container_of(rcu, struct task_entry, rcu);
+	kfree(task);
+}
+
+static void task_put(struct task_entry *task)
+{
+	if (atomic_dec_and_test(&task->refcnt)) {
+		spin_lock(&task_hash_lock);
+		hlist_del_rcu(&task->node);
+		spin_unlock(&task_hash_lock);
+
+		call_rcu(&task->rcu, free_task_rcu);
+	}
+}
+
+static void task_expire(unsigned long task_)
+{
+	struct task_entry *task = (struct task_entry *)task_;
+
+	if (time_before(jiffies, task->last_used + TASK_TIMEOUT))
+		mod_timer(&task->timer, jiffies + TASK_TIMEOUT);
+	else
+		task_put(task);
+}
+
+static inline struct task_entry *find_task(unsigned long cur_task, u32 hash)
+{
+	struct task_entry *task;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_rcu(task, node, &task_hash[hash], node)
+		if (likely(cur_task == task->task &&
+		    atomic_inc_not_zero(&task->refcnt))) {
+			task->last_used = jiffies;
+			return task;
+	}
+
+	return NULL;
+}
+
+/* Caller must hold rcu_read_lock. */
+struct loop_counter *loop_get_counter(void)
+{
+	unsigned long cur_task = (unsigned long)current;
+	u32 hash = jhash_1word(cur_task, 0) & (TASK_HASH_BUCKETS - 1);
+	struct task_entry *task;
+
+	WARN_ON(in_interrupt());
+
+	task = find_task(cur_task, hash);
+	if (likely(task))
+		return &task->counter;
+
+	/* If we didn't find an entry, allocate and add a new one.  No one else
+	 * can be trying to add the same entry at this time since this is
+	 * thread local.
+	 */
+
+	task = kzalloc(sizeof(struct task_entry), GFP_ATOMIC);
+	if (!task)
+		return ERR_PTR(-ENOMEM);
+
+	task->task = cur_task;
+	task->last_used = jiffies;
+	atomic_set(&task->refcnt, 2);	/* One for us, one for the timer. */
+
+	spin_lock(&task_hash_lock);
+	hlist_add_head_rcu(&task->node, &task_hash[hash]);
+	spin_unlock(&task_hash_lock);
+
+	setup_timer(&task->timer, task_expire, (unsigned long)task);
+	mod_timer(&task->timer, jiffies + TASK_TIMEOUT);
+
+	return &task->counter;
+}
+
+void loop_put_counter(struct loop_counter *counter)
+{
+	struct task_entry *task = container_of(counter, struct task_entry,
+					       counter);
+	task_put(task);
+}
+
+#endif /* CONFIG_PREEMPT_RT */
diff --git a/datapath/loop_counter.h b/datapath/loop_counter.h
new file mode 100644
index 0000000..3dd855f
--- /dev/null
+++ b/datapath/loop_counter.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2010 Nicira Networks.
+ * Distributed under the terms of the GNU GPL version 2.
+ *
+ * Significant portions of this file may be copied from parts of the Linux
+ * kernel, by Linus Torvalds and others.
+ */
+
+#ifndef LOOP_COUNTER_H
+#define LOOP_COUNTER_H 1
+
+#include "datapath.h"
+#include "flow.h"
+
+/* We limit the number of times that we pass into dp_process_received_packet()
+ * to avoid blowing out the stack in the event that we have a loop. */
+#define MAX_LOOPS 5
+
+struct loop_counter {
+	int count;		/* Count. */
+	bool looping;		/* Loop detected? */
+};
+
+int loop_init(void);
+void loop_exit(void);
+struct loop_counter *loop_get_counter(void);
+void loop_put_counter(struct loop_counter *counter);
+void loop_suppress(struct datapath *, struct sw_flow_actions *);
+
+#endif /* loop_counter.h */
-- 
1.7.1





More information about the dev mailing list