[ovs-dev] [PATCH 2/6] compat: Simplify inet_fragment backports.

Joe Stringer joe at ovn.org
Tue Jul 12 22:26:19 UTC 2016


The core fragmentation handling logic is exported on all supported
kernels, so it's not necessary to backport the latest version of this.
This greatly simplifies the code due to inconsistencies between the old
per-lookup garbage collection and the newer workqueue based garbage
collection.

As a result of simplifying and removing unnecessary backport code, a few
bugs are fixed for corner cases such as when some fragments remain in
the fragment cache when openvswitch is unloaded.

Some backported ip functions need a little extra logic than what is seen
on the latest code due to this, for instance on kernels <3.17:
* Call inet_frag_evictor() before defrag
* Limit hashsize in ip{,6}_fragment logic

The pernet init/exit logic also differs a little from upstream. Upstream
ipv[46]_defrag logic initializes the various pernet fragment parameters
and its own global fragments cache. In the OVS backport, the pernet
parameters are shared while the fragments cache is separate. The
backport relies upon upstream pernet initialization to perform the
shared setup, and performs no pernet initialization of its own. When it
comes to pernet exit however, the backport must ensure that all
OVS-specific fragment state is cleared, while the shared state remains
untouched so that the regular ipv[46] logic may do its own cleanup. In
practice this means that OVS must have its own divergent implementation
of inet_frags_exit_net().

Fixes the following crash:

Call Trace:
 <IRQ>
 [<ffffffff810744f6>] ? call_timer_fn+0x36/0x100
 [<ffffffff8107548f>] run_timer_softirq+0x1ef/0x2f0
 [<ffffffff8106cccc>] __do_softirq+0xec/0x2c0
 [<ffffffff8106d215>] irq_exit+0x105/0x110
 [<ffffffff81737095>] smp_apic_timer_interrupt+0x45/0x60
 [<ffffffff81735a1d>] apic_timer_interrupt+0x6d/0x80
 <EOI>
 [<ffffffff8104f596>] ? native_safe_halt+0x6/0x10
 [<ffffffff8101cb2f>] default_idle+0x1f/0xc0
 [<ffffffff8101d406>] arch_cpu_idle+0x26/0x30
 [<ffffffff810bf3a5>] cpu_startup_entry+0xc5/0x290
 [<ffffffff810415ed>] start_secondary+0x21d/0x2d0
Code:  Bad RIP value.
RIP  [<ffffffffa0177480>] 0xffffffffa0177480
 RSP <ffff88003f703e78>
CR2: ffffffffa0177480
---[ end trace eb98ca80ba07bd9c ]---
Kernel panic - not syncing: Fatal exception in interrupt

Signed-off-by: Joe Stringer <joe at ovn.org>
---
I've tested this on CentOS kernel 3.10.0-327 and Ubuntu kernels
3.13.0-68, 3.16.0-70, 3.19.0-58, and 4.2.0-35. Some earlier kernel
versions may still trigger fragmentation-related crashes due to upstream
bugs, for instance on Ubuntu's 3.13.0-24.
---
 acinclude.m4                                  |   1 +
 datapath/linux/compat/include/net/inet_frag.h |  58 ++-
 datapath/linux/compat/inet_fragment.c         | 486 ++------------------------
 datapath/linux/compat/ip_fragment.c           |  36 +-
 datapath/linux/compat/nf_conntrack_reasm.c    |  17 +
 5 files changed, 91 insertions(+), 507 deletions(-)

diff --git a/acinclude.m4 b/acinclude.m4
index bb0d90a77d60..fc2595506152 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -424,6 +424,7 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [
   OVS_GREP_IFELSE([$KSRC/include/net/inet_frag.h], [last_in],
                   [OVS_DEFINE([HAVE_INET_FRAGS_LAST_IN])])
   OVS_GREP_IFELSE([$KSRC/include/net/inet_frag.h], [inet_frag_evicting])
+  OVS_GREP_IFELSE([$KSRC/include/net/inet_frag.h], [inet_frag_evictor])
   OVS_FIND_FIELD_IFELSE([$KSRC/include/net/inet_frag.h], [inet_frags],
                         [frags_work])
   OVS_FIND_FIELD_IFELSE([$KSRC/include/net/inet_frag.h], [inet_frags],
diff --git a/datapath/linux/compat/include/net/inet_frag.h b/datapath/linux/compat/include/net/inet_frag.h
index aa9a019c6fcc..49c1bceb695a 100644
--- a/datapath/linux/compat/include/net/inet_frag.h
+++ b/datapath/linux/compat/include/net/inet_frag.h
@@ -21,43 +21,18 @@
 #define qp_flags(qp) (qp->q.flags)
 #endif
 
-#ifndef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
-/**
- * struct ovs_inet_frag_queue - fragment queue
- *
- * Wrap the system inet_frag_queue to provide a list evictor.
- *
- * @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
- */
-struct ovs_inet_frag_queue {
-	struct inet_frag_queue	fq;
-	struct hlist_node	list_evictor;
-};
-
-static inline bool rpl_inet_frag_evicting(struct inet_frag_queue *q)
-{
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-	struct ovs_inet_frag_queue *ofq = (struct ovs_inet_frag_queue *)q;
-	return !hlist_unhashed(&ofq->list_evictor);
-#else
-	return (q_flags(q) & INET_FRAG_FIRST_IN) && q->fragments != NULL;
-#endif
-}
-#define inet_frag_evicting rpl_inet_frag_evicting
-#else /* HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR */
 #ifndef HAVE_INET_FRAG_EVICTING
-static inline bool rpl_inet_frag_evicting(struct inet_frag_queue *q)
+static inline bool inet_frag_evicting(struct inet_frag_queue *q)
 {
+#ifdef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
 	return !hlist_unhashed(&q->list_evictor);
+#else
+	return (q_flags(q) & INET_FRAG_FIRST_IN) && q->fragments != NULL;
+#endif /* HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR */
 }
-#define inet_frag_evicting rpl_inet_frag_evicting
-#endif
-#endif
+#endif /* HAVE_INET_FRAG_EVICTING */
 
 #ifndef HAVE_CORRECT_MRU_HANDLING
-static unsigned int rpl_frag_percpu_counter_batch = 130000;
-#define frag_percpu_counter_batch rpl_frag_percpu_counter_batch
-
 static inline void rpl_sub_frag_mem_limit(struct netns_frags *nf, int i)
 {
 	__percpu_counter_add(&nf->mem, -i, frag_percpu_counter_batch);
@@ -70,14 +45,29 @@ static inline void rpl_add_frag_mem_limit(struct netns_frags *nf, int i)
 }
 #define add_frag_mem_limit rpl_add_frag_mem_limit
 
-int rpl_inet_frags_init(struct inet_frags *f);
+static inline int rpl_inet_frags_init(struct inet_frags *frags)
+{
+	inet_frags_init(frags);
+	return 0;
+}
 #define inet_frags_init rpl_inet_frags_init
 
+/* We reuse the upstream inet_fragment.c common code for managing fragment
+ * stores, However we actually store the fragments within our own 'inet_frags'
+ * structures (in {ip_fragment,nf_conntrack_reasm}.c). When unloading the OVS
+ * kernel module, we need to flush all of the remaining fragments from these
+ * caches, or else we will panic with the following sequence of events:
+ *
+ * 1) A fragment for a packet arrives and is cached in inet_frags. This
+ *    starts a timer to ensure the fragment does not hang around forever.
+ * 2) openvswitch module is unloaded.
+ * 3) The timer for the fragment fires, calling into backported OVS code
+ *    to free the fragment.
+ * 4) BUG: unable to handle kernel paging request at ffffffffc03c01e0
+ */
 void rpl_inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
 #define inet_frags_exit_net rpl_inet_frags_exit_net
 
-void rpl_inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f);
-#define inet_frag_destroy(q, f, work) rpl_inet_frag_destroy(q, f)
 #endif /* !HAVE_CORRECT_MRU_HANDLING */
 
 #endif /* inet_frag.h */
diff --git a/datapath/linux/compat/inet_fragment.c b/datapath/linux/compat/inet_fragment.c
index 4479450c3737..f05e6177bfb3 100644
--- a/datapath/linux/compat/inet_fragment.c
+++ b/datapath/linux/compat/inet_fragment.c
@@ -11,8 +11,6 @@
  *				ipv6/reassembly. and ipv6 nf conntrack reassembly
  */
 
-#include <linux/version.h>
-
 #ifndef HAVE_CORRECT_MRU_HANDLING
 
 #include <linux/list.h>
@@ -29,98 +27,7 @@
 #include <net/inet_frag.h>
 #include <net/inet_ecn.h>
 
-#define INETFRAGS_EVICT_BUCKETS   128
-#define INETFRAGS_EVICT_MAX	  512
-
-/* don't rebuild inetfrag table with new secret more often than this */
-#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
-
-/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
- * Value : 0xff if frame should be dropped.
- *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
- */
-const u8 ip_frag_ecn_table[16] = {
-	/* at least one fragment had CE, and others ECT_0 or ECT_1 */
-	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]			= INET_ECN_CE,
-	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]			= INET_ECN_CE,
-	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]	= INET_ECN_CE,
-
-	/* invalid combinations : drop frame */
-	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
-	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
-	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
-	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
-	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
-	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
-	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
-};
-
-static unsigned int
-inet_frag_hashfn(const struct inet_frags *f, struct inet_frag_queue *q)
-{
-	return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
-}
-
 #ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-static bool inet_frag_may_rebuild(struct inet_frags *f)
-{
-	return time_after(jiffies,
-	       f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
-}
-
-static void inet_frag_secret_rebuild(struct inet_frags *f)
-{
-	int i;
-
-	write_seqlock_bh(&f->rnd_seqlock);
-
-	if (!inet_frag_may_rebuild(f))
-		goto out;
-
-	get_random_bytes(&f->rnd, sizeof(u32));
-
-	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
-		struct inet_frag_bucket *hb;
-		struct inet_frag_queue *q;
-		struct hlist_node *n;
-
-		hb = &f->hash[i];
-		spin_lock(&hb->chain_lock);
-
-		hlist_for_each_entry_safe(q, n, &hb->chain, list) {
-			unsigned int hval = inet_frag_hashfn(f, q);
-
-			if (hval != i) {
-				struct inet_frag_bucket *hb_dest;
-
-				hlist_del(&q->list);
-
-				/* Relink to new hash chain. */
-				hb_dest = &f->hash[hval];
-
-				/* This is the only place where we take
-				 * another chain_lock while already holding
-				 * one.  As this will not run concurrently,
-				 * we cannot deadlock on hb_dest lock below, if its
-				 * already locked it will be released soon since
-				 * other caller cannot be waiting for hb lock
-				 * that we've taken above.
-				 */
-				spin_lock_nested(&hb_dest->chain_lock,
-						 SINGLE_DEPTH_NESTING);
-				hlist_add_head(&q->list, &hb_dest->chain);
-				spin_unlock(&hb_dest->chain_lock);
-			}
-		}
-		spin_unlock(&hb->chain_lock);
-	}
-
-	f->rebuild = false;
-	f->last_rebuild_jiffies = jiffies;
-out:
-	write_sequnlock_bh(&f->rnd_seqlock);
-}
-
 static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
 {
 	return q->net->low_thresh == 0 ||
@@ -130,9 +37,6 @@ static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
 static unsigned int
 inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
 {
-#ifndef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
-	struct ovs_inet_frag_queue *ofq;
-#endif
 	struct inet_frag_queue *fq;
 	struct hlist_node *n;
 	unsigned int evicted = 0;
@@ -150,8 +54,8 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
 #ifdef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
 		hlist_add_head(&fq->list_evictor, &expired);
 #else
-		ofq = (struct ovs_inet_frag_queue *)fq;
-		hlist_add_head(&ofq->list_evictor, &expired);
+		hlist_del(&fq->list);
+		hlist_add_head(&fq->list, &expired);
 #endif
 		++evicted;
 	}
@@ -160,99 +64,28 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
 
 #ifdef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
 	hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
-		f->frag_expire((unsigned long) fq);
 #else
-	hlist_for_each_entry_safe(ofq, n, &expired, list_evictor)
-		f->frag_expire((unsigned long) &ofq->fq);
+	hlist_for_each_entry_safe(fq, n, &expired, list)
 #endif
+		f->frag_expire((unsigned long) fq);
 
 	return evicted;
 }
 
-static void inet_frag_worker(struct work_struct *work)
-{
-	unsigned int budget = INETFRAGS_EVICT_BUCKETS;
-	unsigned int i, evicted = 0;
-	struct inet_frags *f;
-
-	f = container_of(work, struct inet_frags, frags_work);
-
-	BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
-
-	local_bh_disable();
-
-	for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
-		evicted += inet_evict_bucket(f, &f->hash[i]);
-		i = (i + 1) & (INETFRAGS_HASHSZ - 1);
-		if (evicted > INETFRAGS_EVICT_MAX)
-			break;
-	}
-
-	f->next_bucket = i;
-
-	local_bh_enable();
-
-	if (f->rebuild && inet_frag_may_rebuild(f))
-		inet_frag_secret_rebuild(f);
-}
-
-static void inet_frag_schedule_worker(struct inet_frags *f)
-{
-	if (unlikely(!work_pending(&f->frags_work)))
-		schedule_work(&f->frags_work);
-}
-#endif /* >= 3.17 */
-
-int inet_frags_init(struct inet_frags *f)
-{
-	int i;
-
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-	INIT_WORK(&f->frags_work, inet_frag_worker);
-#endif
-
-	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
-		struct inet_frag_bucket *hb = &f->hash[i];
-
-		spin_lock_init(&hb->chain_lock);
-		INIT_HLIST_HEAD(&hb->chain);
-	}
-
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-	seqlock_init(&f->rnd_seqlock);
-	f->last_rebuild_jiffies = 0;
-	f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
-					    NULL);
-	if (!f->frags_cachep)
-		return -ENOMEM;
-#else
-	rwlock_init(&f->lock);
-	f->secret_timer.expires = jiffies + f->secret_interval;
-#endif
-
-	return 0;
-}
-
-void inet_frags_fini(struct inet_frags *f)
-{
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-	cancel_work_sync(&f->frags_work);
-	kmem_cache_destroy(f->frags_cachep);
-#endif
-}
-
-int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force);
-
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
 void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
 {
+	int thresh = nf->low_thresh;
 	unsigned int seq;
+	int i;
+
+	nf->low_thresh = 0;
 
 evict_again:
 	local_bh_disable();
 	seq = read_seqbegin(&f->rnd_seqlock);
 
-	inet_frag_evictor(nf, f, true);
+	for (i = 0; i < INETFRAGS_HASHSZ ; i++)
+		inet_evict_bucket(f, &f->hash[i]);
 
 	local_bh_enable();
 	cond_resched();
@@ -260,301 +93,22 @@ evict_again:
 	if (read_seqretry(&f->rnd_seqlock, seq) ||
 	    percpu_counter_sum(&nf->mem))
 		goto evict_again;
-}
-#else
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
-{
-	read_lock_bh(&f->lock);
-	inet_frag_evictor(nf, f, true);
-	read_unlock_bh(&f->lock);
-}
-#endif
-
-static struct inet_frag_bucket *
-get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
-#ifdef HAVE_INET_FRAGS_WITH_RWLOCK
-__acquires(f->lock)
-#endif
-__acquires(hb->chain_lock)
-{
-	struct inet_frag_bucket *hb;
-	unsigned int hash;
-
-#ifdef HAVE_INET_FRAGS_WITH_RWLOCK
-	read_lock(&f->lock);
-#else
-	unsigned int seq;
- restart:
-	seq = read_seqbegin(&f->rnd_seqlock);
-#endif
-
-	hash = inet_frag_hashfn(f, fq);
-	hb = &f->hash[hash];
-
-	spin_lock(&hb->chain_lock);
-
-#ifndef HAVE_INET_FRAGS_WITH_RWLOCK
-	if (read_seqretry(&f->rnd_seqlock, seq)) {
-		spin_unlock(&hb->chain_lock);
-		goto restart;
-	}
-#endif
 
-	return hb;
+	nf->low_thresh = thresh;
 }
-
-static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
-#ifdef HAVE_INET_FRAGS_WITH_RWLOCK
-__releases(f->lock)
-#endif
-__releases(hb->chain_lock)
-{
-	struct inet_frag_bucket *hb;
-
-	hb = get_frag_bucket_locked(fq, f);
-	hlist_del(&fq->list);
-	q_flags(fq) |= INET_FRAG_COMPLETE;
-	spin_unlock(&hb->chain_lock);
-
-#ifdef HAVE_INET_FRAGS_WITH_RWLOCK
-	read_unlock(&f->lock);
-#endif
-}
-
-void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
-{
-	if (del_timer(&fq->timer))
-		atomic_dec(&fq->refcnt);
-
-	if (!(q_flags(fq) & INET_FRAG_COMPLETE)) {
-		fq_unlink(fq, f);
-		atomic_dec(&fq->refcnt);
-	}
-}
-
-static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
-				  struct sk_buff *skb)
-{
-	if (f->skb_free)
-		f->skb_free(skb);
-	kfree_skb(skb);
-}
-
-void rpl_inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
-{
-	struct sk_buff *fp;
-	struct netns_frags *nf;
-	unsigned int sum, sum_truesize = 0;
-
-	WARN_ON(!(q_flags(q) & INET_FRAG_COMPLETE));
-	WARN_ON(del_timer(&q->timer) != 0);
-
-	/* Release all fragment data. */
-	fp = q->fragments;
-	nf = q->net;
-	while (fp) {
-		struct sk_buff *xp = fp->next;
-
-		sum_truesize += fp->truesize;
-		frag_kfree_skb(nf, f, fp);
-		fp = xp;
-	}
-	sum = sum_truesize + f->qsize;
-
-	if (f->destructor)
-		f->destructor(q);
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-	kmem_cache_free(f->frags_cachep, q);
-#else
-	kfree(q);
-#endif
-
-	sub_frag_mem_limit(nf, sum);
-}
-
-int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
-{
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-	int i;
-
-	for (i = 0; i < INETFRAGS_HASHSZ ; i++)
-		inet_evict_bucket(f, &f->hash[i]);
-
-	return 0;
-#else
-	struct inet_frag_queue *q;
-	int work, evicted = 0;
-
-	work = frag_mem_limit(nf) - nf->low_thresh;
-	while (work > 0 || force) {
-		spin_lock(&nf->lru_lock);
-
-		if (list_empty(&nf->lru_list)) {
-			spin_unlock(&nf->lru_lock);
-			break;
-		}
-
-		q = list_first_entry(&nf->lru_list,
-				     struct inet_frag_queue, lru_list);
-		atomic_inc(&q->refcnt);
-		/* Remove q from list to avoid several CPUs grabbing it */
-		list_del_init(&q->lru_list);
-
-		spin_unlock(&nf->lru_lock);
-
-		spin_lock(&q->lock);
-		if (!(q->last_in & INET_FRAG_COMPLETE))
-			inet_frag_kill(q, f);
-		spin_unlock(&q->lock);
-
-		if (atomic_dec_and_test(&q->refcnt))
-			inet_frag_destroy(q, f, &work);
-		evicted++;
-	}
-
-	return evicted;
-#endif
-}
-
-static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
-						struct inet_frag_queue *qp_in,
-						struct inet_frags *f,
-						void *arg)
-{
-	struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
-	struct inet_frag_queue *qp;
-
-#ifdef CONFIG_SMP
-	/* With SMP race we have to recheck hash table, because
-	 * such entry could have been created on other cpu before
-	 * we acquired hash bucket lock.
-	 */
-	hlist_for_each_entry(qp, &hb->chain, list) {
-		if (qp->net == nf && f->match(qp, arg)) {
-			atomic_inc(&qp->refcnt);
-			spin_unlock(&hb->chain_lock);
-#ifdef HAVE_INET_FRAGS_WITH_RWLOCK
-			read_unlock(&f->lock);
-#endif
-			q_flags(qp_in) |= INET_FRAG_COMPLETE;
-			inet_frag_put(qp_in, f);
-			return qp;
-		}
-	}
-#endif /* CONFIG_SMP */
-	qp = qp_in;
-	if (!mod_timer(&qp->timer, jiffies + nf->timeout))
-		atomic_inc(&qp->refcnt);
-
-	atomic_inc(&qp->refcnt);
-	hlist_add_head(&qp->list, &hb->chain);
-
-	spin_unlock(&hb->chain_lock);
-#ifdef HAVE_INET_FRAGS_WITH_RWLOCK
-	read_unlock(&f->lock);
-#endif
-
-	return qp;
-}
-
-static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
-					       struct inet_frags *f,
-					       void *arg)
-{
-	struct inet_frag_queue *q;
-
-	if (frag_mem_limit(nf) > nf->high_thresh) {
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-		inet_frag_schedule_worker(f);
-#endif
-		return NULL;
-	}
-
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-	q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
-#else
-	q = kzalloc(f->qsize, GFP_ATOMIC);
-#endif
-	if (!q)
-		return NULL;
-
-	q->net = nf;
-	f->constructor(q, arg);
-	add_frag_mem_limit(nf, f->qsize);
-
-	setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
-	spin_lock_init(&q->lock);
-	atomic_set(&q->refcnt, 1);
-
-	return q;
-}
-
-static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
-						struct inet_frags *f,
-						void *arg)
-{
-	struct inet_frag_queue *q;
-
-	q = inet_frag_alloc(nf, f, arg);
-	if (!q)
-		return NULL;
-
-	return inet_frag_intern(nf, q, f, arg);
-}
-
-struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
-				       struct inet_frags *f, void *key,
-				       unsigned int hash)
+#else /* HAVE_INET_FRAGS_WITH_FRAGS_WORK */
+void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
 {
-	struct inet_frag_bucket *hb;
-	struct inet_frag_queue *q;
-	int depth = 0;
-
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-	if (frag_mem_limit(nf) > nf->low_thresh)
-		inet_frag_schedule_worker(f);
-#else
-	if (frag_mem_limit(nf) > nf->high_thresh)
-		inet_frag_evictor(nf, f, false);
-#endif
-
-	hash &= (INETFRAGS_HASHSZ - 1);
-	hb = &f->hash[hash];
-
-	spin_lock(&hb->chain_lock);
-	hlist_for_each_entry(q, &hb->chain, list) {
-		if (q->net == nf && f->match(q, key)) {
-			atomic_inc(&q->refcnt);
-			spin_unlock(&hb->chain_lock);
-			return q;
-		}
-		depth++;
-	}
-	spin_unlock(&hb->chain_lock);
-
-	if (depth <= INETFRAGS_MAXDEPTH)
-		return inet_frag_create(nf, f, key);
-
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-	if (inet_frag_may_rebuild(f)) {
-		if (!f->rebuild)
-			f->rebuild = true;
-		inet_frag_schedule_worker(f);
-	}
-#endif
+	int thresh = nf->low_thresh;
 
-	return ERR_PTR(-ENOBUFS);
-}
+	nf->low_thresh = 0;
 
-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
-				   const char *prefix)
-{
-	static const char msg[] = "inet_frag_find: Fragment hash bucket"
-		" list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
-		". Dropping fragment.\n";
+	local_bh_disable();
+	inet_frag_evictor(nf, f, true);
+	local_bh_enable();
 
-	if (PTR_ERR(q) == -ENOBUFS)
-		net_dbg_ratelimited("%s%s", prefix, msg);
+	nf->low_thresh = thresh;
 }
+#endif /* HAVE_INET_FRAGS_WITH_FRAGS_WORK */
 
 #endif /* !HAVE_CORRECT_MRU_HANDLING */
diff --git a/datapath/linux/compat/ip_fragment.c b/datapath/linux/compat/ip_fragment.c
index 8d01088abc0a..64e2cf23c327 100644
--- a/datapath/linux/compat/ip_fragment.c
+++ b/datapath/linux/compat/ip_fragment.c
@@ -76,12 +76,7 @@ struct ipfrag_skb_cb
 
 /* Describe an entry in the "incomplete datagrams" queue. */
 struct ipq {
-	union {
-		struct inet_frag_queue q;
-#ifndef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
-		struct ovs_inet_frag_queue oq;
-#endif
-	};
+	struct inet_frag_queue q;
 
 	u32		user;
 	__be32		saddr;
@@ -119,6 +114,12 @@ static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
 			    (__force u32)saddr, (__force u32)daddr,
 			    ip4_frags.rnd);
 }
+/* fb3cfe6e75b9 ("inet: frag: remove hash size assumptions from callers")
+ * shifted this logic into inet_fragment, but prior kernels still need this.
+ */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0)
+#define ipqhashfn(a, b, c, d) (ipqhashfn(a, b, c, d) & (INETFRAGS_HASHSZ - 1))
+#endif
 
 #ifdef HAVE_INET_FRAGS_CONST
 static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
@@ -267,6 +268,23 @@ out:
 	ipq_put(qp);
 }
 
+/* Memory limiting on fragments.  Evictor trashes the oldest
+ * fragment queue until we are back under the threshold.
+ *
+ * Necessary for kernels earlier than v3.17. Replaced in commit
+ * b13d3cbfb8e8 ("inet: frag: move eviction of queues to work queue").
+ */
+static void ip_evictor(struct net *net)
+{
+#ifdef HAVE_INET_FRAG_EVICTOR
+	int evicted;
+
+	evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
+	if (evicted)
+		IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
+#endif
+}
+
 /* Find the correct entry in the "incomplete datagrams" queue for
  * this IP datagram, and create new one, if nothing is found.
  */
@@ -281,6 +299,11 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
 	arg.user = user;
 	arg.vif = vif;
 
+	ip_evictor(net);
+
+#ifdef HAVE_INET_FRAGS_WITH_RWLOCK
+	read_lock(&ip4_frags.lock);
+#endif
 	hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
 
 	q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
@@ -701,7 +724,6 @@ int rpl_ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
 	kfree_skb(skb);
 	return -ENOMEM;
 }
-EXPORT_SYMBOL_GPL(rpl_ip_defrag);
 
 static int __net_init ipv4_frags_init_net(struct net *net)
 {
diff --git a/datapath/linux/compat/nf_conntrack_reasm.c b/datapath/linux/compat/nf_conntrack_reasm.c
index ca19a9ff9d56..2024f1f590a0 100644
--- a/datapath/linux/compat/nf_conntrack_reasm.c
+++ b/datapath/linux/compat/nf_conntrack_reasm.c
@@ -80,6 +80,12 @@ static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr,
 	return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
 			    (__force u32)id, nf_frags.rnd);
 }
+/* fb3cfe6e75b9 ("inet: frag: remove hash size assumptions from callers")
+ * shifted this logic into inet_fragment, but prior kernels still need this.
+ */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0)
+#define nf_hash_frag(a, b, c) (nf_hash_frag(a, b, c) & (INETFRAGS_HASHSZ - 1))
+#endif
 
 #ifdef HAVE_INET_FRAGS_CONST
 static unsigned int nf_hashfn(const struct inet_frag_queue *q)
@@ -119,7 +125,11 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id,
 	arg.dst = dst;
 	arg.ecn = ecn;
 
+#ifdef HAVE_INET_FRAGS_WITH_RWLOCK
+	read_lock_bh(&nf_frags.lock);
+#else
 	local_bh_disable();
+#endif
 	hash = nf_hash_frag(id, src, dst);
 
 	q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash);
@@ -512,6 +522,13 @@ int rpl_nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 	hdr = ipv6_hdr(skb);
 	fhdr = (struct frag_hdr *)skb_transport_header(skb);
 
+/* See ip_evictor(). */
+#ifdef HAVE_INET_FRAG_EVICTOR
+	local_bh_disable();
+	inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false);
+	local_bh_enable();
+#endif
+
 	fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
 		     ip6_frag_ecn(hdr));
 	if (fq == NULL)
-- 
2.9.0




More information about the dev mailing list