[ovs-dev] [PATCH] pkt reassemble: fix kernel panic for ovs reassemble

Ben Pfaff blp at ovn.org
Thu Jul 6 20:57:34 UTC 2017


From: wangzhike <wangzhike at jd.com>

Ovs and kernel stack would add frag_queue to same netns_frags list.
As result, ovs and kernel may access the fraq_queue without correct
lock. Also the struct ipq may be different on kernel(older than 4.3),
which leads to invalid pointer access.

The fix creates specific netns_frags for ovs.

Signed-off-by: wangzhike <wangzhike at jd.com>
---
This was originally posted at:
https://github.com/openvswitch/ovs/pull/187
I'm reposting it to ovs-dev to make sure that it gets attention.

 datapath/datapath.c                                | 22 +++---
 datapath/datapath.h                                |  6 ++
 datapath/linux/compat/include/net/inet_frag.h      | 18 -----
 datapath/linux/compat/include/net/ip.h             |  4 ++
 .../include/net/netfilter/ipv6/nf_defrag_ipv6.h    |  4 ++
 datapath/linux/compat/inet_fragment.c              | 83 ----------------------
 datapath/linux/compat/ip_fragment.c                | 66 ++++++++++++++---
 datapath/linux/compat/nf_conntrack_reasm.c         | 58 +++++++++++++--
 8 files changed, 138 insertions(+), 123 deletions(-)

diff --git a/datapath/datapath.c b/datapath/datapath.c
index c85029c067ca..82cad74b7972 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -2297,6 +2297,8 @@ static int __net_init ovs_init_net(struct net *net)
 	INIT_LIST_HEAD(&ovs_net->dps);
 	INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
 	ovs_ct_init(net);
+	ovs_netns_frags_init(net);
+	ovs_netns_frags6_init(net);
 	return 0;
 }
 
@@ -2332,6 +2334,8 @@ static void __net_exit ovs_exit_net(struct net *dnet)
 	struct net *net;
 	LIST_HEAD(head);
 
+	ovs_netns_frags6_exit(dnet);
+	ovs_netns_frags_exit(dnet);
 	ovs_ct_exit(dnet);
 	ovs_lock();
 	list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node)
@@ -2368,13 +2372,9 @@ static int __init dp_init(void)
 
 	pr_info("Open vSwitch switching datapath %s\n", VERSION);
 
-	err = compat_init();
-	if (err)
-		goto error;
-
 	err = action_fifos_init();
 	if (err)
-		goto error_compat_exit;
+		goto error;
 
 	err = ovs_internal_dev_rtnl_link_register();
 	if (err)
@@ -2392,10 +2392,14 @@ static int __init dp_init(void)
 	if (err)
 		goto error_vport_exit;
 
-	err = register_netdevice_notifier(&ovs_dp_device_notifier);
+	err = compat_init();
 	if (err)
 		goto error_netns_exit;
 
+	err = register_netdevice_notifier(&ovs_dp_device_notifier);
+	if (err)
+		goto error_compat_exit;
+
 	err = ovs_netdev_init();
 	if (err)
 		goto error_unreg_notifier;
@@ -2410,6 +2414,8 @@ error_unreg_netdev:
 	ovs_netdev_exit();
 error_unreg_notifier:
 	unregister_netdevice_notifier(&ovs_dp_device_notifier);
+error_compat_exit:
+	compat_exit();
 error_netns_exit:
 	unregister_pernet_device(&ovs_net_ops);
 error_vport_exit:
@@ -2420,8 +2426,6 @@ error_unreg_rtnl_link:
 	ovs_internal_dev_rtnl_link_unregister();
 error_action_fifos_exit:
 	action_fifos_exit();
-error_compat_exit:
-	compat_exit();
 error:
 	return err;
 }
@@ -2431,13 +2435,13 @@ static void dp_cleanup(void)
 	dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
 	ovs_netdev_exit();
 	unregister_netdevice_notifier(&ovs_dp_device_notifier);
+	compat_exit();
 	unregister_pernet_device(&ovs_net_ops);
 	rcu_barrier();
 	ovs_vport_exit();
 	ovs_flow_exit();
 	ovs_internal_dev_rtnl_link_unregister();
 	action_fifos_exit();
-	compat_exit();
 }
 
 module_init(dp_init);
diff --git a/datapath/datapath.h b/datapath/datapath.h
index b835adac5bf9..88496257d486 100644
--- a/datapath/datapath.h
+++ b/datapath/datapath.h
@@ -141,6 +141,12 @@ struct ovs_net {
 
 	/* Module reference for configuring conntrack. */
 	bool xt_label;
+
+#ifdef HAVE_INET_FRAG_LRU_MOVE
+	struct net *net;
+	struct netns_frags ipv4_frags;
+	struct netns_frags nf_frags;
+#endif
 };
 
 extern unsigned int ovs_net_id;
diff --git a/datapath/linux/compat/include/net/inet_frag.h b/datapath/linux/compat/include/net/inet_frag.h
index 01d79ad8147b..34078c80db0f 100644
--- a/datapath/linux/compat/include/net/inet_frag.h
+++ b/datapath/linux/compat/include/net/inet_frag.h
@@ -52,22 +52,4 @@ static inline int rpl_inet_frags_init(struct inet_frags *frags)
 #define inet_frags_init rpl_inet_frags_init
 #endif
 
-#ifndef HAVE_CORRECT_MRU_HANDLING
-/* We reuse the upstream inet_fragment.c common code for managing fragment
- * stores, However we actually store the fragments within our own 'inet_frags'
- * structures (in {ip_fragment,nf_conntrack_reasm}.c). When unloading the OVS
- * kernel module, we need to flush all of the remaining fragments from these
- * caches, or else we will panic with the following sequence of events:
- *
- * 1) A fragment for a packet arrives and is cached in inet_frags. This
- *    starts a timer to ensure the fragment does not hang around forever.
- * 2) openvswitch module is unloaded.
- * 3) The timer for the fragment fires, calling into backported OVS code
- *    to free the fragment.
- * 4) BUG: unable to handle kernel paging request at ffffffffc03c01e0
- */
-void rpl_inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
-#define inet_frags_exit_net rpl_inet_frags_exit_net
-#endif
-
 #endif /* inet_frag.h */
diff --git a/datapath/linux/compat/include/net/ip.h b/datapath/linux/compat/include/net/ip.h
index b188996606ff..ad5ac33eec04 100644
--- a/datapath/linux/compat/include/net/ip.h
+++ b/datapath/linux/compat/include/net/ip.h
@@ -97,6 +97,8 @@ int rpl_ip_defrag(struct net *net, struct sk_buff *skb, u32 user);
 #define ip_defrag rpl_ip_defrag
 int __init rpl_ipfrag_init(void);
 void rpl_ipfrag_fini(void);
+void ovs_netns_frags_init(struct net *net);
+void ovs_netns_frags_exit(struct net *net);
 
 #else /* HAVE_CORRECT_MRU_HANDLING */
 
@@ -131,6 +133,8 @@ static inline int rpl_ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
  * compat_{in,ex}it() can be no-ops. */
 static inline int rpl_ipfrag_init(void) { return 0; }
 static inline void rpl_ipfrag_fini(void) { }
+static inline void ovs_netns_frags_init(struct net *net) { }
+static inline void ovs_netns_frags_exit(struct net *net) { }
 #endif /* HAVE_CORRECT_MRU_HANDLING */
 
 #define ipfrag_init rpl_ipfrag_init
diff --git a/datapath/linux/compat/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/datapath/linux/compat/include/net/netfilter/ipv6/nf_defrag_ipv6.h
index 2ab6c0aa79a1..c4c0f79ab330 100644
--- a/datapath/linux/compat/include/net/netfilter/ipv6/nf_defrag_ipv6.h
+++ b/datapath/linux/compat/include/net/netfilter/ipv6/nf_defrag_ipv6.h
@@ -28,9 +28,13 @@ int rpl_nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user);
  */
 int __init rpl_nf_ct_frag6_init(void);
 void rpl_nf_ct_frag6_cleanup(void);
+void ovs_netns_frags6_init(struct net *net);
+void ovs_netns_frags6_exit(struct net *net);
 #else /* !OVS_NF_DEFRAG6_BACKPORT */
 static inline int __init rpl_nf_ct_frag6_init(void) { return 0; }
 static inline void rpl_nf_ct_frag6_cleanup(void) { }
+static inline void ovs_netns_frags6_init(struct net *net) { }
+static inline void ovs_netns_frags6_exit(struct net *net) { }
 #endif /* OVS_NF_DEFRAG6_BACKPORT */
 #define nf_ct_frag6_init rpl_nf_ct_frag6_init
 #define nf_ct_frag6_cleanup rpl_nf_ct_frag6_cleanup
diff --git a/datapath/linux/compat/inet_fragment.c b/datapath/linux/compat/inet_fragment.c
index f05e6177bfb3..21736e61a696 100644
--- a/datapath/linux/compat/inet_fragment.c
+++ b/datapath/linux/compat/inet_fragment.c
@@ -27,88 +27,5 @@
 #include <net/inet_frag.h>
 #include <net/inet_ecn.h>
 
-#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
-static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
-{
-	return q->net->low_thresh == 0 ||
-	       frag_mem_limit(q->net) >= q->net->low_thresh;
-}
-
-static unsigned int
-inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
-{
-	struct inet_frag_queue *fq;
-	struct hlist_node *n;
-	unsigned int evicted = 0;
-	HLIST_HEAD(expired);
-
-	spin_lock(&hb->chain_lock);
-
-	hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
-		if (!inet_fragq_should_evict(fq))
-			continue;
-
-		if (!del_timer(&fq->timer))
-			continue;
-
-#ifdef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
-		hlist_add_head(&fq->list_evictor, &expired);
-#else
-		hlist_del(&fq->list);
-		hlist_add_head(&fq->list, &expired);
-#endif
-		++evicted;
-	}
-
-	spin_unlock(&hb->chain_lock);
-
-#ifdef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
-	hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
-#else
-	hlist_for_each_entry_safe(fq, n, &expired, list)
-#endif
-		f->frag_expire((unsigned long) fq);
-
-	return evicted;
-}
-
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
-{
-	int thresh = nf->low_thresh;
-	unsigned int seq;
-	int i;
-
-	nf->low_thresh = 0;
-
-evict_again:
-	local_bh_disable();
-	seq = read_seqbegin(&f->rnd_seqlock);
-
-	for (i = 0; i < INETFRAGS_HASHSZ ; i++)
-		inet_evict_bucket(f, &f->hash[i]);
-
-	local_bh_enable();
-	cond_resched();
-
-	if (read_seqretry(&f->rnd_seqlock, seq) ||
-	    percpu_counter_sum(&nf->mem))
-		goto evict_again;
-
-	nf->low_thresh = thresh;
-}
-#else /* HAVE_INET_FRAGS_WITH_FRAGS_WORK */
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
-{
-	int thresh = nf->low_thresh;
-
-	nf->low_thresh = 0;
-
-	local_bh_disable();
-	inet_frag_evictor(nf, f, true);
-	local_bh_enable();
-
-	nf->low_thresh = thresh;
-}
-#endif /* HAVE_INET_FRAGS_WITH_FRAGS_WORK */
 
 #endif /* !HAVE_CORRECT_MRU_HANDLING */
diff --git a/datapath/linux/compat/ip_fragment.c b/datapath/linux/compat/ip_fragment.c
index 47b51b5793b9..8f2012b731ea 100644
--- a/datapath/linux/compat/ip_fragment.c
+++ b/datapath/linux/compat/ip_fragment.c
@@ -57,6 +57,8 @@
 #include <net/inet_ecn.h>
 #include <net/vrf.h>
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#include <net/netns/generic.h>
+#include "datapath.h"
 
 /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
  * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
@@ -107,6 +109,51 @@ struct ip4_create_arg {
 	int vif;
 };
 
+static struct netns_frags *get_netns_frags_from_net(struct net *net)
+{
+#ifdef HAVE_INET_FRAG_LRU_MOVE
+	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+	return &(ovs_net->ipv4_frags);
+#else
+	return &(net->ipv4.frags);
+#endif
+}
+
+static struct net *get_net_from_netns_frags(struct netns_frags *frags)
+{
+	struct net *net;
+#ifdef HAVE_INET_FRAG_LRU_MOVE
+	struct ovs_net *ovs_net;
+
+	ovs_net = container_of(frags, struct ovs_net, ipv4_frags);
+	net = ovs_net->net;
+#else
+	net = container_of(frags, struct net, ipv4.frags);
+#endif
+	return net;
+}
+
+void ovs_netns_frags_init(struct net *net)
+{
+#ifdef HAVE_INET_FRAG_LRU_MOVE
+	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+
+	ovs_net->ipv4_frags.high_thresh = 4 * 1024 * 1024;
+	ovs_net->ipv4_frags.low_thresh = 3 * 1024 * 1024;
+	ovs_net->ipv4_frags.timeout = IP_FRAG_TIME;
+	inet_frags_init_net(&(ovs_net->ipv4_frags));
+	ovs_net->net = net;
+#endif
+}
+
+void ovs_netns_frags_exit(struct net *net)
+{
+	struct netns_frags *frags;
+
+	frags = get_netns_frags_from_net(net);
+	inet_frags_exit_net(frags, &ip4_frags);
+}
+
 static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
 {
 	net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
@@ -158,9 +205,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a)
 #endif
 {
 	struct ipq *qp = container_of(q, struct ipq, q);
-	struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
-					       frags);
-	struct net *net = container_of(ipv4, struct net, ipv4);
+	struct net *net = get_net_from_netns_frags(q->net);
 
 	const struct ip4_create_arg *arg = a;
 
@@ -219,7 +264,7 @@ static void ip_expire(unsigned long arg)
 	struct net *net;
 
 	qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
-	net = container_of(qp->q.net, struct net, ipv4.frags);
+	net = get_net_from_netns_frags(qp->q.net);
 
 	spin_lock(&qp->q.lock);
 
@@ -278,8 +323,10 @@ out:
 static void ip_evictor(struct net *net)
 {
 	int evicted;
+	struct netns_frags *frags;
 
-	evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
+	frags = get_netns_frags_from_net(net);
+	evicted = inet_frag_evictor(frags, &ip4_frags, false);
 	if (evicted)
 		IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
 }
@@ -294,6 +341,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
 	struct inet_frag_queue *q;
 	struct ip4_create_arg arg;
 	unsigned int hash;
+	struct netns_frags *frags;
 
 	arg.iph = iph;
 	arg.user = user;
@@ -304,7 +352,8 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
 #endif
 	hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
 
-	q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
+	frags = get_netns_frags_from_net(net);
+	q = inet_frag_find(frags, &ip4_frags, &arg, hash);
 	if (IS_ERR_OR_NULL(q)) {
 		inet_frag_maybe_warn_overflow(q, pr_fmt());
 		return NULL;
@@ -333,7 +382,7 @@ static int ip_frag_too_far(struct ipq *qp)
 	if (rc) {
 		struct net *net;
 
-		net = container_of(qp->q.net, struct net, ipv4.frags);
+		net = get_net_from_netns_frags(qp->q.net);
 		IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
 	}
 
@@ -566,7 +615,7 @@ err:
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 			 struct net_device *dev)
 {
-	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+	struct net *net = get_net_from_netns_frags(qp->q.net);
 	struct iphdr *iph;
 	struct sk_buff *fp, *head = qp->q.fragments;
 	int len;
@@ -738,7 +787,6 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 
 static void __net_exit ipv4_frags_exit_net(struct net *net)
 {
-	inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
 }
 
 static struct pernet_operations ip4_frags_ops = {
diff --git a/datapath/linux/compat/nf_conntrack_reasm.c b/datapath/linux/compat/nf_conntrack_reasm.c
index 0da94635b496..ea153c3c526f 100644
--- a/datapath/linux/compat/nf_conntrack_reasm.c
+++ b/datapath/linux/compat/nf_conntrack_reasm.c
@@ -53,6 +53,8 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/netns/generic.h>
+#include "datapath.h"
 
 #ifdef OVS_NF_DEFRAG6_BACKPORT
 
@@ -68,6 +70,30 @@ struct nf_ct_frag6_skb_cb
 
 static struct inet_frags nf_frags;
 
+static struct netns_frags *get_netns_frags6_from_net(struct net *net)
+{
+#ifdef HAVE_INET_FRAG_LRU_MOVE
+	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+	return &(ovs_net->nf_frags);
+#else
+	return &(net->nf_frag.frags);
+#endif
+}
+
+static struct net *get_net_from_netns_frags6(struct netns_frags *frags)
+{
+	struct net *net;
+#ifdef HAVE_INET_FRAG_LRU_MOVE
+	struct ovs_net *ovs_net;
+
+	ovs_net = container_of(frags, struct ovs_net, nf_frags);
+	net = ovs_net->net;
+#else
+	net = container_of(frags, struct net, nf_frag.frags);
+#endif
+	return net;
+}
+
 static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
 {
 	return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
@@ -105,7 +131,7 @@ static void nf_ct_frag6_expire(unsigned long data)
 	struct net *net;
 
 	fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
-	net = container_of(fq->q.net, struct net, nf_frag.frags);
+	net = get_net_from_netns_frags6(fq->q.net);
 
 	ip6_expire_frag_queue(net, fq, &nf_frags);
 }
@@ -118,6 +144,7 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id,
 	struct inet_frag_queue *q;
 	struct ip6_create_arg arg;
 	unsigned int hash;
+	struct netns_frags *frags;
 
 	arg.id = id;
 	arg.user = user;
@@ -132,7 +159,8 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id,
 #endif
 	hash = nf_hash_frag(id, src, dst);
 
-	q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash);
+	frags = get_netns_frags6_from_net(net);
+	q = inet_frag_find(frags, &nf_frags, &arg, hash);
 	local_bh_enable();
 	if (IS_ERR_OR_NULL(q)) {
 		inet_frag_maybe_warn_overflow(q, pr_fmt());
@@ -506,6 +534,7 @@ int rpl_nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 	struct frag_queue *fq;
 	struct ipv6hdr *hdr;
 	u8 prevhdr;
+	struct netns_frags *frags;
 
 	/* Jumbo payload inhibits frag. header */
 	if (ipv6_hdr(skb)->payload_len == 0) {
@@ -524,9 +553,10 @@ int rpl_nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 	fhdr = (struct frag_hdr *)skb_transport_header(skb);
 
 /* See ip_evictor(). */
+	frags = get_netns_frags6_from_net(net);
 #ifdef HAVE_INET_FRAG_EVICTOR
 	local_bh_disable();
-	inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false);
+	inet_frag_evictor(frags, &nf_frags, false);
 	local_bh_enable();
 #endif
 
@@ -567,7 +597,27 @@ static int nf_ct_net_init(struct net *net)
 
 static void nf_ct_net_exit(struct net *net)
 {
-	inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
+}
+
+void ovs_netns_frags6_init(struct net *net)
+{
+#ifdef HAVE_INET_FRAG_LRU_MOVE
+	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+
+	ovs_net->nf_frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+	ovs_net->nf_frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+	ovs_net->nf_frags.timeout = IPV6_FRAG_TIMEOUT;
+
+	inet_frags_init_net(&(ovs_net->nf_frags));
+#endif
+}
+
+void ovs_netns_frags6_exit(struct net *net)
+{
+	struct netns_frags *frags;
+
+	frags = get_netns_frags6_from_net(net);
+	inet_frags_exit_net(frags, &nf_frags);
 }
 
 static struct pernet_operations nf_ct_net_ops = {
-- 
2.10.2



More information about the dev mailing list