[ovs-dev] [PATCHv2] datapath: Add support for kernel 4.19.x and 4.20.x

Yifeng Sun pkusunyifeng at gmail.com
Fri Apr 12 17:22:59 UTC 2019


This patch introduces changes needed by OVS to support latest
Linux kernels (4.19.x and 4.20.x). Recent kernels changed many
APIs that are being used by OVS. One major change is that
struct nf_conntrack_l3proto became invisible outside of kernel, so
get_l4proto function is added in file compact/nf_conntrack_core.c to
accommodate this issue.

In addition, if kernel is not compiled with CONFIG_NF_NAT_IPV4
or CONFIG_NF_NAT_IPV6, flow action 'ct(nat)' can cause kernel
to crash. This patch handles this condition.

This patch doesn't introduce new failed tests when running
'make check-kmod' for kernels listed below:
    3.10.0-957.5.1.el7.x86_64
    4.4.0-142-generic
    4.17.14
    4.18.0-16-generic
    4.19.34
    4.20.17

Travis passed at
https://travis-ci.org/yifsun/ovs-travis/builds/519011670

Signed-off-by: Yifeng Sun <pkusunyifeng at gmail.com>
v1->v2: Fixed the CONFIG_NF_NAT_IPV4 bug by using Greg's config
        file. Thanks Greg!
---
 .travis.yml                                        | 20 ++---
 NEWS                                               |  2 +
 acinclude.m4                                       | 20 ++++-
 datapath/conntrack.c                               | 86 +++++++++++++++++++++-
 .../include/net/netfilter/nf_conntrack_core.h      |  6 ++
 .../include/net/netfilter/nf_conntrack_count.h     |  2 +
 datapath/linux/compat/nf_conncount.c               |  6 +-
 datapath/linux/compat/nf_conntrack_core.c          | 80 ++++++++++++++++++++
 datapath/linux/compat/nf_conntrack_proto.c         |  3 +
 9 files changed, 209 insertions(+), 16 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 32d5f1918495..8578d1497f6c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -28,22 +28,24 @@ before_script: export PATH=$PATH:$HOME/bin
 
 env:
   - OPTS="--disable-ssl"
-  - TESTSUITE=1 KERNEL=3.16.54
+  - TESTSUITE=1 KERNEL=3.16.65
   - TESTSUITE=1 OPTS="--enable-shared"
   - BUILD_ENV="-m32" OPTS="--disable-ssl"
-  - KERNEL=3.16.54 DPDK=1 OPTS="--enable-shared"
-  - KERNEL=3.16.54 TESTSUITE=1 DPDK=1
-  - KERNEL=3.16.54 DPDK_SHARED=1
-  - KERNEL=3.16.54 DPDK_SHARED=1 OPTS="--enable-shared"
+  - KERNEL=3.16.65 DPDK=1 OPTS="--enable-shared"
+  - KERNEL=3.16.65 TESTSUITE=1 DPDK=1
+  - KERNEL=3.16.65 DPDK_SHARED=1
+  - KERNEL=3.16.65 DPDK_SHARED=1 OPTS="--enable-shared"
+  - KERNEL=4.20.17
+  - KERNEL=4.19.34
   - KERNEL=4.18.20
   - KERNEL=4.17.19
   - KERNEL=4.16.18
   - KERNEL=4.15.18
-  - KERNEL=4.14.63
-  - KERNEL=4.9.149
-  - KERNEL=4.4.148
+  - KERNEL=4.14.111
+  - KERNEL=4.9.168
+  - KERNEL=4.4.178
   - KERNEL=3.19.8
-  - KERNEL=3.16.57
+  - KERNEL=3.16.65
   - TESTSUITE=1 LIBS=-ljemalloc
 
 matrix:
diff --git a/NEWS b/NEWS
index 1e4744dbd244..af5b5222f78f 100644
--- a/NEWS
+++ b/NEWS
@@ -25,6 +25,8 @@ Post-v2.11.0
    - OVN:
      * Select IPAM mac_prefix in a random manner if not provided by the user
    - New QoS type "linux-netem" on Linux.
+   - Linux datapath:
+     * Support for the kernel versions 4.19.x, 4.20.x.
 
 v2.11.0 - 19 Feb 2019
 ---------------------
diff --git a/acinclude.m4 b/acinclude.m4
index cfc8bcd06397..a2e1f9500955 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -151,10 +151,10 @@ AC_DEFUN([OVS_CHECK_LINUX], [
     AC_MSG_RESULT([$kversion])
 
     if test "$version" -ge 4; then
-       if test "$version" = 4 && test "$patchlevel" -le 18; then
+       if test "$version" = 4 && test "$patchlevel" -le 20; then
           : # Linux 4.x
        else
-          AC_ERROR([Linux kernel in $KBUILD is version $kversion, but version newer than 4.18.x is not supported (please refer to the FAQ for advice)])
+          AC_ERROR([Linux kernel in $KBUILD is version $kversion, but version newer than 4.20.x is not supported (please refer to the FAQ for advice)])
        fi
     elif test "$version" = 3 && test "$patchlevel" -ge 10; then
        : # Linux 3.x
@@ -590,6 +590,22 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [
                   [OVS_DEFINE([HAVE_VOID_INET_FRAGS_INIT])])
   OVS_GREP_IFELSE([$KSRC/include/net/inetpeer.h], [vif],
                   [OVS_DEFINE([HAVE_INETPEER_VIF_SUPPORT])])
+  OVS_FIND_PARAM_IFELSE([$KSRC/include/net/netfilter/nf_conntrack_helper.h],
+                        [nf_ct_helper_ext_add], [nf_conntrack_helper],
+                        [OVS_DEFINE([HAVE_NF_CT_HELPER_EXT_ADD_TAKES_HELPER])])
+  OVS_FIND_PARAM_IFELSE([$KSRC/include/net/netfilter/nf_conntrack_core.h],
+                        [nf_ct_invert_tuple], [l3proto],
+                        [OVS_DEFINE([HAVE_NF_CT_INVERT_TUPLE_TAKES_L3PROTO])])
+  OVS_GREP_IFELSE([$KSRC/include/net/netfilter/nf_conntrack_core.h], [nf_ct_get_tuple],
+                  [OVS_DEFINE([HAVE_NF_CT_GET_TUPLE])])
+  OVS_FIND_PARAM_IFELSE([$KSRC/include/net/netfilter/nf_conntrack_core.h],
+                        [nf_conntrack_in], [net],
+                        [OVS_DEFINE([HAVE_NF_CONNTRACK_IN_TAKES_NET])])
+  OVS_GREP_IFELSE([$KSRC/include/net/ipv6_frag.h], [IP6_DEFRAG_CONNTRACK_IN],
+                  [OVS_DEFINE([HAVE_IPV6_FRAG_H])])
+  OVS_FIND_PARAM_IFELSE([$KSRC/include/net/netfilter/nf_conntrack_l4proto.h],
+                        [__nf_ct_l4proto_find], [l3proto],
+                        [OVS_DEFINE([HAVE_NF_CT_L4PROTO_FIND_TAKES_L3PROTO])])
 
   dnl Check for dst_cache and ipv6 lable to use backported tunnel infrastructure.
   dnl OVS does not really need ipv6 label field, but its presence signifies that
diff --git a/datapath/conntrack.c b/datapath/conntrack.c
index 52208bad3029..ce36a8ddea50 100644
--- a/datapath/conntrack.c
+++ b/datapath/conntrack.c
@@ -38,6 +38,10 @@
 #include <net/netfilter/nf_nat_l3proto.h>
 #endif
 
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) && defined(HAVE_IPV6_FRAG_H)
+#include <net/ipv6_frag.h>
+#endif
+
 #include "datapath.h"
 #include "conntrack.h"
 #include "flow.h"
@@ -645,32 +649,62 @@ static struct nf_conn *
 ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
 		     u8 l3num, struct sk_buff *skb, bool natted)
 {
-	const struct nf_conntrack_l3proto *l3proto;
 	const struct nf_conntrack_l4proto *l4proto;
 	struct nf_conntrack_tuple tuple;
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
-	unsigned int dataoff;
 	u8 protonum;
 
+#ifdef HAVE_NF_CT_INVERT_TUPLE_TAKES_L3PROTO
+	const struct nf_conntrack_l3proto *l3proto;
+	unsigned int dataoff;
+
 	l3proto = __nf_ct_l3proto_find(l3num);
 	if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
 				 &protonum) <= 0) {
 		pr_debug("ovs_ct_find_existing: Can't get protonum\n");
 		return NULL;
 	}
+#else
+	int protooff;
+
+	protooff = get_l4proto(skb, skb_network_offset(skb),
+			       l3num, &protonum);
+	if (protooff <= 0) {
+		pr_warn("ovs_ct_find_existing: Can't get protonum\n");
+		return NULL;
+	}
+#endif
+
+#ifdef HAVE_NF_CT_L4PROTO_FIND_TAKES_L3PROTO
 	l4proto = __nf_ct_l4proto_find(l3num, protonum);
-	if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
-			     protonum, net, &tuple, l3proto, l4proto)) {
+#else
+	l4proto = __nf_ct_l4proto_find(protonum);
+#endif
+
+#ifdef HAVE_NF_CT_GET_TUPLE
+	if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+				       l3num, net, &tuple)) {
+		pr_debug("ovs_ct_find_existing: Can't get tuple\n");
+		return NULL;
+	}
+#else
+	if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+				       l3num, net, &tuple)) {
 		pr_debug("ovs_ct_find_existing: Can't get tuple\n");
 		return NULL;
 	}
+#endif
 
 	/* Must invert the tuple if skb has been transformed by NAT. */
 	if (natted) {
 		struct nf_conntrack_tuple inverse;
 
+#ifdef HAVE_NF_CT_INVERT_TUPLE_TAKES_L3PROTO
 		if (!nf_ct_invert_tuple(&inverse, &tuple, l3proto, l4proto)) {
+#else
+		if (!nf_ct_invert_tuple(&inverse, &tuple, l4proto)) {
+#endif
 			pr_debug("ovs_ct_find_existing: Inversion failed!\n");
 			return NULL;
 		}
@@ -989,6 +1023,9 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 	if (!cached) {
 		struct nf_conn *tmpl = info->ct;
 		int err;
+#ifndef HAVE_NF_CONNTRACK_IN_TAKES_NET
+		struct nf_hook_state state = {};
+#endif
 
 		/* Associate skb with specified zone. */
 		if (tmpl) {
@@ -998,8 +1035,15 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 			nf_ct_set(skb, tmpl, IP_CT_NEW);
 		}
 
+#ifdef HAVE_NF_CONNTRACK_IN_TAKES_NET
 		err = nf_conntrack_in(net, info->family,
 				      NF_INET_PRE_ROUTING, skb);
+#else
+		state.hook = NF_INET_PRE_ROUTING,
+		state.pf = info->family,
+		state.net = net,
+		err = nf_conntrack_in(skb, &state);
+#endif
 		if (err != NF_ACCEPT)
 			return -ENOENT;
 
@@ -1307,9 +1351,17 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
 {
 	int nh_ofs;
 	int err;
+	/* From kernel 4.19.0+, Function handle_fragments may shrink skb's
+	 * headroom, which will result in loss of ethernet header data.
+	 * This buf is used to backup the header data before calling
+	 * handle_fragments. */
+	char buf[32];
 
 	/* The conntrack module expects to be working at L3. */
 	nh_ofs = skb_network_offset(skb);
+	if (nh_ofs > sizeof(buf))
+		return -EINVAL;
+	memcpy(buf, skb->data, nh_ofs);
 	skb_pull_rcsum(skb, nh_ofs);
 
 	err = ovs_skb_network_trim(skb);
@@ -1326,8 +1378,16 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
 		err = ovs_ct_commit(net, key, info, skb);
 	else
 		err = ovs_ct_lookup(net, key, info, skb);
+	if (err)
+		return err;
 
+	if (skb_headroom(skb) < nh_ofs) {
+		err = pskb_expand_head(skb, nh_ofs, 0, GFP_ATOMIC);
+		if (err)
+			return err;
+	}
 	skb_push(skb, nh_ofs);
+	memcpy(skb->data, buf, nh_ofs);
 	skb_postpush_rcsum(skb, skb->data, nh_ofs);
 	if (err)
 		kfree_skb(skb);
@@ -1362,7 +1422,11 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
 		return -EINVAL;
 	}
 
+#ifdef HAVE_NF_CT_HELPER_EXT_ADD_TAKES_HELPER
 	help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL);
+#else
+	help = nf_ct_helper_ext_add(info->ct, GFP_KERNEL);
+#endif
 	if (!help) {
 		nf_conntrack_helper_put(helper);
 		return -ENOMEM;
@@ -1387,6 +1451,20 @@ static int parse_nat(const struct nlattr *attr,
 	bool have_proto_max = false;
 	bool ip_vers = (info->family == NFPROTO_IPV6);
 
+#ifndef CONFIG_NF_NAT_IPV4
+	if (info->family == NFPROTO_IPV4) {
+		OVS_NLERR(log, "Flow action ct(nat) not supported without nf_nat_ipv4");
+		return -ENOTSUPP;
+	}
+#endif
+
+#ifndef CONFIG_NF_NAT_IPV6
+	if (info->family == NFPROTO_IPV6) {
+		OVS_NLERR(log, "Flow action ct(nat) not supported without nf_nat_ipv6");
+		return -ENOTSUPP;
+        }
+#endif
+
 	nla_for_each_nested(a, attr, rem) {
 		static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = {
 			[OVS_NAT_ATTR_SRC] = {0, 0},
diff --git a/datapath/linux/compat/include/net/netfilter/nf_conntrack_core.h b/datapath/linux/compat/include/net/netfilter/nf_conntrack_core.h
index 7834c8c25f79..7fca7dc551c8 100644
--- a/datapath/linux/compat/include/net/netfilter/nf_conntrack_core.h
+++ b/datapath/linux/compat/include/net/netfilter/nf_conntrack_core.h
@@ -104,4 +104,10 @@ static inline bool rpl_nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
 #define nf_ct_delete rpl_nf_ct_delete
 #endif /* HAVE_NF_CONN_TIMER */
 
+#ifndef HAVE_NF_CT_INVERT_TUPLE_TAKES_L3PROTO
+int rpl_get_l4proto(const struct sk_buff *skb,
+		    unsigned int nhoff, u8 pf, u8 *l4num);
+#define get_l4proto rpl_get_l4proto
+#endif
+
 #endif /* _NF_CONNTRACK_CORE_WRAPPER_H */
diff --git a/datapath/linux/compat/include/net/netfilter/nf_conntrack_count.h b/datapath/linux/compat/include/net/netfilter/nf_conntrack_count.h
index fd536f3e1854..a26eb9f87971 100644
--- a/datapath/linux/compat/include/net/netfilter/nf_conntrack_count.h
+++ b/datapath/linux/compat/include/net/netfilter/nf_conntrack_count.h
@@ -4,6 +4,8 @@
 #include <linux/list.h>
 
 #ifdef HAVE_UPSTREAM_NF_CONNCOUNT
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_zones.h>
 #include_next <net/netfilter/nf_conntrack_count.h>
 
 static inline int rpl_nf_conncount_modinit(void)
diff --git a/datapath/linux/compat/nf_conncount.c b/datapath/linux/compat/nf_conncount.c
index 0bee96274b00..eeae440f872d 100644
--- a/datapath/linux/compat/nf_conncount.c
+++ b/datapath/linux/compat/nf_conncount.c
@@ -13,6 +13,8 @@
  *		only ignore TIME_WAIT or gone connections
  *   (C) CC Computer Consultants GmbH, 2007
  */
+#ifndef HAVE_UPSTREAM_NF_CONNCOUNT
+
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/in.h>
 #include <linux/in6.h>
@@ -138,7 +140,7 @@ static bool conn_free(struct nf_conncount_list *list,
 
 	if (list->count == 0) {
 		spin_unlock(&list->list_lock);
-                return free_entry;
+		return free_entry;
 	}
 
 	list->count--;
@@ -635,3 +637,5 @@ void rpl_nf_conncount_modexit(void)
 	kmem_cache_destroy(conncount_conn_cachep);
 	kmem_cache_destroy(conncount_rb_cachep);
 }
+
+#endif /* HAVE_UPSTREAM_NF_CONNCOUNT */
diff --git a/datapath/linux/compat/nf_conntrack_core.c b/datapath/linux/compat/nf_conntrack_core.c
index a7d3d4331e4a..ecfeae1a037e 100644
--- a/datapath/linux/compat/nf_conntrack_core.c
+++ b/datapath/linux/compat/nf_conntrack_core.c
@@ -1,4 +1,7 @@
+#include <linux/types.h>
 #include <linux/version.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
 
 #ifndef HAVE_NF_CT_ZONE_INIT
 
@@ -11,3 +14,80 @@ const struct nf_conntrack_zone nf_ct_zone_dflt = {
 };
 
 #endif /* HAVE_NF_CT_ZONE_INIT */
+
+#ifndef HAVE_NF_CT_INVERT_TUPLE_TAKES_L3PROTO
+static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
+			    u_int8_t *protonum)
+{
+	int dataoff = -1;
+	const struct iphdr *iph;
+	struct iphdr _iph;
+
+	iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+	if (!iph)
+		return -1;
+
+	/* Conntrack defragments packets, we might still see fragments
+	 * inside ICMP packets though.
+	 */
+	if (iph->frag_off & htons(IP_OFFSET))
+		return -1;
+
+	dataoff = nhoff + (iph->ihl << 2);
+	*protonum = iph->protocol;
+
+	/* Check bogus IP headers */
+	if (dataoff > skb->len) {
+		pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n",
+			 nhoff, iph->ihl << 2, skb->len);
+		return -1;
+	}
+	return dataoff;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
+			    u8 *protonum)
+{
+	int protoff = -1;
+	unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
+	__be16 frag_off;
+	u8 nexthdr;
+
+	if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr),
+			  &nexthdr, sizeof(nexthdr)) != 0) {
+		pr_debug("can't get nexthdr\n");
+		return -1;
+	}
+	protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off);
+	/*
+	 * (protoff == skb->len) means the packet has not data, just
+	 * IPv6 and possibly extensions headers, but it is tracked anyway
+	 */
+	if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
+		pr_debug("can't find proto in pkt\n");
+		return -1;
+	}
+
+	*protonum = nexthdr;
+	return protoff;
+}
+#endif
+
+int rpl_get_l4proto(const struct sk_buff *skb,
+		    unsigned int nhoff, u8 pf, u8 *l4num)
+{
+	switch (pf) {
+	case NFPROTO_IPV4:
+		return ipv4_get_l4proto(skb, nhoff, l4num);
+#if IS_ENABLED(CONFIG_IPV6)
+	case NFPROTO_IPV6:
+		return ipv6_get_l4proto(skb, nhoff, l4num);
+#endif
+	default:
+		*l4num = 0;
+		break;
+	}
+	return -1;
+}
+#endif /* HAVE_NF_CT_INVERT_TUPLE_TAKES_L3PROTO */
diff --git a/datapath/linux/compat/nf_conntrack_proto.c b/datapath/linux/compat/nf_conntrack_proto.c
index 4ac66f61c70d..89c2f542247b 100644
--- a/datapath/linux/compat/nf_conntrack_proto.c
+++ b/datapath/linux/compat/nf_conntrack_proto.c
@@ -1,7 +1,10 @@
 #include <linux/types.h>
 
 #include <net/netfilter/nf_conntrack.h>
+
+#ifdef HAVE_NF_CT_INVERT_TUPLE_TAKES_L3PROTO
 #include <net/netfilter/nf_conntrack_l3proto.h>
+#endif
 
 /*
  * Upstream net-next commmit 7e35ec0e8044
-- 
2.7.4



More information about the dev mailing list