[ovs-dev] [ERSPAN RFC 01/25] gre: introduce native tunnel support for ERSPAN

Greg Rose gvrose8192 at gmail.com
Thu Mar 22 22:07:17 UTC 2018


From: William Tu <u9012063 at gmail.com>

Upstream commit:
    commit 84e54fe0a5eaed696dee4019c396f8396f5a908b
    Author: William Tu <u9012063 at gmail.com>
    Date:   Tue Aug 22 09:40:28 2017 -0700

    gre: introduce native tunnel support for ERSPAN

    The patch adds ERSPAN type II tunnel support.  The implementation
    is based on the draft at [1].  One of the purposes is for Linux
    box to be able to receive ERSPAN monitoring traffic sent from
    the Cisco switch, by creating a ERSPAN tunnel device.
    In addition, the patch also adds ERSPAN TX, so Linux virtual
    switch can redirect monitored traffic to the ERSPAN tunnel device.
    The traffic will be encapsulated into ERSPAN and sent out.

    The implementation reuses tunnel key as ERSPAN session ID, and
    field 'erspan' as ERSPAN Index fields:
    ./ip link add dev ers11 type erspan seq key 100 erspan 123 \
    			local 172.16.1.200 remote 172.16.1.100

    To use the above device as ERSPAN receiver, configure
    Nexus 5000 switch as below:

    monitor session 100 type erspan-source
      erspan-id 123
      vrf default
      destination ip 172.16.1.200
      source interface Ethernet1/11 both
      source interface Ethernet1/12 both
      no shut
    monitor erspan origin ip-address 172.16.1.100 global

    [1] https://tools.ietf.org/html/draft-foschiano-erspan-01
    [2] iproute2 patch: http://marc.info/?l=linux-netdev&m=150306086924951&w=2
    [3] test script: http://marc.info/?l=linux-netdev&m=150231021807304&w=2

    Signed-off-by: William Tu <u9012063 at gmail.com>
    Signed-off-by: Meenakshi Vohra <mvohra at vmware.com>
    Cc: Alexey Kuznetsov <kuznet at ms2.inr.ac.ru>
    Cc: Hideaki YOSHIFUJI <yoshfuji at linux-ipv6.org>
    Signed-off-by: David S. Miller <davem at davemloft.net>

This commit also backports heavily from upstream gre, ip_gre and
ip_tunnel modules to support the necessary erspan ip gre
infrastructure as well as implementing a variety of compatability
layer changes for same support.

Cc: William Tu <u9012063 at gmail.com>
Signed-off-by: Greg Rose <gvrose8192 at gmail.com>
---
 acinclude.m4                                   |  10 +-
 datapath/linux/Modules.mk                      |   3 +-
 datapath/linux/compat/gre.c                    | 202 ++++++------
 datapath/linux/compat/include/linux/if_ether.h |   4 +
 datapath/linux/compat/include/linux/skbuff.h   |  29 ++
 datapath/linux/compat/include/net/erspan.h     |  65 ++++
 datapath/linux/compat/include/net/gre.h        |  13 +-
 datapath/linux/compat/include/net/ip_tunnels.h | 159 ++++++++-
 datapath/linux/compat/ip_gre.c                 | 339 ++++++++++++++++++-
 datapath/linux/compat/ip_tunnel.c              | 439 ++++++++++++++++++++++++-
 datapath/linux/compat/ip_tunnels_core.c        |  41 +++
 11 files changed, 1173 insertions(+), 131 deletions(-)
 create mode 100644 datapath/linux/compat/include/net/erspan.h

diff --git a/acinclude.m4 b/acinclude.m4
index d61e37a..6ef244d 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -814,7 +814,15 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [
   OVS_GREP_IFELSE([$KSRC/include/net/inet_frag.h],
                   frag_percpu_counter_batch[],
                   [OVS_DEFINE([HAVE_FRAG_PERCPU_COUNTER_BATCH])])
-
+  OVS_FIND_FIELD_IFELSE([$KSRC/include/net/ip_tunnels.h], [tnl_ptk_info],
+                        [hdr_len],
+                        [OVS_DEFINE([HAVE_TNL_PTK_INFO_HDR_LEN])])
+  OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h],
+                  [null_compute_pseudo],
+                  [OVS_DEFINE([HAVE_NULL_COMPUTE_PSEUDO])])
+  OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h],
+                  [__skb_checksum_convert],
+                  [OVS_DEFINE([HAVE_SKB_CHECKSUM_CONVERT])])
 
   if cmp -s datapath/linux/kcompat.h.new \
             datapath/linux/kcompat.h >/dev/null 2>&1; then
diff --git a/datapath/linux/Modules.mk b/datapath/linux/Modules.mk
index 0dbc1ed..e0a90c3 100644
--- a/datapath/linux/Modules.mk
+++ b/datapath/linux/Modules.mk
@@ -104,5 +104,6 @@ openvswitch_headers += \
 	linux/compat/include/net/netfilter/nf_conntrack_zones.h \
 	linux/compat/include/net/netfilter/nf_nat.h \
 	linux/compat/include/net/netfilter/ipv6/nf_defrag_ipv6.h \
-	linux/compat/include/net/sctp/checksum.h
+	linux/compat/include/net/sctp/checksum.h \
+	linux/compat/include/net/erspan.h
 EXTRA_DIST += linux/compat/build-aux/export-check-whitelist
diff --git a/datapath/linux/compat/gre.c b/datapath/linux/compat/gre.c
index a341fa3..98924c9 100644
--- a/datapath/linux/compat/gre.c
+++ b/datapath/linux/compat/gre.c
@@ -41,90 +41,24 @@
 #ifndef USE_UPSTREAM_TUNNEL
 #if IS_ENABLED(CONFIG_NET_IPGRE_DEMUX)
 
-#ifndef HAVE_GRE_HANDLE_OFFLOADS
-
-#ifndef HAVE_GRE_CISCO_REGISTER
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
-
-#define GREPROTO_CISCO		0
-#define GREPROTO_MAX		1
-
-struct gre_protocol {
-	int  (*handler)(struct sk_buff *skb);
-};
-static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
-
-static int gre_rcv(struct sk_buff *skb)
-{
-	const struct gre_protocol *proto;
-	u8 ver;
-	int ret;
-
-	if (!pskb_may_pull(skb, 12))
-		goto drop;
-
-	ver = skb->data[1] & 0x7f;
-	if (ver >= GREPROTO_MAX)
-		goto drop;
-
-	rcu_read_lock();
-	proto = rcu_dereference(gre_proto[ver]);
-	if (!proto || !proto->handler)
-		goto drop_unlock;
-	ret = proto->handler(skb);
-	rcu_read_unlock();
-	return ret;
-
-drop_unlock:
-	rcu_read_unlock();
-drop:
-	kfree_skb(skb);
-	return NET_RX_DROP;
-}
-
-static const struct net_protocol net_gre_protocol = {
-	.handler     = gre_rcv,
-	.netns_ok    = 1,
-};
-
-static int gre_add_protocol(const struct gre_protocol *proto, u8 version)
+#define ip_gre_calc_hlen rpl_ip_gre_calc_hlen
+#define gre_calc_hlen rpl_ip_gre_calc_hlen
+static int rpl_ip_gre_calc_hlen(__be16 o_flags)
 {
-	if (version >= GREPROTO_MAX)
-		return -EINVAL;
-
-	if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
-		pr_err("%s: cannot register gre protocol handler\n", __func__);
-		return -EAGAIN;
-	}
+	int addend = 4;
 
-	return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ?
-		0 : -EBUSY;
+	if (o_flags & TUNNEL_CSUM)
+		addend += 4;
+	if (o_flags & TUNNEL_KEY)
+		addend += 4;
+	if (o_flags & TUNNEL_SEQ)
+		addend += 4;
+	return addend;
 }
 
-static int gre_del_protocol(const struct gre_protocol *proto, u8 version)
-{
-	int ret;
-
-	if (version >= GREPROTO_MAX)
-		return -EINVAL;
-
-	ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ?
-		0 : -EBUSY;
-
-	if (ret)
-		return ret;
-
-	synchronize_net();
-
-	ret = inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
-	if (ret)
-		return ret;
-
-	return 0;
-}
+#ifndef HAVE_GRE_HANDLE_OFFLOADS
 
-#endif
+#ifndef HAVE_GRE_CISCO_REGISTER
 
 static __sum16 check_checksum(struct sk_buff *skb)
 {
@@ -148,20 +82,6 @@ static __sum16 check_checksum(struct sk_buff *skb)
 	return csum;
 }
 
-#define ip_gre_calc_hlen rpl_ip_gre_calc_hlen
-static int ip_gre_calc_hlen(__be16 o_flags)
-{
-	int addend = 4;
-
-	if (o_flags & TUNNEL_CSUM)
-		addend += 4;
-	if (o_flags & TUNNEL_KEY)
-		addend += 4;
-	if (o_flags & TUNNEL_SEQ)
-		addend += 4;
-	return addend;
-}
-
 #define gre_flags_to_tnl_flags rpl_gre_flags_to_tnl_flags
 static __be16 gre_flags_to_tnl_flags(__be16 flags)
 {
@@ -309,5 +229,101 @@ EXPORT_SYMBOL_GPL(rpl_gre_cisco_unregister);
 #endif /* !HAVE_GRE_CISCO_REGISTER */
 #endif
 
+void rpl_gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
+			  int hdr_len)
+{
+	struct gre_base_hdr *greh;
+
+	skb_push(skb, hdr_len);
+
+	skb_reset_transport_header(skb);
+	greh = (struct gre_base_hdr *)skb->data;
+	greh->flags = tnl_flags_to_gre_flags(tpi->flags);
+	greh->protocol = tpi->proto;
+
+	if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
+		__be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
+
+		if (tpi->flags&TUNNEL_SEQ) {
+			*ptr = tpi->seq;
+			ptr--;
+		}
+		if (tpi->flags&TUNNEL_KEY) {
+			*ptr = tpi->key;
+			ptr--;
+		}
+		if (tpi->flags&TUNNEL_CSUM &&
+		    !(skb_shinfo(skb)->gso_type &
+		      (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) {
+			*ptr = 0;
+			*(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
+								 skb->len, 0));
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(rpl_gre_build_header);
+
+/* Fills in tpi and returns header length to be pulled. */
+int rpl_gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+		     bool *csum_err, __be16 proto, int nhs)
+{
+	const struct gre_base_hdr *greh;
+	__be32 *options;
+	int hdr_len;
+
+	if (unlikely(!pskb_may_pull(skb, nhs + sizeof(struct gre_base_hdr))))
+		return -EINVAL;
+
+	greh = (struct gre_base_hdr *)(skb->data + nhs);
+	if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
+		return -EINVAL;
+
+	tpi->flags = gre_flags_to_tnl_flags(greh->flags);
+	hdr_len = gre_calc_hlen(tpi->flags);
+
+	if (!pskb_may_pull(skb, nhs + hdr_len))
+		return -EINVAL;
+
+	greh = (struct gre_base_hdr *)(skb->data + nhs);
+	tpi->proto = greh->protocol;
+
+	options = (__be32 *)(greh + 1);
+	if (greh->flags & GRE_CSUM) {
+		if (skb_checksum_simple_validate(skb)) {
+			*csum_err = true;
+			return -EINVAL;
+		}
+
+		skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
+					 null_compute_pseudo);
+		options++;
+	}
+
+	if (greh->flags & GRE_KEY) {
+		tpi->key = *options;
+		options++;
+	} else {
+		tpi->key = 0;
+	}
+	if (unlikely(greh->flags & GRE_SEQ)) {
+		tpi->seq = *options;
+		options++;
+	} else {
+		tpi->seq = 0;
+	}
+	/* WCCP version 1 and 2 protocol decoding.
+ * 	 * - Change protocol to IPv4/IPv6
+ * 	 	 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+ * 	 	 	 */
+	if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
+		tpi->proto = proto;
+		if ((*(u8 *)options & 0xF0) != 0x40)
+			hdr_len += 4;
+	}
+	tpi->hdr_len = hdr_len;
+	return hdr_len;
+}
+EXPORT_SYMBOL(rpl_gre_parse_header);
+
 #endif /* CONFIG_NET_IPGRE_DEMUX */
 #endif /* USE_UPSTREAM_TUNNEL */
diff --git a/datapath/linux/compat/include/linux/if_ether.h b/datapath/linux/compat/include/linux/if_ether.h
index c989c6e..aaa88db 100644
--- a/datapath/linux/compat/include/linux/if_ether.h
+++ b/datapath/linux/compat/include/linux/if_ether.h
@@ -23,6 +23,10 @@
 #define ETH_P_NSH       0x894F          /* Network Service Header */
 #endif
 
+#ifndef ETH_P_ERSPAN
+#define ETH_P_ERSPAN	0x88BE		/* ERSPAN TYPE II */
+#endif
+
 #define inner_eth_hdr rpl_inner_eth_hdr
 static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb)
 {
diff --git a/datapath/linux/compat/include/linux/skbuff.h b/datapath/linux/compat/include/linux/skbuff.h
index 2910f3f..f0665ed 100644
--- a/datapath/linux/compat/include/linux/skbuff.h
+++ b/datapath/linux/compat/include/linux/skbuff.h
@@ -21,6 +21,35 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 #define ignore_df local_df
 #endif
 
+
+#ifndef HAVE_NULL_COMPUTE_PSEUDO
+static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto)
+{
+	return 0;
+}
+#endif
+
+#ifndef HAVE_SKB_CHECKSUM_CONVERT
+static inline bool __skb_checksum_convert_check(struct sk_buff *skb)
+{
+	return (skb->ip_summed == CHECKSUM_NONE && skb->csum_valid);
+}
+
+static inline void __skb_checksum_convert(struct sk_buff *skb,
+					  __sum16 check, __wsum pseudo)
+{
+	skb->csum = ~pseudo;
+	skb->ip_summed = CHECKSUM_COMPLETE;
+}
+
+#define skb_checksum_try_convert(skb, proto, check, compute_pseudo)	\
+do {									\
+	if (__skb_checksum_convert_check(skb))				\
+		__skb_checksum_convert(skb, check,			\
+				       compute_pseudo(skb, proto));	\
+} while (0)
+
+#endif
 #ifndef HAVE_SKB_COPY_FROM_LINEAR_DATA_OFFSET
 static inline void skb_copy_from_linear_data_offset(const struct sk_buff *skb,
 						    const int offset, void *to,
diff --git a/datapath/linux/compat/include/net/erspan.h b/datapath/linux/compat/include/net/erspan.h
new file mode 100644
index 0000000..6c5d3a7
--- /dev/null
+++ b/datapath/linux/compat/include/net/erspan.h
@@ -0,0 +1,65 @@
+#ifndef USE_UPSTREAM_TUNNEL
+#ifndef __LINUX_ERSPAN_H
+#define __LINUX_ERSPAN_H
+
+/*
+ * GRE header for ERSPAN encapsulation (8 octets [34:41]) -- 8 bytes
+ *       0                   1                   2                   3
+ *      0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |0|0|0|1|0|00000|000000000|00000|    Protocol Type for ERSPAN   |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |      Sequence Number (increments per packet per session)      |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ *  Note that in the above GRE header [RFC1701] out of the C, R, K, S,
+ *  s, Recur, Flags, Version fields only S (bit 03) is set to 1. The
+ *  other fields are set to zero, so only a sequence number follows.
+ *
+ *  ERSPAN Type II header (8 octets [42:49])
+ *  0                   1                   2                   3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |  Ver  |          VLAN         | COS | En|T|    Session ID     |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |      Reserved         |                  Index                |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * GRE proto ERSPAN type II = 0x88BE, type III = 0x22EB
+ */
+
+#define ERSPAN_VERSION	0x1
+
+#define VER_MASK	0xf000
+#define VLAN_MASK	0x0fff
+#define COS_MASK	0xe000
+#define EN_MASK		0x1800
+#define T_MASK		0x0400
+#define ID_MASK		0x03ff
+#define INDEX_MASK	0xfffff
+
+enum erspan_encap_type {
+	ERSPAN_ENCAP_NOVLAN = 0x0,	/* originally without VLAN tag */
+	ERSPAN_ENCAP_ISL = 0x1,		/* originally ISL encapsulated */
+	ERSPAN_ENCAP_8021Q = 0x2,	/* originally 802.1Q encapsulated */
+	ERSPAN_ENCAP_INFRAME = 0x3,	/* VLAN tag perserved in frame */
+};
+
+struct erspan_metadata {
+	__be32 index;   /* type II */
+};
+
+struct erspanhdr {
+	__be16 ver_vlan;
+#define VER_OFFSET  12
+	__be16 session_id;
+#define COS_OFFSET  13
+#define EN_OFFSET   11
+#define T_OFFSET    10
+	struct erspan_metadata md;
+};
+
+#endif
+#else
+#include_next <net/erspan.h>
+#endif
diff --git a/datapath/linux/compat/include/net/gre.h b/datapath/linux/compat/include/net/gre.h
index 764b9f1..ead86f6 100644
--- a/datapath/linux/compat/include/net/gre.h
+++ b/datapath/linux/compat/include/net/gre.h
@@ -28,11 +28,7 @@ static inline struct net_device *rpl_gretap_fb_dev_create(
 #endif
 
 #else
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37) || \
-   defined(HAVE_GRE_CISCO_REGISTER)
 #include_next <net/gre.h>
-#endif
 
 #ifndef HAVE_GRE_CISCO_REGISTER
 
@@ -62,6 +58,10 @@ struct gre_base_hdr {
 
 #endif /* HAVE_GRE_CISCO_REGISTER */
 
+#define gre_build_header rpl_gre_build_header
+void rpl_gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
+			  int hdr_len);
+
 int rpl_ipgre_init(void);
 void rpl_ipgre_fini(void);
 
@@ -69,6 +69,10 @@ void rpl_ipgre_fini(void);
 struct net_device *rpl_gretap_fb_dev_create(struct net *net, const char *name,
 					u8 name_assign_type);
 
+#define gre_parse_header rpl_gre_parse_header
+int rpl_gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+			 bool *csum_err, __be16 proto, int nhs);
+
 #define gre_fb_xmit rpl_gre_fb_xmit
 netdev_tx_t rpl_gre_fb_xmit(struct sk_buff *skb);
 #endif /* USE_UPSTREAM_TUNNEL */
@@ -79,4 +83,5 @@ netdev_tx_t rpl_gre_fb_xmit(struct sk_buff *skb);
 #define gre_fill_metadata_dst ovs_gre_fill_metadata_dst
 int ovs_gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
 
+
 #endif
diff --git a/datapath/linux/compat/include/net/ip_tunnels.h b/datapath/linux/compat/include/net/ip_tunnels.h
index ae60f09..e3fb13b 100644
--- a/datapath/linux/compat/include/net/ip_tunnels.h
+++ b/datapath/linux/compat/include/net/ip_tunnels.h
@@ -8,6 +8,11 @@
  * Only function that do not depend on ip_tunnel structure can
  * be used. Those needs to be explicitly defined in this header file. */
 #include_next <net/ip_tunnels.h>
+
+#ifndef TUNNEL_ERSPAN_OPT
+#define TUNNEL_ERSPAN_OPT	__cpu_to_be16(0x4000)
+#endif
+#define ovs_ip_tunnel_encap ip_tunnel_encap
 #else
 
 #include <linux/if_tunnel.h>
@@ -18,6 +23,21 @@
 #include <net/inet_ecn.h>
 #include <net/ip.h>
 #include <net/rtnetlink.h>
+#include <net/gro_cells.h>
+
+#ifndef MAX_IPTUN_ENCAP_OPS
+#define MAX_IPTUN_ENCAP_OPS 8
+#endif
+
+#ifndef HAVE_TUNNEL_ENCAP_TYPES
+enum tunnel_encap_types {
+	TUNNEL_ENCAP_NONE,
+	TUNNEL_ENCAP_FOU,
+	TUNNEL_ENCAP_GUE,
+};
+
+#define HAVE_TUNNEL_ENCAP_TYPES 1
+#endif
 
 #define __iptunnel_pull_header rpl___iptunnel_pull_header
 int rpl___iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
@@ -41,13 +61,17 @@ int ovs_iptunnel_handle_offloads(struct sk_buff *skb,
  */
 #define iptunnel_handle_offloads rpl_iptunnel_handle_offloads
 struct sk_buff *rpl_iptunnel_handle_offloads(struct sk_buff *skb,
-					 bool csum_help,
-					 int gso_type_mask);
+					     bool csum_help,
+					     int gso_type_mask);
 
 #define iptunnel_xmit rpl_iptunnel_xmit
 void rpl_iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
 		       __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl,
 		       __be16 df, bool xnet);
+#define ip_tunnel_xmit rpl_ip_tunnel_xmit
+void rpl_ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+			const struct iphdr *tnl_params, const u8 protocol);
+
 
 #ifndef TUNNEL_CSUM
 #define TUNNEL_CSUM	__cpu_to_be16(0x01)
@@ -64,12 +88,18 @@ struct tnl_ptk_info {
 	__be16 proto;
 	__be32 key;
 	__be32 seq;
+#ifndef HAVE_TNL_PTK_INFO_HDR_LEN
+	int hdr_len;
+#endif
 };
 
 #define PACKET_RCVD	0
 #define PACKET_REJECT	1
 #endif
 
+#define IP_TNL_HASH_BITS   7
+#define IP_TNL_HASH_SIZE   (1 << IP_TNL_HASH_BITS)
+
 #ifndef TUNNEL_DONT_FRAGMENT
 #define TUNNEL_DONT_FRAGMENT	__cpu_to_be16(0x0100)
 #endif
@@ -91,6 +121,9 @@ struct tnl_ptk_info {
 #undef TUNNEL_OPTIONS_PRESENT
 #define TUNNEL_OPTIONS_PRESENT (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT)
 
+/* Keep error state on tunnel for 30 sec */
+#define IPTUNNEL_ERR_TIMEO	(30*HZ)
+
 /* Used to memset ip_tunnel padding. */
 #define IP_TUNNEL_KEY_SIZE	offsetofend(struct ip_tunnel_key, tp_dst)
 
@@ -131,6 +164,30 @@ struct ip_tunnel_info {
 	u8			mode;
 };
 
+/* 6rd prefix/relay information */
+#ifdef CONFIG_IPV6_SIT_6RD
+struct ip_tunnel_6rd_parm {
+	struct in6_addr		prefix;
+	__be32			relay_prefix;
+	u16			prefixlen;
+	u16			relay_prefixlen;
+};
+#endif
+
+struct ip_tunnel_encap {
+	u16			type;
+	u16			flags;
+	__be16			sport;
+	__be16			dport;
+};
+
+struct ip_tunnel_prl_entry {
+	struct ip_tunnel_prl_entry __rcu *next;
+	__be32				addr;
+	u16				flags;
+	struct rcu_head			rcu_head;
+};
+
 static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info *tun_info)
 {
 	return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET;
@@ -203,39 +260,115 @@ ip_tunnel_dst_cache_usable(const struct sk_buff *skb,
 }
 #endif
 
-#define ip_tunnel rpl_ip_tunnel
+#define ip_tunnel_dst rpl_ip_tunnel_dst
+struct rpl_ip_tunnel_dst {
+	struct dst_entry __rcu		*dst;
+	__be32				saddr;
+};
 
+#define ip_tunnel rpl_ip_tunnel
 struct ip_tunnel {
+	struct ip_tunnel __rcu	*next;
+	struct hlist_node hash_node;
 	struct net_device	*dev;
 	struct net		*net;	/* netns for packet i/o */
 
-	int		err_count;	/* Number of arrived ICMP errors */
 	unsigned long	err_time;	/* Time when the last ICMP error
-					 * arrived
-					 */
+					 * arrived */
+	int		err_count;	/* Number of arrived ICMP errors */
 
 	/* These four fields used only by GRE */
 	u32		i_seqno;	/* The last seen seqno	*/
 	u32		o_seqno;	/* The last output seqno */
 	int		tun_hlen;	/* Precalculated header length */
-	int		mlink;
+
+	/* These four fields used only by ERSPAN */
+	u32		index;		/* ERSPAN type II index */
+	u8		erspan_ver;	/* ERSPAN version */
+	u8		dir;		/* ERSPAN direction */
+	u16		hwid;		/* ERSPAN hardware ID */
+
+	struct dst_cache dst_cache;
 
 	struct ip_tunnel_parm parms;
 
+	int		mlink;
 	int		encap_hlen;	/* Encap header length (FOU,GUE) */
 	int		hlen;		/* tun_hlen + encap_hlen */
+	struct ip_tunnel_encap encap;
 
-	int		ip_tnl_net_id;
-	bool		collect_md;
+	/* for SIT */
+#ifdef CONFIG_IPV6_SIT_6RD
+	struct ip_tunnel_6rd_parm ip6rd;
+#endif
+	struct ip_tunnel_prl_entry __rcu *prl;	/* potential router list */
+	unsigned int		prl_count;	/* # of entries in PRL */
+	unsigned int		ip_tnl_net_id;
+	struct gro_cells	gro_cells;
+	__u32			fwmark;
+	bool			collect_md;
+	bool			ignore_df;
 };
 
 #define ip_tunnel_net rpl_ip_tunnel_net
 struct ip_tunnel_net {
+	struct net_device *fb_tunnel_dev;
+	struct hlist_head tunnels[IP_TNL_HASH_SIZE];
 	struct ip_tunnel __rcu *collect_md_tun;
-	struct rtnl_link_ops *rtnl_ops;
 };
 
 
+struct ip_tunnel_encap_ops {
+	size_t (*encap_hlen)(struct ip_tunnel_encap *e);
+	int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e,
+			    const u8 *protocol, struct flowi4 *fl4);
+};
+
+extern const struct ip_tunnel_encap_ops __rcu *
+		rpl_iptun_encaps[MAX_IPTUN_ENCAP_OPS];
+
+#define ip_encap_hlen rpl_ip_encap_hlen
+static inline int rpl_ip_encap_hlen(struct ip_tunnel_encap *e)
+{
+	const struct ip_tunnel_encap_ops *ops;
+	int hlen = -EINVAL;
+
+	if (e->type == TUNNEL_ENCAP_NONE)
+		return 0;
+
+	if (e->type >= MAX_IPTUN_ENCAP_OPS)
+		return -EINVAL;
+
+	rcu_read_lock();
+	ops = rcu_dereference(rpl_iptun_encaps[e->type]);
+	if (likely(ops && ops->encap_hlen))
+		hlen = ops->encap_hlen(e);
+	rcu_read_unlock();
+
+	return hlen;
+}
+
+static inline int ovs_ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
+				      const u8 *protocol, struct flowi4 *fl4)
+{
+	const struct ip_tunnel_encap_ops *ops;
+	int ret = -EINVAL;
+
+	if (t->encap.type == TUNNEL_ENCAP_NONE)
+		return 0;
+
+	if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
+		return -EINVAL;
+
+	rcu_read_lock();
+	ops = rcu_dereference(rpl_iptun_encaps[t->encap.type]);
+	if (likely(ops && ops->build_header))
+		ret = ops->build_header(skb, &t->encap, protocol, fl4);
+	rcu_read_unlock();
+
+	return ret;
+}
+
 #ifndef HAVE_PCPU_SW_NETSTATS
 #define ip_tunnel_get_stats64 rpl_ip_tunnel_get_stats64
 #else
@@ -326,6 +459,12 @@ struct net *rpl_ip_tunnel_get_link_net(const struct net_device *dev);
 #define __ip_tunnel_change_mtu rpl___ip_tunnel_change_mtu
 int rpl___ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict);
 
+#define ip_tunnel_lookup rpl_ip_tunnel_lookup
+struct ip_tunnel *rpl_ip_tunnel_lookup(struct ip_tunnel_net *itn,
+				       int link, __be16 flags,
+				       __be32 remote, __be32 local,
+				       __be32 key);
+
 static inline int iptunnel_pull_offloads(struct sk_buff *skb)
 {
 	if (skb_is_gso(skb)) {
diff --git a/datapath/linux/compat/ip_gre.c b/datapath/linux/compat/ip_gre.c
index 4e32591..a82808a 100644
--- a/datapath/linux/compat/ip_gre.c
+++ b/datapath/linux/compat/ip_gre.c
@@ -52,6 +52,7 @@
 #include <net/rtnetlink.h>
 #include <net/gre.h>
 #include <net/dst_metadata.h>
+#include <net/erspan.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
@@ -63,6 +64,7 @@
 #include "vport-netdev.h"
 
 static int gre_tap_net_id __read_mostly;
+static unsigned int erspan_net_id __read_mostly;
 
 #define ip_gre_calc_hlen rpl_ip_gre_calc_hlen
 static int ip_gre_calc_hlen(__be16 o_flags)
@@ -120,6 +122,63 @@ static __be32 tunnel_id_to_key(__be64 x)
 #endif
 }
 
+/* Called with rcu_read_lock and BH disabled. */
+static int gre_err(struct sk_buff *skb, u32 info,
+		   const struct tnl_ptk_info *tpi)
+{
+	return PACKET_REJECT;
+}
+
+static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+		      int gre_hdr_len)
+{
+	struct net *net = dev_net(skb->dev);
+	struct metadata_dst *tun_dst = NULL;
+	struct ip_tunnel_net *itn;
+	struct ip_tunnel *tunnel;
+	struct erspanhdr *ershdr;
+	const struct iphdr *iph;
+	__be32 session_id;
+	__be32 index;
+	int len;
+
+	itn = net_generic(net, erspan_net_id);
+	iph = ip_hdr(skb);
+	len = gre_hdr_len + sizeof(*ershdr);
+
+	if (unlikely(!pskb_may_pull(skb, len)))
+		return -ENOMEM;
+
+	iph = ip_hdr(skb);
+	ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len);
+
+	/* The original GRE header does not have key field,
+	 * Use ERSPAN 10-bit session ID as key.
+	 */
+	session_id = cpu_to_be32(ntohs(ershdr->session_id));
+	tpi->key = session_id;
+	index = ershdr->md.index;
+	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
+				  tpi->flags | TUNNEL_KEY,
+				  iph->saddr, iph->daddr, tpi->key);
+
+	if (tunnel) {
+		if (__iptunnel_pull_header(skb,
+					   gre_hdr_len + sizeof(*ershdr),
+					   htons(ETH_P_TEB),
+					   false, false) < 0)
+			goto drop;
+
+		tunnel->index = ntohl(index);
+		skb_reset_mac_header(skb);
+		ovs_ip_tunnel_rcv(tunnel->dev, skb, tun_dst);
+		return PACKET_RCVD;
+	}
+drop:
+	kfree_skb(skb);
+	return PACKET_RCVD;
+}
+
 static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
 {
 	struct net *net = dev_net(skb->dev);
@@ -164,11 +223,46 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
 	return PACKET_REJECT;
 }
 
-static int gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
+static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
+		       const struct iphdr *tnl_params,
+		       __be16 proto)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct tnl_ptk_info tpi;
+
+	tpi.flags = tunnel->parms.o_flags;
+	tpi.proto = proto;
+	tpi.key = tunnel->parms.o_key;
+	if (tunnel->parms.o_flags & TUNNEL_SEQ)
+		tunnel->o_seqno++;
+	tpi.seq = htonl(tunnel->o_seqno);
+
+	/* Push GRE header. */
+	gre_build_header(skb, &tpi, tunnel->hlen);
+
+	ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
+}
+
+static int gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *unused_tpi)
 {
-	if (ipgre_rcv(skb, tpi) == PACKET_RCVD)
+	struct tnl_ptk_info tpi;
+	bool csum_err = false;
+	int hdr_len;
+
+	hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
+	if (hdr_len < 0)
+		goto drop;
+
+	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) {
+		if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
+			return 0;
+	}
+
+	if (ipgre_rcv(skb, &tpi) == PACKET_RCVD)
 		return 0;
 
+drop:
+
 	kfree_skb(skb);
 	return 0;
 }
@@ -382,13 +476,6 @@ static void __gre_tunnel_init(struct net_device *dev)
 	}
 }
 
-/* Called with rcu_read_lock and BH disabled. */
-static int gre_err(struct sk_buff *skb, u32 info,
-		   const struct tnl_ptk_info *tpi)
-{
-	return PACKET_REJECT;
-}
-
 static struct gre_cisco_protocol ipgre_protocol = {
 	.handler        = gre_rcv,
 	.err_handler    = gre_err,
@@ -437,6 +524,61 @@ out:
 	return ipgre_tunnel_validate(tb, data);
 }
 
+enum {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
+	IFLA_GRE_ENCAP_TYPE = IFLA_GRE_FLAGS + 1,
+	IFLA_GRE_ENCAP_FLAGS,
+	IFLA_GRE_ENCAP_SPORT,
+	IFLA_GRE_ENCAP_DPORT,
+#endif
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,3,0)
+	IFLA_GRE_COLLECT_METADATA = IFLA_GRE_ENCAP_DPORT + 1,
+#endif
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0)
+	IFLA_GRE_IGNORE_DF = IFLA_GRE_COLLECT_METADATA + 1,
+#endif
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,12,0)
+	IFLA_GRE_FWMARK = IFLA_GRE_IGNORE_DF + 1,
+#endif
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,15,0)
+	IFLA_GRE_ERSPAN_INDEX = IFLA_GRE_FWMARK + 1,
+#endif
+};
+
+#define RPL_IFLA_GRE_MAX (IFLA_GRE_ERSPAN_INDEX + 1)
+
+static int erspan_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	__be16 flags = 0;
+	int ret;
+
+	if (!data)
+		return 0;
+
+	ret = ipgre_tap_validate(tb, data);
+	if (ret)
+		return ret;
+
+	/* ERSPAN should only have GRE sequence and key flag */
+	flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+	flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+	if (flags != (GRE_SEQ | GRE_KEY))
+		return -EINVAL;
+
+	/* ERSPAN Session ID only has 10-bit. Since we reuse
+	 * 32-bit key field as ID, check it's range.
+	 */
+	if (data[IFLA_GRE_IKEY] &&
+	    (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
+		return -EINVAL;
+
+	if (data[IFLA_GRE_OKEY] &&
+	    (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
+		return -EINVAL;
+
+	return 0;
+}
+
 static void ipgre_netlink_parms(struct net_device *dev,
 				struct nlattr *data[],
 				struct nlattr *tb[],
@@ -466,6 +608,81 @@ static netdev_tx_t gre_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 	return NETDEV_TX_OK;
 }
 
+static inline u8 tos_to_cos(u8 tos)
+{
+	u8 dscp, cos;
+
+	dscp = tos >> 2;
+	cos = dscp >> 3;
+	return cos;
+}
+
+static void erspan_build_header(struct sk_buff *skb,
+				__be32 id, u32 index, bool truncate)
+{
+	struct iphdr *iphdr = ip_hdr(skb);
+	struct ethhdr *eth = eth_hdr(skb);
+	enum erspan_encap_type enc_type;
+	struct erspanhdr *ershdr;
+	struct qtag_prefix {
+		__be16 eth_type;
+		__be16 tci;
+	} *qp;
+	u16 vlan_tci = 0;
+
+	enc_type = ERSPAN_ENCAP_NOVLAN;
+
+	/* If mirrored packet has vlan tag, extract tci and
+	 *  perserve vlan header in the mirrored frame.
+	 */
+	if (eth->h_proto == htons(ETH_P_8021Q)) {
+		qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN);
+		vlan_tci = ntohs(qp->tci);
+		enc_type = ERSPAN_ENCAP_INFRAME;
+	}
+
+	skb_push(skb, sizeof(*ershdr));
+	ershdr = (struct erspanhdr *)skb->data;
+	memset(ershdr, 0, sizeof(*ershdr));
+
+	ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
+				 (ERSPAN_VERSION << VER_OFFSET));
+	ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) |
+			   ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) |
+			   (enc_type << EN_OFFSET & EN_MASK) |
+			   ((truncate << T_OFFSET) & T_MASK));
+	ershdr->md.index = htonl(index & INDEX_MASK);
+}
+
+static netdev_tx_t erspan_xmit(struct sk_buff *skb,
+			       struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	bool truncate = false;
+
+	if (gre_handle_offloads(skb, false))
+		goto free_skb;
+
+	if (skb_cow_head(skb, dev->needed_headroom))
+		goto free_skb;
+
+	if (skb->len > dev->mtu) {
+		pskb_trim(skb, dev->mtu);
+		truncate = true;
+	}
+
+	/* Push ERSPAN header */
+	erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate);
+	tunnel->parms.o_flags &= ~TUNNEL_KEY;
+	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
+	return NETDEV_TX_OK;
+
+free_skb:
+	kfree_skb(skb);
+	dev->stats.tx_dropped++;
+	return NETDEV_TX_OK;
+}
+
 int ovs_gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 {
 	struct ip_tunnel_info *info = skb_tunnel_info(skb);
@@ -503,6 +720,40 @@ static const struct net_device_ops gre_tap_netdev_ops = {
 #endif
 };
 
+static int erspan_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	int t_hlen;
+
+	tunnel->tun_hlen = 8;
+	tunnel->parms.iph.protocol = IPPROTO_GRE;
+	t_hlen = tunnel->hlen + sizeof(struct iphdr) + sizeof(struct erspanhdr);
+
+	dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
+	dev->mtu = ETH_DATA_LEN - t_hlen - 4;
+	dev->features		|= GRE_FEATURES;
+	dev->hw_features	|= GRE_FEATURES;
+	dev->priv_flags		|= IFF_LIVE_ADDR_CHANGE;
+
+	return ip_tunnel_init(dev);
+}
+
+static const struct net_device_ops erspan_netdev_ops = {
+	.ndo_init		= erspan_tunnel_init,
+	.ndo_uninit		= rpl_ip_tunnel_uninit,
+	.ndo_start_xmit		= erspan_xmit,
+	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= ip_tunnel_change_mtu,
+	.ndo_get_stats64	= rpl_ip_tunnel_get_stats64,
+#ifdef HAVE_NDO_GET_IFLINK
+	.ndo_get_iflink		= rpl_ip_tunnel_get_iflink,
+#endif
+#ifdef HAVE_NDO_FILL_METADATA_DST
+	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
+#endif
+};
+
 static void ipgre_tap_setup(struct net_device *dev)
 {
 	ether_setup(dev);
@@ -561,6 +812,8 @@ static size_t ipgre_get_size(const struct net_device *dev)
 		nla_total_size(2) +
 		/* IFLA_GRE_COLLECT_METADATA */
 		nla_total_size(0) +
+		/* IFLA_GRE_ERSPAN_INDEX */
+		nla_total_size(4) +
 		0;
 }
 
@@ -582,13 +835,26 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 		       !!(p->iph.frag_off & htons(IP_DF))))
 		goto nla_put_failure;
 
+	if (t->index)
+		if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
+			goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
 	return -EMSGSIZE;
 }
 
-static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
+static void erspan_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+	dev->netdev_ops = &erspan_netdev_ops;
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+	ip_tunnel_setup(dev, erspan_net_id);
+}
+
+static const struct nla_policy ipgre_policy[RPL_IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
@@ -599,11 +865,12 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
+	[IFLA_GRE_ERSPAN_INDEX]	= { .type = NLA_U32 },
 };
 
 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
 	.kind		= "ovs_gretap",
-	.maxtype	= IFLA_GRE_MAX,
+	.maxtype	= RPL_IFLA_GRE_MAX,
 	.policy		= ipgre_policy,
 	.priv_size	= sizeof(struct ip_tunnel),
 	.setup		= ipgre_tap_setup,
@@ -617,6 +884,22 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
 #endif
 };
 
+static struct rtnl_link_ops erspan_link_ops __read_mostly = {
+	.kind		= "erspan",
+	.maxtype	= RPL_IFLA_GRE_MAX,
+	.policy		= ipgre_policy,
+	.priv_size	= sizeof(struct ip_tunnel),
+	.setup		= erspan_setup,
+	.validate	= erspan_validate,
+	.newlink	= ipgre_newlink,
+	.dellink	= ip_tunnel_dellink,
+	.get_size	= ipgre_get_size,
+	.fill_info	= ipgre_fill_info,
+#ifdef HAVE_GET_LINK_NET
+	.get_link_net	= ip_tunnel_get_link_net,
+#endif
+};
+
 struct net_device *rpl_gretap_fb_dev_create(struct net *net, const char *name,
 					u8 name_assign_type)
 {
@@ -657,6 +940,26 @@ out:
 }
 EXPORT_SYMBOL_GPL(rpl_gretap_fb_dev_create);
 
+static int __net_init erspan_init_net(struct net *net)
+{
+	return ip_tunnel_init_net(net, erspan_net_id,
+				  &erspan_link_ops, "erspan0");
+}
+
+static void __net_exit erspan_exit_net(struct net *net)
+{
+	struct ip_tunnel_net *itn = net_generic(net, erspan_net_id);
+
+	ip_tunnel_delete_net(itn, &erspan_link_ops);
+}
+
+static struct pernet_operations erspan_net_ops = {
+	.init = erspan_init_net,
+	.exit = erspan_exit_net,
+	.id   = &erspan_net_id,
+	.size = sizeof(struct ip_tunnel_net),
+};
+
 static int __net_init ipgre_tap_init_net(struct net *net)
 {
 	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
@@ -684,6 +987,10 @@ int rpl_ipgre_init(void)
 	if (err < 0)
 		goto pnet_tap_faied;
 
+	err = register_pernet_device(&erspan_net_ops);
+	if (err < 0)
+		goto pnet_erspan_failed;
+
 	err = gre_cisco_register(&ipgre_protocol);
 	if (err < 0) {
 		pr_info("%s: can't add protocol\n", __func__);
@@ -694,12 +1001,20 @@ int rpl_ipgre_init(void)
 	if (err < 0)
 		goto tap_ops_failed;
 
+	err = rtnl_link_register(&erspan_link_ops);
+	if (err < 0)
+		goto erspan_link_failed;
+
 	pr_info("GRE over IPv4 tunneling driver\n");
 	return 0;
 
+erspan_link_failed:
+	rtnl_link_unregister(&ipgre_tap_ops);
 tap_ops_failed:
 	gre_cisco_unregister(&ipgre_protocol);
 add_proto_failed:
+	unregister_pernet_device(&erspan_net_ops);
+pnet_erspan_failed:
 	unregister_pernet_device(&ipgre_tap_net_ops);
 pnet_tap_faied:
 	pr_err("Error while initializing GRE %d\n", err);
@@ -709,8 +1024,10 @@ pnet_tap_faied:
 void rpl_ipgre_fini(void)
 {
 	rtnl_link_unregister(&ipgre_tap_ops);
+	rtnl_link_unregister(&erspan_link_ops);
 	gre_cisco_unregister(&ipgre_protocol);
 	unregister_pernet_device(&ipgre_tap_net_ops);
+	unregister_pernet_device(&erspan_net_ops);
 }
 
 #endif
diff --git a/datapath/linux/compat/ip_tunnel.c b/datapath/linux/compat/ip_tunnel.c
index be82b55..6e9fa95 100644
--- a/datapath/linux/compat/ip_tunnel.c
+++ b/datapath/linux/compat/ip_tunnel.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013 Nicira, Inc.
+ * Copyright (c) 2013,2018 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -18,7 +18,6 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/kconfig.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/skbuff.h>
@@ -52,7 +51,6 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
-#include <net/udp.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
@@ -63,6 +61,9 @@
 #include "compat.h"
 
 #ifndef USE_UPSTREAM_TUNNEL
+const struct ip_tunnel_encap_ops __rcu *
+		rpl_iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
+
 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 {
 	if (t->collect_md)
@@ -77,6 +78,52 @@ static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 		rcu_assign_pointer(itn->collect_md_tun, NULL);
 }
 
+static struct net_device *__ip_tunnel_create(struct net *net,
+					     const struct rtnl_link_ops *ops,
+					     struct ip_tunnel_parm *parms)
+{
+	int err;
+	struct ip_tunnel *tunnel;
+	struct net_device *dev;
+	char name[IFNAMSIZ];
+
+	if (parms->name[0])
+		strlcpy(name, parms->name, IFNAMSIZ);
+	else {
+		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
+			err = -E2BIG;
+			goto failed;
+		}
+		strlcpy(name, ops->kind, IFNAMSIZ);
+		strncat(name, "%d", 2);
+	}
+
+	ASSERT_RTNL();
+	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
+	if (!dev) {
+		err = -ENOMEM;
+		goto failed;
+	}
+	dev_net_set(dev, net);
+
+	dev->rtnl_link_ops = ops;
+
+	tunnel = netdev_priv(dev);
+	tunnel->parms = *parms;
+	tunnel->net = net;
+
+	err = register_netdevice(dev);
+	if (err)
+		goto failed_free;
+
+	return dev;
+
+failed_free:
+	free_netdev(dev);
+failed:
+	return ERR_PTR(err);
+}
+
 static inline void init_tunnel_flow(struct flowi4 *fl4,
 				    int proto,
 				    __be32 daddr, __be32 saddr,
@@ -162,6 +209,222 @@ int rpl_ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
 	return rpl___ip_tunnel_change_mtu(dev, new_mtu, true);
 }
 
+static int rpl_tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
+			       struct rtable *rt, __be16 df,
+			       const struct iphdr *inner_iph)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
+	int mtu;
+
+	if (df)
+		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
+					- sizeof(struct iphdr) - tunnel->hlen;
+	else
+		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
+
+	if (skb_dst(skb))
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		if (!skb_is_gso(skb) &&
+		    (inner_iph->frag_off & htons(IP_DF)) &&
+		    mtu < pkt_size) {
+			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+			return -E2BIG;
+		}
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (skb->protocol == htons(ETH_P_IPV6)) {
+		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
+
+		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
+			   mtu >= IPV6_MIN_MTU) {
+			if ((tunnel->parms.iph.daddr &&
+			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
+			    rt6->rt6i_dst.plen == 128) {
+				rt6->rt6i_flags |= RTF_MODIFIED;
+				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
+			}
+		}
+
+		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
+					mtu < pkt_size) {
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+			return -E2BIG;
+		}
+	}
+#endif
+	return 0;
+}
+
+void rpl_ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+		        const struct iphdr *tnl_params, const u8 protocol)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	const struct iphdr *inner_iph;
+	struct flowi4 fl4;
+	u8     tos, ttl;
+	__be16 df;
+	struct rtable *rt;		/* Route to the other host */
+	unsigned int max_headroom;	/* The extra header space needed */
+	__be32 dst;
+	bool connected;
+
+	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+	connected = (tunnel->parms.iph.daddr != 0);
+
+	dst = tnl_params->daddr;
+	if (dst == 0) {
+		/* NBMA tunnel */
+
+		if (skb_dst(skb) == NULL) {
+			dev->stats.tx_fifo_errors++;
+			goto tx_error;
+		}
+
+		if (skb->protocol == htons(ETH_P_IP)) {
+			rt = skb_rtable(skb);
+			dst = rt_nexthop(rt, inner_iph->daddr);
+		}
+#if IS_ENABLED(CONFIG_IPV6)
+		else if (skb->protocol == htons(ETH_P_IPV6)) {
+			const struct in6_addr *addr6;
+			struct neighbour *neigh;
+			bool do_tx_error_icmp;
+			int addr_type;
+
+			neigh = dst_neigh_lookup(skb_dst(skb),
+						 &ipv6_hdr(skb)->daddr);
+			if (neigh == NULL)
+				goto tx_error;
+
+			addr6 = (const struct in6_addr *)&neigh->primary_key;
+			addr_type = ipv6_addr_type(addr6);
+
+			if (addr_type == IPV6_ADDR_ANY) {
+				addr6 = &ipv6_hdr(skb)->daddr;
+				addr_type = ipv6_addr_type(addr6);
+			}
+
+			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
+				do_tx_error_icmp = true;
+			else {
+				do_tx_error_icmp = false;
+				dst = addr6->s6_addr32[3];
+			}
+			neigh_release(neigh);
+			if (do_tx_error_icmp)
+				goto tx_error_icmp;
+		}
+#endif
+		else
+			goto tx_error;
+
+		connected = false;
+	}
+
+	tos = tnl_params->tos;
+	if (tos & 0x1) {
+		tos &= ~0x1;
+		if (skb->protocol == htons(ETH_P_IP)) {
+			tos = inner_iph->tos;
+			connected = false;
+		} else if (skb->protocol == htons(ETH_P_IPV6)) {
+			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+			connected = false;
+		}
+	}
+
+	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
+			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
+
+	if (ovs_ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
+		goto tx_error;
+
+	rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
+			 NULL;
+
+	if (!rt) {
+		rt = ip_route_output_key(tunnel->net, &fl4);
+
+		if (IS_ERR(rt)) {
+			dev->stats.tx_carrier_errors++;
+			goto tx_error;
+		}
+		if (connected)
+			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
+					  fl4.saddr);
+	}
+
+	if (rt->dst.dev == dev) {
+		ip_rt_put(rt);
+		dev->stats.collisions++;
+		goto tx_error;
+	}
+
+	if (rpl_tnl_update_pmtu(dev, skb, rt,
+				tnl_params->frag_off, inner_iph)) {
+		ip_rt_put(rt);
+		goto tx_error;
+	}
+
+	if (tunnel->err_count > 0) {
+		if (time_before(jiffies,
+				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
+			tunnel->err_count--;
+
+			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+			dst_link_failure(skb);
+		} else
+			tunnel->err_count = 0;
+	}
+
+	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
+	ttl = tnl_params->ttl;
+	if (ttl == 0) {
+		if (skb->protocol == htons(ETH_P_IP))
+			ttl = inner_iph->ttl;
+#if IS_ENABLED(CONFIG_IPV6)
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
+#endif
+		else
+			ttl = ip4_dst_hoplimit(&rt->dst);
+	}
+
+	df = tnl_params->frag_off;
+	if (skb->protocol == htons(ETH_P_IP))
+		df |= (inner_iph->frag_off&htons(IP_DF));
+
+	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+			+ rt->dst.header_len;
+	if (max_headroom > dev->needed_headroom)
+		dev->needed_headroom = max_headroom;
+
+	if (skb_cow_head(skb, dev->needed_headroom)) {
+		ip_rt_put(rt);
+		dev->stats.tx_dropped++;
+		kfree_skb(skb);
+		return;
+	}
+
+	iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol,
+		      tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
+
+	return;
+
+#if IS_ENABLED(CONFIG_IPV6)
+tx_error_icmp:
+	dst_link_failure(skb);
+#endif
+tx_error:
+	dev->stats.tx_errors++;
+	kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(rpl_ip_tunnel_xmit);
+
 static void ip_tunnel_dev_free(struct net_device *dev)
 {
 	free_percpu(dev->tstats);
@@ -181,7 +444,7 @@ void rpl_ip_tunnel_dellink(struct net_device *dev)
 
 	ip_tunnel_del(itn, netdev_priv(dev));
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
-	unregister_netdevice_queue(dev, head);
+        unregister_netdevice_queue(dev, head);
 #endif
 }
 
@@ -189,10 +452,34 @@ int rpl_ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 				  struct rtnl_link_ops *ops, char *devname)
 {
 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
+	struct ip_tunnel_parm parms;
+	unsigned int i;
 
-	itn->collect_md_tun = NULL;
-	itn->rtnl_ops = ops;
-	return 0;
+	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&itn->tunnels[i]);
+
+	if (!ops) {
+		itn->fb_tunnel_dev = NULL;
+		return 0;
+	}
+
+	memset(&parms, 0, sizeof(parms));
+	if (devname)
+		strlcpy(parms.name, devname, IFNAMSIZ);
+
+	rtnl_lock();
+	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
+	/* FB netdevice is special: we have one, and only one per netns.
+ * 	 * Allowing to move it to another netns is clearly unsafe.
+ * 	 	 */
+	if (!IS_ERR(itn->fb_tunnel_dev)) {
+		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
+		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
+		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
+	}
+	rtnl_unlock();
+
+	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
 }
 
 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
@@ -257,20 +544,41 @@ int rpl_ip_tunnel_init(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	struct iphdr *iph = &tunnel->parms.iph;
+	int err;
 
-	dev->destructor	= ip_tunnel_dev_free;
-	dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+#ifndef HAVE_NEEDS_FREE_NETDEV
+	dev->destructor = ip_tunnel_dev_free;
+#else
+	dev->needs_free_netdev = true;
+	dev->priv_destructor = ip_tunnel_dev_free;
+#endif
+	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
 	if (!dev->tstats)
 		return -ENOMEM;
+
+	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
+	if (err) {
+		free_percpu(dev->tstats);
+		return err;
+	}
+
+	err = gro_cells_init(&tunnel->gro_cells, dev);
+	if (err) {
+		dst_cache_destroy(&tunnel->dst_cache);
+		free_percpu(dev->tstats);
+		return err;
+	}
+
 	tunnel->dev = dev;
 	tunnel->net = dev_net(dev);
 	strcpy(tunnel->parms.name, dev->name);
 	iph->version		= 4;
 	iph->ihl		= 5;
 
-	if (tunnel->collect_md)
+	if (tunnel->collect_md) {
 		dev->features |= NETIF_F_NETNS_LOCAL;
-
+		netif_keep_dst(dev);
+	}
 	return 0;
 }
 
@@ -306,4 +614,113 @@ struct net *rpl_ip_tunnel_get_link_net(const struct net_device *dev)
 	return tunnel->net;
 }
 
+static unsigned int rpl_ip_tunnel_hash(__be32 key, __be32 remote)
+{
+	return hash_32((__force u32)key ^ (__force u32)remote,
+			 IP_TNL_HASH_BITS);
+}
+
+static bool rpl_ip_tunnel_key_match(const struct ip_tunnel_parm *p,
+				    __be16 flags, __be32 key)
+{
+	if (p->i_flags & TUNNEL_KEY) {
+		if (flags & TUNNEL_KEY)
+			return key == p->i_key;
+		else
+			/* key expected, none present */
+			return false;
+	} else
+		return !(flags & TUNNEL_KEY);
+}
+
+struct ip_tunnel *rpl_ip_tunnel_lookup(struct ip_tunnel_net *itn,
+				       int link, __be16 flags,
+				       __be32 remote, __be32 local,
+				       __be32 key)
+{
+	unsigned int hash;
+	struct ip_tunnel *t, *cand = NULL;
+	struct hlist_head *head;
+
+	hash = rpl_ip_tunnel_hash(key, remote);
+	head = &itn->tunnels[hash];
+
+	hlist_for_each_entry_rcu(t, head, hash_node) {
+		if (local != t->parms.iph.saddr ||
+		    remote != t->parms.iph.daddr ||
+		    !(t->dev->flags & IFF_UP))
+			continue;
+
+		if (!rpl_ip_tunnel_key_match(&t->parms, flags, key))
+			continue;
+
+		if (t->parms.link == link)
+			return t;
+		else
+			cand = t;
+	}
+
+	hlist_for_each_entry_rcu(t, head, hash_node) {
+		if (remote != t->parms.iph.daddr ||
+		    t->parms.iph.saddr != 0 ||
+		    !(t->dev->flags & IFF_UP))
+			continue;
+
+		if (!rpl_ip_tunnel_key_match(&t->parms, flags, key))
+			continue;
+
+		if (t->parms.link == link)
+			return t;
+		else if (!cand)
+			cand = t;
+	}
+
+	hash = rpl_ip_tunnel_hash(key, 0);
+	head = &itn->tunnels[hash];
+
+	hlist_for_each_entry_rcu(t, head, hash_node) {
+		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
+		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
+			continue;
+
+		if (!(t->dev->flags & IFF_UP))
+			continue;
+
+		if (!rpl_ip_tunnel_key_match(&t->parms, flags, key))
+			continue;
+
+		if (t->parms.link == link)
+			return t;
+		else if (!cand)
+			cand = t;
+	}
+
+	if (flags & TUNNEL_NO_KEY)
+		goto skip_key_lookup;
+
+	hlist_for_each_entry_rcu(t, head, hash_node) {
+		if (t->parms.i_key != key ||
+		    t->parms.iph.saddr != 0 ||
+		    t->parms.iph.daddr != 0 ||
+		    !(t->dev->flags & IFF_UP))
+			continue;
+
+		if (t->parms.link == link)
+			return t;
+		else if (!cand)
+			cand = t;
+	}
+
+skip_key_lookup:
+	if (cand)
+		return cand;
+
+	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
+		return netdev_priv(itn->fb_tunnel_dev);
+
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(rpl_ip_tunnel_lookup);
+
 #endif
diff --git a/datapath/linux/compat/ip_tunnels_core.c b/datapath/linux/compat/ip_tunnels_core.c
index 7ade6c1..90e838a 100644
--- a/datapath/linux/compat/ip_tunnels_core.c
+++ b/datapath/linux/compat/ip_tunnels_core.c
@@ -129,6 +129,47 @@ error:
 }
 EXPORT_SYMBOL_GPL(ovs_iptunnel_handle_offloads);
 
+struct sk_buff *rpl_iptunnel_handle_offloads(struct sk_buff *skb,
+					     bool csum_help,
+					     int gso_type_mask)
+{
+	int err;
+
+	if (likely(!skb->encapsulation)) {
+		skb_reset_inner_headers(skb);
+		skb->encapsulation = 1;
+	}
+
+	if (skb_is_gso(skb)) {
+		err = skb_unclone(skb, GFP_ATOMIC);
+		if (unlikely(err))
+			goto error;
+		skb_shinfo(skb)->gso_type |= gso_type_mask;
+		return skb;
+	}
+
+	/* If packet is not gso and we are resolving any partial checksum,
+ 	 * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL
+ 	 * on the outer header without confusing devices that implement
+ 	 * NETIF_F_IP_CSUM with encapsulation.
+ 	 */
+	if (csum_help)
+		skb->encapsulation = 0;
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) {
+		err = skb_checksum_help(skb);
+		if (unlikely(err))
+			goto error;
+	} else if (skb->ip_summed != CHECKSUM_PARTIAL)
+		skb->ip_summed = CHECKSUM_NONE;
+
+	return skb;
+error:
+	kfree_skb(skb);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(rpl_iptunnel_handle_offloads);
+
 int rpl___iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
 			       __be16 inner_proto, bool raw_proto, bool xnet)
 {
-- 
1.8.3.1



More information about the dev mailing list