[ovs-dev] [RFC upstream PATCH 1/4] GRE: Allow multiple GREPROTO_CISCO protocol handlers.

Pravin B Shelar pshelar at nicira.com
Wed Sep 26 23:34:35 UTC 2012


Currently only one protocol handler of GREPROTO_CISCO protocol
is allowed. Soon we will have ovs tunnel registering for same protocol
as GRE device.
Following patch extends GRE de-multiplexer so that it can multiple GRE 
modules can register GRE protocol handler.

Signed-off-by: Pravin B Shelar <pshelar at nicira.com>
---
 include/net/gre.h  |   12 +++
 include/net/ipip.h |    9 ++
 net/ipv4/gre.c     |  281 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/ip_gre.c  |  139 ++++----------------------
 4 files changed, 323 insertions(+), 118 deletions(-)

diff --git a/include/net/gre.h b/include/net/gre.h
index 8266547..82e9276 100644
--- a/include/net/gre.h
+++ b/include/net/gre.h
@@ -2,6 +2,7 @@
 #define __LINUX_GRE_H
 
 #include <linux/skbuff.h>
+#include <net/ipip.h>
 
 #define GREPROTO_CISCO		0
 #define GREPROTO_PPTP		1
@@ -15,4 +16,15 @@ struct gre_protocol {
 int gre_add_protocol(const struct gre_protocol *proto, u8 version);
 int gre_del_protocol(const struct gre_protocol *proto, u8 version);
 
+struct gre_protocol_v0 {
+	int (*handler)(struct sk_buff *skb, struct tnl_ptk_info *tpi);
+	int (*err_handler)(struct sk_buff *skb, u32 info,
+			   struct tnl_ptk_info *tpi);
+};
+
+int gre_add_protocol_v0(const struct gre_protocol_v0 *proto, u8 priority);
+int gre_del_protocol_v0(const struct gre_protocol_v0 *proto, u8 priority);
+void build_gre_header(struct sk_buff *skb, int hlen, __be16 flags,
+		      __be16 proto, __be32 key, u32 seqno);
+
 #endif
diff --git a/include/net/ipip.h b/include/net/ipip.h
index a93cf6d..4b77671 100644
--- a/include/net/ipip.h
+++ b/include/net/ipip.h
@@ -38,6 +38,15 @@ struct ip_tunnel {
 	unsigned int			prl_count;	/* # of entries in PRL */
 };
 
+struct tnl_ptk_info {
+	__be16 flags;
+	__be16 proto;
+	__be32 key;
+	__be32 seq;
+	int hdr_len;
+	__be16 csum;
+};
+
 struct ip_tunnel_prl_entry {
 	struct ip_tunnel_prl_entry __rcu *next;
 	__be32				addr;
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
index 42a4910..2c384c7 100644
--- a/net/ipv4/gre.c
+++ b/net/ipv4/gre.c
@@ -16,15 +16,27 @@
 #include <linux/kernel.h>
 #include <linux/kmod.h>
 #include <linux/skbuff.h>
+#include <linux/if.h>
+#include <linux/icmp.h>
 #include <linux/in.h>
 #include <linux/ip.h>
+#include <linux/if_tunnel.h>
 #include <linux/netdevice.h>
 #include <linux/spinlock.h>
 #include <net/protocol.h>
 #include <net/gre.h>
+#include <net/icmp.h>
 
+#define GREPROTO_V0_MAX 2
+#define GRE_HEADER_SECTION 4
+
+struct gre_base_hdr {
+	__be16 flags;
+	__be16 protocol;
+};
 
 static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
+static const struct gre_protocol_v0 __rcu *gre_proto_v0[GREPROTO_V0_MAX] __read_mostly;
 static DEFINE_SPINLOCK(gre_proto_lock);
 
 int gre_add_protocol(const struct gre_protocol *proto, u8 version)
@@ -112,12 +124,273 @@ static void gre_err(struct sk_buff *skb, u32 info)
 	rcu_read_unlock();
 }
 
+int gre_add_protocol_v0(const struct gre_protocol_v0 *proto, u8 priority)
+{
+	if (priority >= GREPROTO_V0_MAX)
+		goto err_out;
+
+	spin_lock(&gre_proto_lock);
+	if (gre_proto_v0[priority])
+		goto err_out_unlock;
+
+	RCU_INIT_POINTER(gre_proto_v0[priority], proto);
+	spin_unlock(&gre_proto_lock);
+	return 0;
+
+err_out_unlock:
+	spin_unlock(&gre_proto_lock);
+err_out:
+	return -1;
+}
+EXPORT_SYMBOL_GPL(gre_add_protocol_v0);
+
+int gre_del_protocol_v0(const struct gre_protocol_v0 *proto, u8 priority)
+{
+	if (priority >= GREPROTO_V0_MAX)
+		goto err_out;
+
+	spin_lock(&gre_proto_lock);
+	if (rcu_dereference_protected(gre_proto_v0[priority],
+			lockdep_is_held(&gre_proto_lock)) != proto)
+		goto err_out_unlock;
+	RCU_INIT_POINTER(gre_proto_v0[priority], NULL);
+	spin_unlock(&gre_proto_lock);
+	synchronize_rcu();
+	return 0;
+
+err_out_unlock:
+	spin_unlock(&gre_proto_lock);
+err_out:
+	return -1;
+}
+EXPORT_SYMBOL_GPL(gre_del_protocol_v0);
+
+void build_gre_header(struct sk_buff *skb, int hlen, __be16 flags,
+		      __be16 proto, __be32 key, u32 seqno)
+{
+	struct iphdr *iph = ip_hdr(skb);
+
+	((__be16 *)(iph + 1))[0] = flags;
+	((__be16 *)(iph + 1))[1] = proto;
+
+	if (flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
+		__be32 *ptr = (__be32 *)(((u8 *)iph) + hlen - 4);
+
+		if (flags&GRE_SEQ) {
+			*ptr = htonl(seqno);
+			ptr--;
+		}
+		if (flags&GRE_KEY) {
+			*ptr = key;
+			ptr--;
+		}
+		if (flags&GRE_CSUM) {
+			*ptr = 0;
+			*(__sum16 *)ptr = ip_compute_csum((void *)(iph+1),
+					   skb->len - sizeof(struct iphdr));
+		}
+	}
+}
+EXPORT_SYMBOL(build_gre_header);
+
+static __sum16 check_checksum(struct sk_buff *skb)
+{
+	struct iphdr *iph = ip_hdr(skb);
+	struct gre_base_hdr *greh = (struct gre_base_hdr *)(iph + 1);
+	__sum16 csum = 0;
+
+	if (greh->flags & GRE_CSUM) {
+		switch (skb->ip_summed) {
+		case CHECKSUM_COMPLETE:
+			csum = csum_fold(skb->csum);
+
+			if (!csum)
+				break;
+			/* Fall through. */
+
+		case CHECKSUM_NONE:
+			skb->csum = 0;
+			csum = __skb_checksum_complete(skb);
+			skb->ip_summed = CHECKSUM_COMPLETE;
+			break;
+		}
+	}
+
+	return csum;
+}
+
+static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi)
+{
+
+	/* IP and ICMP protocol handlers check that the IHL is valid. */
+	struct gre_base_hdr *greh = (struct gre_base_hdr *)skb->data;
+	__be32 *options = (__be32 *)(greh + 1);
+
+	if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
+		return -EINVAL;
+
+	tpi->flags = greh->flags;
+	tpi->proto = greh->protocol;
+
+	tpi->hdr_len = GRE_HEADER_SECTION;
+	tpi->csum = check_checksum(skb);
+
+	if (greh->flags & GRE_CSUM) {
+		tpi->hdr_len += GRE_HEADER_SECTION;
+		options++;
+	}
+
+	if (greh->flags & GRE_KEY) {
+		if ((void *)(options + 1) > (void *)skb_tail_pointer(skb))
+			return -1;
+		tpi->hdr_len += GRE_HEADER_SECTION;
+		tpi->key = *options;
+		options++;
+	} else
+		tpi->key = 0;
+
+	if (unlikely(greh->flags & GRE_SEQ)) {
+		if ((void *) (options + 1) > (void *)skb_tail_pointer(skb))
+			return -1;
+
+		tpi->seq = *options;
+		tpi->hdr_len += GRE_HEADER_SECTION;
+		options++;
+	} else
+		tpi->seq = 0;
+
+	/* WCCP version 1 and 2 protocol decoding.
+	 * - Change protocol to IP
+	 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+	 */
+	if (tpi->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
+		tpi->proto = htons(ETH_P_IP);
+		if ((*(u8 *)options & 0xF0) != 0x40)
+			tpi->hdr_len += 4;
+	}
+
+	return 0;
+}
+
+static int ipgre_rcv_v0(struct sk_buff *skb)
+{
+	struct tnl_ptk_info tpi;
+	int i;
+
+	if (!pskb_may_pull(skb, 16))
+		goto drop;
+
+	if (parse_gre_header(skb, &tpi) < 0)
+		goto drop;
+
+	rcu_read_lock();
+	for (i = 0; i < GREPROTO_V0_MAX; i++) {
+		if (gre_proto_v0[i]->handler) {
+			int ret;
+
+			ret = gre_proto_v0[i]->handler(skb, &tpi);
+			if (ret <= 0) {
+				rcu_read_unlock();
+				return ret;
+			}
+		}
+
+	}
+	rcu_read_unlock();
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+static void ipgre_err_v0(struct sk_buff *skb, u32 info)
+{
+
+	/* All the routers (except for Linux) return only
+	 * 8 bytes of packet payload. It means, that precise relaying of
+	 * ICMP in the real Internet is absolutely infeasible.
+	 *
+	 * Moreover, Cisco "wise men" put GRE key to the third word
+	 * in GRE header. It makes impossible maintaining even soft
+	 * state for keyed
+	 * GRE tunnels with enabled checksum. Tell them "thank you".
+	 *
+	 * Well, I wonder, rfc1812 was written by Cisco employee,
+	 * what the hell these idiots break standards established
+	 * by themselves???
+	 **/
+
+	const int type = icmp_hdr(skb)->type;
+	const int code = icmp_hdr(skb)->code;
+	struct tnl_ptk_info tpi;
+	int i;
+
+	if (!pskb_may_pull(skb, sizeof(struct gre_base_hdr) + ETH_HLEN))
+		return;
+
+	parse_gre_header(skb, &tpi);
+
+	/* If only 8 bytes returned, keyed message will be dropped here */
+	if (tpi.flags & GRE_KEY) {
+		if ((tpi.flags & GRE_CSUM) && (tpi.hdr_len < 12))
+			return;
+		if (tpi.hdr_len < 8)
+			return;
+	}
+
+	switch (type) {
+	default:
+	case ICMP_PARAMETERPROB:
+		return;
+
+	case ICMP_DEST_UNREACH:
+		switch (code) {
+		case ICMP_SR_FAILED:
+		case ICMP_PORT_UNREACH:
+			/* Impossible event. */
+		return;
+		default:
+			/* All others are translated to HOST_UNREACH.
+			   rfc2003 contains "deep thoughts" about NET_UNREACH,
+			   I believe they are just ether pollution. --ANK
+			 */
+		break;
+		}
+		break;
+	case ICMP_TIME_EXCEEDED:
+		if (code != ICMP_EXC_TTL)
+			return;
+		break;
+
+	case ICMP_REDIRECT:
+		break;
+	}
+
+	rcu_read_lock();
+	for (i = 0; i < GREPROTO_V0_MAX; i++) {
+		if (gre_proto_v0[i]->err_handler) {
+			if (gre_proto_v0[i]->err_handler(skb, info, &tpi) <= 0) {
+				rcu_read_unlock();
+				return;
+			}
+		}
+
+	}
+	rcu_read_unlock();
+}
+
 static const struct net_protocol net_gre_protocol = {
 	.handler     = gre_rcv,
 	.err_handler = gre_err,
 	.netns_ok    = 1,
 };
 
+static const struct gre_protocol ipgre_protocol = {
+	.handler     = ipgre_rcv_v0,
+	.err_handler = ipgre_err_v0,
+};
+
 static int __init gre_init(void)
 {
 	pr_info("GRE over IPv4 demultiplexor driver\n");
@@ -126,12 +399,20 @@ static int __init gre_init(void)
 		pr_err("can't add protocol\n");
 		return -EAGAIN;
 	}
+	if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) {
+		pr_info("%s: can't add ipgre handler\n", __func__);
+		inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+		return -EAGAIN;
+	}
 
 	return 0;
 }
 
 static void __exit gre_exit(void)
 {
+	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
+		pr_info("%s: can't remove protocol\n", __func__);
+
 	inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
 }
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f233c1d..3bda6e2 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -466,7 +466,7 @@ static void ipgre_tunnel_uninit(struct net_device *dev)
 }
 
 
-static void ipgre_err(struct sk_buff *skb, u32 info)
+static int ipgre_err(struct sk_buff *skb, u32 info, struct tnl_ptk_info *tnl_ptk_info)
 {
 
 /* All the routers (except for Linux) return only
@@ -483,63 +483,16 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
  */
 
 	const struct iphdr *iph = (const struct iphdr *)skb->data;
-	__be16	     *p = (__be16 *)(skb->data+(iph->ihl<<2));
-	int grehlen = (iph->ihl<<2) + 4;
 	const int type = icmp_hdr(skb)->type;
 	const int code = icmp_hdr(skb)->code;
 	struct ip_tunnel *t;
-	__be16 flags;
-
-	flags = p[0];
-	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
-		if (flags&(GRE_VERSION|GRE_ROUTING))
-			return;
-		if (flags&GRE_KEY) {
-			grehlen += 4;
-			if (flags&GRE_CSUM)
-				grehlen += 4;
-		}
-	}
-
-	/* If only 8 bytes returned, keyed message will be dropped here */
-	if (skb_headlen(skb) < grehlen)
-		return;
-
-	switch (type) {
-	default:
-	case ICMP_PARAMETERPROB:
-		return;
-
-	case ICMP_DEST_UNREACH:
-		switch (code) {
-		case ICMP_SR_FAILED:
-		case ICMP_PORT_UNREACH:
-			/* Impossible event. */
-			return;
-		default:
-			/* All others are translated to HOST_UNREACH.
-			   rfc2003 contains "deep thoughts" about NET_UNREACH,
-			   I believe they are just ether pollution. --ANK
-			 */
-			break;
-		}
-		break;
-	case ICMP_TIME_EXCEEDED:
-		if (code != ICMP_EXC_TTL)
-			return;
-		break;
-
-	case ICMP_REDIRECT:
-		break;
-	}
 
 	rcu_read_lock();
 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
-				flags & GRE_KEY ?
-				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
-				p[1]);
-	if (t == NULL)
-		goto out;
+				tnl_ptk_info->key, tnl_ptk_info->proto);
+	if (t == NULL) {
+		return 1;
+	}
 
 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
 		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
@@ -565,6 +518,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
 	t->err_time = jiffies;
 out:
 	rcu_read_unlock();
+	return 0;
 }
 
 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
@@ -589,80 +543,29 @@ ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
 	return INET_ECN_encapsulate(tos, inner);
 }
 
-static int ipgre_rcv(struct sk_buff *skb)
+static int ipgre_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi)
 {
 	const struct iphdr *iph;
-	u8     *h;
-	__be16    flags;
-	__sum16   csum = 0;
-	__be32 key = 0;
-	u32    seqno = 0;
 	struct ip_tunnel *tunnel;
-	int    offset = 4;
-	__be16 gre_proto;
 
 	if (!pskb_may_pull(skb, 16))
 		goto drop_nolock;
 
 	iph = ip_hdr(skb);
-	h = skb->data;
-	flags = *(__be16 *)h;
-
-	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
-		/* - Version must be 0.
-		   - We do not support routing headers.
-		 */
-		if (flags&(GRE_VERSION|GRE_ROUTING))
-			goto drop_nolock;
-
-		if (flags&GRE_CSUM) {
-			switch (skb->ip_summed) {
-			case CHECKSUM_COMPLETE:
-				csum = csum_fold(skb->csum);
-				if (!csum)
-					break;
-				/* fall through */
-			case CHECKSUM_NONE:
-				skb->csum = 0;
-				csum = __skb_checksum_complete(skb);
-				skb->ip_summed = CHECKSUM_COMPLETE;
-			}
-			offset += 4;
-		}
-		if (flags&GRE_KEY) {
-			key = *(__be32 *)(h + offset);
-			offset += 4;
-		}
-		if (flags&GRE_SEQ) {
-			seqno = ntohl(*(__be32 *)(h + offset));
-			offset += 4;
-		}
-	}
-
-	gre_proto = *(__be16 *)(h + 2);
 
 	rcu_read_lock();
 	if ((tunnel = ipgre_tunnel_lookup(skb->dev,
-					  iph->saddr, iph->daddr, key,
-					  gre_proto))) {
+					  iph->saddr, iph->daddr, tpi->key,
+					  tpi->proto))) {
 		struct pcpu_tstats *tstats;
 
 		secpath_reset(skb);
 
-		skb->protocol = gre_proto;
-		/* WCCP version 1 and 2 protocol decoding.
-		 * - Change protocol to IP
-		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
-		 */
-		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
-			skb->protocol = htons(ETH_P_IP);
-			if ((*(h + offset) & 0xF0) != 0x40)
-				offset += 4;
-		}
+		skb->protocol = tpi->proto;
 
 		skb->mac_header = skb->network_header;
-		__pskb_pull(skb, offset);
-		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
+		__pskb_pull(skb, tpi->hdr_len);
+		skb_postpull_rcsum(skb, skb_transport_header(skb), tpi->hdr_len);
 		skb->pkt_type = PACKET_HOST;
 #ifdef CONFIG_NET_IPGRE_BROADCAST
 		if (ipv4_is_multicast(iph->daddr)) {
@@ -674,20 +577,20 @@ static int ipgre_rcv(struct sk_buff *skb)
 		}
 #endif
 
-		if (((flags&GRE_CSUM) && csum) ||
-		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
+		if (((tpi->flags&GRE_CSUM) && tpi->csum) ||
+		    (!(tpi->flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
 			tunnel->dev->stats.rx_crc_errors++;
 			tunnel->dev->stats.rx_errors++;
 			goto drop;
 		}
 		if (tunnel->parms.i_flags&GRE_SEQ) {
-			if (!(flags&GRE_SEQ) ||
-			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
+			if (!(tpi->flags&GRE_SEQ) ||
+			    (tunnel->i_seqno && (s32)(tpi->seq - tunnel->i_seqno) < 0)) {
 				tunnel->dev->stats.rx_fifo_errors++;
 				tunnel->dev->stats.rx_errors++;
 				goto drop;
 			}
-			tunnel->i_seqno = seqno + 1;
+			tunnel->i_seqno = tpi->seq + 1;
 		}
 
 		/* Warning: All skb pointers will be invalidated! */
@@ -1373,7 +1276,7 @@ static void ipgre_fb_tunnel_init(struct net_device *dev)
 }
 
 
-static const struct gre_protocol ipgre_protocol = {
+static const struct gre_protocol_v0 ipgre_protocol = {
 	.handler     = ipgre_rcv,
 	.err_handler = ipgre_err,
 };
@@ -1771,7 +1674,7 @@ static int __init ipgre_init(void)
 	if (err < 0)
 		return err;
 
-	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
+	err = gre_add_protocol_v0(&ipgre_protocol, 0);
 	if (err < 0) {
 		pr_info("%s: can't add protocol\n", __func__);
 		goto add_proto_failed;
@@ -1791,7 +1694,7 @@ out:
 tap_ops_failed:
 	rtnl_link_unregister(&ipgre_link_ops);
 rtnl_link_failed:
-	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
+	gre_del_protocol_v0(&ipgre_protocol, 0);
 add_proto_failed:
 	unregister_pernet_device(&ipgre_net_ops);
 	goto out;
@@ -1801,7 +1704,7 @@ static void __exit ipgre_fini(void)
 {
 	rtnl_link_unregister(&ipgre_tap_ops);
 	rtnl_link_unregister(&ipgre_link_ops);
-	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
+	if (gre_del_protocol_v0(&ipgre_protocol, 0) < 0)
 		pr_info("%s: can't remove protocol\n", __func__);
 	unregister_pernet_device(&ipgre_net_ops);
 }
-- 
1.7.10




More information about the dev mailing list