[ovs-dev] [PATCH v3] gre: Restructure tunneling.

Rajahalme, Jarno (NSN - FI/Espoo) jarno.rajahalme at nsn.com
Fri May 24 06:12:29 UTC 2013


Pravin,

Please find some review comments below,

  Jarno

On May 23, 2013, at 23:01 , ext Pravin B Shelar wrote:
...
> diff --git a/datapath/linux/compat/gre.c b/datapath/linux/compat/gre.c
> new file mode 100644
> index 0000000..c352ff8
> --- /dev/null
> +++ b/datapath/linux/compat/gre.c
> @@ -0,0 +1,352 @@
> +/*
> + * Copyright (c) 2007-2013 Nicira, Inc.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of version 2 of the GNU General Public
> + * License as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> + * 02110-1301, USA
> + */
> +
> +#include <linux/module.h>
> +#include <linux/if.h>
> +#include <linux/if_tunnel.h>
> +#include <linux/icmp.h>
> +#include <linux/in.h>
> +#include <linux/ip.h>
> +#include <linux/kernel.h>
> +#include <linux/kmod.h>
> +#include <linux/netdevice.h>
> +#include <linux/skbuff.h>
> +#include <linux/spinlock.h>
> +
> +#include <net/gre.h>
> +#include <net/icmp.h>
> +#include <net/protocol.h>
> +#include <net/route.h>
> +#include <net/xfrm.h>
> +
> +#include "gso.h"
> +
> +static struct gre_cisco_protocol __rcu *gre_cisco_proto;
> +
> +static void gre_csum_fix(struct sk_buff *skb)
> +{
> +	struct gre_base_hdr *greh;
> +	__be32 *options;
> +	int gre_offset = skb_transport_offset(skb);
> +
> +	greh = (struct gre_base_hdr *)skb_transport_header(skb);
> +	options = ((__be32 *)greh + 1);
> +
> +	*options = 0;
> +	*(__sum16 *)options = csum_fold(skb_checksum(skb, gre_offset,
> +						     skb->len - gre_offset, 0));
> +}
> +
> +struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum)
> +{
> +	int err;
> +
> +	skb_reset_inner_headers(skb);
> +
> +	if (skb_is_gso(skb)) {
> +		if (gre_csum)
> +			OVS_GSO_CB(skb)->fix_segment = gre_csum_fix;
> +	} else {
> +		if (skb->ip_summed == CHECKSUM_PARTIAL && gre_csum) {
> +			err = skb_checksum_help(skb);
> +			if (err)
> +				goto error;
> +
> +		} else if (skb->ip_summed != CHECKSUM_PARTIAL)
> +			skb->ip_summed = CHECKSUM_NONE;
> +	}
> +	return skb;
> +error:
> +	kfree_skb(skb);
> +	return ERR_PTR(err);
> +}
> +
> +static bool is_gre_gso(struct sk_buff *skb)
> +{
> +	return skb_is_gso(skb);
> +}

What is the value of this? You already use skb_is_gso() above.

> +
> +void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
> +		      int hdr_len)
> +{
> +	struct gre_base_hdr *greh;
> +
> +	__skb_push(skb, hdr_len);
> +
> +	greh = (struct gre_base_hdr *)skb->data;
> +	greh->flags = tnl_flags_to_gre_flags(tpi->flags);
> +	greh->protocol = tpi->proto;
> +
> +	if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {

Add some spaces? Also below.

> +		__be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
> +
> +		if (tpi->flags&TUNNEL_SEQ) {
> +			*ptr = tpi->seq;
> +			ptr--;
> +		}
> +		if (tpi->flags&TUNNEL_KEY) {
> +			*ptr = tpi->key;
> +			ptr--;
> +		}
> +		if (tpi->flags&TUNNEL_CSUM && !is_gre_gso(skb)) {
> +			*ptr = 0;
> +			*(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
> +						skb->len, 0));
> +		}
> +	}
> +}
> +
> +static __sum16 check_checksum(struct sk_buff *skb)
> +{
> +	__sum16 csum = 0;
> +
> +	switch (skb->ip_summed) {
> +	case CHECKSUM_COMPLETE:
> +		csum = csum_fold(skb->csum);
> +
> +		if (!csum)
> +			break;
> +		/* Fall through. */
> +
> +	case CHECKSUM_NONE:
> +		skb->csum = 0;
> +		csum = __skb_checksum_complete(skb);
> +		skb->ip_summed = CHECKSUM_COMPLETE;
> +		break;
> +	}
> +
> +	return csum;
> +}
> +
> +static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
> +			    bool *csum_err)

It seems the csum_err returned via the parameter pointer is ignored by the caller, hence it could be removed. 

> +{
> +	unsigned int ip_hlen = ip_hdrlen(skb);
> +	struct gre_base_hdr *greh;
> +	__be32 *options;
> +	int hdr_len;
> +
> +	if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
> +		return -EINVAL;
> +
> +	greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
> +	if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
> +		return -EINVAL;
> +
> +	tpi->flags = gre_flags_to_tnl_flags(greh->flags);
> +	hdr_len = ip_gre_calc_hlen(tpi->flags);
> +
> +	if (!pskb_may_pull(skb, hdr_len))
> +		return -EINVAL;
> +
> +	greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
> +	tpi->proto = greh->protocol;
> +
> +	options = (__be32 *)(greh + 1);
> +	if (greh->flags & GRE_CSUM) {
> +		if (check_checksum(skb)) {
> +			*csum_err = true;
> +			return -EINVAL;
> +		}
> +		options++;
> +	}
> +
> +	if (greh->flags & GRE_KEY) {
> +		tpi->key = *options;
> +		options++;
> +	} else
> +		tpi->key = 0;
> +
> +	if (unlikely(greh->flags & GRE_SEQ)) {
> +		tpi->seq = *options;
> +		options++;
> +	} else
> +		tpi->seq = 0;
> +
> +	/* WCCP version 1 and 2 protocol decoding.
> +	 * - Change protocol to IP
> +	 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
> +	 */
> +	if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
> +		tpi->proto = htons(ETH_P_IP);
> +		if ((*(u8 *)options & 0xF0) != 0x40) {
> +			hdr_len += 4;
> +			if (!pskb_may_pull(skb, hdr_len))
> +				return -EINVAL;
> +		}
> +	}
> +
> +	return iptunnel_pull_header(skb, hdr_len, tpi->proto);
> +}
> +
> +static int gre_cisco_rcv(struct sk_buff *skb)
> +{
> +	struct tnl_ptk_info tpi;
> +	bool csum_err = false;
> +	struct gre_cisco_protocol *proto;
> +
> +	rcu_read_lock();
> +	proto = rcu_dereference(gre_cisco_proto);
> +	if (!proto)
> +		goto drop;
> +
> +	if (parse_gre_header(skb, &tpi, &csum_err) < 0)
> +		goto drop;
> +	proto->handler(skb, &tpi);
> +	rcu_read_unlock();
> +	return 0;
> +
> +drop:
> +	rcu_read_unlock();
> +	kfree_skb(skb);
> +	return 0;
> +}
> +
> +static const struct gre_protocol ipgre_protocol = {
> +	.handler	=	gre_cisco_rcv,
> +};
> +
> +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
> +static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
> +
> +int gre_add_protocol(const struct gre_protocol *proto, u8 version)
> +{
> +	if (version >= GREPROTO_MAX)
> +		return -EINVAL;
> +
> +	return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ?
> +		0 : -EBUSY;
> +}
> +
> +int gre_del_protocol(const struct gre_protocol *proto, u8 version)
> +{
> +	int ret;
> +
> +	if (version >= GREPROTO_MAX)
> +		return -EINVAL;
> +
> +	ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ?
> +		0 : -EBUSY;
> +
> +	if (ret)
> +		return ret;
> +
> +	synchronize_net();
> +	return 0;
> +}
> +
> +static int gre_rcv(struct sk_buff *skb)
> +{
> +	const struct gre_protocol *proto;
> +	u8 ver;
> +	int ret;
> +
> +	if (!pskb_may_pull(skb, 12))
> +		goto drop;

Is there a symbol you could use here instead the literal 12?

> +
> +	ver = skb->data[1]&0x7f;

Spaces...

> +	if (ver >= GREPROTO_MAX)
> +		goto drop;
> +
> +	rcu_read_lock();
> +	proto = rcu_dereference(gre_proto[ver]);
> +	if (!proto || !proto->handler)
> +		goto drop_unlock;
> +	ret = proto->handler(skb);
> +	rcu_read_unlock();
> +	return ret;
> +
> +drop_unlock:
> +	rcu_read_unlock();
> +drop:
> +	kfree_skb(skb);
> +	return NET_RX_DROP;
> +}
> +
...
> diff --git a/datapath/linux/compat/gso.c b/datapath/linux/compat/gso.c
> new file mode 100644
> index 0000000..f6a4ad3
> --- /dev/null
> +++ b/datapath/linux/compat/gso.c
> @@ -0,0 +1,166 @@
> +/*
> + * Copyright (c) 2007-2013 Nicira, Inc.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of version 2 of the GNU General Public
> + * License as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> + * 02110-1301, USA
> + */
> +
> +#include <linux/module.h>
> +#include <linux/if.h>
> +#include <linux/if_tunnel.h>
> +#include <linux/icmp.h>
> +#include <linux/in.h>
> +#include <linux/ip.h>
> +#include <linux/kernel.h>
> +#include <linux/kmod.h>
> +#include <linux/netdevice.h>
> +#include <linux/skbuff.h>
> +#include <linux/spinlock.h>
> +
> +#include <net/gre.h>
> +#include <net/icmp.h>
> +#include <net/protocol.h>
> +#include <net/route.h>
> +#include <net/xfrm.h>
> +
> +#include "gso.h"
> +
> +static __be16 skb_network_protocol(struct sk_buff *skb)
> +{
> +	__be16 type = skb->protocol;
> +	int vlan_depth = ETH_HLEN;

Shouldn't this be 2*ETH_ALEN instead, since that is where VLAN header starts at if there is one?

> +
> +	while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
> +		struct vlan_hdr *vh;
> +
> +		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
> +			return 0;
> +
> +		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
> +		type = vh->h_vlan_encapsulated_proto;
> +		vlan_depth += VLAN_HLEN;
> +	}
> +
> +	return type;
> +}
> +
> +static struct sk_buff *tnl_skb_gso_segment(struct sk_buff *skb,
> +					   netdev_features_t features,
> +					   bool tx_path)
> +{
> +	struct iphdr *iph	= ip_hdr(skb);
> +	int tnl_hlen		= skb_inner_network_offset(skb);
> +	struct sk_buff *skb1	= skb;
> +	struct sk_buff *segs;
> +
> +	__skb_pull(skb, tnl_hlen);
> +	skb_reset_mac_header(skb);
> +	skb_reset_network_header(skb);
> +	skb_reset_transport_header(skb);
> +	skb->protocol = skb_network_protocol(skb);
> +
> +	segs = __skb_gso_segment(skb, 0, true);

Are you sure you want to ignore the tx_path and insist on 'true' here? Using 'false' selects a looser check of skb->ip_summed at __skb_gso_segment. We have seen the check implied by 'true' having the kernel fill the disk with non-rate-limited warnings, so this may be a significant choice you are making here.

> +	if (!segs || IS_ERR(segs))
> +		goto free;
> +
> +	skb = segs;
> +	while (skb) {
> +		__skb_push(skb, tnl_hlen);
> +		skb_reset_mac_header(skb);
> +		skb_reset_network_header(skb);
> +		skb_set_transport_header(skb, sizeof(struct iphdr));
> +		skb->mac_len = 0;
> +
> +		memcpy(ip_hdr(skb), iph, tnl_hlen);

Could you educate me a bit here and explain shortly why it is necessary to overwrite the IP headers of the segmented skbs?

> +		if (OVS_GSO_CB(skb)->fix_segment)
> +			OVS_GSO_CB(skb)->fix_segment(skb);
> +		skb = skb->next;
> +	}
> +free:
> +	consume_skb(skb1);
> +	return segs;
> +}
> +
> +static bool need_linearize(const struct sk_buff *skb)
> +{
> +	int i;
> +
> +	if (unlikely(skb_shinfo(skb)->frag_list))
> +		return true;
> +
> +	/*
> +	 * Generally speaking we should linearize if there are paged frags.
> +	 * However, if all of the refcounts are 1 we know nobody else can
> +	 * change them from underneath us and we can skip the linearization.
> +	 */
> +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
> +		if (unlikely(page_count(skb_frag_page(&skb_shinfo(skb)->frags[i])) > 1))
> +			return true;
> +
> +	return false;
> +}
> +
> +int ip_local_out(struct sk_buff *skb)

IMO it would be less surprising if you used 'rpl_ip_local_out' here. Don't know if that would go against a convention though.

> +{
> +	struct dst_entry *dst = skb_dst(skb);
> +	int ret = NETDEV_TX_OK;
> +
> +	if (skb_is_gso(skb)) {
> +		skb = tnl_skb_gso_segment(skb, 0, true);

Maybe should use 'false' as the tx_path argument (Jesse?).

> +		if (!skb || IS_ERR(skb))
> +			return 0;
> +	}  else if (skb->ip_summed == CHECKSUM_PARTIAL) {
> +		int err;
> +
> +		if (unlikely(need_linearize(skb))) {
> +			err = __skb_linearize(skb);
> +			if (unlikely(err))
> +				return 0;
> +		}
> +
> +		err = skb_checksum_help(skb);
> +		if (unlikely(err))
> +			return 0;
> +	}
> +
> +	while (skb) {
> +		struct sk_buff *next_skb = skb->next;
> +		int err;
> +
> +		if (next_skb)
> +			skb_dst_set(skb, dst_clone(dst));
> +		else
> +			skb_dst_set(skb, dst);
> +
> +		skb->next = NULL;
> +
> +		/*
> +		 * Allow our local IP stack to fragment the outer packet even
> +		 * if the DF bit is set as a last resort.  We also need to
> +		 * force selection of an IP ID here because Linux will
> +		 * otherwise leave it at 0 if the packet originally had DF set.
> +		 */

This comment needs editing as the IP ID is not set here.

> +
> +		skb->local_df = 1;
> +		memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
> +

Is clearing of the skb control buffer necessary here?

> +#undef ip_local_out
> +		err = ip_local_out(skb);
> +		if (unlikely(net_xmit_eval(err)))
> +			ret = err;
> +
> +		skb = next_skb;
> +	}
> +	return ret;
> +}
...
> diff --git a/datapath/linux/compat/include/net/gre.h b/datapath/linux/compat/include/net/gre.h
> new file mode 100644
> index 0000000..16dc54e
> --- /dev/null
> +++ b/datapath/linux/compat/include/net/gre.h
> @@ -0,0 +1,109 @@
> +#ifndef __LINUX_GRE_WRAPPER_H
> +#define __LINUX_GRE_WRAPPER_H
> +
> +#include <linux/skbuff.h>
> +#include <net/ip_tunnels.h>
> +
> +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,37)
> +#include_next <net/gre.h>
> +
> +#else /* LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,37) */
> +
> +#define GREPROTO_CISCO		0
> +#define GREPROTO_PPTP		1
> +#define GREPROTO_MAX		2
> +#define GRE_IP_PROTO_MAX	2
> +
> +struct gre_protocol {
> +	int  (*handler)(struct sk_buff *skb);
> +	void (*err_handler)(struct sk_buff *skb, u32 info);
> +};
> +
> +int gre_add_protocol(const struct gre_protocol *proto, u8 version);
> +int gre_del_protocol(const struct gre_protocol *proto, u8 version);
> +
> +#endif
> +
> +#define GRE_IP_PROTO_MAX	2
> +
> +struct gre_base_hdr {
> +	__be16 flags;
> +	__be16 protocol;
> +};
> +#define GRE_HEADER_SECTION 4
> +
> +#define MAX_GRE_PROTO_PRIORITY 255
> +struct gre_cisco_protocol {
> +	int (*handler)(struct sk_buff *skb, const struct tnl_ptk_info *tpi);
> +	int (*err_handler)(struct sk_buff *skb, u32 info,
> +			   const struct tnl_ptk_info *tpi);
> +	u8 priority;
> +};
> +
> +#define gre_build_header rpl_gre_build_header
> +void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
> +		      int hdr_len);
> +
> +#define gre_handle_offloads rpl_gre_handle_offloads
> +struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum);
> +
> +int gre_cisco_register(struct gre_cisco_protocol *proto);
> +int gre_cisco_unregister(struct gre_cisco_protocol *proto);
> +
> +static inline int ip_gre_calc_hlen(__be16 o_flags)
> +{
> +	int addend = 4;
> +
> +	if (o_flags&TUNNEL_CSUM)

Spaces...

> +		addend += 4;
> +	if (o_flags&TUNNEL_KEY)
> +		addend += 4;
> +	if (o_flags&TUNNEL_SEQ)
> +		addend += 4;
> +	return addend;
> +}
> +
...
> diff --git a/datapath/linux/compat/include/net/ip_tunnels.h b/datapath/linux/compat/include/net/ip_tunnels.h
> new file mode 100644
> index 0000000..ad17c9d
> --- /dev/null
> +++ b/datapath/linux/compat/include/net/ip_tunnels.h
> @@ -0,0 +1,54 @@
> +#ifndef __NET_IP_TUNNELS_WRAPPER_H
> +#define __NET_IP_TUNNELS_WRAPPER_H 1
> +
> +#include <linux/if_tunnel.h>
> +#include <linux/netdevice.h>
> +#include <linux/skbuff.h>
> +#include <linux/types.h>
> +#include <net/dsfield.h>
> +#include <net/flow.h>
> +#include <net/inet_ecn.h>
> +#include <net/ip.h>
> +#include <net/rtnetlink.h>
> +
> +#define TUNNEL_CSUM	__cpu_to_be16(0x01)
> +#define TUNNEL_ROUTING	__cpu_to_be16(0x02)
> +#define TUNNEL_KEY	__cpu_to_be16(0x04)
> +#define TUNNEL_SEQ	__cpu_to_be16(0x08)
> +#define TUNNEL_STRICT	__cpu_to_be16(0x10)
> +#define TUNNEL_REC	__cpu_to_be16(0x20)
> +#define TUNNEL_VERSION	__cpu_to_be16(0x40)
> +#define TUNNEL_NO_KEY	__cpu_to_be16(0x80)
> +#define TUNNEL_DONT_FRAGMENT	__cpu_to_be16(0x0100)
> +
> +struct tnl_ptk_info {
> +	__be16 flags;
> +	__be16 proto;
> +	__be32 key;
> +	__be32 seq;
> +};
> +
> +#define PACKET_RCVD	0
> +#define PACKET_REJECT	1
> +
> +static inline void tunnel_ip_select_ident(struct sk_buff *skb,
> +					  const struct iphdr  *old_iph,
> +					  struct dst_entry *dst)
> +{
> +	struct iphdr *iph = ip_hdr(skb);
> +
> +	/* Use inner packet iph-id if possible. */
> +	if (skb->protocol == htons(ETH_P_IP) && old_iph->id)
> +		iph->id = old_iph->id;

Was it concluded that it is safe to use the inner packet ip id? I get that this is a copy of upstream code, but would like to know.

> +	else
> +		__ip_select_ident(iph, dst,
> +				(skb_shinfo(skb)->gso_segs ?: 1) - 1);
> +}
> +
> +int iptunnel_xmit(struct net *net, struct rtable *rt,
> +		  struct sk_buff *skb,
> +		  __be32 src, __be32 dst, __u8 proto,
> +		  __u8 tos, __u8 ttl, __be16 df);
> +
> +int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto);
> +#endif /* __NET_IP_TUNNELS_H */
> diff --git a/datapath/linux/compat/ip_tunnels_core.c b/datapath/linux/compat/ip_tunnels_core.c
> new file mode 100644
> index 0000000..f8c8488
> --- /dev/null
> +++ b/datapath/linux/compat/ip_tunnels_core.c
> @@ -0,0 +1,115 @@
> +/*
> + * Copyright (c) 2007-2013 Nicira, Inc.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of version 2 of the GNU General Public
> + * License as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> + * 02110-1301, USA
> + */
> +
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/in.h>
> +#include <linux/in_route.h>
> +#include <linux/inetdevice.h>
> +#include <linux/jhash.h>
> +#include <linux/list.h>
> +#include <linux/kernel.h>
> +#include <linux/version.h>
> +#include <linux/workqueue.h>
> +#include <linux/rculist.h>
> +#include <net/ip_tunnels.h>
> +#include <net/route.h>
> +#include <net/xfrm.h>
> +
> +#include "gso.h"
> +#include "compat.h"
> +
> +int iptunnel_xmit(struct net *net, struct rtable *rt,
> +		  struct sk_buff *skb,
> +		  __be32 src, __be32 dst, __u8 proto,
> +		  __u8 tos, __u8 ttl, __be16 df)
> +{
> +	int pkt_len = skb->len;
> +	struct iphdr *iph;
> +	int err;
> +
> +	nf_reset(skb);
> +	secpath_reset(skb);
> +	skb_clear_rxhash(skb);
> +	skb_dst_drop(skb);
> +	skb_dst_set(skb, &rt_dst(rt));
> +#if 0
> +	/* Do not clear ovs-callback.  It will be done in gso code. */
> +	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
> +#endif

Why does the skb control buffer need to be cleared in the first place?

> +
> +	/* Push down and install the IP header. */
> +	__skb_push(skb, sizeof(struct iphdr));
> +	skb_reset_network_header(skb);
> +
> +	iph = ip_hdr(skb);
> +
> +	iph->version	=	4;
> +	iph->ihl	=	sizeof(struct iphdr) >> 2;
> +	iph->frag_off	=	df;
> +	iph->protocol	=	proto;
> +	iph->tos	=	tos;
> +	iph->daddr	=	dst;
> +	iph->saddr	=	src;
> +	iph->ttl	=	ttl;
> +	tunnel_ip_select_ident(skb,
> +			       (const struct iphdr *)skb_inner_network_header(skb),
> +			       &rt_dst(rt));
> +
> +	err = ip_local_out(skb);
> +	if (unlikely(net_xmit_eval(err)))
> +		pkt_len = 0;
> +	return pkt_len;
> +}
> +
...
> diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c
> index add17d9..1707eab 100644
> --- a/datapath/vport-gre.c
> +++ b/datapath/vport-gre.c
> @@ -24,50 +24,29 @@
> #include <linux/if_tunnel.h>
> #include <linux/if_vlan.h>
> #include <linux/in.h>
> +#include <linux/if_vlan.h>
> +#include <linux/in.h>
> +#include <linux/in_route.h>
> +#include <linux/inetdevice.h>
> +#include <linux/jhash.h>
> +#include <linux/list.h>
> +#include <linux/kernel.h>
> +#include <linux/workqueue.h>
> +#include <linux/rculist.h>
> +#include <net/route.h>
> +#include <net/xfrm.h>
> +
> 
> #include <net/icmp.h>
> #include <net/ip.h>
> +#include <net/ip_tunnels.h>
> +#include <net/gre.h>
> #include <net/protocol.h>
> 
> #include "datapath.h"
> #include "tunnel.h"
> #include "vport.h"
> 
> -/*
> - * The GRE header is composed of a series of sections: a base and then a variable
> - * number of options.
> - */
> -#define GRE_HEADER_SECTION 4
> -
> -struct gre_base_hdr {
> -	__be16 flags;
> -	__be16 protocol;
> -};
> -
> -static int gre_hdr_len(const struct ovs_key_ipv4_tunnel *tun_key)
> -{
> -	int len = GRE_HEADER_SECTION;
> -
> -	if (tun_key->tun_flags & OVS_TNL_F_KEY)
> -		len += GRE_HEADER_SECTION;
> -	if (tun_key->tun_flags & OVS_TNL_F_CSUM)
> -		len += GRE_HEADER_SECTION;
> -	return len;
> -}
> -
> -static int gre64_hdr_len(const struct ovs_key_ipv4_tunnel *tun_key)
> -{
> -	/* Set key for GRE64 tunnels, even when key if is zero. */
> -	int len = GRE_HEADER_SECTION +		/* GRE Hdr */
> -		  GRE_HEADER_SECTION +		/* GRE Key */
> -		  GRE_HEADER_SECTION;		/* GRE SEQ */
> -
> -	if (tun_key->tun_flags & OVS_TNL_F_CSUM)
> -		len += GRE_HEADER_SECTION;
> -
> -	return len;
> -}
> -
> /* Returns the least-significant 32 bits of a __be64. */
> static __be32 be64_get_low32(__be64 x)
> {
> @@ -78,61 +57,31 @@ static __be32 be64_get_low32(__be64 x)
> #endif
> }
> 
> -static __be32 be64_get_high32(__be64 x)
> +static __be16 filter_tnl_flags(__be16 flags)
> {
> -#ifdef __BIG_ENDIAN
> -	return (__force __be32)((__force u64)x >> 32);
> -#else
> -	return (__force __be32)x;
> -#endif
> +	return (flags & (TUNNEL_CSUM | TUNNEL_KEY));
> }
> 
> -static void __gre_build_header(struct sk_buff *skb,
> -			       int tunnel_hlen,
> -			       bool is_gre64)
> +static struct sk_buff *__build_header(const struct vport *vport,

'vport' is not used, so it could be removed here. 

> +				      struct sk_buff *skb,
> +				      int tunnel_hlen,
> +				      __be32 seq, __be16 gre64_flag)
> {
> 	const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->tun_key;
> -	__be32 *options = (__be32 *)(skb_network_header(skb) + tunnel_hlen
> -			- GRE_HEADER_SECTION);
> -	struct gre_base_hdr *greh = (struct gre_base_hdr *) skb_transport_header(skb);
> -	greh->protocol = htons(ETH_P_TEB);
> -	greh->flags = 0;
> -
> -	/* Work backwards over the options so the checksum is last. */
> -	if (tun_key->tun_flags & OVS_TNL_F_KEY || is_gre64) {
> -		greh->flags |= GRE_KEY;
> -		if (is_gre64) {
> -			/* Set higher 32 bits to seq. */
> -			*options = be64_get_high32(tun_key->tun_id);
> -			options--;
> -			greh->flags |= GRE_SEQ;
> -		}
> -		*options = be64_get_low32(tun_key->tun_id);
> -		options--;
> -	}
> +	struct tnl_ptk_info tpi;
> 
> -	if (tun_key->tun_flags & OVS_TNL_F_CSUM) {
> -		greh->flags |= GRE_CSUM;
> -		*options = 0;
> -		*(__sum16 *)options = csum_fold(skb_checksum(skb,
> -						skb_transport_offset(skb),
> -						skb->len - skb_transport_offset(skb),
> -						0));
> -	}
> -}
> +	skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM));
> +	if (IS_ERR(skb))
> +		return NULL;
> 
> -static void gre_build_header(const struct vport *vport,
> -			     struct sk_buff *skb,
> -			     int tunnel_hlen)
> -{
> -	__gre_build_header(skb, tunnel_hlen, false);
> -}
> +	tpi.flags = filter_tnl_flags(tun_key->tun_flags) | gre64_flag;
> 
> -static void gre64_build_header(const struct vport *vport,
> -			       struct sk_buff *skb,
> -			       int tunnel_hlen)
> -{
> -	__gre_build_header(skb, tunnel_hlen, true);
> +	tpi.proto = htons(ETH_P_TEB);
> +	tpi.key = be64_get_low32(tun_key->tun_id);
> +	tpi.seq = seq;
> +	gre_build_header(skb, &tpi, tunnel_hlen);
> +
> +	return skb;
> }
> 




More information about the dev mailing list