[ovs-dev] [PATCH 8/8] gre: Add support for path MTU discovery.

Thu Mar 4 23:50:18 UTC 2010

On Thu, Mar 04, 2010 at 01:22:13PM -0500, Jesse Gross wrote:
> @@ -455,6 +458,77 @@ static unsigned int tunnel_hard_header_len(struct net_device *dev)
>  #endif
>  }
>  
> +static void icmp_err_frag(struct sk_buff *skb, struct ip_tunnel *t)
> +{
> +	int mtu = ntohs(icmp_hdr(skb)->un.frag.mtu);
> +	int header_len = t->hlen + tunnel_hard_header_len(t->dev);
> +	unsigned int orig_mac_header = skb_mac_header(skb) - skb->data;
> +	unsigned int orig_nw_header = skb_network_header(skb) - skb->data;
> +	__be16 encap_proto;
> +
> +	/* Add the size of the IP header since this is the smallest
> +	 * packet size the we might do something with and we might as
> +	 * well fail early if we don't have it.  Plus it allows us to
> +	 * safely look at the VLAN header if there is one.  The final
> +	 * size is checked before use. */
> +	if (!pskb_may_pull(skb, header_len + sizeof(struct iphdr)))
> +		return;
> +
> +	if (t->dev->type == ARPHRD_ETHER) {
> +		skb_set_mac_header(skb, t->hlen);
> +		encap_proto = eth_hdr(skb)->h_proto;
> +
> +		if (encap_proto == htons(ETH_P_8021Q)) {
> +			header_len += VLAN_HLEN;
> +			encap_proto =
> +				   vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
> +		}
> +	} else {
> +		encap_proto = *(__be16 *)((skb->data +
> +					  (ip_hdr(skb)->ihl << 2)) + 2);

Can 'ip_hdr(skb)->ihl << 2' with a malicious ihl == 15 value cause us to
read past the end of the header?

The "+ 2" here looks like magic.  I guess there must be a gre_hdr
somewhere (I see one in nf_conntrack_proto_gre, for example) to reduce
the magic?

> +	}
> +
> +	skb_set_network_header(skb, header_len);
> +	skb->protocol = encap_proto;
> +	mtu -= header_len;
> +
> +	if (skb->protocol == htons(ETH_P_IP)) {
> +		if (mtu < 68) {
> +			if (ntohs(ip_hdr(skb)->tot_len) >= 68)
> +				mtu = 68;

It took me a few minutes to figure out where this number came from.  I
guess it is from RFC 791: "Every internet module must be able to forward
a datagram of 68 octets without further fragmentation.  This is because
an internet header may be up to 60 octets, and the minimum fragment is 8
octets."  I don't know whether that is obvious to everyone else.

I wonder whether the kernel upstream would be open to adding an
IP_MIN_MTU macro.

I don't see a guarantee that ->tot_len doesn't read past the end of the
header (and the same for the later IPv6 case).

> +			else
> +				goto out;
> +		}
> +
> +		header_len += sizeof(struct iphdr);

Should this be "header_len += ip_hdrlen(ip_hdr(skb))?"

> +	} else if (skb->protocol == htons(ETH_P_IPV6)) {
> +		if (mtu < IPV6_MIN_MTU) {
> +			unsigned int packet_length = sizeof(struct ipv6hdr) +
> +					      ntohs(ipv6_hdr(skb)->payload_len);
> +
> +			if (packet_length >= IPV6_MIN_MTU
> +			    || ntohs(ipv6_hdr(skb)->payload_len) == 0)
> +				mtu = IPV6_MIN_MTU;
> +			else
> +				goto out;
> +		}
> +
> +		header_len += sizeof(struct ipv6hdr);

I don't know whether there is a need to skip past IPv6 options.

> +	} else
> +		goto out;
> +
> +	if (pskb_may_pull(skb, header_len)) {
> +		__pskb_pull(skb, t->hlen);
> +		send_frag_needed(skb, t->dev, mtu);
> +		skb_push(skb, t->hlen);
> +	}
> +
> +out:
> +	skb_set_mac_header(skb, orig_mac_header);
> +	skb_set_network_header(skb, orig_nw_header);
> +	skb->protocol = htons(ETH_P_IP);
> +}
> +
>  static void ipgre_err(struct sk_buff *skb, u32 info)
>  {
>  

[...]

> @@ -491,9 +567,11 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
>  	}
>  
>  	/* If only 8 bytes returned, keyed message will be dropped here */
> -	if (skb_headlen(skb) < grehlen)
> +	if (!pskb_may_pull(skb, grehlen))
>  		return;

Wow, it's odd that pskb_may_pull() wasn't already used here.  I wonder
why.

> +static bool ipv4_should_icmp(struct sk_buff *skb)
> +{
> +	struct iphdr *old_iph = ip_hdr(skb);
> +
> +	/* Don't respond to L2 broadcast. */
> +	if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
> +		return false;
> +
> +	/* Don't respond to L3 broadcast or invalid addresses. */
> +	if (!check_ipv4_address(old_iph->daddr) ||
> +	    !check_ipv4_address(old_iph->saddr))
> +		return false;
> +
> +	/* Only respond to the first fragment. */
> +	if (old_iph->frag_off & htons(IP_OFFSET))
> +		return false;
> +
> +	/* Don't respond to ICMP error messages. */
> +	if (old_iph->protocol == IPPROTO_ICMP) {
> +		u8 icmp_type, *icmp_typep;
> +
> +		icmp_typep = skb_header_pointer(skb, (u8 *)old_iph +
> +						(old_iph->ihl << 2) +
> +						offsetof(struct icmphdr, type) -
> +						skb->data, sizeof(icmp_type),
> +						&icmp_type);
> +
> +		if (!icmp_typep)
> +			return false;
> +
> +		if (*icmp_typep > NR_ICMP_TYPES
> +			|| (*icmp_typep <= ICMP_PARAMETERPROB
> +				&& *icmp_typep != ICMP_ECHOREPLY
> +				&& *icmp_typep != ICMP_ECHO))
> +			return false;

RFC 792 just says "no ICMP messages are sent about ICMP messages."  This
seems to violate that.  Is it for a good reason?

> +	}
> +
> +	return true;
> +}
> +
> +static void ipv4_build_icmp(struct sk_buff *skb, struct sk_buff *nskb,
> +			    unsigned int mtu, unsigned int payload_length)
> +{
> +	struct iphdr *iph, *old_iph = ip_hdr(skb);
> +	struct icmphdr *icmph;
> +	u8 *payload;
> +
> +	iph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
> +	icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr));
> +	payload = skb_put(nskb, payload_length);
> +
> +	/* IP */
> +	iph->version		=	4;
> +	iph->ihl		=	sizeof(struct iphdr) >> 2;
> +	iph->tos		=	(old_iph->tos & IPTOS_TOS_MASK) |
> +					IPTOS_PREC_INTERNETCONTROL;
> +	iph->tot_len		=	htons(sizeof(struct iphdr)
> +					      + sizeof(struct icmphdr)
> +					      + payload_length);
> +	get_random_bytes(&iph->id, sizeof iph->id);

This selection of 'id' seems poor.  There is the ip_select_ident()
function, but we don't have a dst_entry here.  Maybe we could get one.
Or we could use 0 and set the DF bit, since the id is only used for
reassembly.

> +	iph->frag_off		=	0;
> +	iph->ttl		=	IPDEFTTL;
> +	iph->protocol		=	IPPROTO_ICMP;
> +	iph->daddr		=	old_iph->saddr;
> +	iph->saddr		=	old_iph->daddr;
> +
> +	ip_send_check(iph);
> +
> +	/* ICMP */
> +	icmph->type		=	ICMP_DEST_UNREACH;
> +	icmph->code		=	ICMP_FRAG_NEEDED;
> +	icmph->un.gateway	=	htonl(mtu);
> +	icmph->checksum		=	0;
> +
> +	nskb->csum = csum_partial((void *)icmph, sizeof *icmph, 0);

The cast can be omitted here.

> +	nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_iph - skb->data,
> +					    payload, payload_length,
> +					    nskb->csum);
> +	icmph->checksum = csum_fold(nskb->csum);

Is there value in setting nskb->csum here?  Should we set
nskb->ip_summed to CHECKSUM_NONE?  That would match what
icmp_push_reply() in net/ipv4/icmp.c does in seemingly similar
circumstances.

> +}

I didn't really study the IPv6 versions of these functions.  Wow,
there's a lot of code here.

> +static bool send_frag_needed(struct sk_buff *skb, struct net_device *dev,
> +			     unsigned int mtu)
> +{
> +	unsigned int eth_hdr_len = ETH_HLEN;
> +	unsigned int total_length, header_length, payload_length;
> +	struct ethhdr *eh, *old_eh = eth_hdr(skb);
> +	struct sk_buff *nskb;
> +	struct net_device_stats *stats;
> +
> +	/* Normal IP stack. */
> +	if (!dev->br_port) {
> +		if (skb->protocol == htons(ETH_P_IP)) {
> +			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
> +				  htonl(mtu));
> +			return true;
> +		} else {
> +#ifdef CONFIG_IPV6
> +			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
> +			return true;
> +#else
> +			return false;
> +#endif
> +		}
> +	}
> +
> +	/* Sanity check */
> +	if (skb->protocol == htons(ETH_P_IP)) {
> +		if (mtu < 68)
> +			return false;
> +
> +		if (!ipv4_should_icmp(skb))
> +			return true;
> +	} else {
> +		if (mtu < IPV6_MIN_MTU)
> +			return false;
> +
> +		/* In theory we should do PMTUD on IPv6 multicast messages but
> +		 * we don't have an address to send from so just fragment. */
> +		if (ipv6_addr_type(&ipv6_hdr(skb)->daddr) & IPV6_ADDR_MULTICAST)
> +			return false;
> +
> +		if (!ipv6_should_icmp(skb))
> +			return true;
> +	}
> +
> +	/* Allocate */
> +	if (old_eh->h_proto == htons(ETH_P_8021Q))
> +		eth_hdr_len = VLAN_ETH_HLEN;
> +
> +	payload_length = skb->len - eth_hdr_len;
> +	if (skb->protocol == htons(ETH_P_IP)) {
> +		header_length = sizeof(struct iphdr) + sizeof(struct icmphdr);
> +		total_length = min_t(unsigned int, header_length +
> +						   payload_length, 576);
> +	} else {
> +		header_length = sizeof(struct ipv6hdr) +
> +				sizeof(struct icmp6hdr);
> +		total_length = min_t(unsigned int, header_length +
> +						  payload_length, IPV6_MIN_MTU);
> +	}
> +	total_length = min(total_length, dev->mtu);
> +	payload_length = total_length - header_length;
> +
> +	nskb = netdev_alloc_skb(dev, NET_IP_ALIGN + eth_hdr_len
> +				+ header_length + payload_length);

I don't know whether netdev_alloc_skb() is considered OK to use for
allocating an sk_buff to *send*.  The uses within net/ seem to be
oriented around *receiving*, and the comment on netdev_alloc_skb() says
that it is for allocating a buffer for *receiving* on a given device.

> +	if (!nskb)
> +		return false;
> +
> +	skb_reserve(nskb, NET_IP_ALIGN);

But if netdev_alloc_skb() is OK, then you could use
netdev_alloc_skb_ip_align() to avoid writing the skb_reserve()
explicitly.

> +	/* Ethernet / VLAN */
> +	eh = (struct ethhdr *)skb_put(nskb, eth_hdr_len);
> +	memcpy(eh->h_dest, old_eh->h_source, ETH_ALEN);
> +	memcpy(eh->h_source, dev->dev_addr, ETH_ALEN);
> +
> +	if (old_eh->h_proto != htons(ETH_P_8021Q)) {
> +		eh->h_proto = old_eh->h_proto;
> +	} else {
> +		struct vlan_ethhdr *vh = (struct vlan_ethhdr *)eh;
> +
> +		vh->h_vlan_proto = htons(ETH_P_8021Q);
> +		vh->h_vlan_TCI = vlan_eth_hdr(skb)->h_vlan_TCI;
> +		vh->h_vlan_encapsulated_proto = skb->protocol;;
> +	}

The last statement there ends with a double semicolon.

I would be tempted to write this as something like

        eh->h_proto = old_eh->h_proto;
        if (eh->h_proto == htons(ETH_P_8021Q)) {
		struct vlan_ethhdr *vh = (struct vlan_ethhdr *)eh;

		vh->h_vlan_TCI = vlan_eth_hdr(skb)->h_vlan_TCI;
		vh->h_vlan_encapsulated_proto = skb->protocol;
        }

(or would it even be possible to memcpy the vlan_eth_hdr from the old
skb into the new one?)

> @@ -758,6 +1143,9 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
>  			skb->protocol = 0;
>  	}
>  
> +	if (dev->br_port)
> +		skb_dst_drop(skb);
> +

I'm curious about why skb_dst_drop(skb) is conditional on the device
being a bridge port.

> @@ -898,7 +1298,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
>  
>  	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
>  
> -	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
> +	if (skb_headroom(skb) < max_headroom ||
>  	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
>  		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
>  		if (!new_skb) {

Why is the above change valid?

> @@ -1343,7 +1749,7 @@ static void ethtool_getinfo(struct net_device *dev,
>  }
>  
>  static struct ethtool_ops ethtool_ops = {
> -	.get_drvinfo = ethtool_getinfo,
> +	.get_drvinfo		= ethtool_getinfo,
>  };
>  
>  #ifdef HAVE_NET_DEVICE_OPS
> @@ -1876,7 +2282,7 @@ static int __init ipgre_init(void)
>  {
>  	int err;
>  
> -	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
> +	printk("Open vSwitch GRE over IPv4, built "__DATE__" "__TIME__"\n");

It's probably best to retain the KERN_INFO annotation here.

> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> index e27299a..ade1387 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -340,6 +340,10 @@ error:
>  #endif
>  }
>  
> +#ifndef IP_DF
> +#define IP_DF 0x4000
> +#endif

Here in userspace we have this available in packets.h as
IP_DONT_FRAGMENT.

>  static int
>  setup_gre_ioctl(const char *name, struct gre_config *config, bool create)
>  {

I must confess that I don't have a solid grasp of all of this code.  But
it certainly looks convincing!