[ovs-dev] [PATCH 8/8] gre: Add support for path MTU discovery.
Ben Pfaff
blp at nicira.com
Thu Mar 4 23:50:18 UTC 2010
On Thu, Mar 04, 2010 at 01:22:13PM -0500, Jesse Gross wrote:
> @@ -455,6 +458,77 @@ static unsigned int tunnel_hard_header_len(struct net_device *dev)
> #endif
> }
>
> +static void icmp_err_frag(struct sk_buff *skb, struct ip_tunnel *t)
> +{
> + int mtu = ntohs(icmp_hdr(skb)->un.frag.mtu);
> + int header_len = t->hlen + tunnel_hard_header_len(t->dev);
> + unsigned int orig_mac_header = skb_mac_header(skb) - skb->data;
> + unsigned int orig_nw_header = skb_network_header(skb) - skb->data;
> + __be16 encap_proto;
> +
> + /* Add the size of the IP header since this is the smallest
> + * packet size the we might do something with and we might as
> + * well fail early if we don't have it. Plus it allows us to
> + * safely look at the VLAN header if there is one. The final
> + * size is checked before use. */
> + if (!pskb_may_pull(skb, header_len + sizeof(struct iphdr)))
> + return;
> +
> + if (t->dev->type == ARPHRD_ETHER) {
> + skb_set_mac_header(skb, t->hlen);
> + encap_proto = eth_hdr(skb)->h_proto;
> +
> + if (encap_proto == htons(ETH_P_8021Q)) {
> + header_len += VLAN_HLEN;
> + encap_proto =
> + vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
> + }
> + } else {
> + encap_proto = *(__be16 *)((skb->data +
> + (ip_hdr(skb)->ihl << 2)) + 2);
Can 'ip_hdr(skb)->ihl << 2' with a malicious ihl == 15 value cause us to
read past the end of the header?
The "+ 2" here looks like magic. I guess there must be a gre_hdr
somewhere (I see one in nf_conntrack_proto_gre, for example) to reduce
the magic?
> + }
> +
> + skb_set_network_header(skb, header_len);
> + skb->protocol = encap_proto;
> + mtu -= header_len;
> +
> + if (skb->protocol == htons(ETH_P_IP)) {
> + if (mtu < 68) {
> + if (ntohs(ip_hdr(skb)->tot_len) >= 68)
> + mtu = 68;
It took me a few minutes to figure out where this number came from. I
guess it is from RFC 791: "Every internet module must be able to forward
a datagram of 68 octets without further fragmentation. This is because
an internet header may be up to 60 octets, and the minimum fragment is 8
octets." I don't know whether that is obvious to everyone else.
I wonder whether the kernel upstream would be open to adding an
IP_MIN_MTU macro.
I don't see a guarantee that ->tot_len doesn't read past the end of the
header (and the same for the later IPv6 case).
> + else
> + goto out;
> + }
> +
> + header_len += sizeof(struct iphdr);
Should this be "header_len += ip_hdrlen(ip_hdr(skb))?"
> + } else if (skb->protocol == htons(ETH_P_IPV6)) {
> + if (mtu < IPV6_MIN_MTU) {
> + unsigned int packet_length = sizeof(struct ipv6hdr) +
> + ntohs(ipv6_hdr(skb)->payload_len);
> +
> + if (packet_length >= IPV6_MIN_MTU
> + || ntohs(ipv6_hdr(skb)->payload_len) == 0)
> + mtu = IPV6_MIN_MTU;
> + else
> + goto out;
> + }
> +
> + header_len += sizeof(struct ipv6hdr);
I don't know whether there is a need to skip past IPv6 options.
> + } else
> + goto out;
> +
> + if (pskb_may_pull(skb, header_len)) {
> + __pskb_pull(skb, t->hlen);
> + send_frag_needed(skb, t->dev, mtu);
> + skb_push(skb, t->hlen);
> + }
> +
> +out:
> + skb_set_mac_header(skb, orig_mac_header);
> + skb_set_network_header(skb, orig_nw_header);
> + skb->protocol = htons(ETH_P_IP);
> +}
> +
> static void ipgre_err(struct sk_buff *skb, u32 info)
> {
>
[...]
> @@ -491,9 +567,11 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
> }
>
> /* If only 8 bytes returned, keyed message will be dropped here */
> - if (skb_headlen(skb) < grehlen)
> + if (!pskb_may_pull(skb, grehlen))
> return;
Wow, it's odd that pskb_may_pull() wasn't already used here. I wonder
why.
> +static bool ipv4_should_icmp(struct sk_buff *skb)
> +{
> + struct iphdr *old_iph = ip_hdr(skb);
> +
> + /* Don't respond to L2 broadcast. */
> + if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
> + return false;
> +
> + /* Don't respond to L3 broadcast or invalid addresses. */
> + if (!check_ipv4_address(old_iph->daddr) ||
> + !check_ipv4_address(old_iph->saddr))
> + return false;
> +
> + /* Only respond to the first fragment. */
> + if (old_iph->frag_off & htons(IP_OFFSET))
> + return false;
> +
> + /* Don't respond to ICMP error messages. */
> + if (old_iph->protocol == IPPROTO_ICMP) {
> + u8 icmp_type, *icmp_typep;
> +
> + icmp_typep = skb_header_pointer(skb, (u8 *)old_iph +
> + (old_iph->ihl << 2) +
> + offsetof(struct icmphdr, type) -
> + skb->data, sizeof(icmp_type),
> + &icmp_type);
> +
> + if (!icmp_typep)
> + return false;
> +
> + if (*icmp_typep > NR_ICMP_TYPES
> + || (*icmp_typep <= ICMP_PARAMETERPROB
> + && *icmp_typep != ICMP_ECHOREPLY
> + && *icmp_typep != ICMP_ECHO))
> + return false;
RFC 792 just says "no ICMP messages are sent about ICMP messages." This
seems to violate that. Is it for a good reason?
> + }
> +
> + return true;
> +}
> +
> +static void ipv4_build_icmp(struct sk_buff *skb, struct sk_buff *nskb,
> + unsigned int mtu, unsigned int payload_length)
> +{
> + struct iphdr *iph, *old_iph = ip_hdr(skb);
> + struct icmphdr *icmph;
> + u8 *payload;
> +
> + iph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
> + icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr));
> + payload = skb_put(nskb, payload_length);
> +
> + /* IP */
> + iph->version = 4;
> + iph->ihl = sizeof(struct iphdr) >> 2;
> + iph->tos = (old_iph->tos & IPTOS_TOS_MASK) |
> + IPTOS_PREC_INTERNETCONTROL;
> + iph->tot_len = htons(sizeof(struct iphdr)
> + + sizeof(struct icmphdr)
> + + payload_length);
> + get_random_bytes(&iph->id, sizeof iph->id);
This selection of 'id' seems poor. There is the ip_select_ident()
function, but we don't have a dst_entry here. Maybe we could get one.
Or we could use 0 and set the DF bit, since the id is only used for
reassembly.
> + iph->frag_off = 0;
> + iph->ttl = IPDEFTTL;
> + iph->protocol = IPPROTO_ICMP;
> + iph->daddr = old_iph->saddr;
> + iph->saddr = old_iph->daddr;
> +
> + ip_send_check(iph);
> +
> + /* ICMP */
> + icmph->type = ICMP_DEST_UNREACH;
> + icmph->code = ICMP_FRAG_NEEDED;
> + icmph->un.gateway = htonl(mtu);
> + icmph->checksum = 0;
> +
> + nskb->csum = csum_partial((void *)icmph, sizeof *icmph, 0);
The cast can be omitted here.
> + nskb->csum = skb_copy_and_csum_bits(skb, (u8 *)old_iph - skb->data,
> + payload, payload_length,
> + nskb->csum);
> + icmph->checksum = csum_fold(nskb->csum);
Is there value in setting nskb->csum here? Should we set
nskb->ip_summed to CHECKSUM_NONE? That would match what
icmp_push_reply() in net/ipv4/icmp.c does in seemingly similar
circumstances.
> +}
I didn't really study the IPv6 versions of these functions. Wow,
there's a lot of code here.
> +static bool send_frag_needed(struct sk_buff *skb, struct net_device *dev,
> + unsigned int mtu)
> +{
> + unsigned int eth_hdr_len = ETH_HLEN;
> + unsigned int total_length, header_length, payload_length;
> + struct ethhdr *eh, *old_eh = eth_hdr(skb);
> + struct sk_buff *nskb;
> + struct net_device_stats *stats;
> +
> + /* Normal IP stack. */
> + if (!dev->br_port) {
> + if (skb->protocol == htons(ETH_P_IP)) {
> + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
> + htonl(mtu));
> + return true;
> + } else {
> +#ifdef CONFIG_IPV6
> + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
> + return true;
> +#else
> + return false;
> +#endif
> + }
> + }
> +
> + /* Sanity check */
> + if (skb->protocol == htons(ETH_P_IP)) {
> + if (mtu < 68)
> + return false;
> +
> + if (!ipv4_should_icmp(skb))
> + return true;
> + } else {
> + if (mtu < IPV6_MIN_MTU)
> + return false;
> +
> + /* In theory we should do PMTUD on IPv6 multicast messages but
> + * we don't have an address to send from so just fragment. */
> + if (ipv6_addr_type(&ipv6_hdr(skb)->daddr) & IPV6_ADDR_MULTICAST)
> + return false;
> +
> + if (!ipv6_should_icmp(skb))
> + return true;
> + }
> +
> + /* Allocate */
> + if (old_eh->h_proto == htons(ETH_P_8021Q))
> + eth_hdr_len = VLAN_ETH_HLEN;
> +
> + payload_length = skb->len - eth_hdr_len;
> + if (skb->protocol == htons(ETH_P_IP)) {
> + header_length = sizeof(struct iphdr) + sizeof(struct icmphdr);
> + total_length = min_t(unsigned int, header_length +
> + payload_length, 576);
> + } else {
> + header_length = sizeof(struct ipv6hdr) +
> + sizeof(struct icmp6hdr);
> + total_length = min_t(unsigned int, header_length +
> + payload_length, IPV6_MIN_MTU);
> + }
> + total_length = min(total_length, dev->mtu);
> + payload_length = total_length - header_length;
> +
> + nskb = netdev_alloc_skb(dev, NET_IP_ALIGN + eth_hdr_len
> + + header_length + payload_length);
I don't know whether netdev_alloc_skb() is considered OK to use for
allocating an sk_buff to *send*. The uses within net/ seem to be
oriented around *receiving*, and the comment on netdev_alloc_skb() says
that it is for allocating a buffer for *receiving* on a given device.
> + if (!nskb)
> + return false;
> +
> + skb_reserve(nskb, NET_IP_ALIGN);
But if netdev_alloc_skb() is OK, then you could use
netdev_alloc_skb_ip_align() to avoid writing the skb_reserve()
explicitly.
> + /* Ethernet / VLAN */
> + eh = (struct ethhdr *)skb_put(nskb, eth_hdr_len);
> + memcpy(eh->h_dest, old_eh->h_source, ETH_ALEN);
> + memcpy(eh->h_source, dev->dev_addr, ETH_ALEN);
> +
> + if (old_eh->h_proto != htons(ETH_P_8021Q)) {
> + eh->h_proto = old_eh->h_proto;
> + } else {
> + struct vlan_ethhdr *vh = (struct vlan_ethhdr *)eh;
> +
> + vh->h_vlan_proto = htons(ETH_P_8021Q);
> + vh->h_vlan_TCI = vlan_eth_hdr(skb)->h_vlan_TCI;
> + vh->h_vlan_encapsulated_proto = skb->protocol;;
> + }
The last statement there ends with a double semicolon.
I would be tempted to write this as something like
eh->h_proto = old_eh->h_proto;
if (eh->h_proto == htons(ETH_P_8021Q)) {
struct vlan_ethhdr *vh = (struct vlan_ethhdr *)eh;
vh->h_vlan_TCI = vlan_eth_hdr(skb)->h_vlan_TCI;
vh->h_vlan_encapsulated_proto = skb->protocol;
}
(or would it even be possible to memcpy the vlan_eth_hdr from the old
skb into the new one?)
> @@ -758,6 +1143,9 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
> skb->protocol = 0;
> }
>
> + if (dev->br_port)
> + skb_dst_drop(skb);
> +
I'm curious about why skb_dst_drop(skb) is conditional on the device
being a bridge port.
> @@ -898,7 +1298,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
>
> max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
>
> - if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
> + if (skb_headroom(skb) < max_headroom ||
> (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
> struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
> if (!new_skb) {
Why is the above change valid?
> @@ -1343,7 +1749,7 @@ static void ethtool_getinfo(struct net_device *dev,
> }
>
> static struct ethtool_ops ethtool_ops = {
> - .get_drvinfo = ethtool_getinfo,
> + .get_drvinfo = ethtool_getinfo,
> };
>
> #ifdef HAVE_NET_DEVICE_OPS
> @@ -1876,7 +2282,7 @@ static int __init ipgre_init(void)
> {
> int err;
>
> - printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
> + printk("Open vSwitch GRE over IPv4, built "__DATE__" "__TIME__"\n");
It's probably best to retain the KERN_INFO annotation here.
> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> index e27299a..ade1387 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -340,6 +340,10 @@ error:
> #endif
> }
>
> +#ifndef IP_DF
> +#define IP_DF 0x4000
> +#endif
Here in userspace we have this available in packets.h as
IP_DONT_FRAGMENT.
> static int
> setup_gre_ioctl(const char *name, struct gre_config *config, bool create)
> {
I must confess that I don't have a solid grasp of all of this code. But
it certainly looks convincing!
More information about the dev
mailing list