[ovs-dev] [PATCH v4] gre: Restructure tunneling.

Jesse Gross jesse at nicira.com
Thu Jun 6 00:46:56 UTC 2013


On Tue, Jun 4, 2013 at 1:31 PM, Pravin B Shelar <pshelar at nicira.com> wrote:
> diff --git a/datapath/compat.h b/datapath/compat.h
> index c7fd225..6095323 100644
> --- a/datapath/compat.h
> +++ b/datapath/compat.h
> @@ -93,5 +93,11 @@ static inline void skb_set_mark(struct sk_buff *skb, u32 mark)
>         skb->mark = mark;
>  }
>  #endif /* after 2.6.20 */
> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
> +#define rt_dst(rt) (rt->dst)
> +#else
> +#define rt_dst(rt) (rt->u.dst)
> +#endif
> +

I think the spacing is a little off here - this adds an extra blank
line at the end of this block and none before.

> diff --git a/datapath/linux/compat/gre.c b/datapath/linux/compat/gre.c
> new file mode 100644
> index 0000000..63c298b
> --- /dev/null
> +++ b/datapath/linux/compat/gre.c
> +static int gre_compat_init(void)
> +{
> +       int err;
> +
> +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
> +       if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
> +               pr_err("can't add protocol\n");

We should have a more descriptive error message since there isn't much
context in the operation of the larger OVS module.

> diff --git a/datapath/linux/compat/gso.c b/datapath/linux/compat/gso.c
> new file mode 100644
> index 0000000..5a3a786
> --- /dev/null
> +++ b/datapath/linux/compat/gso.c
> +static __be16 skb_network_protocol(struct sk_buff *skb)
> +{
> +       __be16 type = skb->protocol;
> +       int vlan_depth = ETH_HLEN;
> +
> +       while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
> +               struct vlan_hdr *vh;
> +
> +               if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
> +                       return 0;
> +
> +               vh = (struct vlan_hdr *)(skb->data + vlan_depth);
> +               type = vh->h_vlan_encapsulated_proto;
> +               vlan_depth += VLAN_HLEN;
> +       }
> +
> +       return type;
> +}

Since we're assuming that these are raw IP packets, is this check even
necessary?

> +static struct sk_buff *tnl_skb_gso_segment(struct sk_buff *skb,
> +                                          netdev_features_t features,
> +                                          bool tx_path)
> +{
> +       struct iphdr *iph       = ip_hdr(skb);
> +       int tnl_hlen            = skb_inner_network_offset(skb);

I think this also includes the inner MAC header, so I would probably
chose a different name.

> +       struct sk_buff *skb1    = skb;
> +       struct ovs_gso_cb cb    = *OVS_GSO_CB(skb);
> +       struct sk_buff *segs;
> +
> +       __skb_pull(skb, tnl_hlen);
> +       skb_reset_mac_header(skb);
> +       skb_reset_network_header(skb);
> +       skb_reset_transport_header(skb);
> +       skb->protocol = skb_network_protocol(skb);
> +
> +       segs = __skb_gso_segment(skb, 0, tx_path);
> +       if (!segs || IS_ERR(segs))
> +               goto free;
> +
> +       skb = segs;
> +       while (skb) {
> +               __skb_push(skb, tnl_hlen);
> +               skb_reset_mac_header(skb);
> +               skb_reset_network_header(skb);
> +               skb_set_transport_header(skb, sizeof(struct iphdr));
> +               skb->mac_len = 0;
> +
> +               memcpy(ip_hdr(skb), iph, tnl_hlen);
> +               *OVS_GSO_CB(skb) = cb;

Doesn't the cb automatically get replicated onto every segment?

> +static bool need_linearize(const struct sk_buff *skb)
> +{
> +       int i;
> +
> +       if (unlikely(skb_shinfo(skb)->frag_list))
> +               return true;
> +
> +       /*
> +        * Generally speaking we should linearize if there are paged frags.
> +        * However, if all of the refcounts are 1 we know nobody else can
> +        * change them from underneath us and we can skip the linearization.
> +        */
> +       for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
> +               if (unlikely(page_count(skb_frag_page(&skb_shinfo(skb)->frags[i])) > 1))
> +                       return true;
> +
> +       return false;
> +}

I think we should probably not try to be too clever and just use the
same logic as skb_needs_linearize().

> +int rpl_ip_local_out(struct sk_buff *skb)
[...]
> +       while (skb) {
> +               struct sk_buff *next_skb = skb->next;
> +               struct iphdr *iph;
> +               int err;
> +
> +               if (next_skb)
> +                       skb_dst_set(skb, dst_clone(dst));
> +               else
> +                       skb_dst_set(skb, dst);

This doesn't seem quite right conceptually, since it will give two
references to the first skb and then none to the last. It likely works
in most cases but it seems risky.

> +               skb->next = NULL;
> +
> +               /*
> +                * Allow our local IP stack to fragment the outer packet even
> +                * if the DF bit is set as a last resort.  We also need to
> +                * force selection of an IP ID here because Linux will
> +                * otherwise leave it at 0 if the packet originally had DF set.
> +                */
> +
> +               skb->local_df = 1;

This should be done in the protocol code, since it's not really a
function of GSO.

> +               iph = ip_hdr(skb);
> +               tunnel_ip_select_ident(skb,
> +                      (const struct iphdr *)skb_inner_network_header(skb),
> +                       skb_dst(skb));

Should we just increment the IP ID blindly like is done in GSO currently?

> diff --git a/datapath/linux/compat/gso.h b/datapath/linux/compat/gso.h
> new file mode 100644
> index 0000000..3157041
> --- /dev/null
> +++ b/datapath/linux/compat/gso.h
> +#define skb_reset_inner_headers rpl_skb_reset_inner_headers
> +static inline void skb_reset_inner_headers(struct sk_buff *skb)
> +{
> +       BUILD_BUG_ON(sizeof(struct ovs_gso_cb) > 48);

You can use FIELD_SIZEOF to programmatically get the size of skb->cb
rather than hard coding it.

> diff --git a/datapath/linux/compat/include/linux/err.h b/datapath/linux/compat/include/linux/err.h
> index 50faf2a..d640298 100644
> --- a/datapath/linux/compat/include/linux/err.h
> +++ b/datapath/linux/compat/include/linux/err.h
> +#ifndef IS_ERR_VALUE
> +#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO)

Isn't IS_ERR_VALUE very old (it seems to be in 2.6.18)?

> diff --git a/datapath/linux/compat/include/net/gre.h b/datapath/linux/compat/include/net/gre.h
> new file mode 100644
> index 0000000..3285ccd
> --- /dev/null
> +++ b/datapath/linux/compat/include/net/gre.h
> @@ -0,0 +1,109 @@
> +#ifndef __LINUX_GRE_WRAPPER_H
> +#define __LINUX_GRE_WRAPPER_H
> +
> +#include <linux/skbuff.h>
> +#include <net/ip_tunnels.h>
> +
> +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,37)
> +#include_next <net/gre.h>
> +
> +#else /* LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,37) */
> +
> +#define GREPROTO_CISCO         0
> +#define GREPROTO_PPTP          1
> +#define GREPROTO_MAX           2
> +#define GRE_IP_PROTO_MAX       2

It looks like we have two definitions of GRE_IP_PROTO_MAX for kernels <= 2.6.37.

> +static inline __be16 gre_flags_to_tnl_flags(__be16 flags)
> +{
> +       __be16 tflags = 0;
> +
> +       if (flags & GRE_CSUM)
> +               tflags |= TUNNEL_CSUM;
> +       if (flags & GRE_ROUTING)
> +               tflags |= TUNNEL_ROUTING;
> +       if (flags & GRE_KEY)
> +               tflags |= TUNNEL_KEY;
> +       if (flags & GRE_SEQ)
> +               tflags |= TUNNEL_SEQ;
> +       if (flags & GRE_STRICT)
> +               tflags |= TUNNEL_STRICT;
> +       if (flags & GRE_REC)
> +               tflags |= TUNNEL_REC;
> +       if (flags & GRE_VERSION)
> +               tflags |= TUNNEL_VERSION;
> +
> +       return tflags;
> +}
> +
> +static inline __be16 tnl_flags_to_gre_flags(__be16 tflags)
> +{
> +       __be16 flags = 0;
> +
> +       if (tflags & TUNNEL_CSUM)
> +               flags |= GRE_CSUM;
> +       if (tflags & TUNNEL_ROUTING)
> +               flags |= GRE_ROUTING;
> +       if (tflags & TUNNEL_KEY)
> +               flags |= GRE_KEY;
> +       if (tflags & TUNNEL_SEQ)
> +               flags |= GRE_SEQ;
> +       if (tflags & TUNNEL_STRICT)
> +               flags |= GRE_STRICT;
> +       if (tflags & TUNNEL_REC)
> +               flags |= GRE_REC;
> +       if (tflags & TUNNEL_VERSION)
> +               flags |= GRE_VERSION;
> +
> +       return flags;
> +}

I think these functions are already upstream, should they have version guards?

> diff --git a/datapath/linux/compat/include/net/ip_tunnels.h b/datapath/linux/compat/include/net/ip_tunnels.h
> new file mode 100644
> index 0000000..ad17c9d
> --- /dev/null
> +++ b/datapath/linux/compat/include/net/ip_tunnels.h
> @@ -0,0 +1,54 @@
> +#ifndef __NET_IP_TUNNELS_WRAPPER_H
> +#define __NET_IP_TUNNELS_WRAPPER_H 1

I believe that all of these functions are already upstream but it
looks like we unconditionally use our copy. It's probably better to
use the upstream version on kernels that have it so that if the values
change, we'll automatically change with it.

> diff --git a/datapath/linux/compat/ip_tunnels_core.c b/datapath/linux/compat/ip_tunnels_core.c
> new file mode 100644
> index 0000000..a8e62ee
> --- /dev/null
> +++ b/datapath/linux/compat/ip_tunnels_core.c
> +int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
> +{
> +       if (inner_proto == htons(ETH_P_TEB)) {
> +               struct ethhdr *eh;
> +
> +               if (unlikely(!pskb_may_pull(skb, hdr_len + ETH_HLEN)))
> +                       return -ENOMEM;
> +
> +               __skb_pull(skb, hdr_len);
> +               skb_postpull_rcsum(skb, skb_transport_header(skb), hdr_len + ETH_HLEN);
> +
> +               eh = (struct ethhdr *)skb->data;

This whole thing about having the Ethernet header not covered
skb->csum is really an old bug that I started claiming as an
optimization at some point. However, it's really not worth much and we
probably shouldn't propagate it beyond OVS. The rule in general is
that packet headers that have not been pulled off are covered by
skb->csum so we should really just make ourselves consistent.

> +               if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
> +                       skb->protocol = eh->h_proto;
> +               else
> +                       skb->protocol = htons(ETH_P_802_2);
> +
> +       } else {
> +               if (unlikely(!pskb_may_pull(skb, hdr_len)))
> +                       return -ENOMEM;
> +
> +               __skb_pull(skb, hdr_len);
> +               skb_postpull_rcsum(skb, skb_transport_header(skb), hdr_len);

You can combine the above two steps into skb_pull_rcsum().

> +               skb->protocol = inner_proto;
> +       }
> +
> +       nf_reset(skb);
> +       secpath_reset(skb);
> +       skb_clear_rxhash(skb);
> +       skb_dst_drop(skb);
> +       vlan_set_tci(skb, 0);
> +       skb_set_queue_mapping(skb, 0);
> +       skb->pkt_type = PACKET_HOST;
> +       return 0;
> +}

Several of these come from different places (such as
__skb_tunnel_rx()). It would be good to make the compat code more
closely mirror upstream.



More information about the dev mailing list