[ovs-dev] [PATCH v4] gre: Restructure tunneling.

Pravin Shelar pshelar at nicira.com
Thu Jun 6 01:30:15 UTC 2013


On Wed, Jun 5, 2013 at 5:46 PM, Jesse Gross <jesse at nicira.com> wrote:
> On Tue, Jun 4, 2013 at 1:31 PM, Pravin B Shelar <pshelar at nicira.com> wrote:
>> diff --git a/datapath/compat.h b/datapath/compat.h
>> index c7fd225..6095323 100644
>> --- a/datapath/compat.h
>> +++ b/datapath/compat.h
>> @@ -93,5 +93,11 @@ static inline void skb_set_mark(struct sk_buff *skb, u32 mark)
>>         skb->mark = mark;
>>  }
>>  #endif /* after 2.6.20 */
>> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
>> +#define rt_dst(rt) (rt->dst)
>> +#else
>> +#define rt_dst(rt) (rt->u.dst)
>> +#endif
>> +
>
> I think the spacing is a little off here - this adds an extra blank
> line at the end of this block and none before.
>
ok.

>> diff --git a/datapath/linux/compat/gre.c b/datapath/linux/compat/gre.c
>> new file mode 100644
>> index 0000000..63c298b
>> --- /dev/null
>> +++ b/datapath/linux/compat/gre.c
>> +static int gre_compat_init(void)
>> +{
>> +       int err;
>> +
>> +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
>> +       if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
>> +               pr_err("can't add protocol\n");
>
> We should have a more descriptive error message since there isn't much
> context in the operation of the larger OVS module.
>
ok.

>> diff --git a/datapath/linux/compat/gso.c b/datapath/linux/compat/gso.c
>> new file mode 100644
>> index 0000000..5a3a786
>> --- /dev/null
>> +++ b/datapath/linux/compat/gso.c
>> +static __be16 skb_network_protocol(struct sk_buff *skb)
>> +{
>> +       __be16 type = skb->protocol;
>> +       int vlan_depth = ETH_HLEN;
>> +
>> +       while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
>> +               struct vlan_hdr *vh;
>> +
>> +               if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
>> +                       return 0;
>> +
>> +               vh = (struct vlan_hdr *)(skb->data + vlan_depth);
>> +               type = vh->h_vlan_encapsulated_proto;
>> +               vlan_depth += VLAN_HLEN;
>> +       }
>> +
>> +       return type;
>> +}
>
> Since we're assuming that these are raw IP packets, is this check even
> necessary?
>
right, it is not necessary, I am also planning on getting rid of it
upstream kernel using inner protocol.

>> +static struct sk_buff *tnl_skb_gso_segment(struct sk_buff *skb,
>> +                                          netdev_features_t features,
>> +                                          bool tx_path)
>> +{
>> +       struct iphdr *iph       = ip_hdr(skb);
>> +       int tnl_hlen            = skb_inner_network_offset(skb);
>
> I think this also includes the inner MAC header, so I would probably
> chose a different name.
>
ok.

>> +       struct sk_buff *skb1    = skb;
>> +       struct ovs_gso_cb cb    = *OVS_GSO_CB(skb);
>> +       struct sk_buff *segs;
>> +
>> +       __skb_pull(skb, tnl_hlen);
>> +       skb_reset_mac_header(skb);
>> +       skb_reset_network_header(skb);
>> +       skb_reset_transport_header(skb);
>> +       skb->protocol = skb_network_protocol(skb);
>> +
>> +       segs = __skb_gso_segment(skb, 0, tx_path);
>> +       if (!segs || IS_ERR(segs))
>> +               goto free;
>> +
>> +       skb = segs;
>> +       while (skb) {
>> +               __skb_push(skb, tnl_hlen);
>> +               skb_reset_mac_header(skb);
>> +               skb_reset_network_header(skb);
>> +               skb_set_transport_header(skb, sizeof(struct iphdr));
>> +               skb->mac_len = 0;
>> +
>> +               memcpy(ip_hdr(skb), iph, tnl_hlen);
>> +               *OVS_GSO_CB(skb) = cb;
>
> Doesn't the cb automatically get replicated onto every segment?
>
right.

>> +static bool need_linearize(const struct sk_buff *skb)
>> +{
>> +       int i;
>> +
>> +       if (unlikely(skb_shinfo(skb)->frag_list))
>> +               return true;
>> +
>> +       /*
>> +        * Generally speaking we should linearize if there are paged frags.
>> +        * However, if all of the refcounts are 1 we know nobody else can
>> +        * change them from underneath us and we can skip the linearization.
>> +        */
>> +       for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
>> +               if (unlikely(page_count(skb_frag_page(&skb_shinfo(skb)->frags[i])) > 1))
>> +                       return true;
>> +
>> +       return false;
>> +}
>
> I think we should probably not try to be too clever and just use the
> same logic as skb_needs_linearize().
>
ok.

>> +int rpl_ip_local_out(struct sk_buff *skb)
> [...]
>> +       while (skb) {
>> +               struct sk_buff *next_skb = skb->next;
>> +               struct iphdr *iph;
>> +               int err;
>> +
>> +               if (next_skb)
>> +                       skb_dst_set(skb, dst_clone(dst));
>> +               else
>> +                       skb_dst_set(skb, dst);
>
> This doesn't seem quite right conceptually, since it will give two
> references to the first skb and then none to the last. It likely works
> in most cases but it seems risky.
>
ok, I will fix it.

>> +               skb->next = NULL;
>> +
>> +               /*
>> +                * Allow our local IP stack to fragment the outer packet even
>> +                * if the DF bit is set as a last resort.  We also need to
>> +                * force selection of an IP ID here because Linux will
>> +                * otherwise leave it at 0 if the packet originally had DF set.
>> +                */
>> +
>> +               skb->local_df = 1;
>
> This should be done in the protocol code, since it's not really a
> function of GSO.
>
ok.

>> +               iph = ip_hdr(skb);
>> +               tunnel_ip_select_ident(skb,
>> +                      (const struct iphdr *)skb_inner_network_header(skb),
>> +                       skb_dst(skb));
>
> Should we just increment the IP ID blindly like is done in GSO currently?
>
ok.

>> diff --git a/datapath/linux/compat/gso.h b/datapath/linux/compat/gso.h
>> new file mode 100644
>> index 0000000..3157041
>> --- /dev/null
>> +++ b/datapath/linux/compat/gso.h
>> +#define skb_reset_inner_headers rpl_skb_reset_inner_headers
>> +static inline void skb_reset_inner_headers(struct sk_buff *skb)
>> +{
>> +       BUILD_BUG_ON(sizeof(struct ovs_gso_cb) > 48);
>
> You can use FIELD_SIZEOF to programmatically get the size of skb->cb
> rather than hard coding it.
>
ok.

>> diff --git a/datapath/linux/compat/include/linux/err.h b/datapath/linux/compat/include/linux/err.h
>> index 50faf2a..d640298 100644
>> --- a/datapath/linux/compat/include/linux/err.h
>> +++ b/datapath/linux/compat/include/linux/err.h
>> +#ifndef IS_ERR_VALUE
>> +#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO)
>
> Isn't IS_ERR_VALUE very old (it seems to be in 2.6.18)?
>
right, it is not required.

>> diff --git a/datapath/linux/compat/include/net/gre.h b/datapath/linux/compat/include/net/gre.h
>> new file mode 100644
>> index 0000000..3285ccd
>> --- /dev/null
>> +++ b/datapath/linux/compat/include/net/gre.h
>> @@ -0,0 +1,109 @@
>> +#ifndef __LINUX_GRE_WRAPPER_H
>> +#define __LINUX_GRE_WRAPPER_H
>> +
>> +#include <linux/skbuff.h>
>> +#include <net/ip_tunnels.h>
>> +
>> +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,37)
>> +#include_next <net/gre.h>
>> +
>> +#else /* LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,37) */
>> +
>> +#define GREPROTO_CISCO         0
>> +#define GREPROTO_PPTP          1
>> +#define GREPROTO_MAX           2
>> +#define GRE_IP_PROTO_MAX       2
>
> It looks like we have two definitions of GRE_IP_PROTO_MAX for kernels <= 2.6.37.
>
ok.

>> +static inline __be16 gre_flags_to_tnl_flags(__be16 flags)
>> +{
>> +       __be16 tflags = 0;
>> +
>> +       if (flags & GRE_CSUM)
>> +               tflags |= TUNNEL_CSUM;
>> +       if (flags & GRE_ROUTING)
>> +               tflags |= TUNNEL_ROUTING;
>> +       if (flags & GRE_KEY)
>> +               tflags |= TUNNEL_KEY;
>> +       if (flags & GRE_SEQ)
>> +               tflags |= TUNNEL_SEQ;
>> +       if (flags & GRE_STRICT)
>> +               tflags |= TUNNEL_STRICT;
>> +       if (flags & GRE_REC)
>> +               tflags |= TUNNEL_REC;
>> +       if (flags & GRE_VERSION)
>> +               tflags |= TUNNEL_VERSION;
>> +
>> +       return tflags;
>> +}
>> +
>> +static inline __be16 tnl_flags_to_gre_flags(__be16 tflags)
>> +{
>> +       __be16 flags = 0;
>> +
>> +       if (tflags & TUNNEL_CSUM)
>> +               flags |= GRE_CSUM;
>> +       if (tflags & TUNNEL_ROUTING)
>> +               flags |= GRE_ROUTING;
>> +       if (tflags & TUNNEL_KEY)
>> +               flags |= GRE_KEY;
>> +       if (tflags & TUNNEL_SEQ)
>> +               flags |= GRE_SEQ;
>> +       if (tflags & TUNNEL_STRICT)
>> +               flags |= GRE_STRICT;
>> +       if (tflags & TUNNEL_REC)
>> +               flags |= GRE_REC;
>> +       if (tflags & TUNNEL_VERSION)
>> +               flags |= GRE_VERSION;
>> +
>> +       return flags;
>> +}
>
> I think these functions are already upstream, should they have version guards?
>
This patch is against 3.8 kernel, I will have version check for 3.9
support patch.

>> diff --git a/datapath/linux/compat/include/net/ip_tunnels.h b/datapath/linux/compat/include/net/ip_tunnels.h
>> new file mode 100644
>> index 0000000..ad17c9d
>> --- /dev/null
>> +++ b/datapath/linux/compat/include/net/ip_tunnels.h
>> @@ -0,0 +1,54 @@
>> +#ifndef __NET_IP_TUNNELS_WRAPPER_H
>> +#define __NET_IP_TUNNELS_WRAPPER_H 1
>
> I believe that all of these functions are already upstream but it
> looks like we unconditionally use our copy. It's probably better to
> use the upstream version on kernels that have it so that if the values
> change, we'll automatically change with it.
>
These changes are for 3.8 or less.

>> diff --git a/datapath/linux/compat/ip_tunnels_core.c b/datapath/linux/compat/ip_tunnels_core.c
>> new file mode 100644
>> index 0000000..a8e62ee
>> --- /dev/null
>> +++ b/datapath/linux/compat/ip_tunnels_core.c
>> +int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
>> +{
>> +       if (inner_proto == htons(ETH_P_TEB)) {
>> +               struct ethhdr *eh;
>> +
>> +               if (unlikely(!pskb_may_pull(skb, hdr_len + ETH_HLEN)))
>> +                       return -ENOMEM;
>> +
>> +               __skb_pull(skb, hdr_len);
>> +               skb_postpull_rcsum(skb, skb_transport_header(skb), hdr_len + ETH_HLEN);
>> +
>> +               eh = (struct ethhdr *)skb->data;
>
> This whole thing about having the Ethernet header not covered
> skb->csum is really an old bug that I started claiming as an
> optimization at some point. However, it's really not worth much and we
> probably shouldn't propagate it beyond OVS. The rule in general is
> that packet headers that have not been pulled off are covered by
> skb->csum so we should really just make ourselves consistent.
>
ok.

>> +               if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
>> +                       skb->protocol = eh->h_proto;
>> +               else
>> +                       skb->protocol = htons(ETH_P_802_2);
>> +
>> +       } else {
>> +               if (unlikely(!pskb_may_pull(skb, hdr_len)))
>> +                       return -ENOMEM;
>> +
>> +               __skb_pull(skb, hdr_len);
>> +               skb_postpull_rcsum(skb, skb_transport_header(skb), hdr_len);
>
> You can combine the above two steps into skb_pull_rcsum().
>
ok.

>> +               skb->protocol = inner_proto;
>> +       }
>> +
>> +       nf_reset(skb);
>> +       secpath_reset(skb);
>> +       skb_clear_rxhash(skb);
>> +       skb_dst_drop(skb);
>> +       vlan_set_tci(skb, 0);
>> +       skb_set_queue_mapping(skb, 0);
>> +       skb->pkt_type = PACKET_HOST;
>> +       return 0;
>> +}
>
> Several of these come from different places (such as
> __skb_tunnel_rx()). It would be good to make the compat code more
> closely mirror upstream.

I am planning on adding iptunnel_pull_header() function to upstream
kernel, it is in gre-upstream series that I am going to send.



More information about the dev mailing list