[ovs-dev] [PATCH 1/2] datapath: Add support for tun_key to Open vSwitch datapath

Pravin Shelar pshelar at nicira.com
Wed Oct 10 23:23:53 UTC 2012


Patch looks good, I have few comments inlined.


On Tue, Oct 9, 2012 at 11:49 AM, Kyle Mestery <kmestery at cisco.com> wrote:
> This is a first pass at providing a tun_key which can be
> used as the basis for flow-based tunnelling. The
> tun_key includes and replaces the tun_id in both struct
> ovs_skb_cb and struct sw_tun_key.
>
> This patch allows all existing tun_id behaviour to still work. Existing
> users of tun_id are redirected to tun_key->tun_id to retain compatibility.
> However, when the userspace code is updated to make use of the new tun_key,
> the old behaviour will be deprecated and removed.
>
> NOTE: With these changes, the tunneling code no longer assumes input and
> output keys are symmetric.  If they are not, PMTUD needs to be disabled
> for tunneling to work.
>
> Signed-off-by: Kyle Mestery <kmestery at cisco.com>
> Cc: Simon Horman <horms at verge.net.au>
> Cc: Jesse Gross <jesse at nicira.com>
> ---
> V6:
> - Fix more comments addressed from Jesse.
> V5:
> - Address another round of comments from Jesse.
> V4:
> - Address 2 comments from Jesse:
>   - When processing actions, if OVS_CB(skb)->tun_key is NULL, point it at one
>     on the stack temporarily. This goes away when we remove the ability to set
>     tun_id outside the scope of tun_key.
>   - Move tun_key to the end of struct sw_flow_key.
> V3:
> - Fix issues found during review by Jesse.
> - Add a NEWS entry around tunnel code no longer assuming symmetric input and
>   output tunnel keys.
>
> V2:
> - Fix blank line addition/removal found by Simon.
> - Fix hex printing output found by Simon.
> ---
>  NEWS                        |   3 ++
>  datapath/actions.c          |  38 +++++++++++++----
>  datapath/datapath.c         |  10 ++++-
>  datapath/datapath.h         |   5 ++-
>  datapath/flow.c             |  64 ++++++++++++++++++++++++----
>  datapath/flow.h             |  12 ++++--
>  datapath/tunnel.c           | 101 ++++++++++++++++++++++++++++----------------
>  datapath/tunnel.h           |  15 ++++++-
>  datapath/vport-capwap.c     |  12 +++---
>  datapath/vport-gre.c        |  15 ++++---
>  datapath/vport.c            |   2 +-
>  include/linux/openvswitch.h |  18 +++++++-
>  lib/dpif-netdev.c           |   1 +
>  lib/odp-util.c              |  15 ++++++-
>  lib/odp-util.h              |   3 +-
>  15 files changed, 235 insertions(+), 79 deletions(-)
>
> diff --git a/NEWS b/NEWS
> index 29fd9f3..ae831e3 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -1,5 +1,8 @@
>  post-v1.8.0
>  ------------------------
> +    - The tunneling code no longer assumes input and output keys are symmetric.
> +      If they are not, PMTUD needs to be disabled for tunneling to work. Note
> +      this only applies to flow-based keys.
>      - FreeBSD is now a supported platform, thanks to code contributions from
>        Gaetano Catalli, Ed Maste, and Giuseppe Lettieri.
>      - ovs-bugtool: New --ovs option to report only OVS related information.
> diff --git a/datapath/actions.c b/datapath/actions.c
> index ec9b595..fa8c10d 100644
> --- a/datapath/actions.c
> +++ b/datapath/actions.c
> @@ -37,7 +37,8 @@
>  #include "vport.h"
>
>  static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
> -                       const struct nlattr *attr, int len, bool keep_skb);
> +                       const struct nlattr *attr, int len,
> +                       struct ovs_key_ipv4_tunnel *tun_key, bool keep_skb);
>
>  static int make_writable(struct sk_buff *skb, int write_len)
>  {
> @@ -329,11 +330,14 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
>         }
>
>         return do_execute_actions(dp, skb, nla_data(acts_list),
> -                                                nla_len(acts_list), true);
> +                                                nla_len(acts_list),
> +                                                OVS_CB(skb)->tun_key,
> +                                                true);
>  }
>
>  static int execute_set_action(struct sk_buff *skb,
> -                                const struct nlattr *nested_attr)
> +                                const struct nlattr *nested_attr,
> +                                struct ovs_key_ipv4_tunnel *tun_key)
>  {
>         int err = 0;
>
> @@ -343,7 +347,21 @@ static int execute_set_action(struct sk_buff *skb,
>                 break;
>
>         case OVS_KEY_ATTR_TUN_ID:
> -               OVS_CB(skb)->tun_id = nla_get_be64(nested_attr);
> +               if (OVS_CB(skb)->tun_key == NULL) {
> +                       /* If tun_key is NULL for this skb, assign it to
> +                        * a value the caller passed in for action processing
> +                        * and output. This can disappear once we drop support
> +                        * for setting tun_id outside of tun_key.
> +                        */
> +                       memset(tun_key, 0, sizeof(struct ovs_key_ipv4_tunnel));
> +                       OVS_CB(skb)->tun_key = tun_key;
> +               }
> +
> +               OVS_CB(skb)->tun_key->tun_id = nla_get_be64(nested_attr);
> +               break;
> +
> +       case OVS_KEY_ATTR_IPV4_TUNNEL:
> +               OVS_CB(skb)->tun_key = nla_data(nested_attr);
>                 break;
>
>         case OVS_KEY_ATTR_ETHERNET:
> @@ -368,7 +386,8 @@ static int execute_set_action(struct sk_buff *skb,
>
>  /* Execute a list of actions against 'skb'. */
>  static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
> -                       const struct nlattr *attr, int len, bool keep_skb)
> +                       const struct nlattr *attr, int len,
> +                       struct ovs_key_ipv4_tunnel *tun_key, bool keep_skb)
>  {
>         /* Every output action needs a separate clone of 'skb', but the common
>          * case is just a single output action, so that doing a clone and
> @@ -407,7 +426,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
>                         break;
>
>                 case OVS_ACTION_ATTR_SET:
> -                       err = execute_set_action(skb, nla_data(a));
> +                       err = execute_set_action(skb, nla_data(a), tun_key);
>                         break;
>
>                 case OVS_ACTION_ATTR_SAMPLE:
> @@ -458,6 +477,9 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb)
>         struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts);
>         struct loop_counter *loop;
>         int error;
> +       struct ovs_key_ipv4_tunnel tun_key;
> +
> +        memset(&tun_key, 0, sizeof(tun_key));
>
tun key will zeroed while executing tunnel set action, Do we still
need to zero it here?

>         /* Check whether we've looped too much. */
>         loop = &__get_cpu_var(loop_counters);
> @@ -469,9 +491,9 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb)
>                 goto out_loop;
>         }
>
> -       OVS_CB(skb)->tun_id = 0;
> +       OVS_CB(skb)->tun_key = NULL;
>         error = do_execute_actions(dp, skb, acts->actions,
> -                                        acts->actions_len, false);
> +                                        acts->actions_len, &tun_key, false);
>
>         /* Check whether sub-actions looped too much. */
>         if (unlikely(loop->looping))
> diff --git a/datapath/datapath.c b/datapath/datapath.c
> index a6915fb..d8a198e 100644
> --- a/datapath/datapath.c
> +++ b/datapath/datapath.c
> @@ -587,12 +587,20 @@ static int validate_set(const struct nlattr *a,
>
>         switch (key_type) {
>         const struct ovs_key_ipv4 *ipv4_key;
> +       const struct ovs_key_ipv4_tunnel *tun_key;
>
>         case OVS_KEY_ATTR_PRIORITY:
>         case OVS_KEY_ATTR_TUN_ID:
>         case OVS_KEY_ATTR_ETHERNET:
>                 break;
>
> +       case OVS_KEY_ATTR_IPV4_TUNNEL:
> +               tun_key = nla_data(ovs_key);
> +               if (!tun_key->ipv4_dst) {
> +                       return -EINVAL;
> +               }
> +               break;
> +
>         case OVS_KEY_ATTR_IPV4:
>                 if (flow_key->eth.type != htons(ETH_P_IP))
>                         return -EINVAL;
> @@ -785,7 +793,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
>
>         err = ovs_flow_metadata_from_nlattrs(&flow->key.phy.priority,
>                                              &flow->key.phy.in_port,
> -                                            &flow->key.phy.tun_id,
> +                                            &flow->key.tun.tun_key,
>                                              a[OVS_PACKET_ATTR_KEY]);
>         if (err)
>                 goto err_flow_put;
> diff --git a/datapath/datapath.h b/datapath/datapath.h
> index affbf0e..c5df12d 100644
> --- a/datapath/datapath.h
> +++ b/datapath/datapath.h
> @@ -96,7 +96,8 @@ struct datapath {
>  /**
>   * struct ovs_skb_cb - OVS data in skb CB
>   * @flow: The flow associated with this packet.  May be %NULL if no flow.
> - * @tun_id: ID of the tunnel that encapsulated this packet.  It is 0 if the
> + * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the
> + * packet is not being tunneled.
>   * @ip_summed: Consistently stores L4 checksumming status across different
>   * kernel versions.
>   * @csum_start: Stores the offset from which to start checksumming independent
> @@ -107,7 +108,7 @@ struct datapath {
>   */
>  struct ovs_skb_cb {
>         struct sw_flow          *flow;
> -       __be64                  tun_id;
> +       struct ovs_key_ipv4_tunnel  *tun_key;
>  #ifdef NEED_CSUM_NORMALIZE
>         enum csum_type          ip_summed;
>         u16                     csum_start;
> diff --git a/datapath/flow.c b/datapath/flow.c
> index d07337c..376f4be 100644
> --- a/datapath/flow.c
> +++ b/datapath/flow.c
> @@ -629,7 +629,8 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
>         memset(key, 0, sizeof(*key));
>
>         key->phy.priority = skb->priority;
> -       key->phy.tun_id = OVS_CB(skb)->tun_id;
> +       if (OVS_CB(skb)->tun_key)
> +               key->tun.tun_key = *OVS_CB(skb)->tun_key;
>         key->phy.in_port = in_port;
>
>         skb_reset_mac_header(skb);
> @@ -847,6 +848,7 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
>
>         /* Not upstream. */
>         [OVS_KEY_ATTR_TUN_ID] = sizeof(__be64),
> +       [OVS_KEY_ATTR_IPV4_TUNNEL] = sizeof(struct ovs_key_ipv4_tunnel),
>  };
>
>  static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len,
> @@ -1022,9 +1024,30 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
>                 swkey->phy.in_port = DP_MAX_PORTS;
>         }
>
> -       if (attrs & (1ULL << OVS_KEY_ATTR_TUN_ID)) {
> -               swkey->phy.tun_id = nla_get_be64(a[OVS_KEY_ATTR_TUN_ID]);
> +       /* OVS_KEY_ATTR_TUN_ID and OVS_KEY_ATTR_IPV4_TUNEL must both arrive
> +        * together.
> +        */
> +       if (attrs & (1ULL << OVS_KEY_ATTR_TUN_ID) &&
> +           attrs & (1ULL << OVS_KEY_ATTR_IPV4_TUNNEL)) {
> +               struct ovs_key_ipv4_tunnel *tun_key;
> +               __be64 tun_id = 0;
> +               tun_key = nla_data(a[OVS_KEY_ATTR_IPV4_TUNNEL]);
> +               tun_id = nla_get_be64(a[OVS_KEY_ATTR_TUN_ID]);
> +
> +               if (tun_id != tun_key->tun_id)
> +                       return -EINVAL;
> +
> +               swkey->tun.tun_key = *tun_key;
>                 attrs &= ~(1ULL << OVS_KEY_ATTR_TUN_ID);
> +               attrs &= ~(1ULL << OVS_KEY_ATTR_IPV4_TUNNEL);
> +       } else if (attrs & (1ULL << OVS_KEY_ATTR_TUN_ID)) {
> +               swkey->tun.tun_key.tun_id = nla_get_be64(a[OVS_KEY_ATTR_TUN_ID]);
> +               attrs &= ~(1ULL << OVS_KEY_ATTR_TUN_ID);
> +       } else if (attrs & (1ULL << OVS_KEY_ATTR_IPV4_TUNNEL)) {
> +               struct ovs_key_ipv4_tunnel *tun_key;
> +               tun_key = nla_data(a[OVS_KEY_ATTR_IPV4_TUNNEL]);
> +               swkey->tun.tun_key = *tun_key;
> +               attrs &= ~(1ULL << OVS_KEY_ATTR_IPV4_TUNNEL);
>         }

While validating set action OVS_KEY_ATTR_IPV4_TUNNEL, we are checking
for ipv4_dst, I think we need similar check here.

>
>         /* Data attributes. */
> @@ -1162,14 +1185,16 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
>   * get the metadata, that is, the parts of the flow key that cannot be
>   * extracted from the packet itself.
>   */
> -int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port, __be64 *tun_id,
> +int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port,
> +                                  struct ovs_key_ipv4_tunnel *tun_key,
>                                    const struct nlattr *attr)
>  {
>         const struct nlattr *nla;
>         int rem;
> +       __be64 tun_id = 0;
>
>         *in_port = DP_MAX_PORTS;
> -       *tun_id = 0;
> +       memset(tun_key, 0, sizeof(*tun_key));
>         *priority = 0;
>
>         nla_for_each_nested(nla, attr, rem) {
> @@ -1185,7 +1210,20 @@ int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port, __be64 *tun_id,
>                                 break;
>
>                         case OVS_KEY_ATTR_TUN_ID:
> -                               *tun_id = nla_get_be64(nla);
> +                               tun_id = nla_get_be64(nla);
> +                               if ((tun_key->tun_flags & FLOW_TNL_F_KEY) &&
> +                                   tun_key->tun_id != tun_id)
> +                                       return -EINVAL;
> +                               break;
> +
> +                       case OVS_KEY_ATTR_IPV4_TUNNEL:
> +                               if (tun_key->tun_flags & FLOW_TNL_F_KEY)
> +                                       tun_id = tun_key->tun_id;
> +                               memcpy(tun_key, nla_data(nla),
> +                                      sizeof(*tun_key));
> +                               if ((tun_key->tun_flags & FLOW_TNL_F_KEY) &&
> +                                   tun_id != tun_key->tun_id)
> +                                       return -EINVAL;
>                                 break;
>
>                         case OVS_KEY_ATTR_IN_PORT:
> @@ -1210,10 +1248,20 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
>             nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority))
>                 goto nla_put_failure;
>
> -       if (swkey->phy.tun_id != cpu_to_be64(0) &&
> -           nla_put_be64(skb, OVS_KEY_ATTR_TUN_ID, swkey->phy.tun_id))
> +        if ((swkey->tun.tun_key.tun_flags & FLOW_TNL_F_KEY) &&
> +           nla_put_be64(skb, OVS_KEY_ATTR_TUN_ID, swkey->tun.tun_key.tun_id))
>                 goto nla_put_failure;
>
> +       if (swkey->tun.tun_key.ipv4_dst) {
> +               struct ovs_key_ipv4_tunnel *tun_key;
> +               nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4_TUNNEL,
> +                                 sizeof(*tun_key));
> +               if (!nla)
> +                       goto nla_put_failure;
> +               tun_key = nla_data(nla);
> +               *tun_key = swkey->tun.tun_key;
> +       }
> +

Why are we sending both OVS_KEY_ATTR_TUN_ID and
OVS_KEY_ATTR_IPV4_TUNNEL attributed here?
we could first check for ipv4_dst else for FLOW_TNL_F_KEY in flow key.

>         if (swkey->phy.in_port != DP_MAX_PORTS &&
>             nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port))
>                 goto nla_put_failure;
> diff --git a/datapath/flow.h b/datapath/flow.h
> index 02c563a..4430b32 100644
> --- a/datapath/flow.h
> +++ b/datapath/flow.h
> @@ -42,10 +42,12 @@ struct sw_flow_actions {
>
>  struct sw_flow_key {
>         struct {
> -               __be64  tun_id;         /* Encapsulating tunnel ID. */
>                 u32     priority;       /* Packet QoS priority. */
>                 u16     in_port;        /* Input switch port (or DP_MAX_PORTS). */
>         } phy;
> +        struct {
> +               struct ovs_key_ipv4_tunnel tun_key;  /* Encapsulating tunnel key. */
> +       } tun;
>         struct {
>                 u8     src[ETH_ALEN];   /* Ethernet source address. */
>                 u8     dst[ETH_ALEN];   /* Ethernet destination address. */
> @@ -150,6 +152,7 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies);
>   *                         ------  ---  ------  -----
>   *  OVS_KEY_ATTR_PRIORITY      4    --     4      8
>   *  OVS_KEY_ATTR_TUN_ID        8    --     4     12
> + *  OVS_KEY_ATTR_IPV4_TUNNEL  24    --     4     28
>   *  OVS_KEY_ATTR_IN_PORT       4    --     4      8
>   *  OVS_KEY_ATTR_ETHERNET     12    --     4     16
>   *  OVS_KEY_ATTR_ETHERTYPE     2     2     4      8  (outer VLAN ethertype)
> @@ -160,14 +163,15 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies);
>   *  OVS_KEY_ATTR_ICMPV6        2     2     4      8
>   *  OVS_KEY_ATTR_ND           28    --     4     32
>   *  -------------------------------------------------
> - *  total                                       156
> + *  total                                       184
>   */
> -#define FLOW_BUFSIZE 156
> +#define FLOW_BUFSIZE 184
>
>  int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *);
>  int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
>                       const struct nlattr *);
> -int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port, __be64 *tun_id,
> +int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port,
> +                                  struct ovs_key_ipv4_tunnel *tun_key,
>                                    const struct nlattr *);
>
>  #define MAX_ACTIONS_BUFSIZE    (16 * 1024)
> diff --git a/datapath/tunnel.c b/datapath/tunnel.c
> index d651c11..739f098 100644
> --- a/datapath/tunnel.c
> +++ b/datapath/tunnel.c
> @@ -367,9 +367,9 @@ struct vport *ovs_tnl_find_port(struct net *net, __be32 saddr, __be32 daddr,
>         return NULL;
ñ>  }
>
> -static void ecn_decapsulate(struct sk_buff *skb, u8 tos)
> +static void ecn_decapsulate(struct sk_buff *skb)
>  {
> -       if (unlikely(INET_ECN_is_ce(tos))) {
> +       if (unlikely(INET_ECN_is_ce(OVS_CB(skb)->tun_key->ipv4_tos))) {
>                 __be16 protocol = skb->protocol;
>
>                 skb_set_network_header(skb, ETH_HLEN);
> @@ -416,7 +416,7 @@ static void ecn_decapsulate(struct sk_buff *skb, u8 tos)
>   * - skb->csum does not include the inner Ethernet header.
>   * - The layer pointers are undefined.
>   */
> -void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb, u8 tos)
> +void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb)
>  {
>         struct ethhdr *eh;
>
> @@ -433,7 +433,7 @@ void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb, u8 tos)
>         skb_clear_rxhash(skb);
>         secpath_reset(skb);
>
> -       ecn_decapsulate(skb, tos);
> +       ecn_decapsulate(skb);
>         vlan_set_tci(skb, 0);
>
>         if (unlikely(compute_ip_summed(skb, false))) {
> @@ -613,7 +613,7 @@ static void ipv6_build_icmp(struct sk_buff *skb, struct sk_buff *nskb,
>
>  bool ovs_tnl_frag_needed(struct vport *vport,
>                          const struct tnl_mutable_config *mutable,
> -                        struct sk_buff *skb, unsigned int mtu, __be64 flow_key)
> +                        struct sk_buff *skb, unsigned int mtu)
>  {
>         unsigned int eth_hdr_len = ETH_HLEN;
>         unsigned int total_length = 0, header_length = 0, payload_length;
> @@ -697,17 +697,6 @@ bool ovs_tnl_frag_needed(struct vport *vport,
>                 ipv6_build_icmp(skb, nskb, mtu, payload_length);
>  #endif
>
> -       /*
> -        * Assume that flow based keys are symmetric with respect to input
> -        * and output and use the key that we were going to put on the
> -        * outgoing packet for the fake received packet.  If the keys are
> -        * not symmetric then PMTUD needs to be disabled since we won't have
> -        * any way of synthesizing packets.
> -        */
> -       if ((mutable->flags & (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION)) ==
> -           (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION))
> -               OVS_CB(nskb)->tun_id = flow_key;
> -
>         if (unlikely(compute_ip_summed(nskb, false))) {
>                 kfree_skb(nskb);
>                 return false;
> @@ -723,12 +712,20 @@ static bool check_mtu(struct sk_buff *skb,
>                       const struct tnl_mutable_config *mutable,
>                       const struct rtable *rt, __be16 *frag_offp)
>  {
> -       bool df_inherit = mutable->flags & TNL_F_DF_INHERIT;
> +       bool df_inherit;
>         bool pmtud = mutable->flags & TNL_F_PMTUD;
> -       __be16 frag_off = mutable->flags & TNL_F_DF_DEFAULT ? htons(IP_DF) : 0;
> +       __be16 frag_off;
>         int mtu = 0;
>         unsigned int packet_length = skb->len - ETH_HLEN;
>
> +       if (OVS_CB(skb)->tun_key->ipv4_dst != 0) {
> +               df_inherit = false;
> +               frag_off = OVS_CB(skb)->tun_key->tun_flags & FLOW_TNL_F_DONT_FRAGMENT;
> +       } else {
> +               df_inherit = mutable->flags & TNL_F_DF_INHERIT;
> +               frag_off = mutable->flags & TNL_F_DF_DEFAULT ? htons(IP_DF) : 0;
> +        }
> +
Why not set pmtud to false incase of flow-based tunneling like you did
it for df_inherit?

>         /* Allow for one level of tagging in the packet length. */
>         if (!vlan_tx_tag_present(skb) &&
>             eth_hdr(skb)->h_proto == htons(ETH_P_8021Q))
> @@ -760,8 +757,7 @@ static bool check_mtu(struct sk_buff *skb,
>                         mtu = max(mtu, IP_MIN_MTU);
>
>                         if (packet_length > mtu &&
> -                           ovs_tnl_frag_needed(vport, mutable, skb, mtu,
> -                                               OVS_CB(skb)->tun_id))
> +                           ovs_tnl_frag_needed(vport, mutable, skb, mtu))
>                                 return false;
>                 }
>         }
> @@ -777,8 +773,7 @@ static bool check_mtu(struct sk_buff *skb,
>                         mtu = max(mtu, IPV6_MIN_MTU);
>
>                         if (packet_length > mtu &&
> -                           ovs_tnl_frag_needed(vport, mutable, skb, mtu,
> -                                               OVS_CB(skb)->tun_id))
> +                           ovs_tnl_frag_needed(vport, mutable, skb, mtu))
>                                 return false;
>                 }
>         }
> @@ -1000,15 +995,16 @@ unlock:
>  }
>
>  static struct rtable *__find_route(const struct tnl_mutable_config *mutable,
> -                                  u8 ipproto, u8 tos)
> +                                  __be32 saddr, __be32 daddr, u8 ipproto,
> +                                  u8 tos)
>  {
>         /* Tunnel configuration keeps DSCP part of TOS bits, But Linux
>          * router expect RT_TOS bits only. */
>
>  #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
>         struct flowi fl = { .nl_u = { .ip4_u = {
> -                                       .daddr = mutable->key.daddr,
> -                                       .saddr = mutable->key.saddr,
> +                                       .daddr = daddr,
> +                                       .saddr = saddr,
>                                         .tos   = RT_TOS(tos) } },
>                                         .proto = ipproto };
>         struct rtable *rt;
> @@ -1018,8 +1014,8 @@ static struct rtable *__find_route(const struct tnl_mutable_config *mutable,
>
>         return rt;
>  #else
> -       struct flowi4 fl = { .daddr = mutable->key.daddr,
> -                            .saddr = mutable->key.saddr,
> +       struct flowi4 fl = { .daddr = daddr,
> +                            .saddr = saddr,
>                              .flowi4_tos = RT_TOS(tos),
>                              .flowi4_proto = ipproto };
>
> @@ -1029,7 +1025,8 @@ static struct rtable *__find_route(const struct tnl_mutable_config *mutable,
>
>  static struct rtable *find_route(struct vport *vport,
>                                  const struct tnl_mutable_config *mutable,
> -                                u8 tos, struct tnl_cache **cache)
> +                                __be32 saddr, __be32 daddr, u8 tos,
> +                                struct tnl_cache **cache)
>  {
>         struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
>         struct tnl_cache *cur_cache = rcu_dereference(tnl_vport->cache);
> @@ -1037,18 +1034,21 @@ static struct rtable *find_route(struct vport *vport,
>         *cache = NULL;
>         tos = RT_TOS(tos);
>
> -       if (likely(tos == RT_TOS(mutable->tos) &&
> -           check_cache_valid(cur_cache, mutable))) {
> +       if (daddr == mutable->key.daddr && saddr == mutable->key.saddr &&
> +           tos == RT_TOS(mutable->tos) &&
> +           check_cache_valid(cur_cache, mutable)) {
>                 *cache = cur_cache;
>                 return cur_cache->rt;
>         } else {
>                 struct rtable *rt;
>
> -               rt = __find_route(mutable, tnl_vport->tnl_ops->ipproto, tos);
> +               rt = __find_route(mutable, saddr, daddr,
> +                                 tnl_vport->tnl_ops->ipproto, tos);
>                 if (IS_ERR(rt))
>                         return NULL;
>
> -               if (likely(tos == RT_TOS(mutable->tos)))
> +               if (daddr == mutable->key.daddr && saddr == mutable->key.saddr &&
> +                   tos == RT_TOS(mutable->tos))
>                         *cache = build_cache(vport, mutable, rt);
>
I am not sure why checks are added on saddr and daddr here?
header caching is not set in case of flow based tunneling to
build_cache call is no-op anyways.

>                 return rt;
> @@ -1178,8 +1178,11 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
>         struct rtable *rt;
>         struct dst_entry *unattached_dst = NULL;
>         struct tnl_cache *cache;
> +       struct ovs_key_ipv4_tunnel tun_key;
>         int sent_len = 0;
>         __be16 frag_off = 0;
> +       __be32 daddr;
> +       __be32 saddr;
>         u8 ttl;
>         u8 inner_tos;
>         u8 tos;
> @@ -1207,6 +1210,13 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
>         }
>  #endif
>
> +       /* If OVS_CB(skb)->tun_key is NULL, point it at the local tun_key here,
> +        * and zero it out.
> +        */
> +       if (OVS_CB(skb)->tun_key == NULL) {
> +               memset(&tun_key, 0, sizeof(tun_key));
> +               OVS_CB(skb)->tun_key = &tun_key;
> +       }
>         /* ToS */
>         if (skb->protocol == htons(ETH_P_IP))
>                 inner_tos = ip_hdr(skb)->tos;
> @@ -1219,11 +1229,23 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
>
>         if (mutable->flags & TNL_F_TOS_INHERIT)
>                 tos = inner_tos;
> +       else if (OVS_CB(skb)->tun_key->ipv4_dst != 0)
> +               tos = OVS_CB(skb)->tun_key->ipv4_tos;
>         else
>                 tos = mutable->tos;
>
> +       if (OVS_CB(skb)->tun_key->ipv4_dst != 0)
> +               daddr = OVS_CB(skb)->tun_key->ipv4_dst;
> +       else
> +               daddr = mutable->key.daddr;
> +
> +       if (OVS_CB(skb)->tun_key->ipv4_dst != 0)
> +               saddr = OVS_CB(skb)->tun_key->ipv4_src;
> +       else
> +               saddr = mutable->key.saddr;
> +

It will be easier to read if we have one if check on
OVS_CB(skb)->tun_key->ipv4_dst which is retrieve all tunnel parameter
for flow-based tunning and else block for campat code.

>         /* Route lookup */
> -       rt = find_route(vport, mutable, tos, &cache);
> +       rt = find_route(vport, mutable, saddr, daddr, tos, &cache);
>         if (unlikely(!rt))
>                 goto error_free;
>         if (unlikely(!cache))
> @@ -1260,9 +1282,13 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
>         }
>
>         /* TTL */
> -       ttl = mutable->ttl;
> -       if (!ttl)
> -               ttl = ip4_dst_hoplimit(&rt_dst(rt));
> +       if (OVS_CB(skb)->tun_key->ipv4_dst != 0)
> +               ttl = OVS_CB(skb)->tun_key->ipv4_ttl;
> +       else {
> +               ttl = mutable->ttl;
> +               if (!ttl)
> +                       ttl = ip4_dst_hoplimit(&rt_dst(rt));
> +       }
>
>         if (mutable->flags & TNL_F_TTL_INHERIT) {

This check could be in the compat code block, so it is less confusing.

>                 if (skb->protocol == htons(ETH_P_IP))
> @@ -1442,7 +1468,8 @@ static int tnl_set_config(struct net *net, struct nlattr *options,
>                 struct net_device *dev;
>                 struct rtable *rt;
>
> -               rt = __find_route(mutable, tnl_ops->ipproto, mutable->tos);
> +               rt = __find_route(mutable, mutable->key.saddr, mutable->key.daddr,
> +                                 tnl_ops->ipproto, mutable->tos);
>                 if (IS_ERR(rt))
>                         return -EADDRNOTAVAIL;
>                 dev = rt_dst(rt).dev;
> diff --git a/datapath/tunnel.h b/datapath/tunnel.h
> index 1924017..b7858d3 100644
> --- a/datapath/tunnel.h
> +++ b/datapath/tunnel.h
> @@ -269,14 +269,14 @@ int ovs_tnl_set_addr(struct vport *vport, const unsigned char *addr);
>  const char *ovs_tnl_get_name(const struct vport *vport);
>  const unsigned char *ovs_tnl_get_addr(const struct vport *vport);
>  int ovs_tnl_send(struct vport *vport, struct sk_buff *skb);
> -void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb, u8 tos);
> +void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb);
>
>  struct vport *ovs_tnl_find_port(struct net *net, __be32 saddr, __be32 daddr,
>                                 __be64 key, int tunnel_type,
>                                 const struct tnl_mutable_config **mutable);
>  bool ovs_tnl_frag_needed(struct vport *vport,
>                          const struct tnl_mutable_config *mutable,
> -                        struct sk_buff *skb, unsigned int mtu, __be64 flow_key);
> +                        struct sk_buff *skb, unsigned int mtu);
>  void ovs_tnl_free_linked_skbs(struct sk_buff *skb);
>
>  int ovs_tnl_init(void);
> @@ -286,4 +286,15 @@ static inline struct tnl_vport *tnl_vport_priv(const struct vport *vport)
>         return vport_priv(vport);
>  }
>
> +static inline void tnl_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key,
> +                                   const struct iphdr *iph, __be64 tun_id)
> +{
> +       tun_key->tun_id = tun_id;
> +       tun_key->ipv4_src = iph->saddr;
> +       tun_key->ipv4_dst = iph->daddr;
> +       tun_key->ipv4_tos = iph->tos;
> +       tun_key->ipv4_ttl = iph->ttl;
> +       tun_key->tun_flags = 0;
> +}
> +
>  #endif /* tunnel.h */
> diff --git a/datapath/vport-capwap.c b/datapath/vport-capwap.c
> index 05a099d..fbf50d9 100644
> --- a/datapath/vport-capwap.c
> +++ b/datapath/vport-capwap.c
> @@ -220,7 +220,7 @@ static struct sk_buff *capwap_update_header(const struct vport *vport,
>                 struct capwaphdr_wsi *wsi = (struct capwaphdr_wsi *)(cwh + 1);
>                 struct capwaphdr_wsi_key *opt = (struct capwaphdr_wsi_key *)(wsi + 1);
>
> -               opt->key = OVS_CB(skb)->tun_id;
> +               opt->key = OVS_CB(skb)->tun_key->tun_id;
>         }
>
>         udph->len = htons(skb->len - skb_transport_offset(skb));
> @@ -316,6 +316,7 @@ static int capwap_rcv(struct sock *sk, struct sk_buff *skb)
>         struct vport *vport;
>         const struct tnl_mutable_config *mutable;
>         struct iphdr *iph;
> +       struct ovs_key_ipv4_tunnel tun_key;
>         __be64 key = 0;
>
>         if (unlikely(!pskb_may_pull(skb, CAPWAP_MIN_HLEN + ETH_HLEN)))
> @@ -333,12 +334,11 @@ static int capwap_rcv(struct sock *sk, struct sk_buff *skb)
>                 goto error;
>         }
>
> -       if (mutable->flags & TNL_F_IN_KEY_MATCH)
> -               OVS_CB(skb)->tun_id = key;
> -       else
> -               OVS_CB(skb)->tun_id = 0;
> +       tnl_tun_key_init(&tun_key, iph,
> +                    mutable->flags & TNL_F_IN_KEY_MATCH ? key : 0);
> +       OVS_CB(skb)->tun_key = &tun_key;
>
> -       ovs_tnl_rcv(vport, skb, iph->tos);
> +       ovs_tnl_rcv(vport, skb);
>         goto out;
>
>  error:
> diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c
> index ab89c5b..8076b1d 100644
> --- a/datapath/vport-gre.c
> +++ b/datapath/vport-gre.c
> @@ -103,7 +103,7 @@ static struct sk_buff *gre_update_header(const struct vport *vport,
>
>         /* Work backwards over the options so the checksum is last. */
>         if (mutable->flags & TNL_F_OUT_KEY_ACTION)
> -               *options = be64_get_low32(OVS_CB(skb)->tun_id);
> +               *options = be64_get_low32(OVS_CB(skb)->tun_key->tun_id);
>
Here we do need check if we doing flow-based tunneling. in case of
flow-based tunneling mutable is shared between all tunnels, so there
is no-way to we can do port specific flags check from mutable. about
GRE checksum, we need to check for FLOW_TNL_F_CSUM flag for doing
checksum on a packet.

>         if (mutable->out_key || mutable->flags & TNL_F_OUT_KEY_ACTION)
>                 options--;
> @@ -285,7 +285,7 @@ static void gre_err(struct sk_buff *skb, u32 info)
>  #endif
>
>         __skb_pull(skb, tunnel_hdr_len);
> -       ovs_tnl_frag_needed(vport, mutable, skb, mtu, key);
> +       ovs_tnl_frag_needed(vport, mutable, skb, mtu);
>         __skb_push(skb, tunnel_hdr_len);
>
same here, tunnel_hdr_len is stored in mutable which is going to be
shared for all tunnels for a type in flow based tunneling, so we need
to calculate it on packet send.

>  out:
> @@ -327,6 +327,7 @@ static int gre_rcv(struct sk_buff *skb)
>         const struct tnl_mutable_config *mutable;
>         int hdr_len;
>         struct iphdr *iph;
> +       struct ovs_key_ipv4_tunnel tun_key;
>         __be16 flags;
>         __be64 key;
>
> @@ -351,15 +352,15 @@ static int gre_rcv(struct sk_buff *skb)
>                 goto error;
>         }
>
> -       if (mutable->flags & TNL_F_IN_KEY_MATCH)
> -               OVS_CB(skb)->tun_id = key;
> -       else
> -               OVS_CB(skb)->tun_id = 0;
> +
> +       tnl_tun_key_init(&tun_key, iph,
> +                    mutable->flags & TNL_F_IN_KEY_MATCH ? key : 0);
> +       OVS_CB(skb)->tun_key = &tun_key;
>
>         __skb_pull(skb, hdr_len);
>         skb_postpull_rcsum(skb, skb_transport_header(skb), hdr_len + ETH_HLEN);
>
> -       ovs_tnl_rcv(vport, skb, iph->tos);
> +       ovs_tnl_rcv(vport, skb);
>         return 0;
>
>  error:
> diff --git a/datapath/vport.c b/datapath/vport.c
> index dc7adfa..e8279fb 100644
> --- a/datapath/vport.c
> +++ b/datapath/vport.c
> @@ -462,7 +462,7 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb)
>                 OVS_CB(skb)->flow = NULL;
>
>         if (!(vport->ops->flags & VPORT_F_TUN_ID))
> -               OVS_CB(skb)->tun_id = 0;
> +               OVS_CB(skb)->tun_key = NULL;
>
>         ovs_dp_process_received_packet(vport, skb);
>  }
> diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h
> index f5c9cca..8334cc1 100644
> --- a/include/linux/openvswitch.h
> +++ b/include/linux/openvswitch.h
> @@ -278,7 +278,8 @@ enum ovs_key_attr {
>         OVS_KEY_ATTR_ICMPV6,    /* struct ovs_key_icmpv6 */
>         OVS_KEY_ATTR_ARP,       /* struct ovs_key_arp */
>         OVS_KEY_ATTR_ND,        /* struct ovs_key_nd */
> -       OVS_KEY_ATTR_TUN_ID = 63, /* be64 tunnel ID */
> +       OVS_KEY_ATTR_IPV4_TUNNEL = 62,  /* struct ovs_key_ipv4_tunnel */
> +       OVS_KEY_ATTR_TUN_ID = 63,  /* be64 tunnel ID */
>         __OVS_KEY_ATTR_MAX
>  };
>
> @@ -360,6 +361,21 @@ struct ovs_key_nd {
>         __u8  nd_tll[6];
>  };
>
> +/* Values for ovs_key_ipv4_tunnel->tun_flags */
> +#define FLOW_TNL_F_DONT_FRAGMENT (1 << 0)
> +#define FLOW_TNL_F_CSUM (1 << 1)
> +#define FLOW_TNL_F_KEY (1 << 2)
> +
> +struct ovs_key_ipv4_tunnel {
> +       __be64 tun_id;
> +       __u32  tun_flags;
> +       __be32 ipv4_src;
> +       __be32 ipv4_dst;
> +       __u8   ipv4_tos;
> +       __u8   ipv4_ttl;
> +        __u8   pad[2];
> +};
> +
>  /**
>   * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
>   * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index c9e3210..797cb06 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -1179,6 +1179,7 @@ execute_set_action(struct ofpbuf *packet, const struct nlattr *a)
>      case OVS_KEY_ATTR_TUN_ID:
>      case OVS_KEY_ATTR_PRIORITY:
>      case OVS_KEY_ATTR_IPV6:
> +    case OVS_KEY_ATTR_IPV4_TUNNEL:
>          /* not implemented */
>          break;
>
> diff --git a/lib/odp-util.c b/lib/odp-util.c
> index 257d7a7..9ed17ed 100644
> --- a/lib/odp-util.c
> +++ b/lib/odp-util.c
> @@ -93,6 +93,8 @@ ovs_key_attr_to_string(enum ovs_key_attr attr)
>      case OVS_KEY_ATTR_UNSPEC: return "unspec";
>      case OVS_KEY_ATTR_ENCAP: return "encap";
>      case OVS_KEY_ATTR_PRIORITY: return "priority";
> +    case OVS_KEY_ATTR_TUN_ID: return "tun_id";
> +    case OVS_KEY_ATTR_IPV4_TUNNEL: return "ipv4_tunnel";
>      case OVS_KEY_ATTR_IN_PORT: return "in_port";
>      case OVS_KEY_ATTR_ETHERNET: return "eth";
>      case OVS_KEY_ATTR_VLAN: return "vlan";
> @@ -105,7 +107,6 @@ ovs_key_attr_to_string(enum ovs_key_attr attr)
>      case OVS_KEY_ATTR_ICMPV6: return "icmpv6";
>      case OVS_KEY_ATTR_ARP: return "arp";
>      case OVS_KEY_ATTR_ND: return "nd";
> -    case OVS_KEY_ATTR_TUN_ID: return "tun_id";
>
>      case __OVS_KEY_ATTR_MAX:
>      default:
> @@ -602,6 +603,7 @@ odp_flow_key_attr_len(uint16_t type)
>      case OVS_KEY_ATTR_ENCAP: return -2;
>      case OVS_KEY_ATTR_PRIORITY: return 4;
>      case OVS_KEY_ATTR_TUN_ID: return 8;
> +    case OVS_KEY_ATTR_IPV4_TUNNEL: return sizeof(struct ovs_key_ipv4_tunnel);
>      case OVS_KEY_ATTR_IN_PORT: return 4;
>      case OVS_KEY_ATTR_ETHERNET: return sizeof(struct ovs_key_ethernet);
>      case OVS_KEY_ATTR_VLAN: return sizeof(ovs_be16);
> @@ -668,6 +670,7 @@ format_odp_key_attr(const struct nlattr *a, struct ds *ds)
>      const struct ovs_key_icmpv6 *icmpv6_key;
>      const struct ovs_key_arp *arp_key;
>      const struct ovs_key_nd *nd_key;
> +    const struct ovs_key_ipv4_tunnel *ipv4_tun_key;
>      enum ovs_key_attr attr = nl_attr_type(a);
>      int expected_len;
>
> @@ -698,6 +701,16 @@ format_odp_key_attr(const struct nlattr *a, struct ds *ds)
>          ds_put_format(ds, "(%#"PRIx64")", ntohll(nl_attr_get_be64(a)));
>          break;
>
> +    case OVS_KEY_ATTR_IPV4_TUNNEL:
> +        ipv4_tun_key = nl_attr_get(a);
> +        ds_put_format(ds, "(tun_id=0x%"PRIx64",flags=0x%"PRIx32
> +                      ",src="IP_FMT",dst="IP_FMT",tos=0x%"PRIx8",ttl=%"PRIu8")",
> +                      ntohll(ipv4_tun_key->tun_id), ipv4_tun_key->tun_flags,
> +                      IP_ARGS(&ipv4_tun_key->ipv4_src),
> +                      IP_ARGS(&ipv4_tun_key->ipv4_dst),
> +                      ipv4_tun_key->ipv4_tos, ipv4_tun_key->ipv4_ttl);
> +        break;
> +
>      case OVS_KEY_ATTR_IN_PORT:
>          ds_put_format(ds, "(%"PRIu32")", nl_attr_get_u32(a));
>          break;
> diff --git a/lib/odp-util.h b/lib/odp-util.h
> index 16f2b15..57073ba 100644
> --- a/lib/odp-util.h
> +++ b/lib/odp-util.h
> @@ -80,6 +80,7 @@ int odp_actions_from_string(const char *, const struct simap *port_names,
>   *                         ------  ---  ------  -----
>   *  OVS_KEY_ATTR_PRIORITY      4    --     4      8
>   *  OVS_KEY_ATTR_TUN_ID        8    --     4     12
> + *  OVS_KEY_ATTR_IPV4_TUNNEL  24    --     4     28
>   *  OVS_KEY_ATTR_IN_PORT       4    --     4      8
>   *  OVS_KEY_ATTR_ETHERNET     12    --     4     16
>   *  OVS_KEY_ATTR_ETHERTYPE     2     2     4      8  (outer VLAN ethertype)
> @@ -90,7 +91,7 @@ int odp_actions_from_string(const char *, const struct simap *port_names,
>   *  OVS_KEY_ATTR_ICMPV6        2     2     4      8
>   *  OVS_KEY_ATTR_ND           28    --     4     32
>   *  -------------------------------------------------
> - *  total                                       156
> + *  total                                       184
>   *
>   * We include some slack space in case the calculation isn't quite right or we
>   * add another field and forget to adjust this value.
> --
> 1.7.11.4
>
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev



More information about the dev mailing list