[ovs-dev] [PATCH 1/2] datapath: Add support for tun_key to Open vSwitch datapath
Kyle Mestery (kmestery)
kmestery at cisco.com
Thu Oct 11 20:52:37 UTC 2012
On Oct 10, 2012, at 6:23 PM, Pravin Shelar <pshelar at nicira.com> wrote:
> Patch looks good, I have few comments inlined.
>
>
> On Tue, Oct 9, 2012 at 11:49 AM, Kyle Mestery <kmestery at cisco.com> wrote:
>> This is a first pass at providing a tun_key which can be
>> used as the basis for flow-based tunnelling. The
>> tun_key includes and replaces the tun_id in both struct
>> ovs_skb_cb and struct sw_tun_key.
>>
>> This patch allows all existing tun_id behaviour to still work. Existing
>> users of tun_id are redirected to tun_key->tun_id to retain compatibility.
>> However, when the userspace code is updated to make use of the new tun_key,
>> the old behaviour will be deprecated and removed.
>>
>> NOTE: With these changes, the tunneling code no longer assumes input and
>> output keys are symmetric. If they are not, PMTUD needs to be disabled
>> for tunneling to work.
>>
>> Signed-off-by: Kyle Mestery <kmestery at cisco.com>
>> Cc: Simon Horman <horms at verge.net.au>
>> Cc: Jesse Gross <jesse at nicira.com>
>> ---
>> V6:
>> - Fix more comments addressed from Jesse.
>> V5:
>> - Address another round of comments from Jesse.
>> V4:
>> - Address 2 comments from Jesse:
>> - When processing actions, if OVS_CB(skb)->tun_key is NULL, point it at one
>> on the stack temporarily. This goes away when we remove the ability to set
>> tun_id outside the scope of tun_key.
>> - Move tun_key to the end of struct sw_flow_key.
>> V3:
>> - Fix issues found during review by Jesse.
>> - Add a NEWS entry around tunnel code no longer assuming symmetric input and
>> output tunnel keys.
>>
>> V2:
>> - Fix blank line addition/removal found by Simon.
>> - Fix hex printing output found by Simon.
>> ---
>> NEWS | 3 ++
>> datapath/actions.c | 38 +++++++++++++----
>> datapath/datapath.c | 10 ++++-
>> datapath/datapath.h | 5 ++-
>> datapath/flow.c | 64 ++++++++++++++++++++++++----
>> datapath/flow.h | 12 ++++--
>> datapath/tunnel.c | 101 ++++++++++++++++++++++++++++----------------
>> datapath/tunnel.h | 15 ++++++-
>> datapath/vport-capwap.c | 12 +++---
>> datapath/vport-gre.c | 15 ++++---
>> datapath/vport.c | 2 +-
>> include/linux/openvswitch.h | 18 +++++++-
>> lib/dpif-netdev.c | 1 +
>> lib/odp-util.c | 15 ++++++-
>> lib/odp-util.h | 3 +-
>> 15 files changed, 235 insertions(+), 79 deletions(-)
>>
>> diff --git a/NEWS b/NEWS
>> index 29fd9f3..ae831e3 100644
>> --- a/NEWS
>> +++ b/NEWS
>> @@ -1,5 +1,8 @@
>> post-v1.8.0
>> ------------------------
>> + - The tunneling code no longer assumes input and output keys are symmetric.
>> + If they are not, PMTUD needs to be disabled for tunneling to work. Note
>> + this only applies to flow-based keys.
>> - FreeBSD is now a supported platform, thanks to code contributions from
>> Gaetano Catalli, Ed Maste, and Giuseppe Lettieri.
>> - ovs-bugtool: New --ovs option to report only OVS related information.
>> diff --git a/datapath/actions.c b/datapath/actions.c
>> index ec9b595..fa8c10d 100644
>> --- a/datapath/actions.c
>> +++ b/datapath/actions.c
>> @@ -37,7 +37,8 @@
>> #include "vport.h"
>>
>> static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
>> - const struct nlattr *attr, int len, bool keep_skb);
>> + const struct nlattr *attr, int len,
>> + struct ovs_key_ipv4_tunnel *tun_key, bool keep_skb);
>>
>> static int make_writable(struct sk_buff *skb, int write_len)
>> {
>> @@ -329,11 +330,14 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
>> }
>>
>> return do_execute_actions(dp, skb, nla_data(acts_list),
>> - nla_len(acts_list), true);
>> + nla_len(acts_list),
>> + OVS_CB(skb)->tun_key,
>> + true);
>> }
>>
>> static int execute_set_action(struct sk_buff *skb,
>> - const struct nlattr *nested_attr)
>> + const struct nlattr *nested_attr,
>> + struct ovs_key_ipv4_tunnel *tun_key)
>> {
>> int err = 0;
>>
>> @@ -343,7 +347,21 @@ static int execute_set_action(struct sk_buff *skb,
>> break;
>>
>> case OVS_KEY_ATTR_TUN_ID:
>> - OVS_CB(skb)->tun_id = nla_get_be64(nested_attr);
>> + if (OVS_CB(skb)->tun_key == NULL) {
>> + /* If tun_key is NULL for this skb, assign it to
>> + * a value the caller passed in for action processing
>> + * and output. This can disappear once we drop support
>> + * for setting tun_id outside of tun_key.
>> + */
>> + memset(tun_key, 0, sizeof(struct ovs_key_ipv4_tunnel));
>> + OVS_CB(skb)->tun_key = tun_key;
>> + }
>> +
>> + OVS_CB(skb)->tun_key->tun_id = nla_get_be64(nested_attr);
>> + break;
>> +
>> + case OVS_KEY_ATTR_IPV4_TUNNEL:
>> + OVS_CB(skb)->tun_key = nla_data(nested_attr);
>> break;
>>
>> case OVS_KEY_ATTR_ETHERNET:
>> @@ -368,7 +386,8 @@ static int execute_set_action(struct sk_buff *skb,
>>
>> /* Execute a list of actions against 'skb'. */
>> static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
>> - const struct nlattr *attr, int len, bool keep_skb)
>> + const struct nlattr *attr, int len,
>> + struct ovs_key_ipv4_tunnel *tun_key, bool keep_skb)
>> {
>> /* Every output action needs a separate clone of 'skb', but the common
>> * case is just a single output action, so that doing a clone and
>> @@ -407,7 +426,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
>> break;
>>
>> case OVS_ACTION_ATTR_SET:
>> - err = execute_set_action(skb, nla_data(a));
>> + err = execute_set_action(skb, nla_data(a), tun_key);
>> break;
>>
>> case OVS_ACTION_ATTR_SAMPLE:
>> @@ -458,6 +477,9 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb)
>> struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts);
>> struct loop_counter *loop;
>> int error;
>> + struct ovs_key_ipv4_tunnel tun_key;
>> +
>> + memset(&tun_key, 0, sizeof(tun_key));
>>
> tun key will zeroed while executing tunnel set action, Do we still
> need to zero it here?
>
>> /* Check whether we've looped too much. */
>> loop = &__get_cpu_var(loop_counters);
>> @@ -469,9 +491,9 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb)
>> goto out_loop;
>> }
>>
>> - OVS_CB(skb)->tun_id = 0;
>> + OVS_CB(skb)->tun_key = NULL;
>> error = do_execute_actions(dp, skb, acts->actions,
>> - acts->actions_len, false);
>> + acts->actions_len, &tun_key, false);
>>
>> /* Check whether sub-actions looped too much. */
>> if (unlikely(loop->looping))
>> diff --git a/datapath/datapath.c b/datapath/datapath.c
>> index a6915fb..d8a198e 100644
>> --- a/datapath/datapath.c
>> +++ b/datapath/datapath.c
>> @@ -587,12 +587,20 @@ static int validate_set(const struct nlattr *a,
>>
>> switch (key_type) {
>> const struct ovs_key_ipv4 *ipv4_key;
>> + const struct ovs_key_ipv4_tunnel *tun_key;
>>
>> case OVS_KEY_ATTR_PRIORITY:
>> case OVS_KEY_ATTR_TUN_ID:
>> case OVS_KEY_ATTR_ETHERNET:
>> break;
>>
>> + case OVS_KEY_ATTR_IPV4_TUNNEL:
>> + tun_key = nla_data(ovs_key);
>> + if (!tun_key->ipv4_dst) {
>> + return -EINVAL;
>> + }
>> + break;
>> +
>> case OVS_KEY_ATTR_IPV4:
>> if (flow_key->eth.type != htons(ETH_P_IP))
>> return -EINVAL;
>> @@ -785,7 +793,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
>>
>> err = ovs_flow_metadata_from_nlattrs(&flow->key.phy.priority,
>> &flow->key.phy.in_port,
>> - &flow->key.phy.tun_id,
>> + &flow->key.tun.tun_key,
>> a[OVS_PACKET_ATTR_KEY]);
>> if (err)
>> goto err_flow_put;
>> diff --git a/datapath/datapath.h b/datapath/datapath.h
>> index affbf0e..c5df12d 100644
>> --- a/datapath/datapath.h
>> +++ b/datapath/datapath.h
>> @@ -96,7 +96,8 @@ struct datapath {
>> /**
>> * struct ovs_skb_cb - OVS data in skb CB
>> * @flow: The flow associated with this packet. May be %NULL if no flow.
>> - * @tun_id: ID of the tunnel that encapsulated this packet. It is 0 if the
>> + * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the
>> + * packet is not being tunneled.
>> * @ip_summed: Consistently stores L4 checksumming status across different
>> * kernel versions.
>> * @csum_start: Stores the offset from which to start checksumming independent
>> @@ -107,7 +108,7 @@ struct datapath {
>> */
>> struct ovs_skb_cb {
>> struct sw_flow *flow;
>> - __be64 tun_id;
>> + struct ovs_key_ipv4_tunnel *tun_key;
>> #ifdef NEED_CSUM_NORMALIZE
>> enum csum_type ip_summed;
>> u16 csum_start;
>> diff --git a/datapath/flow.c b/datapath/flow.c
>> index d07337c..376f4be 100644
>> --- a/datapath/flow.c
>> +++ b/datapath/flow.c
>> @@ -629,7 +629,8 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
>> memset(key, 0, sizeof(*key));
>>
>> key->phy.priority = skb->priority;
>> - key->phy.tun_id = OVS_CB(skb)->tun_id;
>> + if (OVS_CB(skb)->tun_key)
>> + key->tun.tun_key = *OVS_CB(skb)->tun_key;
>> key->phy.in_port = in_port;
>>
>> skb_reset_mac_header(skb);
>> @@ -847,6 +848,7 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
>>
>> /* Not upstream. */
>> [OVS_KEY_ATTR_TUN_ID] = sizeof(__be64),
>> + [OVS_KEY_ATTR_IPV4_TUNNEL] = sizeof(struct ovs_key_ipv4_tunnel),
>> };
>>
>> static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len,
>> @@ -1022,9 +1024,30 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
>> swkey->phy.in_port = DP_MAX_PORTS;
>> }
>>
>> - if (attrs & (1ULL << OVS_KEY_ATTR_TUN_ID)) {
>> - swkey->phy.tun_id = nla_get_be64(a[OVS_KEY_ATTR_TUN_ID]);
>> + /* OVS_KEY_ATTR_TUN_ID and OVS_KEY_ATTR_IPV4_TUNEL must both arrive
>> + * together.
>> + */
>> + if (attrs & (1ULL << OVS_KEY_ATTR_TUN_ID) &&
>> + attrs & (1ULL << OVS_KEY_ATTR_IPV4_TUNNEL)) {
>> + struct ovs_key_ipv4_tunnel *tun_key;
>> + __be64 tun_id = 0;
>> + tun_key = nla_data(a[OVS_KEY_ATTR_IPV4_TUNNEL]);
>> + tun_id = nla_get_be64(a[OVS_KEY_ATTR_TUN_ID]);
>> +
>> + if (tun_id != tun_key->tun_id)
>> + return -EINVAL;
>> +
>> + swkey->tun.tun_key = *tun_key;
>> attrs &= ~(1ULL << OVS_KEY_ATTR_TUN_ID);
>> + attrs &= ~(1ULL << OVS_KEY_ATTR_IPV4_TUNNEL);
>> + } else if (attrs & (1ULL << OVS_KEY_ATTR_TUN_ID)) {
>> + swkey->tun.tun_key.tun_id = nla_get_be64(a[OVS_KEY_ATTR_TUN_ID]);
>> + attrs &= ~(1ULL << OVS_KEY_ATTR_TUN_ID);
>> + } else if (attrs & (1ULL << OVS_KEY_ATTR_IPV4_TUNNEL)) {
>> + struct ovs_key_ipv4_tunnel *tun_key;
>> + tun_key = nla_data(a[OVS_KEY_ATTR_IPV4_TUNNEL]);
>> + swkey->tun.tun_key = *tun_key;
>> + attrs &= ~(1ULL << OVS_KEY_ATTR_IPV4_TUNNEL);
>> }
>
> While validating set action OVS_KEY_ATTR_IPV4_TUNNEL, we are checking
> for ipv4_dst, I think we need similar check here.
>
>>
>> /* Data attributes. */
>> @@ -1162,14 +1185,16 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
>> * get the metadata, that is, the parts of the flow key that cannot be
>> * extracted from the packet itself.
>> */
>> -int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port, __be64 *tun_id,
>> +int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port,
>> + struct ovs_key_ipv4_tunnel *tun_key,
>> const struct nlattr *attr)
>> {
>> const struct nlattr *nla;
>> int rem;
>> + __be64 tun_id = 0;
>>
>> *in_port = DP_MAX_PORTS;
>> - *tun_id = 0;
>> + memset(tun_key, 0, sizeof(*tun_key));
>> *priority = 0;
>>
>> nla_for_each_nested(nla, attr, rem) {
>> @@ -1185,7 +1210,20 @@ int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port, __be64 *tun_id,
>> break;
>>
>> case OVS_KEY_ATTR_TUN_ID:
>> - *tun_id = nla_get_be64(nla);
>> + tun_id = nla_get_be64(nla);
>> + if ((tun_key->tun_flags & FLOW_TNL_F_KEY) &&
>> + tun_key->tun_id != tun_id)
>> + return -EINVAL;
>> + break;
>> +
>> + case OVS_KEY_ATTR_IPV4_TUNNEL:
>> + if (tun_key->tun_flags & FLOW_TNL_F_KEY)
>> + tun_id = tun_key->tun_id;
>> + memcpy(tun_key, nla_data(nla),
>> + sizeof(*tun_key));
>> + if ((tun_key->tun_flags & FLOW_TNL_F_KEY) &&
>> + tun_id != tun_key->tun_id)
>> + return -EINVAL;
>> break;
>>
>> case OVS_KEY_ATTR_IN_PORT:
>> @@ -1210,10 +1248,20 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
>> nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority))
>> goto nla_put_failure;
>>
>> - if (swkey->phy.tun_id != cpu_to_be64(0) &&
>> - nla_put_be64(skb, OVS_KEY_ATTR_TUN_ID, swkey->phy.tun_id))
>> + if ((swkey->tun.tun_key.tun_flags & FLOW_TNL_F_KEY) &&
>> + nla_put_be64(skb, OVS_KEY_ATTR_TUN_ID, swkey->tun.tun_key.tun_id))
>> goto nla_put_failure;
>>
>> + if (swkey->tun.tun_key.ipv4_dst) {
>> + struct ovs_key_ipv4_tunnel *tun_key;
>> + nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4_TUNNEL,
>> + sizeof(*tun_key));
>> + if (!nla)
>> + goto nla_put_failure;
>> + tun_key = nla_data(nla);
>> + *tun_key = swkey->tun.tun_key;
>> + }
>> +
>
> Why are we sending both OVS_KEY_ATTR_TUN_ID and
> OVS_KEY_ATTR_IPV4_TUNNEL attributed here?
> we could first check for ipv4_dst else for FLOW_TNL_F_KEY in flow key.
>
>> if (swkey->phy.in_port != DP_MAX_PORTS &&
>> nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port))
>> goto nla_put_failure;
>> diff --git a/datapath/flow.h b/datapath/flow.h
>> index 02c563a..4430b32 100644
>> --- a/datapath/flow.h
>> +++ b/datapath/flow.h
>> @@ -42,10 +42,12 @@ struct sw_flow_actions {
>>
>> struct sw_flow_key {
>> struct {
>> - __be64 tun_id; /* Encapsulating tunnel ID. */
>> u32 priority; /* Packet QoS priority. */
>> u16 in_port; /* Input switch port (or DP_MAX_PORTS). */
>> } phy;
>> + struct {
>> + struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */
>> + } tun;
>> struct {
>> u8 src[ETH_ALEN]; /* Ethernet source address. */
>> u8 dst[ETH_ALEN]; /* Ethernet destination address. */
>> @@ -150,6 +152,7 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies);
>> * ------ --- ------ -----
>> * OVS_KEY_ATTR_PRIORITY 4 -- 4 8
>> * OVS_KEY_ATTR_TUN_ID 8 -- 4 12
>> + * OVS_KEY_ATTR_IPV4_TUNNEL 24 -- 4 28
>> * OVS_KEY_ATTR_IN_PORT 4 -- 4 8
>> * OVS_KEY_ATTR_ETHERNET 12 -- 4 16
>> * OVS_KEY_ATTR_ETHERTYPE 2 2 4 8 (outer VLAN ethertype)
>> @@ -160,14 +163,15 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies);
>> * OVS_KEY_ATTR_ICMPV6 2 2 4 8
>> * OVS_KEY_ATTR_ND 28 -- 4 32
>> * -------------------------------------------------
>> - * total 156
>> + * total 184
>> */
>> -#define FLOW_BUFSIZE 156
>> +#define FLOW_BUFSIZE 184
>>
>> int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *);
>> int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
>> const struct nlattr *);
>> -int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port, __be64 *tun_id,
>> +int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port,
>> + struct ovs_key_ipv4_tunnel *tun_key,
>> const struct nlattr *);
>>
>> #define MAX_ACTIONS_BUFSIZE (16 * 1024)
>> diff --git a/datapath/tunnel.c b/datapath/tunnel.c
>> index d651c11..739f098 100644
>> --- a/datapath/tunnel.c
>> +++ b/datapath/tunnel.c
>> @@ -367,9 +367,9 @@ struct vport *ovs_tnl_find_port(struct net *net, __be32 saddr, __be32 daddr,
>> return NULL;
> ñ> }
>>
>> -static void ecn_decapsulate(struct sk_buff *skb, u8 tos)
>> +static void ecn_decapsulate(struct sk_buff *skb)
>> {
>> - if (unlikely(INET_ECN_is_ce(tos))) {
>> + if (unlikely(INET_ECN_is_ce(OVS_CB(skb)->tun_key->ipv4_tos))) {
>> __be16 protocol = skb->protocol;
>>
>> skb_set_network_header(skb, ETH_HLEN);
>> @@ -416,7 +416,7 @@ static void ecn_decapsulate(struct sk_buff *skb, u8 tos)
>> * - skb->csum does not include the inner Ethernet header.
>> * - The layer pointers are undefined.
>> */
>> -void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb, u8 tos)
>> +void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb)
>> {
>> struct ethhdr *eh;
>>
>> @@ -433,7 +433,7 @@ void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb, u8 tos)
>> skb_clear_rxhash(skb);
>> secpath_reset(skb);
>>
>> - ecn_decapsulate(skb, tos);
>> + ecn_decapsulate(skb);
>> vlan_set_tci(skb, 0);
>>
>> if (unlikely(compute_ip_summed(skb, false))) {
>> @@ -613,7 +613,7 @@ static void ipv6_build_icmp(struct sk_buff *skb, struct sk_buff *nskb,
>>
>> bool ovs_tnl_frag_needed(struct vport *vport,
>> const struct tnl_mutable_config *mutable,
>> - struct sk_buff *skb, unsigned int mtu, __be64 flow_key)
>> + struct sk_buff *skb, unsigned int mtu)
>> {
>> unsigned int eth_hdr_len = ETH_HLEN;
>> unsigned int total_length = 0, header_length = 0, payload_length;
>> @@ -697,17 +697,6 @@ bool ovs_tnl_frag_needed(struct vport *vport,
>> ipv6_build_icmp(skb, nskb, mtu, payload_length);
>> #endif
>>
>> - /*
>> - * Assume that flow based keys are symmetric with respect to input
>> - * and output and use the key that we were going to put on the
>> - * outgoing packet for the fake received packet. If the keys are
>> - * not symmetric then PMTUD needs to be disabled since we won't have
>> - * any way of synthesizing packets.
>> - */
>> - if ((mutable->flags & (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION)) ==
>> - (TNL_F_IN_KEY_MATCH | TNL_F_OUT_KEY_ACTION))
>> - OVS_CB(nskb)->tun_id = flow_key;
>> -
>> if (unlikely(compute_ip_summed(nskb, false))) {
>> kfree_skb(nskb);
>> return false;
>> @@ -723,12 +712,20 @@ static bool check_mtu(struct sk_buff *skb,
>> const struct tnl_mutable_config *mutable,
>> const struct rtable *rt, __be16 *frag_offp)
>> {
>> - bool df_inherit = mutable->flags & TNL_F_DF_INHERIT;
>> + bool df_inherit;
>> bool pmtud = mutable->flags & TNL_F_PMTUD;
>> - __be16 frag_off = mutable->flags & TNL_F_DF_DEFAULT ? htons(IP_DF) : 0;
>> + __be16 frag_off;
>> int mtu = 0;
>> unsigned int packet_length = skb->len - ETH_HLEN;
>>
>> + if (OVS_CB(skb)->tun_key->ipv4_dst != 0) {
>> + df_inherit = false;
>> + frag_off = OVS_CB(skb)->tun_key->tun_flags & FLOW_TNL_F_DONT_FRAGMENT;
>> + } else {
>> + df_inherit = mutable->flags & TNL_F_DF_INHERIT;
>> + frag_off = mutable->flags & TNL_F_DF_DEFAULT ? htons(IP_DF) : 0;
>> + }
>> +
> Why not set pmtud to false incase of flow-based tunneling like you did
> it for df_inherit?
>
>> /* Allow for one level of tagging in the packet length. */
>> if (!vlan_tx_tag_present(skb) &&
>> eth_hdr(skb)->h_proto == htons(ETH_P_8021Q))
>> @@ -760,8 +757,7 @@ static bool check_mtu(struct sk_buff *skb,
>> mtu = max(mtu, IP_MIN_MTU);
>>
>> if (packet_length > mtu &&
>> - ovs_tnl_frag_needed(vport, mutable, skb, mtu,
>> - OVS_CB(skb)->tun_id))
>> + ovs_tnl_frag_needed(vport, mutable, skb, mtu))
>> return false;
>> }
>> }
>> @@ -777,8 +773,7 @@ static bool check_mtu(struct sk_buff *skb,
>> mtu = max(mtu, IPV6_MIN_MTU);
>>
>> if (packet_length > mtu &&
>> - ovs_tnl_frag_needed(vport, mutable, skb, mtu,
>> - OVS_CB(skb)->tun_id))
>> + ovs_tnl_frag_needed(vport, mutable, skb, mtu))
>> return false;
>> }
>> }
>> @@ -1000,15 +995,16 @@ unlock:
>> }
>>
>> static struct rtable *__find_route(const struct tnl_mutable_config *mutable,
>> - u8 ipproto, u8 tos)
>> + __be32 saddr, __be32 daddr, u8 ipproto,
>> + u8 tos)
>> {
>> /* Tunnel configuration keeps DSCP part of TOS bits, But Linux
>> * router expect RT_TOS bits only. */
>>
>> #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
>> struct flowi fl = { .nl_u = { .ip4_u = {
>> - .daddr = mutable->key.daddr,
>> - .saddr = mutable->key.saddr,
>> + .daddr = daddr,
>> + .saddr = saddr,
>> .tos = RT_TOS(tos) } },
>> .proto = ipproto };
>> struct rtable *rt;
>> @@ -1018,8 +1014,8 @@ static struct rtable *__find_route(const struct tnl_mutable_config *mutable,
>>
>> return rt;
>> #else
>> - struct flowi4 fl = { .daddr = mutable->key.daddr,
>> - .saddr = mutable->key.saddr,
>> + struct flowi4 fl = { .daddr = daddr,
>> + .saddr = saddr,
>> .flowi4_tos = RT_TOS(tos),
>> .flowi4_proto = ipproto };
>>
>> @@ -1029,7 +1025,8 @@ static struct rtable *__find_route(const struct tnl_mutable_config *mutable,
>>
>> static struct rtable *find_route(struct vport *vport,
>> const struct tnl_mutable_config *mutable,
>> - u8 tos, struct tnl_cache **cache)
>> + __be32 saddr, __be32 daddr, u8 tos,
>> + struct tnl_cache **cache)
>> {
>> struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
>> struct tnl_cache *cur_cache = rcu_dereference(tnl_vport->cache);
>> @@ -1037,18 +1034,21 @@ static struct rtable *find_route(struct vport *vport,
>> *cache = NULL;
>> tos = RT_TOS(tos);
>>
>> - if (likely(tos == RT_TOS(mutable->tos) &&
>> - check_cache_valid(cur_cache, mutable))) {
>> + if (daddr == mutable->key.daddr && saddr == mutable->key.saddr &&
>> + tos == RT_TOS(mutable->tos) &&
>> + check_cache_valid(cur_cache, mutable)) {
>> *cache = cur_cache;
>> return cur_cache->rt;
>> } else {
>> struct rtable *rt;
>>
>> - rt = __find_route(mutable, tnl_vport->tnl_ops->ipproto, tos);
>> + rt = __find_route(mutable, saddr, daddr,
>> + tnl_vport->tnl_ops->ipproto, tos);
>> if (IS_ERR(rt))
>> return NULL;
>>
>> - if (likely(tos == RT_TOS(mutable->tos)))
>> + if (daddr == mutable->key.daddr && saddr == mutable->key.saddr &&
>> + tos == RT_TOS(mutable->tos))
>> *cache = build_cache(vport, mutable, rt);
>>
> I am not sure why checks are added on saddr and daddr here?
> header caching is not set in case of flow based tunneling to
> build_cache call is no-op anyways.
>
>> return rt;
>> @@ -1178,8 +1178,11 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
>> struct rtable *rt;
>> struct dst_entry *unattached_dst = NULL;
>> struct tnl_cache *cache;
>> + struct ovs_key_ipv4_tunnel tun_key;
>> int sent_len = 0;
>> __be16 frag_off = 0;
>> + __be32 daddr;
>> + __be32 saddr;
>> u8 ttl;
>> u8 inner_tos;
>> u8 tos;
>> @@ -1207,6 +1210,13 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
>> }
>> #endif
>>
>> + /* If OVS_CB(skb)->tun_key is NULL, point it at the local tun_key here,
>> + * and zero it out.
>> + */
>> + if (OVS_CB(skb)->tun_key == NULL) {
>> + memset(&tun_key, 0, sizeof(tun_key));
>> + OVS_CB(skb)->tun_key = &tun_key;
>> + }
>> /* ToS */
>> if (skb->protocol == htons(ETH_P_IP))
>> inner_tos = ip_hdr(skb)->tos;
>> @@ -1219,11 +1229,23 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
>>
>> if (mutable->flags & TNL_F_TOS_INHERIT)
>> tos = inner_tos;
>> + else if (OVS_CB(skb)->tun_key->ipv4_dst != 0)
>> + tos = OVS_CB(skb)->tun_key->ipv4_tos;
>> else
>> tos = mutable->tos;
>>
>> + if (OVS_CB(skb)->tun_key->ipv4_dst != 0)
>> + daddr = OVS_CB(skb)->tun_key->ipv4_dst;
>> + else
>> + daddr = mutable->key.daddr;
>> +
>> + if (OVS_CB(skb)->tun_key->ipv4_dst != 0)
>> + saddr = OVS_CB(skb)->tun_key->ipv4_src;
>> + else
>> + saddr = mutable->key.saddr;
>> +
>
> It will be easier to read if we have one if check on
> OVS_CB(skb)->tun_key->ipv4_dst which is retrieve all tunnel parameter
> for flow-based tunning and else block for campat code.
>
>> /* Route lookup */
>> - rt = find_route(vport, mutable, tos, &cache);
>> + rt = find_route(vport, mutable, saddr, daddr, tos, &cache);
>> if (unlikely(!rt))
>> goto error_free;
>> if (unlikely(!cache))
>> @@ -1260,9 +1282,13 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
>> }
>>
>> /* TTL */
>> - ttl = mutable->ttl;
>> - if (!ttl)
>> - ttl = ip4_dst_hoplimit(&rt_dst(rt));
>> + if (OVS_CB(skb)->tun_key->ipv4_dst != 0)
>> + ttl = OVS_CB(skb)->tun_key->ipv4_ttl;
>> + else {
>> + ttl = mutable->ttl;
>> + if (!ttl)
>> + ttl = ip4_dst_hoplimit(&rt_dst(rt));
>> + }
>>
>> if (mutable->flags & TNL_F_TTL_INHERIT) {
>
> This check could be in the compat code block, so it is less confusing.
>
>> if (skb->protocol == htons(ETH_P_IP))
>> @@ -1442,7 +1468,8 @@ static int tnl_set_config(struct net *net, struct nlattr *options,
>> struct net_device *dev;
>> struct rtable *rt;
>>
>> - rt = __find_route(mutable, tnl_ops->ipproto, mutable->tos);
>> + rt = __find_route(mutable, mutable->key.saddr, mutable->key.daddr,
>> + tnl_ops->ipproto, mutable->tos);
>> if (IS_ERR(rt))
>> return -EADDRNOTAVAIL;
>> dev = rt_dst(rt).dev;
>> diff --git a/datapath/tunnel.h b/datapath/tunnel.h
>> index 1924017..b7858d3 100644
>> --- a/datapath/tunnel.h
>> +++ b/datapath/tunnel.h
>> @@ -269,14 +269,14 @@ int ovs_tnl_set_addr(struct vport *vport, const unsigned char *addr);
>> const char *ovs_tnl_get_name(const struct vport *vport);
>> const unsigned char *ovs_tnl_get_addr(const struct vport *vport);
>> int ovs_tnl_send(struct vport *vport, struct sk_buff *skb);
>> -void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb, u8 tos);
>> +void ovs_tnl_rcv(struct vport *vport, struct sk_buff *skb);
>>
>> struct vport *ovs_tnl_find_port(struct net *net, __be32 saddr, __be32 daddr,
>> __be64 key, int tunnel_type,
>> const struct tnl_mutable_config **mutable);
>> bool ovs_tnl_frag_needed(struct vport *vport,
>> const struct tnl_mutable_config *mutable,
>> - struct sk_buff *skb, unsigned int mtu, __be64 flow_key);
>> + struct sk_buff *skb, unsigned int mtu);
>> void ovs_tnl_free_linked_skbs(struct sk_buff *skb);
>>
>> int ovs_tnl_init(void);
>> @@ -286,4 +286,15 @@ static inline struct tnl_vport *tnl_vport_priv(const struct vport *vport)
>> return vport_priv(vport);
>> }
>>
>> +static inline void tnl_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key,
>> + const struct iphdr *iph, __be64 tun_id)
>> +{
>> + tun_key->tun_id = tun_id;
>> + tun_key->ipv4_src = iph->saddr;
>> + tun_key->ipv4_dst = iph->daddr;
>> + tun_key->ipv4_tos = iph->tos;
>> + tun_key->ipv4_ttl = iph->ttl;
>> + tun_key->tun_flags = 0;
>> +}
>> +
>> #endif /* tunnel.h */
>> diff --git a/datapath/vport-capwap.c b/datapath/vport-capwap.c
>> index 05a099d..fbf50d9 100644
>> --- a/datapath/vport-capwap.c
>> +++ b/datapath/vport-capwap.c
>> @@ -220,7 +220,7 @@ static struct sk_buff *capwap_update_header(const struct vport *vport,
>> struct capwaphdr_wsi *wsi = (struct capwaphdr_wsi *)(cwh + 1);
>> struct capwaphdr_wsi_key *opt = (struct capwaphdr_wsi_key *)(wsi + 1);
>>
>> - opt->key = OVS_CB(skb)->tun_id;
>> + opt->key = OVS_CB(skb)->tun_key->tun_id;
>> }
>>
>> udph->len = htons(skb->len - skb_transport_offset(skb));
>> @@ -316,6 +316,7 @@ static int capwap_rcv(struct sock *sk, struct sk_buff *skb)
>> struct vport *vport;
>> const struct tnl_mutable_config *mutable;
>> struct iphdr *iph;
>> + struct ovs_key_ipv4_tunnel tun_key;
>> __be64 key = 0;
>>
>> if (unlikely(!pskb_may_pull(skb, CAPWAP_MIN_HLEN + ETH_HLEN)))
>> @@ -333,12 +334,11 @@ static int capwap_rcv(struct sock *sk, struct sk_buff *skb)
>> goto error;
>> }
>>
>> - if (mutable->flags & TNL_F_IN_KEY_MATCH)
>> - OVS_CB(skb)->tun_id = key;
>> - else
>> - OVS_CB(skb)->tun_id = 0;
>> + tnl_tun_key_init(&tun_key, iph,
>> + mutable->flags & TNL_F_IN_KEY_MATCH ? key : 0);
>> + OVS_CB(skb)->tun_key = &tun_key;
>>
>> - ovs_tnl_rcv(vport, skb, iph->tos);
>> + ovs_tnl_rcv(vport, skb);
>> goto out;
>>
>> error:
>> diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c
>> index ab89c5b..8076b1d 100644
>> --- a/datapath/vport-gre.c
>> +++ b/datapath/vport-gre.c
>> @@ -103,7 +103,7 @@ static struct sk_buff *gre_update_header(const struct vport *vport,
>>
>> /* Work backwards over the options so the checksum is last. */
>> if (mutable->flags & TNL_F_OUT_KEY_ACTION)
>> - *options = be64_get_low32(OVS_CB(skb)->tun_id);
>> + *options = be64_get_low32(OVS_CB(skb)->tun_key->tun_id);
>>
> Here we do need check if we doing flow-based tunneling. in case of
> flow-based tunneling mutable is shared between all tunnels, so there
> is no-way to we can do port specific flags check from mutable. about
> GRE checksum, we need to check for FLOW_TNL_F_CSUM flag for doing
> checksum on a packet.
>
>> if (mutable->out_key || mutable->flags & TNL_F_OUT_KEY_ACTION)
>> options--;
>> @@ -285,7 +285,7 @@ static void gre_err(struct sk_buff *skb, u32 info)
>> #endif
>>
>> __skb_pull(skb, tunnel_hdr_len);
>> - ovs_tnl_frag_needed(vport, mutable, skb, mtu, key);
>> + ovs_tnl_frag_needed(vport, mutable, skb, mtu);
>> __skb_push(skb, tunnel_hdr_len);
>>
> same here, tunnel_hdr_len is stored in mutable which is going to be
> shared for all tunnels for a type in flow based tunneling, so we need
> to calculate it on packet send.
>
I looked, and in the context of this function, tunnel_hdr_len is actually calculated
by the static function parse_header(), which actually is looking at the packet to
find it. Now, in the broader context, mutable does have a tunnel header as a part
of it, and if that's the tunnel header length you are concerned about, I agree we
need to address that. Can you confirm?
Thanks,
Kyle
>> out:
>> @@ -327,6 +327,7 @@ static int gre_rcv(struct sk_buff *skb)
>> const struct tnl_mutable_config *mutable;
>> int hdr_len;
>> struct iphdr *iph;
>> + struct ovs_key_ipv4_tunnel tun_key;
>> __be16 flags;
>> __be64 key;
>>
>> @@ -351,15 +352,15 @@ static int gre_rcv(struct sk_buff *skb)
>> goto error;
>> }
>>
>> - if (mutable->flags & TNL_F_IN_KEY_MATCH)
>> - OVS_CB(skb)->tun_id = key;
>> - else
>> - OVS_CB(skb)->tun_id = 0;
>> +
>> + tnl_tun_key_init(&tun_key, iph,
>> + mutable->flags & TNL_F_IN_KEY_MATCH ? key : 0);
>> + OVS_CB(skb)->tun_key = &tun_key;
>>
>> __skb_pull(skb, hdr_len);
>> skb_postpull_rcsum(skb, skb_transport_header(skb), hdr_len + ETH_HLEN);
>>
>> - ovs_tnl_rcv(vport, skb, iph->tos);
>> + ovs_tnl_rcv(vport, skb);
>> return 0;
>>
>> error:
>> diff --git a/datapath/vport.c b/datapath/vport.c
>> index dc7adfa..e8279fb 100644
>> --- a/datapath/vport.c
>> +++ b/datapath/vport.c
>> @@ -462,7 +462,7 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb)
>> OVS_CB(skb)->flow = NULL;
>>
>> if (!(vport->ops->flags & VPORT_F_TUN_ID))
>> - OVS_CB(skb)->tun_id = 0;
>> + OVS_CB(skb)->tun_key = NULL;
>>
>> ovs_dp_process_received_packet(vport, skb);
>> }
>> diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h
>> index f5c9cca..8334cc1 100644
>> --- a/include/linux/openvswitch.h
>> +++ b/include/linux/openvswitch.h
>> @@ -278,7 +278,8 @@ enum ovs_key_attr {
>> OVS_KEY_ATTR_ICMPV6, /* struct ovs_key_icmpv6 */
>> OVS_KEY_ATTR_ARP, /* struct ovs_key_arp */
>> OVS_KEY_ATTR_ND, /* struct ovs_key_nd */
>> - OVS_KEY_ATTR_TUN_ID = 63, /* be64 tunnel ID */
>> + OVS_KEY_ATTR_IPV4_TUNNEL = 62, /* struct ovs_key_ipv4_tunnel */
>> + OVS_KEY_ATTR_TUN_ID = 63, /* be64 tunnel ID */
>> __OVS_KEY_ATTR_MAX
>> };
>>
>> @@ -360,6 +361,21 @@ struct ovs_key_nd {
>> __u8 nd_tll[6];
>> };
>>
>> +/* Values for ovs_key_ipv4_tunnel->tun_flags */
>> +#define FLOW_TNL_F_DONT_FRAGMENT (1 << 0)
>> +#define FLOW_TNL_F_CSUM (1 << 1)
>> +#define FLOW_TNL_F_KEY (1 << 2)
>> +
>> +struct ovs_key_ipv4_tunnel {
>> + __be64 tun_id;
>> + __u32 tun_flags;
>> + __be32 ipv4_src;
>> + __be32 ipv4_dst;
>> + __u8 ipv4_tos;
>> + __u8 ipv4_ttl;
>> + __u8 pad[2];
>> +};
>> +
>> /**
>> * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
>> * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index c9e3210..797cb06 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -1179,6 +1179,7 @@ execute_set_action(struct ofpbuf *packet, const struct nlattr *a)
>> case OVS_KEY_ATTR_TUN_ID:
>> case OVS_KEY_ATTR_PRIORITY:
>> case OVS_KEY_ATTR_IPV6:
>> + case OVS_KEY_ATTR_IPV4_TUNNEL:
>> /* not implemented */
>> break;
>>
>> diff --git a/lib/odp-util.c b/lib/odp-util.c
>> index 257d7a7..9ed17ed 100644
>> --- a/lib/odp-util.c
>> +++ b/lib/odp-util.c
>> @@ -93,6 +93,8 @@ ovs_key_attr_to_string(enum ovs_key_attr attr)
>> case OVS_KEY_ATTR_UNSPEC: return "unspec";
>> case OVS_KEY_ATTR_ENCAP: return "encap";
>> case OVS_KEY_ATTR_PRIORITY: return "priority";
>> + case OVS_KEY_ATTR_TUN_ID: return "tun_id";
>> + case OVS_KEY_ATTR_IPV4_TUNNEL: return "ipv4_tunnel";
>> case OVS_KEY_ATTR_IN_PORT: return "in_port";
>> case OVS_KEY_ATTR_ETHERNET: return "eth";
>> case OVS_KEY_ATTR_VLAN: return "vlan";
>> @@ -105,7 +107,6 @@ ovs_key_attr_to_string(enum ovs_key_attr attr)
>> case OVS_KEY_ATTR_ICMPV6: return "icmpv6";
>> case OVS_KEY_ATTR_ARP: return "arp";
>> case OVS_KEY_ATTR_ND: return "nd";
>> - case OVS_KEY_ATTR_TUN_ID: return "tun_id";
>>
>> case __OVS_KEY_ATTR_MAX:
>> default:
>> @@ -602,6 +603,7 @@ odp_flow_key_attr_len(uint16_t type)
>> case OVS_KEY_ATTR_ENCAP: return -2;
>> case OVS_KEY_ATTR_PRIORITY: return 4;
>> case OVS_KEY_ATTR_TUN_ID: return 8;
>> + case OVS_KEY_ATTR_IPV4_TUNNEL: return sizeof(struct ovs_key_ipv4_tunnel);
>> case OVS_KEY_ATTR_IN_PORT: return 4;
>> case OVS_KEY_ATTR_ETHERNET: return sizeof(struct ovs_key_ethernet);
>> case OVS_KEY_ATTR_VLAN: return sizeof(ovs_be16);
>> @@ -668,6 +670,7 @@ format_odp_key_attr(const struct nlattr *a, struct ds *ds)
>> const struct ovs_key_icmpv6 *icmpv6_key;
>> const struct ovs_key_arp *arp_key;
>> const struct ovs_key_nd *nd_key;
>> + const struct ovs_key_ipv4_tunnel *ipv4_tun_key;
>> enum ovs_key_attr attr = nl_attr_type(a);
>> int expected_len;
>>
>> @@ -698,6 +701,16 @@ format_odp_key_attr(const struct nlattr *a, struct ds *ds)
>> ds_put_format(ds, "(%#"PRIx64")", ntohll(nl_attr_get_be64(a)));
>> break;
>>
>> + case OVS_KEY_ATTR_IPV4_TUNNEL:
>> + ipv4_tun_key = nl_attr_get(a);
>> + ds_put_format(ds, "(tun_id=0x%"PRIx64",flags=0x%"PRIx32
>> + ",src="IP_FMT",dst="IP_FMT",tos=0x%"PRIx8",ttl=%"PRIu8")",
>> + ntohll(ipv4_tun_key->tun_id), ipv4_tun_key->tun_flags,
>> + IP_ARGS(&ipv4_tun_key->ipv4_src),
>> + IP_ARGS(&ipv4_tun_key->ipv4_dst),
>> + ipv4_tun_key->ipv4_tos, ipv4_tun_key->ipv4_ttl);
>> + break;
>> +
>> case OVS_KEY_ATTR_IN_PORT:
>> ds_put_format(ds, "(%"PRIu32")", nl_attr_get_u32(a));
>> break;
>> diff --git a/lib/odp-util.h b/lib/odp-util.h
>> index 16f2b15..57073ba 100644
>> --- a/lib/odp-util.h
>> +++ b/lib/odp-util.h
>> @@ -80,6 +80,7 @@ int odp_actions_from_string(const char *, const struct simap *port_names,
>> * ------ --- ------ -----
>> * OVS_KEY_ATTR_PRIORITY 4 -- 4 8
>> * OVS_KEY_ATTR_TUN_ID 8 -- 4 12
>> + * OVS_KEY_ATTR_IPV4_TUNNEL 24 -- 4 28
>> * OVS_KEY_ATTR_IN_PORT 4 -- 4 8
>> * OVS_KEY_ATTR_ETHERNET 12 -- 4 16
>> * OVS_KEY_ATTR_ETHERTYPE 2 2 4 8 (outer VLAN ethertype)
>> @@ -90,7 +91,7 @@ int odp_actions_from_string(const char *, const struct simap *port_names,
>> * OVS_KEY_ATTR_ICMPV6 2 2 4 8
>> * OVS_KEY_ATTR_ND 28 -- 4 32
>> * -------------------------------------------------
>> - * total 156
>> + * total 184
>> *
>> * We include some slack space in case the calculation isn't quite right or we
>> * add another field and forget to adjust this value.
>> --
>> 1.7.11.4
>>
>> _______________________________________________
>> dev mailing list
>> dev at openvswitch.org
>> http://openvswitch.org/mailman/listinfo/dev
More information about the dev
mailing list