[ovs-dev] [PATCH] datapath: More flexible kernel/userspace tunneling attribute.

Pravin Shelar pshelar at nicira.com
Wed Jan 16 21:54:57 UTC 2013


Thanks for comments.
I posted patch according to comments except refactoring
netlink-parsing. I will post separate patch for that.

--Pravin.

On Tue, Jan 15, 2013 at 10:38 AM, Jesse Gross <jesse at nicira.com> wrote:
> On Fri, Jan 11, 2013 at 5:23 PM, Pravin B Shelar <pshelar at nicira.com> wrote:
>> diff --git a/datapath/datapath.c b/datapath/datapath.c
>> index 30e26a7..4ed00cf 100644
>> --- a/datapath/datapath.c
>> +++ b/datapath/datapath.c
>>  static int validate_sample(const struct nlattr *attr,
>> -                               const struct sw_flow_key *key, int depth)
>> +                          const struct sw_flow_key *key, int depth,
>> +                          struct sw_flow_actions *sfa)
>>  {
>>         const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
>>         const struct nlattr *probability, *actions;
>> @@ -451,7 +453,7 @@ static int validate_sample(const struct nlattr *attr,
>>         actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
>>         if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
>>                 return -EINVAL;
>> -       return validate_actions(actions, key, depth + 1);
>> +       return validate_and_copy_actions(actions, key, depth + 1, sfa);
>
> I think this will result in the inner actions being output before the
> sample attribute itself.
>
>> +static int ____nla_put(struct sw_flow_actions *sfa, int attrtype, void *data, int len)
>
> I would probably give this a name other than nla_put since it makes it
> sound like it is part of the Netlink library itself.
>
>> +{
>> +       struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions + sfa->used);
>> +
>> +       if (NLA_ALIGN(nla_attr_size(len)) > (sfa->actions_len - sfa->used));
>> +               return -EMSGSIZE;
>
> What if we made actions_len the current length of the attributes that
> we've put and then used ksize() to check for overflow?  That way we
> wouldn't need sfa->used and actions_len will always reflect the
> correct data.
>
>> +static int validate_and_copy_set_tun(const struct nlattr *attr,
>> +                                    struct sw_flow_actions *sfa)
>> +{
>> +       struct ovs_key_ipv4_tunnel tun_key;
>> +       int err;
>> +
>> +       err = ipv4_tun_from_nlattr(attr, &tun_key);
>> +       if (err)
>> +               return err;
>> +
>> +       err = ____nla_put(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &tun_key, sizeof(tun_key));
>> +       if (err)
>> +               return err;
>
> I think this will result in only having the OVS_KEY_ATTR_IPV4_TUNNEL
> and not the wrapping set attribute.
>
>> @@ -600,12 +653,16 @@ static int validate_actions(const struct nlattr *attr,
>>                 };
>>                 const struct ovs_action_push_vlan *vlan;
>>                 int type = nla_type(a);
>> +               bool set_tun;
>> +
>>
>
> Extra blank line.
>
>> @@ -634,13 +691,13 @@ static int validate_actions(const struct nlattr *attr,
>>                         break;
>>
>>                 case OVS_ACTION_ATTR_SET:
>> -                       err = validate_set(a, key);
>> +                       err = validate_set(a, key, sfa, &set_tun);
>>                         if (err)
>>                                 return err;
>>                         break;
>>
>>                 case OVS_ACTION_ATTR_SAMPLE:
>> -                       err = validate_sample(a, key, depth);
>> +                       err = validate_sample(a, key, depth, sfa);
>
> I think if we have a sample attribute with a nested tunnel attribute
> then we will get both the original translated tunnel key and the
> original full sample attribute.
>
> Don't we need to translate this back the other way as well?  If we
> there is a query for a flow we should return the original value.
>
>> @@ -951,9 +1013,16 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
>>
>>         /* Validate actions. */
>>         if (a[OVS_FLOW_ATTR_ACTIONS]) {
>> -               error = validate_actions(a[OVS_FLOW_ATTR_ACTIONS], &key,  0);
>> -               if (error)
>> +               acts = ovs_flow_actions_alloc(a[OVS_FLOW_ATTR_ACTIONS]);
>> +               error = PTR_ERR(acts);
>> +               if (IS_ERR(acts))
>> +                       goto error;
>> +
>> +               error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &key,  0, acts);
>> +               if (error) {
>> +                       ovs_flow_deferred_free_acts(acts);
>
> We could just kfree() the actions here, no need to use RCU.
>
> Doesn't this leak the actions if we later encounter an error?
>
>> @@ -1026,21 +1087,14 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
>>                 /* Update actions. */
>>                 old_acts = rcu_dereference_protected(flow->sf_acts,
>>                                                      lockdep_genl_is_held());
>> -               acts_attrs = a[OVS_FLOW_ATTR_ACTIONS];
>> -               if (acts_attrs &&
>> -                  (old_acts->actions_len != nla_len(acts_attrs) ||
>> -                  memcmp(old_acts->actions, nla_data(acts_attrs),
>> -                         old_acts->actions_len))) {
>> -                       struct sw_flow_actions *new_acts;
>> -
>> -                       new_acts = ovs_flow_actions_alloc(acts_attrs);
>> -                       error = PTR_ERR(new_acts);
>> -                       if (IS_ERR(new_acts))
>> -                               goto error;
>> -
>> -                       rcu_assign_pointer(flow->sf_acts, new_acts);
>> +               if (acts &&
>> +                  (old_acts->actions_len != acts->actions_len ||
>> +                   memcmp(old_acts->actions, acts->actions,
>> +                          old_acts->actions_len))) {
>> +                       rcu_assign_pointer(flow->sf_acts, acts);
>>                         ovs_flow_deferred_free_acts(old_acts);
>> -               }
>> +               } else
>> +                       ovs_flow_deferred_free_acts(acts);
>
> I'm not sure that there's much point in this check any more.  Before
> it used to save an allocation but now it's just a pointer assignment.
>
>> diff --git a/datapath/flow.c b/datapath/flow.c
>> index 63eef77..ce563f3 100644
>> --- a/datapath/flow.c
>> +++ b/datapath/flow.c
>> @@ -213,7 +213,7 @@ struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *actions)
>>                 return ERR_PTR(-ENOMEM);
>>
>>         sfa->actions_len = actions_len;
>> -       memcpy(sfa->actions, nla_data(actions), actions_len);
>> +       sfa->used = 0;
>>         return sfa;
>>  }
>
> We should allow for the possibility that the size of fixed structure
> is larger than the Netlink attributes.
>
>> +int ipv4_tun_from_nlattr(const struct nlattr *attr,
>> +                        struct ovs_key_ipv4_tunnel *tun_key)
>> +{
>> +       struct nlattr *a;
>> +       int rem;
>> +
>> +       memset(tun_key, 0, sizeof(*tun_key));
>> +
>> +       nla_for_each_nested(a, attr, rem) {
>> +               int type = nla_type(a);
>> +
>> +               switch (type) {
>> +               case OVS_KEY_ATTR_TUN_ID:
>> +                       memcpy(&tun_key->tun_id, nla_data(a), sizeof(__be64));
>
> This should check the sizes of the attributes before copying,
> otherwise we could run off the end.
>
> It seems like we have a bunch of this type of parsing, which is
> basically the same as nla_parse_attribute() but it complains if there
> are unknown attributes.  Can we somehow factor it out?
>
>> +               break;
>> +               case OVS_TUNNEL_IPV4_SRC:
>> +                       memcpy(&tun_key->ipv4_src, nla_data(a), sizeof(__be32));
>> +               break;
>> +               case OVS_TUNNEL_IPV4_DST:
>> +                       memcpy(&tun_key->ipv4_dst, nla_data(a), sizeof(__be32));
>> +               break;
>> +               case OVS_TUNNEL_TOS:
>> +                       tun_key->ipv4_tos = nla_get_u8(a);
>> +               break;
>> +               case OVS_TUNNEL_TTL:
>> +                       tun_key->ipv4_ttl = nla_get_u8(a);
>
> We should enforce that TTL is set since zero isn't a great default here.
>
>> diff --git a/datapath/flow.h b/datapath/flow.h
>> index 3f3624f..ae15024 100644
>> --- a/datapath/flow.h
>> +++ b/datapath/flow.h
>> @@ -37,9 +37,20 @@ struct sk_buff;
>>  struct sw_flow_actions {
>>         struct rcu_head rcu;
>>         u32 actions_len;
>> +       u32 used;
>>         struct nlattr actions[];
>>  };
>>
>> +struct ovs_key_ipv4_tunnel {
>> +       __be64 tun_id;
>> +       __u32  tun_flags;
>> +       __be32 ipv4_src;
>> +       __be32 ipv4_dst;
>> +       __u8   ipv4_tos;
>> +       __u8   ipv4_ttl;
>> +       __u8   pad[2];
>
> I'm not sure that we need to explicitly include the pad at this point
> any more.  Since it's no longer an interface, it should be fine to
> allow the compiler to choose the padding as appropriate.
>
>> diff --git a/datapath/tunnel.h b/datapath/tunnel.h
>> index 7705475..b7de7a9 100644
>> --- a/datapath/tunnel.h
>> +++ b/datapath/tunnel.h
>> @@ -201,7 +201,6 @@ static inline void tnl_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key,
>>         tun_key->ipv4_tos = iph->tos;
>>         tun_key->ipv4_ttl = iph->ttl;
>>         tun_key->tun_flags = tun_flags;
>> -       memset(tun_key->pad, 0, sizeof(tun_key->pad));
>
> If we do keep the padding, then I think we need to continue
> initializing it here since otherwise we'll look up garbage data in the
> flow table.
>
>> diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h
>> index 5e32965..96f8bcc 100644
>> --- a/include/linux/openvswitch.h
>> +++ b/include/linux/openvswitch.h
>> @@ -265,6 +265,21 @@ struct ovs_flow_stats {
>>         __u64 n_bytes;           /* Number of matched bytes. */
>>  };
>>
>> +
>
> Extra blank line.
>
>> +/* Values for OVS_TUNNEL_FLAGS */
>> +#define OVS_TNL_F_DONT_FRAGMENT (1 << 0)
>> +#define OVS_TNL_F_CSUM (1 << 1)
>> +#define OVS_TNL_F_KEY (1 << 2)
>
> I think at this point it's no longer necessary to define these
> specially, we can just make them Netlink flag attributes.  We think we
> can actually drop OVS_TNL_F_KEY now and use the presence or absence of
> the corresponding attribute instead.
>
>> +enum ovs_tunnel {
>
> I would name this ovs_tunnel_attr for consistency.
>
>> +       OVS_TUNNEL_IPV4_SRC,    /* Tunnel src ip address. */
>> +       OVS_TUNNEL_IPV4_DST,    /* Tunnel dst ip address. */
>> +       OVS_TUNNEL_TOS,         /* Tunnel ip TOS. */
>> +       OVS_TUNNEL_TTL,         /* Tunnel ip TTL. */
>> +       OVS_TUNNEL_FLAGS,       /* OVS_TNL_F_* flags. */
>
> Can you include the types in the comments and also use more standard
> capitalization (IP, ToS)?
>
>> +       __OVS_TUNNEL_MAX
>> +};
>
> Can you also define OVS_TUNNEL_MAX for completeness?
>
> I would also put this below the definition of ovs_key_attr so that it
> is with the other related attributes.
>
>>  enum ovs_key_attr {
>>         OVS_KEY_ATTR_UNSPEC,
>>         OVS_KEY_ATTR_ENCAP,     /* Nested set of encapsulated attributes. */
>> @@ -282,8 +297,12 @@ enum ovs_key_attr {
>>         OVS_KEY_ATTR_ARP,       /* struct ovs_key_arp */
>>         OVS_KEY_ATTR_ND,        /* struct ovs_key_nd */
>>         OVS_KEY_ATTR_SKB_MARK,  /* u32 skb mark */
>> -       OVS_KEY_ATTR_IPV4_TUNNEL,  /* struct ovs_key_ipv4_tunnel */
>> +       OVS_KEY_ATTR_TUNNEL,    /* Nested set of ovs_tunnel attributes */
>>         OVS_KEY_ATTR_TUN_ID = 63,  /* be64 tunnel ID */
>> +
>> +#ifdef __KERNEL__
>> +       OVS_KEY_ATTR_IPV4_TUNNEL,  /* struct ovs_key_ipv4_tunnel */
>> +#endif
>
> I think it would be better to define this before OVS_KEY_ATTR_TUN_ID
> to prevent any possibility of overflowing our 64-bit data types.
>
>> diff --git a/lib/flow.h b/lib/flow.h
>> index 8e79e62..323b248 100644
>> --- a/lib/flow.h
>> +++ b/lib/flow.h
>> @@ -63,9 +63,10 @@ struct flow_tnl {
>>      ovs_be64 tun_id;
>>      ovs_be32 ip_src;
>>      ovs_be32 ip_dst;
>> -    uint16_t flags;
>> +    uint32_t flags;
>>      uint8_t ip_tos;
>>      uint8_t ip_ttl;
>> +    uint8_t zeros[2];
>>  };
>>
>>  /*
>> @@ -103,7 +104,6 @@ struct flow {
>>      uint8_t arp_tha[6];         /* ARP/ND target hardware address. */
>>      uint8_t nw_ttl;             /* IP TTL/Hop Limit. */
>>      uint8_t nw_frag;            /* FLOW_FRAG_* flags. */
>> -    uint8_t zeros[4];
>
> I don't think that this will build on 32-bit systems.  The compiler is
> silently adding padding to the end of struct flow on 64-bit systems,
> which is causing the original size to be the same and the assertion to
> continue passing.  However, I'm not sure that I really see what's
> being achieved here in general though.
>
>> diff --git a/lib/odp-util.c b/lib/odp-util.c
>> index e2f21da..4cb07fc 100644
>> --- a/lib/odp-util.c
>> +++ b/lib/odp-util.c
>> @@ -95,7 +95,7 @@ ovs_key_attr_to_string(enum ovs_key_attr attr)
>>      case OVS_KEY_ATTR_PRIORITY: return "skb_priority";
>>      case OVS_KEY_ATTR_SKB_MARK: return "skb_mark";
>>      case OVS_KEY_ATTR_TUN_ID: return "tun_id";
>> -    case OVS_KEY_ATTR_IPV4_TUNNEL: return "ipv4_tunnel";
>> +    case OVS_KEY_ATTR_TUNNEL: return "ipv4_tunnel";
>
> I think this should now be "tunnel"
>
>> +static void
>> +tun_key_to_attr(struct ofpbuf *a, const struct flow_tnl *tun_key, bool odp_flags)
>> +{
>> +    size_t tun_key_ofs;
>> +
>> +    tun_key_ofs = nl_msg_start_nested(a, OVS_KEY_ATTR_TUNNEL);
>> +
>> +    /* layouts differ, flags has different size */
>
> I'm not sure that this comment is really relevant any more.
>
>> +    if (tun_key->tun_id) {
>> +        nl_msg_put_be64(a, OVS_KEY_ATTR_TUN_ID, tun_key->tun_id);
>> +    }
>> +    if (tun_key->ip_src) {
>> +        nl_msg_put_be32(a, OVS_TUNNEL_IPV4_SRC, tun_key->ip_src);
>> +    }
>> +    if (tun_key->ip_dst) {
>> +        nl_msg_put_be32(a, OVS_TUNNEL_IPV4_DST, tun_key->ip_dst);
>> +    }
>> +    if (tun_key->ip_tos) {
>> +        nl_msg_put_u8(a, OVS_TUNNEL_TOS, tun_key->ip_tos);
>> +    }
>> +    if (tun_key->ip_ttl) {
>> +        nl_msg_put_u8(a, OVS_TUNNEL_TTL, tun_key->ip_ttl);
>> +    }
>
> If we start enforcing the TTL is present then we should make sure that
> we put it in on both userspace and kernel sides.
>
>> @@ -734,17 +840,17 @@ format_odp_key_attr(const struct nlattr *a, struct ds *ds)
>>          ds_put_format(ds, "(%#"PRIx64")", ntohll(nl_attr_get_be64(a)));
>>          break;
>>
>> -    case OVS_KEY_ATTR_IPV4_TUNNEL:
>> -        ipv4_tun_key = nl_attr_get(a);
>> +    case OVS_KEY_ATTR_TUNNEL:
>> +        tun_key_from_attr(a, &tun_key);
>
> I don't think it's really right to convert this to userspace format
> first.  They do have a similar structure but it's not really
> semantically correct.  I guess this is why you expanded the size of
> the flags field in struct flow but we don't really want to impose
> those types of requirements for things like this.
>
>> @@ -974,23 +1080,23 @@ parse_odp_key_attr(const char *s, const struct simap *port_names,
>>      {
>>          char tun_id_s[32];
>>          int tos, ttl;
>> -        struct ovs_key_ipv4_tunnel tun_key;
>> +        struct flow_tnl tun_key;
>>          int n = -1;
>>
>>          if (sscanf(s, "ipv4_tunnel(tun_id=%31[x0123456789abcdefABCDEF],"
>>                     "src="IP_SCAN_FMT",dst="IP_SCAN_FMT
>>                     ",tos=%i,ttl=%i,flags%n", tun_id_s,
>> -                    IP_SCAN_ARGS(&tun_key.ipv4_src),
>> -                    IP_SCAN_ARGS(&tun_key.ipv4_dst), &tos, &ttl,
>> +                    IP_SCAN_ARGS(&tun_key.ip_src),
>> +                    IP_SCAN_ARGS(&tun_key.ip_dst), &tos, &ttl,
>
> Similarly, we're also dealing with kernel format here.
>
>> @@ -1905,23 +1977,17 @@ odp_flow_key_to_flow(const struct nlattr *key, size_t key_len,
>>          expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_TUN_ID;
>>      }
>>
>> -    if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_IPV4_TUNNEL)) {
>> -        const struct ovs_key_ipv4_tunnel *ipv4_tun_key;
>> +    if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_TUNNEL)) {
>>
>> -        ipv4_tun_key = nl_attr_get(attrs[OVS_KEY_ATTR_IPV4_TUNNEL]);
>> -
>> -        flow->tunnel.tun_id = ipv4_tun_key->tun_id;
>> -        flow->tunnel.ip_src = ipv4_tun_key->ipv4_src;
>> -        flow->tunnel.ip_dst = ipv4_tun_key->ipv4_dst;
>> -        flow->tunnel.flags = odp_to_flow_flags(ipv4_tun_key->tun_flags);
>> -        flow->tunnel.ip_tos = ipv4_tun_key->ipv4_tos;
>> -        flow->tunnel.ip_ttl = ipv4_tun_key->ipv4_ttl;
>> +        if (tun_key_from_attr(attrs[OVS_KEY_ATTR_TUNNEL], &flow->tunnel))
>> +            return ODP_FIT_ERROR;
>
> This needs to be able to check whether there were extra attributes,
> otherwise we won't be able to recreate the kernel flow later.



More information about the dev mailing list