[ovs-dev] [PATCH] datapath: More flexible kernel/userspace tunneling attribute.

Jesse Gross jesse at nicira.com
Tue Jan 15 18:38:13 UTC 2013


On Fri, Jan 11, 2013 at 5:23 PM, Pravin B Shelar <pshelar at nicira.com> wrote:
> diff --git a/datapath/datapath.c b/datapath/datapath.c
> index 30e26a7..4ed00cf 100644
> --- a/datapath/datapath.c
> +++ b/datapath/datapath.c
>  static int validate_sample(const struct nlattr *attr,
> -                               const struct sw_flow_key *key, int depth)
> +                          const struct sw_flow_key *key, int depth,
> +                          struct sw_flow_actions *sfa)
>  {
>         const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
>         const struct nlattr *probability, *actions;
> @@ -451,7 +453,7 @@ static int validate_sample(const struct nlattr *attr,
>         actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
>         if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
>                 return -EINVAL;
> -       return validate_actions(actions, key, depth + 1);
> +       return validate_and_copy_actions(actions, key, depth + 1, sfa);

I think this will result in the inner actions being output before the
sample attribute itself.

> +static int ____nla_put(struct sw_flow_actions *sfa, int attrtype, void *data, int len)

I would probably give this a name other than nla_put since it makes it
sound like it is part of the Netlink library itself.

> +{
> +       struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions + sfa->used);
> +
> +       if (NLA_ALIGN(nla_attr_size(len)) > (sfa->actions_len - sfa->used));
> +               return -EMSGSIZE;

What if we made actions_len the current length of the attributes that
we've put and then used ksize() to check for overflow?  That way we
wouldn't need sfa->used and actions_len will always reflect the
correct data.

> +static int validate_and_copy_set_tun(const struct nlattr *attr,
> +                                    struct sw_flow_actions *sfa)
> +{
> +       struct ovs_key_ipv4_tunnel tun_key;
> +       int err;
> +
> +       err = ipv4_tun_from_nlattr(attr, &tun_key);
> +       if (err)
> +               return err;
> +
> +       err = ____nla_put(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &tun_key, sizeof(tun_key));
> +       if (err)
> +               return err;

I think this will result in only having the OVS_KEY_ATTR_IPV4_TUNNEL
and not the wrapping set attribute.

> @@ -600,12 +653,16 @@ static int validate_actions(const struct nlattr *attr,
>                 };
>                 const struct ovs_action_push_vlan *vlan;
>                 int type = nla_type(a);
> +               bool set_tun;
> +
>

Extra blank line.

> @@ -634,13 +691,13 @@ static int validate_actions(const struct nlattr *attr,
>                         break;
>
>                 case OVS_ACTION_ATTR_SET:
> -                       err = validate_set(a, key);
> +                       err = validate_set(a, key, sfa, &set_tun);
>                         if (err)
>                                 return err;
>                         break;
>
>                 case OVS_ACTION_ATTR_SAMPLE:
> -                       err = validate_sample(a, key, depth);
> +                       err = validate_sample(a, key, depth, sfa);

I think if we have a sample attribute with a nested tunnel attribute
then we will get both the original translated tunnel key and the
original full sample attribute.

Don't we need to translate this back the other way as well?  If we
there is a query for a flow we should return the original value.

> @@ -951,9 +1013,16 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
>
>         /* Validate actions. */
>         if (a[OVS_FLOW_ATTR_ACTIONS]) {
> -               error = validate_actions(a[OVS_FLOW_ATTR_ACTIONS], &key,  0);
> -               if (error)
> +               acts = ovs_flow_actions_alloc(a[OVS_FLOW_ATTR_ACTIONS]);
> +               error = PTR_ERR(acts);
> +               if (IS_ERR(acts))
> +                       goto error;
> +
> +               error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &key,  0, acts);
> +               if (error) {
> +                       ovs_flow_deferred_free_acts(acts);

We could just kfree() the actions here, no need to use RCU.

Doesn't this leak the actions if we later encounter an error?

> @@ -1026,21 +1087,14 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
>                 /* Update actions. */
>                 old_acts = rcu_dereference_protected(flow->sf_acts,
>                                                      lockdep_genl_is_held());
> -               acts_attrs = a[OVS_FLOW_ATTR_ACTIONS];
> -               if (acts_attrs &&
> -                  (old_acts->actions_len != nla_len(acts_attrs) ||
> -                  memcmp(old_acts->actions, nla_data(acts_attrs),
> -                         old_acts->actions_len))) {
> -                       struct sw_flow_actions *new_acts;
> -
> -                       new_acts = ovs_flow_actions_alloc(acts_attrs);
> -                       error = PTR_ERR(new_acts);
> -                       if (IS_ERR(new_acts))
> -                               goto error;
> -
> -                       rcu_assign_pointer(flow->sf_acts, new_acts);
> +               if (acts &&
> +                  (old_acts->actions_len != acts->actions_len ||
> +                   memcmp(old_acts->actions, acts->actions,
> +                          old_acts->actions_len))) {
> +                       rcu_assign_pointer(flow->sf_acts, acts);
>                         ovs_flow_deferred_free_acts(old_acts);
> -               }
> +               } else
> +                       ovs_flow_deferred_free_acts(acts);

I'm not sure that there's much point in this check any more.  Before
it used to save an allocation but now it's just a pointer assignment.

> diff --git a/datapath/flow.c b/datapath/flow.c
> index 63eef77..ce563f3 100644
> --- a/datapath/flow.c
> +++ b/datapath/flow.c
> @@ -213,7 +213,7 @@ struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *actions)
>                 return ERR_PTR(-ENOMEM);
>
>         sfa->actions_len = actions_len;
> -       memcpy(sfa->actions, nla_data(actions), actions_len);
> +       sfa->used = 0;
>         return sfa;
>  }

We should allow for the possibility that the size of fixed structure
is larger than the Netlink attributes.

> +int ipv4_tun_from_nlattr(const struct nlattr *attr,
> +                        struct ovs_key_ipv4_tunnel *tun_key)
> +{
> +       struct nlattr *a;
> +       int rem;
> +
> +       memset(tun_key, 0, sizeof(*tun_key));
> +
> +       nla_for_each_nested(a, attr, rem) {
> +               int type = nla_type(a);
> +
> +               switch (type) {
> +               case OVS_KEY_ATTR_TUN_ID:
> +                       memcpy(&tun_key->tun_id, nla_data(a), sizeof(__be64));

This should check the sizes of the attributes before copying,
otherwise we could run off the end.

It seems like we have a bunch of this type of parsing, which is
basically the same as nla_parse_attribute() but it complains if there
are unknown attributes.  Can we somehow factor it out?

> +               break;
> +               case OVS_TUNNEL_IPV4_SRC:
> +                       memcpy(&tun_key->ipv4_src, nla_data(a), sizeof(__be32));
> +               break;
> +               case OVS_TUNNEL_IPV4_DST:
> +                       memcpy(&tun_key->ipv4_dst, nla_data(a), sizeof(__be32));
> +               break;
> +               case OVS_TUNNEL_TOS:
> +                       tun_key->ipv4_tos = nla_get_u8(a);
> +               break;
> +               case OVS_TUNNEL_TTL:
> +                       tun_key->ipv4_ttl = nla_get_u8(a);

We should enforce that TTL is set since zero isn't a great default here.

> diff --git a/datapath/flow.h b/datapath/flow.h
> index 3f3624f..ae15024 100644
> --- a/datapath/flow.h
> +++ b/datapath/flow.h
> @@ -37,9 +37,20 @@ struct sk_buff;
>  struct sw_flow_actions {
>         struct rcu_head rcu;
>         u32 actions_len;
> +       u32 used;
>         struct nlattr actions[];
>  };
>
> +struct ovs_key_ipv4_tunnel {
> +       __be64 tun_id;
> +       __u32  tun_flags;
> +       __be32 ipv4_src;
> +       __be32 ipv4_dst;
> +       __u8   ipv4_tos;
> +       __u8   ipv4_ttl;
> +       __u8   pad[2];

I'm not sure that we need to explicitly include the pad at this point
any more.  Since it's no longer an interface, it should be fine to
allow the compiler to choose the padding as appropriate.

> diff --git a/datapath/tunnel.h b/datapath/tunnel.h
> index 7705475..b7de7a9 100644
> --- a/datapath/tunnel.h
> +++ b/datapath/tunnel.h
> @@ -201,7 +201,6 @@ static inline void tnl_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key,
>         tun_key->ipv4_tos = iph->tos;
>         tun_key->ipv4_ttl = iph->ttl;
>         tun_key->tun_flags = tun_flags;
> -       memset(tun_key->pad, 0, sizeof(tun_key->pad));

If we do keep the padding, then I think we need to continue
initializing it here since otherwise we'll look up garbage data in the
flow table.

> diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h
> index 5e32965..96f8bcc 100644
> --- a/include/linux/openvswitch.h
> +++ b/include/linux/openvswitch.h
> @@ -265,6 +265,21 @@ struct ovs_flow_stats {
>         __u64 n_bytes;           /* Number of matched bytes. */
>  };
>
> +

Extra blank line.

> +/* Values for OVS_TUNNEL_FLAGS */
> +#define OVS_TNL_F_DONT_FRAGMENT (1 << 0)
> +#define OVS_TNL_F_CSUM (1 << 1)
> +#define OVS_TNL_F_KEY (1 << 2)

I think at this point it's no longer necessary to define these
specially, we can just make them Netlink flag attributes.  We think we
can actually drop OVS_TNL_F_KEY now and use the presence or absence of
the corresponding attribute instead.

> +enum ovs_tunnel {

I would name this ovs_tunnel_attr for consistency.

> +       OVS_TUNNEL_IPV4_SRC,    /* Tunnel src ip address. */
> +       OVS_TUNNEL_IPV4_DST,    /* Tunnel dst ip address. */
> +       OVS_TUNNEL_TOS,         /* Tunnel ip TOS. */
> +       OVS_TUNNEL_TTL,         /* Tunnel ip TTL. */
> +       OVS_TUNNEL_FLAGS,       /* OVS_TNL_F_* flags. */

Can you include the types in the comments and also use more standard
capitalization (IP, ToS)?

> +       __OVS_TUNNEL_MAX
> +};

Can you also define OVS_TUNNEL_MAX for completeness?

I would also put this below the definition of ovs_key_attr so that it
is with the other related attributes.

>  enum ovs_key_attr {
>         OVS_KEY_ATTR_UNSPEC,
>         OVS_KEY_ATTR_ENCAP,     /* Nested set of encapsulated attributes. */
> @@ -282,8 +297,12 @@ enum ovs_key_attr {
>         OVS_KEY_ATTR_ARP,       /* struct ovs_key_arp */
>         OVS_KEY_ATTR_ND,        /* struct ovs_key_nd */
>         OVS_KEY_ATTR_SKB_MARK,  /* u32 skb mark */
> -       OVS_KEY_ATTR_IPV4_TUNNEL,  /* struct ovs_key_ipv4_tunnel */
> +       OVS_KEY_ATTR_TUNNEL,    /* Nested set of ovs_tunnel attributes */
>         OVS_KEY_ATTR_TUN_ID = 63,  /* be64 tunnel ID */
> +
> +#ifdef __KERNEL__
> +       OVS_KEY_ATTR_IPV4_TUNNEL,  /* struct ovs_key_ipv4_tunnel */
> +#endif

I think it would be better to define this before OVS_KEY_ATTR_TUN_ID
to prevent any possibility of overflowing our 64-bit data types.

> diff --git a/lib/flow.h b/lib/flow.h
> index 8e79e62..323b248 100644
> --- a/lib/flow.h
> +++ b/lib/flow.h
> @@ -63,9 +63,10 @@ struct flow_tnl {
>      ovs_be64 tun_id;
>      ovs_be32 ip_src;
>      ovs_be32 ip_dst;
> -    uint16_t flags;
> +    uint32_t flags;
>      uint8_t ip_tos;
>      uint8_t ip_ttl;
> +    uint8_t zeros[2];
>  };
>
>  /*
> @@ -103,7 +104,6 @@ struct flow {
>      uint8_t arp_tha[6];         /* ARP/ND target hardware address. */
>      uint8_t nw_ttl;             /* IP TTL/Hop Limit. */
>      uint8_t nw_frag;            /* FLOW_FRAG_* flags. */
> -    uint8_t zeros[4];

I don't think that this will build on 32-bit systems.  The compiler is
silently adding padding to the end of struct flow on 64-bit systems,
which is causing the original size to be the same and the assertion to
continue passing.  However, I'm not sure that I really see what's
being achieved here in general though.

> diff --git a/lib/odp-util.c b/lib/odp-util.c
> index e2f21da..4cb07fc 100644
> --- a/lib/odp-util.c
> +++ b/lib/odp-util.c
> @@ -95,7 +95,7 @@ ovs_key_attr_to_string(enum ovs_key_attr attr)
>      case OVS_KEY_ATTR_PRIORITY: return "skb_priority";
>      case OVS_KEY_ATTR_SKB_MARK: return "skb_mark";
>      case OVS_KEY_ATTR_TUN_ID: return "tun_id";
> -    case OVS_KEY_ATTR_IPV4_TUNNEL: return "ipv4_tunnel";
> +    case OVS_KEY_ATTR_TUNNEL: return "ipv4_tunnel";

I think this should now be "tunnel"

> +static void
> +tun_key_to_attr(struct ofpbuf *a, const struct flow_tnl *tun_key, bool odp_flags)
> +{
> +    size_t tun_key_ofs;
> +
> +    tun_key_ofs = nl_msg_start_nested(a, OVS_KEY_ATTR_TUNNEL);
> +
> +    /* layouts differ, flags has different size */

I'm not sure that this comment is really relevant any more.

> +    if (tun_key->tun_id) {
> +        nl_msg_put_be64(a, OVS_KEY_ATTR_TUN_ID, tun_key->tun_id);
> +    }
> +    if (tun_key->ip_src) {
> +        nl_msg_put_be32(a, OVS_TUNNEL_IPV4_SRC, tun_key->ip_src);
> +    }
> +    if (tun_key->ip_dst) {
> +        nl_msg_put_be32(a, OVS_TUNNEL_IPV4_DST, tun_key->ip_dst);
> +    }
> +    if (tun_key->ip_tos) {
> +        nl_msg_put_u8(a, OVS_TUNNEL_TOS, tun_key->ip_tos);
> +    }
> +    if (tun_key->ip_ttl) {
> +        nl_msg_put_u8(a, OVS_TUNNEL_TTL, tun_key->ip_ttl);
> +    }

If we start enforcing the TTL is present then we should make sure that
we put it in on both userspace and kernel sides.

> @@ -734,17 +840,17 @@ format_odp_key_attr(const struct nlattr *a, struct ds *ds)
>          ds_put_format(ds, "(%#"PRIx64")", ntohll(nl_attr_get_be64(a)));
>          break;
>
> -    case OVS_KEY_ATTR_IPV4_TUNNEL:
> -        ipv4_tun_key = nl_attr_get(a);
> +    case OVS_KEY_ATTR_TUNNEL:
> +        tun_key_from_attr(a, &tun_key);

I don't think it's really right to convert this to userspace format
first.  They do have a similar structure but it's not really
semantically correct.  I guess this is why you expanded the size of
the flags field in struct flow but we don't really want to impose
those types of requirements for things like this.

> @@ -974,23 +1080,23 @@ parse_odp_key_attr(const char *s, const struct simap *port_names,
>      {
>          char tun_id_s[32];
>          int tos, ttl;
> -        struct ovs_key_ipv4_tunnel tun_key;
> +        struct flow_tnl tun_key;
>          int n = -1;
>
>          if (sscanf(s, "ipv4_tunnel(tun_id=%31[x0123456789abcdefABCDEF],"
>                     "src="IP_SCAN_FMT",dst="IP_SCAN_FMT
>                     ",tos=%i,ttl=%i,flags%n", tun_id_s,
> -                    IP_SCAN_ARGS(&tun_key.ipv4_src),
> -                    IP_SCAN_ARGS(&tun_key.ipv4_dst), &tos, &ttl,
> +                    IP_SCAN_ARGS(&tun_key.ip_src),
> +                    IP_SCAN_ARGS(&tun_key.ip_dst), &tos, &ttl,

Similarly, we're also dealing with kernel format here.

> @@ -1905,23 +1977,17 @@ odp_flow_key_to_flow(const struct nlattr *key, size_t key_len,
>          expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_TUN_ID;
>      }
>
> -    if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_IPV4_TUNNEL)) {
> -        const struct ovs_key_ipv4_tunnel *ipv4_tun_key;
> +    if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_TUNNEL)) {
>
> -        ipv4_tun_key = nl_attr_get(attrs[OVS_KEY_ATTR_IPV4_TUNNEL]);
> -
> -        flow->tunnel.tun_id = ipv4_tun_key->tun_id;
> -        flow->tunnel.ip_src = ipv4_tun_key->ipv4_src;
> -        flow->tunnel.ip_dst = ipv4_tun_key->ipv4_dst;
> -        flow->tunnel.flags = odp_to_flow_flags(ipv4_tun_key->tun_flags);
> -        flow->tunnel.ip_tos = ipv4_tun_key->ipv4_tos;
> -        flow->tunnel.ip_ttl = ipv4_tun_key->ipv4_ttl;
> +        if (tun_key_from_attr(attrs[OVS_KEY_ATTR_TUNNEL], &flow->tunnel))
> +            return ODP_FIT_ERROR;

This needs to be able to check whether there were extra attributes,
otherwise we won't be able to recreate the kernel flow later.



More information about the dev mailing list