[ovs-dev] [netlink v4 27/52] datapath: Change userspace vport interface to use Netlink attributes.

Jesse Gross jesse at nicira.com
Sat Jan 15 22:47:34 UTC 2011


On Wed, Jan 12, 2011 at 12:49 AM, Ben Pfaff <blp at nicira.com> wrote:
> diff --git a/datapath/datapath.c b/datapath/datapath.c
> index 0ba8ab6..d9e6848 100644
> --- a/datapath/datapath.c
> +++ b/datapath/datapath.c
> @@ -266,11 +267,16 @@ static int create_dp(int dp_idx, const char __user *devnamep)
>                goto err_free_dp;
>
>        /* Set up our datapath device. */
> -       BUILD_BUG_ON(sizeof(internal_dev_port.devname) != sizeof(devname));
> -       strcpy(internal_dev_port.devname, devname);
> -       internal_dev_port.type = ODPVT_INTERNAL;
> -       err = new_vport(dp, &internal_dev_port, ODPP_LOCAL);
> -       if (err) {
> +       parms.name = devname;
> +       parms.type = ODPVT_INTERNAL;
> +       parms.options = NULL;
> +       parms.dp = dp;
> +       parms.port_no = ODPP_LOCAL;
> +       vport_lock();
> +       vport = new_vport(&parms);
> +       vport_unlock();

We could push down the vport lock into new_vport().  attach_vport()
holds vport lock for a longer period that would be broken by doing
this but I don't think that is conceptually correct because the new
vport is really protected by dp->mutex, not the vport lock.

> +static struct vport *new_vport(const struct vport_parms *parms)
>  {
> -       struct datapath *dp;
> -       struct odp_port port;
> -       int port_no;
> -       int err;
> -
> -       err = -EFAULT;
> -       if (copy_from_user(&port, portp, sizeof port))
> -               goto out;
> -       port.devname[IFNAMSIZ - 1] = '\0';
> -
> -       rtnl_lock();
> -       dp = get_dp_locked(dp_idx);
> -       err = -ENODEV;
> -       if (!dp)
> -               goto out_unlock_rtnl;
> -
> -       for (port_no = 1; port_no < DP_MAX_PORTS; port_no++)
> -               if (!dp->ports[port_no])
> -                       goto got_port_no;
> -       err = -EFBIG;
> -       goto out_unlock_dp;
> -
> -got_port_no:
> -       err = new_vport(dp, &port, port_no);
> -       if (err)
> -               goto out_unlock_dp;
> +       struct vport *vport = vport_add(parms);
> +       if (!IS_ERR(vport)) {
> +               struct datapath *dp = parms->dp;
>
> -       set_internal_devs_mtu(dp);
> -       dp_sysfs_add_if(get_vport_protected(dp, port_no));

Did you intend to delete these two lines?  I don't see where they get
added back.

> +static int copy_vport_to_user(void __user *dst, struct vport *vport, uint32_t total_len)
> +{
> +       struct odp_vport *odp_vport;
> +       struct sk_buff *skb;
> +       struct nlattr *nla;
> +       int ifindex, iflink;
> +       int err;
> +
> +       skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
> +       err = -ENOMEM;
> +       if (!skb)
> +               goto exit;
> +
>        rcu_read_lock();
> -       strncpy(odp_port->devname, vport_get_name(vport), sizeof(odp_port->devname));
> -       odp_port->type = vport_get_type(vport);
> -       vport_get_config(vport, odp_port->config);
> -       odp_port->port = vport->port_no;
> -       odp_port->dp_idx = vport->dp->dp_idx;
> +       odp_vport = (struct odp_vport*)__skb_put(skb, sizeof(struct odp_vport));
> +       odp_vport->dp_idx = vport->dp->dp_idx;
> +       odp_vport->port_no = vport->port_no;
> +       odp_vport->type = vport_get_type(vport);
> +       odp_vport->total_len = total_len;
> +
> +       NLA_PUT_STRING(skb, ODPPT_NAME, vport_get_name(vport));
> +
> +       nla = nla_reserve(skb, ODPPT_STATS, sizeof(struct rtnl_link_stats64));
> +       if (!nla)
> +               goto nla_put_failure;
> +       if (vport_get_stats(vport, nla_data(nla)))
> +               __skb_trim(skb, skb->len - nla->nla_len);

Everywhere else if we run into a problem we abort the operation but
here we try to keep going.  Is there a reason for this?

> -static int query_port(int dp_idx, struct odp_port __user *uport)
> +static struct sk_buff *copy_vport_from_user(struct odp_vport __user *uodp_vport, struct nlattr *a[ODPPT_MAX + 1])

Some of these lines, and the function declarations in particular, are
pretty long.

> +static struct vport *lookup_vport(struct odp_vport *odp_vport, struct nlattr *a[ODPPT_MAX + 1])
> +{
> +       struct datapath *dp;
> +       struct vport *vport;
> +
> +       if (a[ODPPT_NAME]) {
> +               int dp_idx, port_no;
> +
> +       retry:
> +               vport_lock();
> +               vport = vport_locate(nla_data(a[ODPPT_NAME]));
> +               if (!vport) {
> +                       vport_unlock();
> +                       return ERR_PTR(-ENODEV);
> +               }
> +               dp_idx = vport->dp->dp_idx;
> +               port_no = vport->port_no;
> +               vport_unlock();
>
>                dp = get_dp_locked(dp_idx);
>                if (!dp)
> -                       return -ENODEV;
> +                       goto retry;
>
> -               vport = get_vport_protected(dp, port.port);
> -               if (vport)
> -                       compose_odp_port(vport, &port);
> -               mutex_unlock(&dp->mutex);
> +               vport = get_vport_protected(dp, port_no);
> +               if (!vport || strcmp(vport_get_name(vport), nla_data(a[ODPPT_NAME]))) {
> +                       mutex_unlock(&dp->mutex);
> +                       goto retry;
> +               }
> +

The locking in this function is pretty scary.  I don't see anything
actually wrong with it and the vport lock is going away, so I guess
that it is OK for now.  Can you at least add a comment that says it
returns with dp->mutex held?

> +static int change_vport(struct vport *vport, struct nlattr *a[ODPPT_MAX + 1])
> +{
> +       int err = 0;
> +       if (a[ODPPT_STATS])
> +               err = vport_set_stats(vport, nla_data(a[ODPPT_STATS]));
> +       if (!err && a[ODPPT_ADDRESS])
> +               err = vport_set_addr(vport, nla_data(a[ODPPT_ADDRESS]));
> +       if (!err && a[ODPPT_MTU])
> +               err = vport_set_mtu(vport, nla_get_u32(a[ODPPT_MTU]));

The fact we can fail and return with a transaction half complete concerns me.

> +static int attach_vport(struct odp_vport __user *uodp_vport)
>  {
[...]
> +       port_no = odp_vport->port_no;
> +       if (port_no > 0) {
> +               vport = get_vport_protected(dp, odp_vport->port_no);
> +               err = -EBUSY;
> +               if (vport)
> +                       goto exit_unlock_dp;

Is there actually a good reason to allow userspace to request a
particular port?  It seems cleaner to just assign a port number
however we see fit.

[...]
> +       vport_lock();
> +       vport = new_vport(&parms);
> +       err = PTR_ERR(vport);
> +       if (IS_ERR(vport))
> +               goto exit_unlock_vport;
> +
> +       err = change_vport(vport, a);
> +       if (err)
> +               dp_detach_port(vport);
> +
> +exit_unlock_vport:
> +       vport_unlock();

As I said before, it's really dp->mutex that is logically providing
the protection here, so we shouldn't need to hold vport_lock() this
whole time.

> -static int dump_port(struct datapath *dp, struct odp_vport_dump __user *udump)
> +static int modify_vport(unsigned int cmd, struct odp_vport __user *uodp_vport)
>  {
> -       struct odp_vport_dump dump;
> +       struct nlattr *a[ODPPT_MAX + 1];
> +       struct datapath *dp;
> +       struct vport *vport;
> +       struct sk_buff *skb;
> +       int err;
>
> -       if (copy_from_user(&dump, udump, sizeof(dump)))
> -               return -EFAULT;
> +       skb = copy_vport_from_user(uodp_vport, a);
> +       err = PTR_ERR(skb);
> +       if (IS_ERR(skb))
> +               goto exit;
>
> -       return do_dump_port(dp, &dump);
> +       rtnl_lock();
> +       vport = lookup_vport((struct odp_vport *)skb->data, a);
> +       err = PTR_ERR(vport);
> +       if (IS_ERR(vport))
> +               goto exit_free;
> +       dp = vport->dp;

It's not clear to me why modify and delete are combined here.  They
share some code but I don't really see the logical connection.

> +
> +       if (cmd == ODP_VPORT_DEL)

Don't we need to prevent userspace from deleting the datapath device?

> diff --git a/datapath/vport.c b/datapath/vport.c
> index a391932..0fa4dd6 100644
> --- a/datapath/vport.c
> +++ b/datapath/vport.c
> -int vport_mod(struct vport *vport, struct odp_port *port)
> +int vport_set_options(struct vport *vport, struct nlattr *options)
>  {
> +       int err;
> +
>        ASSERT_RTNL();
> -       ASSERT_VPORT();
>
> -       if (vport->ops->modify)
> -               return vport->ops->modify(vport, port);
> +       vport_lock();
> +       if (vport->ops->set_options)
> +               err = vport->ops->set_options(vport, options);
>        else
> -               return -EOPNOTSUPP;
> +               err = -EOPNOTSUPP;
> +       vport_unlock();

There's actually no particular reason why this function needs to hold
vport lock.  I originally documented it that way because it seemed the
most consistent with vport_add() and vport_del().  However, RTNL is
sufficient to protect the actual modifications and it doesn't modify
the hash table of vports (which is what vport lock really protects).

> diff --git a/include/openvswitch/datapath-protocol.h b/include/openvswitch/datapath-protocol.h
> index b6ed979..ae82531 100644
> --- a/include/openvswitch/datapath-protocol.h
> +++ b/include/openvswitch/datapath-protocol.h
> +struct odp_vport {
> +       int32_t dp_idx;
> +       int32_t port_no;

Why are these signed?

> +       uint32_t type;
> +       uint32_t len;
> +       uint32_t total_len;
> +};
> +
> +enum {
> +        ODPPT_UNSPEC,
> +        ODPPT_NAME,            /* string name, up to IFNAMSIZ bytes long */
> +        ODPPT_STATS,           /* struct rtnl_link_stats64 */
> +        ODPPT_ADDRESS,         /* hardware address */
> +        ODPPT_MTU,             /* 32-bit maximum transmission unit */
> +        ODPPT_OPTIONS,         /* nested attributes, varies by vport type */
> +        ODPPT_IFINDEX,         /* 32-bit ifindex of backing netdev */
> +        ODPPT_IFLINK,          /* 32-bit ifindex on which packets are sent */
> +        __ODPPT_MAX
>  };

The rationale for putting fields in the header vs attributes is not
very clear to me (with the obvious exception of the length fields).
It's perhaps not such a big deal here because we do need the fixed
header at the beginning to hold the lengths.  However, in the later
patches where we actually use Netlink, this seems to lead unnecessary
complexity.  Having these fields in the header means that different
operations have different headers, which means that we have different
families.  The end result is that we have a proliferation of families
and structs.  I would expect it to have one family with maybe a fixed
header consisting of just the DP index.

> diff --git a/include/openvswitch/tunnel.h b/include/openvswitch/tunnel.h
> index 44facfa..1e7d2b4 100644
> --- a/include/openvswitch/tunnel.h
> +++ b/include/openvswitch/tunnel.h
> @@ -43,6 +43,15 @@
>  #include <linux/types.h>
>  #include "openvswitch/datapath-protocol.h"
>
> +/* ODPPT_OPTIONS attributes for tunnels. */
> +enum {
> +       TNLAT_UNSPEC,
> +       TNLAT_CONFIG,           /* struct tnl_port_config */

I think this really needs to be broken apart into separate attributes.
 I already have plans to change this struct, so not it's not realistic
to expect it to stay the constant.

> diff --git a/lib/dpif-linux.c b/lib/dpif-linux.c
> index 9c0974f..f931e25 100644
> --- a/lib/dpif-linux.c
> +++ b/lib/dpif-linux.c
>  dpif_linux_open(const struct dpif_class *class OVS_UNUSED, const char *name,
>                 bool create, struct dpif **dpifp)
[...]
> +
> +    dpif_linux_vport_init(&request);
> +    request.cmd = ODP_VPORT_GET;
> +    if (minor >= 0) {
> +        request.dp_idx = minor;

I guess this is relying on port_no to be initialized to 0 during the
memset?  It seems like it would be much clearer to explicitly set it
to ODPP_LOCAL.

>  static int
>  dpif_linux_port_add(struct dpif *dpif, struct netdev *netdev,
>                     uint16_t *port_nop)

The kernel can return 32-bit port numbers but userspace (and OpenFlow
1.0) only use 16-bit ports.  Is there anything to prevent overflow
other than the fact that we're unlikely to have both that 64k ports?
Right now we use linear allocation but I'm not sure that it's
something we want to guarantee.

>  static int
>  lookup_internal_device(const char *name, int *dp_idx, int *port_no)
>  {
> -    struct odp_port odp_port;
> -    static int dp0_fd = -1;
> -
> -    if (dp0_fd < 0) {
> -        int error;
> -        char *fn;
> -
> -        error = make_openvswitch_device(0, &fn);
> -        if (error) {
> -            return error;
> -        }
> -
> -        dp0_fd = open(fn, O_RDONLY | O_NONBLOCK);
> -        if (dp0_fd < 0) {
> -            VLOG_WARN_RL(&error_rl, "%s: open failed (%s)",
> -                         fn, strerror(errno));
> -            free(fn);
> -            return errno;
> -        }
> -        free(fn);
> -    }
> +    struct dpif_linux_vport request, reply;
> +    struct ofpbuf *buf;
> +    int error;
>
> -    memset(&odp_port, 0, sizeof odp_port);
> -    strncpy(odp_port.devname, name, sizeof odp_port.devname);
> -    if (ioctl(dp0_fd, ODP_VPORT_QUERY, &odp_port)) {
> -        if (errno != ENODEV) {
> +    dpif_linux_vport_init(&request);
> +    request.cmd = ODP_VPORT_GET;
> +    request.name = name;
> +    error = dpif_linux_vport_transact(&request, &reply, &buf);
> +    if (error) {
> +        if (error != ENODEV) {
>             VLOG_WARN_RL(&error_rl, "%s: vport query failed (%s)",
> -                         name, strerror(errno));
> +                         name, strerror(error));
>         }
> -        return errno;
> -    } else if (odp_port.type == ODPVT_INTERNAL) {
> -        *dp_idx = odp_port.dp_idx;
> -        *port_no = odp_port.port;
> -        return 0;
> -    } else {
> -        return EINVAL;
> +        return error;
>     }
> -}

This function seems a little overkill now that there is only a single
caller.  Why not just merge it with dpif_linux_is_internal_device()
and cut out the extra stuff?  You could also use
dpif_linux_vport_get().

> diff --git a/lib/dpif-linux.h b/lib/dpif-linux.h
> index 1dff4b7..552f432 100644
> --- a/lib/dpif-linux.h
> +++ b/lib/dpif-linux.h
> @@ -1,5 +1,5 @@
>  /*
> - * Copyright (c) 2010 Nicira Networks.
> + * Copyright (c) 2010, 2011 Nicira Networks.
>  *
>  * Licensed under the Apache License, Version 2.0 (the "License");
>  * you may not use this file except in compliance with the License.
> @@ -18,6 +18,37 @@
>  #define DPIF_LINUX_H 1
>
>  #include <stdbool.h>
> +#include "openvswitch/datapath-protocol.h"
> +
> +struct ofpbuf;
> +
> +struct dpif_linux_vport {
> +    /* ioctl command argument. */
> +    int cmd;

'cmd' feels a little out of place to me because it isn't actually part
of the port is often left uninitialized (in replies for example).
Would it make sense to just have it be a separate argument to
dpif_linux_vport_transact()?

> diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
> index 4085dd0..79c0f0a 100644
> --- a/lib/netdev-vport.c
> +++ b/lib/netdev-vport.c
[...]
> +        dpif_linux_vport_init(&request);
> +        request.cmd = ODP_VPORT_GET;
> +        request.name = name;
> +        error = dpif_linux_vport_transact(&request, &reply, &buf);

Is there a reason not to use dpif_linux_vport_get() here?

> @@ -247,24 +305,35 @@ netdev_vport_set_config(struct netdev_dev *dev_, const struct shash *args)
>         if (!error || error == ENODEV) {
>             /* Either reconfiguration succeeded or this vport is not installed
>              * in the kernel (e.g. it hasn't been added to a dpif yet with
>              * dpif_port_add()). */
> -            memcpy(dev->config, port.config, sizeof dev->config);
> +            ofpbuf_delete(dev->options);
> +            dev->options = options;
> +            options = NULL;

Should we zero out error in the ENODEV case, since it has effectively succeeded?

>  static int
>  netdev_vport_get_mtu(const struct netdev *netdev, int *mtup)
>  {
> -    struct odp_vport_mtu vport_mtu;
> -    int err;
> -
> -    ovs_strlcpy(vport_mtu.devname, netdev_get_name(netdev),
> -                sizeof vport_mtu.devname);
> +    struct dpif_linux_vport reply;
> +    struct ofpbuf *buf;
> +    int error;
>
> -    err = netdev_vport_do_ioctl(ODP_VPORT_MTU_GET, &vport_mtu);
> -    if (err) {
> -        return err;
> +    error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
> +    if (!error) {
> +        *mtup = reply.mtu;

Hmm, we probably need some way of detecting that the port doesn't have an MTU.

>  int
>  netdev_vport_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
>  {
> -    const char *name = netdev_get_name(netdev);
> -    struct odp_vport_stats_req ovsr;
> -    int err;
> +    struct dpif_linux_vport reply;
> +    struct ofpbuf *buf;
> +    int error;
>
> -    ovs_strlcpy(ovsr.devname, name, sizeof ovsr.devname);
> -    err = netdev_vport_do_ioctl(ODP_VPORT_STATS_GET, &ovsr);
> -    if (err) {
> -        return err;
> +    error = dpif_linux_vport_get(netdev_get_name(netdev), &reply, &buf);
> +    if (error) {
> +        return error;
> +    } else if (!reply.stats) {
> +        ofpbuf_delete(buf);
> +        return EOPNOTSUPP;
>     }
[...]

Don't we have to free buf in the case of success after getting the stats?

> +static const struct tnl_port_config *
> +tnl_port_config_from_nlattr(const struct nlattr *options, size_t options_len)
> +{
> +    static const struct nl_policy odp_tunnel_policy[] = {
> +        [TNLAT_CONFIG] = { .type = NL_A_UNSPEC,
> +                           .min_len = sizeof(struct tnl_port_config),
> +                           .max_len = sizeof(struct tnl_port_config),
> +                           .optional = false }
> +    };
> +    struct nlattr *a[ARRAY_SIZE(odp_tunnel_policy)];
> +    struct ofpbuf buf;
> +
> +    ofpbuf_use_const(&buf, options, options_len);
> +    if (!nl_policy_parse(&buf, 0, odp_tunnel_policy,
> +                         a, ARRAY_SIZE(odp_tunnel_policy))) {

Why do we use parse in one place and find in another?  Aren't we
trying to solve essentially the same problem in both places?




More information about the dev mailing list