[ovs-dev] [PATCH v3] datapath: Add support for namespace.

Jesse Gross jesse at nicira.com
Sat Jan 7 02:27:13 UTC 2012


On Thu, Jan 5, 2012 at 7:59 PM, Pravin B Shelar <pshelar at nicira.com> wrote:
> diff --git a/datapath/datapath.c b/datapath/datapath.c
> index 17871e4..76bf8f5 100644
> --- a/datapath/datapath.c
> +++ b/datapath/datapath.c
> +static void __dp_destroy(struct datapath *dp)
> +{

Can you document the locking expectations for the function (i.e. genl mutex)?

> +       struct vport *vport, *next_vport;
>
> +       list_for_each_entry_safe(vport, next_vport, &dp->port_list, node)
> +               if (vport->port_no != OVSP_LOCAL)
> +                       ovs_dp_detach_port(vport);

I think that the above block is the only part of destruction that
needs to hold RTNL.  Can we just take the lock there and drop it from
callers?  Then we can also move call_rcu in here as well, so it really
does the full destruction process.

> @@ -2047,15 +2059,20 @@ error:
>  static int __rehash_flow_table(void *dummy)
>  {
>        struct datapath *dp;
> +       struct net *net;
>
> -       list_for_each_entry(dp, &dps, list_node) {
> -               struct flow_table *old_table = genl_dereference(dp->table);
> -               struct flow_table *new_table;
> +       for_each_net(net) {
> +               struct ovs_net *ovs_net = net_generic(net, ovs_net_id);

I think you need to take RTNL here, otherwise a namespace could
disappear while you are iterating over them.

> +static void dp_destroy_all(struct ovs_net *ovs_net)
> +{
> +       struct datapath *dp, *dp_next;
> +       LIST_HEAD(dp_kill_list);
> +
> +       rtnl_lock();
> +       list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) {

This list is really protected by genl mutex, not RTNL.

> +               __dp_destroy(dp);
> +               list_add_tail(&dp->list_node, &dp_kill_list);

I'm not sure that there's much benefit to batching the calls to
call_rcu outside of RTNL.  However, if you push them both down into
__dp_destroy() then you can get the same effect for free.

> +       list_for_each_entry(dp, &dp_kill_list, list_node) {
> +               call_rcu(&dp->rcu, destroy_dp_rcu);
> +               module_put(THIS_MODULE);

I believe that we can now just drop module refcounting because this
actually achieves the goal of deleting datapaths when the module is
unloaded.  When you call unregister_pernet_device() on module unload,
it calls ovs_exit_net() on all namespaces that are currently active so
remaining dps automatically get deleted.

Can you also annotate ovs_init_net() ovs_exit_net() with __init_net
and __exit_net respectively?

> @@ -2085,21 +2154,21 @@ static int __init dp_init(void)
[...]
> +       err = register_pernet_device(&ovs_net_ops);
> +       if (err < 0)

Does this ever return a positive value?  Otherwise, we can just check
for if (err) like in other places.

> @@ -2131,9 +2200,9 @@ static void dp_cleanup(void)
>        rcu_barrier();
>        dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
>        unregister_netdevice_notifier(&ovs_dp_device_notifier);
> +       unregister_pernet_device(&ovs_net_ops);

I think we need to move rcu_barrier() down because
unregister_pernet_device() can generate rcu callbacks (assuming that
we drop the refcounting) and those should be complete before the
module exists.  In general, I think the safest thing to do is just
make it last, that way we don't have to worry about whether any of the
exit functions might have callbacks.

> diff --git a/datapath/datapath.h b/datapath/datapath.h
> index 27151b9..d5a8e41 100644
> --- a/datapath/datapath.h
> +++ b/datapath/datapath.h
> @@ -29,10 +29,11 @@
>
>  #include "checksum.h"
>  #include "compat.h"
> -#include "flow.h"
>  #include "dp_sysfs.h"
> +#include "flow.h"
> +#include "tunnel.h"
>  #include "vlan.h"
> -
> +#include "vport.h"
>  struct vport;

If you include vport.h then you can drop the forward declaration of
struct vport.

> +struct ovs_net {
> +       /* Per Network namespace list of datapaths to enable dumping them
> +        * all out. Protected by genl_mutex.
> +        */
> +       struct list_head dps;
> +       /* Per network namespace data for tnl. */
> +       struct tnl_net tnl_net;
> +       /* Per network namespace data for vport. */
> +       struct vport_net vport_net;
> +};

You should use the kernel comment style to document the struct
members, as is used other places in this file.

> +extern int ovs_net_id;

Can you put this together with the other global static variables?

> diff --git a/datapath/linux/compat/include/net/genetlink.h b/datapath/linux/compat/include/net/genetlink.h
> index a1ff7c1..4e7f1b6 100644
> --- a/datapath/linux/compat/include/net/genetlink.h
> +++ b/datapath/linux/compat/include/net/genetlink.h
> +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32)
> +#define SET_NETNSOK
> +#else
> +#define SET_NETNSOK    .netnsok = true,
> +#endif

Can you put this in compat.h since it's not going upstream?

> diff --git a/datapath/linux/compat/include/net/net_namespace.h b/datapath/linux/compat/include/net/net_namespace.h
> index 9ce9fcd..1c25cc6 100644
> --- a/datapath/linux/compat/include/net/net_namespace.h
> +++ b/datapath/linux/compat/include/net/net_namespace.h
>  #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32)
> -#define INIT_NET_GENL_SOCK init_net.genl_sock
> +#define GENL_SOCK(net) ((net)->genl_sock)
>  #else
> -#define INIT_NET_GENL_SOCK genl_sock
> +#define GENL_SOCK(net) (genl_sock)
> +#endif

While you're at it, can you move this to compat.h as well?

> diff --git a/datapath/linux/compat/net_namespace.c b/datapath/linux/compat/net_namespace.c
> new file mode 100644
> index 0000000..31c4a6e
> --- /dev/null
> +++ b/datapath/linux/compat/net_namespace.c
> +#if LINUX_VERSION_CODE == KERNEL_VERSION(2,6,32)
> +
> +#undef pernet_operations
> +struct pernet_operations pnet_compat;
> +
> +struct rpl_pernet_operations *pnet;

These should be static.

> +static int __net_init ovs_init_net(struct net *net)

Can you name these something different from the ones in datapath.c?

> +{
> +       int err;
> +       void *ovs_net = kzalloc(pnet->size, GFP_KERNEL);
> +

Extra blank line here.

> +       if (!ovs_net)
> +               return -ENOMEM;
> +
> +       err = net_assign_generic(net, *pnet->id, ovs_net);
> +       if (err) {
> +               kfree(ovs_net);
> +               return err;
> +       }
> +
> +       err = pnet->init(net);
> +       if (err) {
> +               kfree(ovs_net);
> +               return err;

Can you make a common exit path?

> +static void __net_init ovs_exit_net(struct net *net)

This should be __net_exit, not __net_init.

> +int rpl_register_pernet_device(struct rpl_pernet_operations *rpl_pnet)
> +{
> +       pnet = rpl_pnet;
> +       pnet_compat.init = ovs_init_net;
> +       pnet_compat.exit = ovs_exit_net;

Can you just initialize these statically, like you did in the main
pernet code in datapath.c?

> +       return register_pernet_gen_subsys(pnet->id, &pnet_compat);

Shouldn't this be register_pernet_gen_device(), since that's what
you're emulating.

> +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32)

These could just be combined into an #elsif.

> diff --git a/datapath/tunnel.c b/datapath/tunnel.c
> index 33d2fe9..1a66db3 100644
> --- a/datapath/tunnel.c
> +++ b/datapath/tunnel.c
> @@ -82,23 +84,12 @@
>  #define CACHE_DATA_ALIGN 16
>  #define PORT_TABLE_SIZE  1024
>
> -static struct hlist_head *port_table __read_mostly;
> -static int port_table_count;
>

Extra blank line.

>  static void cache_cleaner(struct work_struct *work)
[...]
> +       for_each_net(net) {

I think this needs to be for_each_net_rcu().

> +               struct tnl_net *tnl_net = get_tnl_net(net);
> +
> +               for (i = 0; i < PORT_TABLE_SIZE; i++) {
> +

Extra blank line.

> @@ -1417,7 +1438,8 @@ static int tnl_set_config(struct nlattr *options, const struct tnl_ops *tnl_ops,
>
>        mutable->tunnel_hlen += sizeof(struct iphdr);
>
> -       old_vport = port_table_lookup(&mutable->key, &old_mutable);
> +       tnl_net = get_tnl_net(net);

Can you initialize tnl_net at the beginning of the function so we
don't have wonder whether it is set?

> +int ovs_tnl_init_net(struct net *net)
[...]
> +       tnl_net->port_table = kmalloc(PORT_TABLE_SIZE * sizeof(struct hlist_head *),
> +                                     GFP_KERNEL);

I think that namespaces might affect the tradeoff of whether it makes
sense to preallocate a tunnel table at startup.  There could be a lot
of namespaces, none of which use tunnels, so the wasted memory could
actually be pretty significant.

> diff --git a/datapath/tunnel.h b/datapath/tunnel.h
> index 6865ae6..f470702 100644
> --- a/datapath/tunnel.h
> +++ b/datapath/tunnel.h
> @@ -20,6 +20,9 @@
>  #define TUNNEL_H 1
>
>  #include <linux/version.h>
> +#include <net/ip_vs.h>

We shouldn't reference anything in ip_vs.h

> +struct tnl_net {
> +       struct hlist_head *port_table;
> +
> +       /*
> +        * These are just used as an optimization: they don't require any kind
> +        * of synchronization because we could have just as easily read the
> +        * value before the port change happened.
> +        */
> +
> +       unsigned int key_local_remote_ports;
> +       unsigned int key_remote_ports;
> +       unsigned int key_multicast_ports;
> +       unsigned int local_remote_ports;
> +       unsigned int remote_ports;
> +       unsigned int multicast_ports;
> +};

Does anything outside of tunnel.c actually use struct tnl_net?

> diff --git a/datapath/vport-capwap.c b/datapath/vport-capwap.c
> index 6c1b0da..493405b 100644
> --- a/datapath/vport-capwap.c
> +++ b/datapath/vport-capwap.c
> -static int capwap_init(void)
> +static int capwap_init_net(struct net *net)

Can you annotate this with __net_init as well (actually we should
annotate all the net functions).

However, given that we already wanted to start opening capwap sockets
on demand I wonder whether it makes sense to have these vport init_net
and exit_net functions at all and just drive everything off port
creation (and ports will get automatically cleaned up with the
namespace goes away).

> diff --git a/datapath/vport-patch.c b/datapath/vport-patch.c
> index 53b24b0..a7be489 100644
> --- a/datapath/vport-patch.c
> +++ b/datapath/vport-patch.c
> -static void update_peers(const char *name, struct vport *vport)
> +static void update_peers(const char *name, struct net *net, struct vport *vport)
>  {
>        struct hlist_head *bucket = hash_bucket(name);

It would be nice if we also included the net in the hash.

> diff --git a/datapath/vport.c b/datapath/vport.c
> index e9ccdbd..83043ec 100644
> --- a/datapath/vport.c
> +++ b/datapath/vport.c
> +static void vport_exit_net(struct net *net, int max)
> +{
> +       int i;
> +
> +       for (i = 0; i < max; i++) {
> +               const struct vport_ops *ops = vport_ops_list[i];
> +
> +               if (ops->exit_net)
> +                       ops->exit_net(net);
> +       }
> +}
> +
> +int ovs_vport_init_net(struct net *net)
> +{
> +       int i;
> +
> +       for (i = 0; i < n_vport_types; i++) {
> +               const struct vport_ops *ops = vport_ops_list[i];
> +               int err;
> +
> +               err = 0;
> +
> +               if (ops->init_net)
> +                       err = ops->init_net(net);
> +
> +               if (err) {
> +                       vport_exit_net(net, i);
> +                       return err;
> +               }
> +       }
> +
> +       return 0;
> +}
> +
> +void ovs_vport_exit_net(struct net *net)
> +{
> +       vport_exit_net(net, n_vport_types);
> +}
> +

We now have a funny model where things that need to be initialized at
module load and fail will continue without the vport support but
things initialized at namespace init fail the net.  Since there's no
clear distinction (from the outside) why something is initialized in
one or the other, it's somewhat confusing.  This is probably an even
stronger reason to do things on demand and just get rid of the
initialization functions altogether.

> -struct vport *ovs_vport_locate(const char *name)
> +struct vport *ovs_vport_locate(struct net *net, const char *name)
>  {
>        struct hlist_head *bucket = hash_bucket(name);
>        struct vport *vport;
>        struct hlist_node *node;
>
>        hlist_for_each_entry_rcu(vport, node, bucket, hash_node)
> -               if (!strcmp(name, vport->ops->get_name(vport)))
> +               if (!strcmp(name, vport->ops->get_name(vport)) &&
> +                   vport->net == net)

It would be nice if we could use the net in the hash here too.

> @@ -187,6 +229,7 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops,
>
>        vport->dp = parms->dp;
>        vport->port_no = parms->port_no;
> +       vport->net = parms->dp->net;

vport->net is always exactly the same as vport->dp->net, right?

Assuming that's the case, I don't think it is a very good idea to make
a copy of it.  It's confusing if an internal device is moved to a
different namespace and generally redundant.  You could create a
function get this if you want but that probably won't save any typing
anyways.

> diff --git a/datapath/vport.h b/datapath/vport.h
> index 44cf603..44f3bf0 100644
> --- a/datapath/vport.h
> +++ b/datapath/vport.h
> @@ -20,25 +20,32 @@
>  #define VPORT_H 1
>
>  #include <linux/list.h>
> +#include <linux/netlink.h>

Is netlink.h actually used?



More information about the dev mailing list