[ovs-dev] [PATCH] datapath: Support for network namespace.

Jesse Gross jesse at nicira.com
Tue Dec 13 05:58:32 UTC 2011


On Mon, Nov 28, 2011 at 7:11 PM, Pravin B Shelar <pshelar at nicira.com> wrote:
> diff --git a/datapath/datapath.c b/datapath/datapath.c
> index 4d95e04..831318d 100644
> --- a/datapath/datapath.c
> +++ b/datapath/datapath.c
> @@ -1384,6 +1386,8 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
>                goto err_destroy_table;
>        }
>
> +       dp->net = get_net(sock_net(skb->sk));

I think this is going to prevent the namespace from ever being freed
unless the datapath is removed before exiting the namespace.

Everything else that I looked at automatically cleans itself up when
the namespace exits.  Physical devices get moved back to the root
namespace and virtual devices get deleted.  The bridge keeps all of
it's equivalent datapath state in the bridge device itself, so that's
how it gets cleaned up.

I think one way to deal with this is to delete the datapath if the
bridge device is unregistered.

> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,33)
> +static int __net_init ovs_init_net(struct net *net)
> +{
> +       return __ovs_init_net(net);
> +}
> +
> +static void __net_init ovs_exit_net(struct net *net)
> +{
> +       __ovs_exit_net(net);
> +}
> +
> +static struct pernet_operations ovs_net_ops = {
> +       .init = ovs_init_net,
> +       .exit = ovs_exit_net,
> +       .id   = &ovs_net_id,
> +       .size = sizeof(struct ovs_net),
> +};
> +#elif LINUX_VERSION_CODE == KERNEL_VERSION(2,6,32)
> +static int __net_init ovs_init_net(struct net *net)
> +{
> +       int err;
> +       struct ovs_net *ovs_net = kzalloc(sizeof(*ovs_net), GFP_KERNEL);
> +
> +       if (!ovs_net)
> +               return -ENOMEM;
> +
> +
> +       err = net_assign_generic(net, ovs_net_id, ovs_net);
> +       if (err) {
> +               kfree(ovs_net);
> +               return err;
> +       }
> +
> +       err = __ovs_init_net(net);
> +       if (err) {
> +               kfree(ovs_net);
> +               return err;
> +       }
> +
> +       return 0;
> +}
> +
> +static void __net_init ovs_exit_net(struct net *net)
> +{
> +       struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
> +
> +       __ovs_exit_net(net);
> +       net_assign_generic(net, ovs_net_id, NULL);
> +       kfree(ovs_net);
> +}
> +
> +static struct pernet_operations ovs_net_ops = {
> +       .init = ovs_init_net,
> +       .exit = ovs_exit_net,
> +};
> +#else
> +static struct ovs_net __ovs_net;
> +
> +void *net_generic(const struct net *net, int id)
> +{
> +       return &__ovs_net;
> +}
> +#endif

Since we only have a single instance of per-net data I think it should
be possible to do all of this emulation entirely in compatibility
code.

> diff --git a/datapath/datapath.h b/datapath/datapath.h
> index 27151b9..18f7c82 100644
> --- a/datapath/datapath.h
> +++ b/datapath/datapath.h
> @@ -32,7 +32,8 @@
>  #include "flow.h"
>  #include "dp_sysfs.h"
>  #include "vlan.h"
> -
> +#include "tunnel.h"
> +#include "vport.h"
>  struct vport;

These are supposed to be in alphabetical order (even if one is out of
place), so let's try to maintain that along with proper spacing.

> @@ -87,6 +89,7 @@ struct datapath {
>
>        /* Stats. */
>        struct dp_stats_percpu __percpu *stats_percpu;
> +       struct net *net;

Please group this appropriately instead of sticking it in with "stats".

>  int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb);
> +struct ovs_net {
> +       /* Per Network namespace list of datapaths to enable dumping them
> +        * all out. Protected by genl_mutex.
> +        */
> +       struct list_head dps;
> +       struct tnl_net tnl_net;
> +       struct vport_net vport_net;
> +};

This should be grouped together with the other structures and
commented appropriately.

> diff --git a/datapath/linux/compat/include/linux/net.h b/datapath/linux/compat/include/linux/net.h
> new file mode 100644
> index 0000000..cac26cd
> --- /dev/null
> +++ b/datapath/linux/compat/include/linux/net.h
> @@ -0,0 +1,12 @@
> +#ifndef __LINUX_NET_WRAPPER_H
> +#define __LINUX_NET_WRAPPER_H 1
> +
> +#include_next <linux/net.h>
> +#include <linux/sched.h>

Shouldn't this be version.h instead of sched.h?

> diff --git a/datapath/linux/compat/include/linux/netdevice.h b/datapath/linux/compat/include/linux/netdevice.h
> index 0c2f2f4..e97dcbb 100644
> --- a/datapath/linux/compat/include/linux/netdevice.h
> +++ b/datapath/linux/compat/include/linux/netdevice.h
> +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
> +static inline struct net *skb_net(const struct sk_buff *skb)
> +{
> +       return dev_net(skb->dev ? : skb_dst(skb)->dev);
> +}
> +#endif

This is part of IPVS, not generic network code.  We should just be
using dev_net(skb->dev).

> diff --git a/datapath/linux/compat/include/net/netns/generic.h b/datapath/linux/compat/include/net/netns/generic.h
> new file mode 100644
> index 0000000..70562d1
> --- /dev/null
> +++ b/datapath/linux/compat/include/net/netns/generic.h
> @@ -0,0 +1,11 @@
> +#ifndef __NET_NET_NETNS_GENERIC_WRAPPER_H
> +#define __NET_NET_NETNS_GENERIC_WRAPPER_H 1
> +
> +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,25)
> +/* <net/netns/generic.h> exists, go ahead and include it. */
> +#include_next <net/netns/generic.h>

net_generic() actually exists on these kernels, declared as a static
inline.  Do we actually get the right version?

> diff --git a/datapath/tunnel.c b/datapath/tunnel.c
> index 41907b9..e48304e 100644
> --- a/datapath/tunnel.c
> +++ b/datapath/tunnel.c
> +#ifdef HAVE_RT_GENID
> +static inline int rt_genid(struct net *net)
> +{
> +       return atomic_read(&net->ipv4.rt_genid);
> +}
> +#endif

Doesn't this belong as compatibility code?

> @@ -858,10 +866,15 @@ static void cache_cleaner(struct work_struct *work)
>                struct hlist_node *n;
>                struct hlist_head *bucket;
>                struct tnl_vport  *tnl_vport;
> +               struct net *net;
>
> -               bucket = &port_table[i];
> -               hlist_for_each_entry_rcu(tnl_vport, n, bucket, hash_node)
> -                       __cache_cleaner(tnl_vport);
> +               for_each_net(net) {
> +                       struct tnl_net *tnl_net = get_tnl_net(net);

It doesn't really matter all that much here but it's probably more
normal to loop over each net and then the table inside of that and you
get better cache locality as a bonus.

> @@ -1642,12 +1664,11 @@ void ovs_tnl_exit(void)
>                struct hlist_head *hash_head;
>                struct hlist_node *n;
>
> -               hash_head = &port_table[i];
> +               hash_head = &tnl_net->port_table[i];
>                hlist_for_each_entry(tnl_vport, n, hash_head, hash_node) {
> -                       BUG();
> -                       goto out;
> +                       pr_err("Tunnel-port-table not empty.");
> +                       return;

I think that we should just delete this block of code that checks for
non-empty tables.  The change to not free the table if there are still
entries seems particularly difficult to reason about correctness
because if we hit this, there is already a serious bug.  It also makes
static checkers unhappy.

> diff --git a/datapath/tunnel.h b/datapath/tunnel.h
> index 6865ae6..0881acf 100644
> --- a/datapath/tunnel.h
> +++ b/datapath/tunnel.h
[...]
> +struct tnl_net {
> +       struct hlist_head *port_table;
> +       int port_table_count;

I don't think that port_table_count should be per-namespace.  The only
place where it is used is to turn the cache cleaner on and off but the
cache cleaner iterates over all the namespaces so we just need one
copy of it.

> diff --git a/datapath/vport-capwap.c b/datapath/vport-capwap.c
> index 6c1b0da..dcb72b1 100644
> --- a/datapath/vport-capwap.c
> +++ b/datapath/vport-capwap.c
> +static int capwap_init_net(struct net *net)
>  {
>        int err;
> +       struct vport_net *vport_net = ovs_get_vport_net(net);
>        struct sockaddr_in sin;
>
> -       err = sock_create(AF_INET, SOCK_DGRAM, 0, &capwap_rcv_socket);
> +
> +       err = __sock_create(net, AF_INET, SOCK_DGRAM, 0,
> +                         &vport_net->capwap_rcv_socket, 1);

Double blank line.

>  const struct vport_ops ovs_capwap_vport_ops = {
>        .type           = OVS_VPORT_TYPE_CAPWAP,
> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
> +       /* Depends on __sock_create(). */
>        .flags          = VPORT_F_TUN_ID,
> +#else
> +       .flags          = VPORT_F_TUN_ID | VPORT_F_INIT_NS,
> +#endif

I don't think the concept of VPORT_F_INIT_NS is really correct because
sock_create() doesn't necessarily make a socket in init_net.  Instead,
it creates it in the namespace of the current process context, which
in this case the module loader or process creating a new namespace.
This is probably the initial namespace but it's not guaranteed to be
and if it's not then you're going to have problems with uninitialized
memory.

I think that the capwap code itself should probably do the work
internally of keeping track of which namespace it should be loaded
into since it is pretty specific to this use case.  Then we can keep
track of the namespace when the module is loaded (accessible via
current->nsproxy->net_ns) or verify that sock_create() will actually
use the initial namespace.  Either way, we should probably print out
some warnings at either compile or run time about capwap being
available only in a single namespace.

> diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c
> index 4411cac..9793b5c 100644
> --- a/datapath/vport-gre.c
> +++ b/datapath/vport-gre.c

We should also mark the protocol (in struct net_protocol) as being
namespace-safe.

> diff --git a/datapath/vport-internal_dev.c b/datapath/vport-internal_dev.c
> index c56f3b2..66e6813 100644
> --- a/datapath/vport-internal_dev.c
> +++ b/datapath/vport-internal_dev.c
> @@ -203,7 +203,8 @@ static void do_setup(struct net_device *netdev)
>        netdev->tx_queue_len = 0;
>
>        netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST |
> -                               NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_TSO;
> +                          NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_TSO |
> +                          NETIF_F_NETNS_LOCAL;

I think we should only assert NETIF_F_NETNS_LOCAL on the original
bridge device and not on additional internal devices because we want
those to be able to move into different namespaces.

> diff --git a/datapath/vport.h b/datapath/vport.h
> index 44cf603..3cc3a69 100644
> --- a/datapath/vport.h
> +++ b/datapath/vport.h
[...]
> +struct vport_net {
> +       struct hlist_head *vport_table;
> +       struct hlist_head *peer_table;

It's not clear to me that either of these tables need to be replicated
for each namespace.  Neither is performance critical so rather than
creating additional hash tables we can just compare the namespace
during lookup.

> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
> +       struct socket *capwap_rcv_socket;
> +       struct netns_frags frag_state;
> +#endif

I think these should be declared in a capwap structure, which can then
have its contents be #ifdef'ed.



More information about the dev mailing list