[ovs-dev] [PATCH] datapath: Support for network namespace.

Pravin Shelar pshelar at nicira.com
Tue Dec 13 21:16:51 UTC 2011


On Mon, Dec 12, 2011 at 9:58 PM, Jesse Gross <jesse at nicira.com> wrote:
> On Mon, Nov 28, 2011 at 7:11 PM, Pravin B Shelar <pshelar at nicira.com> wrote:
>> diff --git a/datapath/datapath.c b/datapath/datapath.c
>> index 4d95e04..831318d 100644
>> --- a/datapath/datapath.c
>> +++ b/datapath/datapath.c
>> @@ -1384,6 +1386,8 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
>>                goto err_destroy_table;
>>        }
>>
>> +       dp->net = get_net(sock_net(skb->sk));
>
> I think this is going to prevent the namespace from ever being freed
> unless the datapath is removed before exiting the namespace.
>
> Everything else that I looked at automatically cleans itself up when
> the namespace exits.  Physical devices get moved back to the root
> namespace and virtual devices get deleted.  The bridge keeps all of
> it's equivalent datapath state in the bridge device itself, so that's
> how it gets cleaned up.
good point.

>
> I think one way to deal with this is to delete the datapath if the
> bridge device is unregistered.
ok. I guess i can send del_dp msg from this event.

>
>> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,33)
>> +static int __net_init ovs_init_net(struct net *net)
>> +{
>> +       return __ovs_init_net(net);
>> +}
>> +
>> +static void __net_init ovs_exit_net(struct net *net)
>> +{
>> +       __ovs_exit_net(net);
>> +}
>> +
>> +static struct pernet_operations ovs_net_ops = {
>> +       .init = ovs_init_net,
>> +       .exit = ovs_exit_net,
>> +       .id   = &ovs_net_id,
>> +       .size = sizeof(struct ovs_net),
>> +};
>> +#elif LINUX_VERSION_CODE == KERNEL_VERSION(2,6,32)
>> +static int __net_init ovs_init_net(struct net *net)
>> +{
>> +       int err;
>> +       struct ovs_net *ovs_net = kzalloc(sizeof(*ovs_net), GFP_KERNEL);
>> +
>> +       if (!ovs_net)
>> +               return -ENOMEM;
>> +
>> +
>> +       err = net_assign_generic(net, ovs_net_id, ovs_net);
>> +       if (err) {
>> +               kfree(ovs_net);
>> +               return err;
>> +       }
>> +
>> +       err = __ovs_init_net(net);
>> +       if (err) {
>> +               kfree(ovs_net);
>> +               return err;
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>> +static void __net_init ovs_exit_net(struct net *net)
>> +{
>> +       struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
>> +
>> +       __ovs_exit_net(net);
>> +       net_assign_generic(net, ovs_net_id, NULL);
>> +       kfree(ovs_net);
>> +}
>> +
>> +static struct pernet_operations ovs_net_ops = {
>> +       .init = ovs_init_net,
>> +       .exit = ovs_exit_net,
>> +};
>> +#else
>> +static struct ovs_net __ovs_net;
>> +
>> +void *net_generic(const struct net *net, int id)
>> +{
>> +       return &__ovs_net;
>> +}
>> +#endif
>
> Since we only have a single instance of per-net data I think it should
> be possible to do all of this emulation entirely in compatibility
> code.

ok.
>
>> diff --git a/datapath/datapath.h b/datapath/datapath.h
>> index 27151b9..18f7c82 100644
>> --- a/datapath/datapath.h
>> +++ b/datapath/datapath.h
>> @@ -32,7 +32,8 @@
>>  #include "flow.h"
>>  #include "dp_sysfs.h"
>>  #include "vlan.h"
>> -
>> +#include "tunnel.h"
>> +#include "vport.h"
>>  struct vport;
>
> These are supposed to be in alphabetical order (even if one is out of
> place), so let's try to maintain that along with proper spacing.
>
ok

>> @@ -87,6 +89,7 @@ struct datapath {
>>
>>        /* Stats. */
>>        struct dp_stats_percpu __percpu *stats_percpu;
>> +       struct net *net;
>
> Please group this appropriately instead of sticking it in with "stats".
>
>>  int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb);
>> +struct ovs_net {
>> +       /* Per Network namespace list of datapaths to enable dumping them
>> +        * all out. Protected by genl_mutex.
>> +        */
>> +       struct list_head dps;
>> +       struct tnl_net tnl_net;
>> +       struct vport_net vport_net;
>> +};
>
> This should be grouped together with the other structures and
> commented appropriately.
>
ok
>> diff --git a/datapath/linux/compat/include/linux/net.h b/datapath/linux/compat/include/linux/net.h
>> new file mode 100644
>> index 0000000..cac26cd
>> --- /dev/null
>> +++ b/datapath/linux/compat/include/linux/net.h
>> @@ -0,0 +1,12 @@
>> +#ifndef __LINUX_NET_WRAPPER_H
>> +#define __LINUX_NET_WRAPPER_H 1
>> +
>> +#include_next <linux/net.h>
>> +#include <linux/sched.h>
>
> Shouldn't this be version.h instead of sched.h?
>
right.

>> diff --git a/datapath/linux/compat/include/linux/netdevice.h b/datapath/linux/compat/include/linux/netdevice.h
>> index 0c2f2f4..e97dcbb 100644
>> --- a/datapath/linux/compat/include/linux/netdevice.h
>> +++ b/datapath/linux/compat/include/linux/netdevice.h
>> +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
>> +static inline struct net *skb_net(const struct sk_buff *skb)
>> +{
>> +       return dev_net(skb->dev ? : skb_dst(skb)->dev);
>> +}
>> +#endif
>
> This is part of IPVS, not generic network code.  We should just be
> using dev_net(skb->dev).
>
ok.

>> diff --git a/datapath/linux/compat/include/net/netns/generic.h b/datapath/linux/compat/include/net/netns/generic.h
>> new file mode 100644
>> index 0000000..70562d1
>> --- /dev/null
>> +++ b/datapath/linux/compat/include/net/netns/generic.h
>> @@ -0,0 +1,11 @@
>> +#ifndef __NET_NET_NETNS_GENERIC_WRAPPER_H
>> +#define __NET_NET_NETNS_GENERIC_WRAPPER_H 1
>> +
>> +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,25)
>> +/* <net/netns/generic.h> exists, go ahead and include it. */
>> +#include_next <net/netns/generic.h>
>
> net_generic() actually exists on these kernels, declared as a static
> inline.  Do we actually get the right version?
>
I need to overload it to return ovs_net for compatibility with older kernel.

>> diff --git a/datapath/tunnel.c b/datapath/tunnel.c
>> index 41907b9..e48304e 100644
>> --- a/datapath/tunnel.c
>> +++ b/datapath/tunnel.c
>> +#ifdef HAVE_RT_GENID
>> +static inline int rt_genid(struct net *net)
>> +{
>> +       return atomic_read(&net->ipv4.rt_genid);
>> +}
>> +#endif
>
> Doesn't this belong as compatibility code?
>
ok

>> @@ -858,10 +866,15 @@ static void cache_cleaner(struct work_struct *work)
>>                struct hlist_node *n;
>>                struct hlist_head *bucket;
>>                struct tnl_vport  *tnl_vport;
>> +               struct net *net;
>>
>> -               bucket = &port_table[i];
>> -               hlist_for_each_entry_rcu(tnl_vport, n, bucket, hash_node)
>> -                       __cache_cleaner(tnl_vport);
>> +               for_each_net(net) {
>> +                       struct tnl_net *tnl_net = get_tnl_net(net);
>
> It doesn't really matter all that much here but it's probably more
> normal to loop over each net and then the table inside of that and you
> get better cache locality as a bonus.
>
right.

>> @@ -1642,12 +1664,11 @@ void ovs_tnl_exit(void)
>>                struct hlist_head *hash_head;
>>                struct hlist_node *n;
>>
>> -               hash_head = &port_table[i];
>> +               hash_head = &tnl_net->port_table[i];
>>                hlist_for_each_entry(tnl_vport, n, hash_head, hash_node) {
>> -                       BUG();
>> -                       goto out;
>> +                       pr_err("Tunnel-port-table not empty.");
>> +                       return;
>
> I think that we should just delete this block of code that checks for
> non-empty tables.  The change to not free the table if there are still
> entries seems particularly difficult to reason about correctness
> because if we hit this, there is already a serious bug.  It also makes
> static checkers unhappy.
>
ok

>> diff --git a/datapath/tunnel.h b/datapath/tunnel.h
>> index 6865ae6..0881acf 100644
>> --- a/datapath/tunnel.h
>> +++ b/datapath/tunnel.h
> [...]
>> +struct tnl_net {
>> +       struct hlist_head *port_table;
>> +       int port_table_count;
>
> I don't think that port_table_count should be per-namespace.  The only
> place where it is used is to turn the cache cleaner on and off but the
> cache cleaner iterates over all the namespaces so we just need one
> copy of it.
>
ok

>> diff --git a/datapath/vport-capwap.c b/datapath/vport-capwap.c
>> index 6c1b0da..dcb72b1 100644
>> --- a/datapath/vport-capwap.c
>> +++ b/datapath/vport-capwap.c
>> +static int capwap_init_net(struct net *net)
>>  {
>>        int err;
>> +       struct vport_net *vport_net = ovs_get_vport_net(net);
>>        struct sockaddr_in sin;
>>
>> -       err = sock_create(AF_INET, SOCK_DGRAM, 0, &capwap_rcv_socket);
>> +
>> +       err = __sock_create(net, AF_INET, SOCK_DGRAM, 0,
>> +                         &vport_net->capwap_rcv_socket, 1);
>
> Double blank line.
>
>>  const struct vport_ops ovs_capwap_vport_ops = {
>>        .type           = OVS_VPORT_TYPE_CAPWAP,
>> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
>> +       /* Depends on __sock_create(). */
>>        .flags          = VPORT_F_TUN_ID,
>> +#else
>> +       .flags          = VPORT_F_TUN_ID | VPORT_F_INIT_NS,
>> +#endif
>
> I don't think the concept of VPORT_F_INIT_NS is really correct because
> sock_create() doesn't necessarily make a socket in init_net.  Instead,
> it creates it in the namespace of the current process context, which
> in this case the module loader or process creating a new namespace.
> This is probably the initial namespace but it's not guaranteed to be
> and if it's not then you're going to have problems with uninitialized
> memory.
>
> I think that the capwap code itself should probably do the work
> internally of keeping track of which namespace it should be loaded
> into since it is pretty specific to this use case.  Then we can keep
> track of the namespace when the module is loaded (accessible via
> current->nsproxy->net_ns) or verify that sock_create() will actually
> use the initial namespace.  Either way, we should probably print out
> some warnings at either compile or run time about capwap being
> available only in a single namespace.
>
good idea. I can use per-net cap-wap sock ptr.

>> diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c
>> index 4411cac..9793b5c 100644
>> --- a/datapath/vport-gre.c
>> +++ b/datapath/vport-gre.c
>
> We should also mark the protocol (in struct net_protocol) as being
> namespace-safe.
ok
>
>> diff --git a/datapath/vport-internal_dev.c b/datapath/vport-internal_dev.c
>> index c56f3b2..66e6813 100644
>> --- a/datapath/vport-internal_dev.c
>> +++ b/datapath/vport-internal_dev.c
>> @@ -203,7 +203,8 @@ static void do_setup(struct net_device *netdev)
>>        netdev->tx_queue_len = 0;
>>
>>        netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST |
>> -                               NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_TSO;
>> +                          NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_TSO |
>> +                          NETIF_F_NETNS_LOCAL;
>
> I think we should only assert NETIF_F_NETNS_LOCAL on the original
> bridge device and not on additional internal devices because we want
> those to be able to move into different namespaces.
>
ok

>> diff --git a/datapath/vport.h b/datapath/vport.h
>> index 44cf603..3cc3a69 100644
>> --- a/datapath/vport.h
>> +++ b/datapath/vport.h
> [...]
>> +struct vport_net {
>> +       struct hlist_head *vport_table;
>> +       struct hlist_head *peer_table;
>
> It's not clear to me that either of these tables need to be replicated
> for each namespace.  Neither is performance critical so rather than
> creating additional hash tables we can just compare the namespace
> during lookup.
>
I thought about that, but if we have global table then it will need
more locking.
if you want I can make it global table.

>> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
>> +       struct socket *capwap_rcv_socket;
>> +       struct netns_frags frag_state;
>> +#endif
>
> I think these should be declared in a capwap structure, which can then
> have its contents be #ifdef'ed.

ok.



More information about the dev mailing list