[ovs-dev] [PATCH] datapath: Support for network namespace.
Pravin Shelar
pshelar at nicira.com
Tue Dec 13 21:16:51 UTC 2011
On Mon, Dec 12, 2011 at 9:58 PM, Jesse Gross <jesse at nicira.com> wrote:
> On Mon, Nov 28, 2011 at 7:11 PM, Pravin B Shelar <pshelar at nicira.com> wrote:
>> diff --git a/datapath/datapath.c b/datapath/datapath.c
>> index 4d95e04..831318d 100644
>> --- a/datapath/datapath.c
>> +++ b/datapath/datapath.c
>> @@ -1384,6 +1386,8 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
>> goto err_destroy_table;
>> }
>>
>> + dp->net = get_net(sock_net(skb->sk));
>
> I think this is going to prevent the namespace from ever being freed
> unless the datapath is removed before exiting the namespace.
>
> Everything else that I looked at automatically cleans itself up when
> the namespace exits. Physical devices get moved back to the root
> namespace and virtual devices get deleted. The bridge keeps all of
> it's equivalent datapath state in the bridge device itself, so that's
> how it gets cleaned up.
good point.
>
> I think one way to deal with this is to delete the datapath if the
> bridge device is unregistered.
ok. I guess i can send del_dp msg from this event.
>
>> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,33)
>> +static int __net_init ovs_init_net(struct net *net)
>> +{
>> + return __ovs_init_net(net);
>> +}
>> +
>> +static void __net_init ovs_exit_net(struct net *net)
>> +{
>> + __ovs_exit_net(net);
>> +}
>> +
>> +static struct pernet_operations ovs_net_ops = {
>> + .init = ovs_init_net,
>> + .exit = ovs_exit_net,
>> + .id = &ovs_net_id,
>> + .size = sizeof(struct ovs_net),
>> +};
>> +#elif LINUX_VERSION_CODE == KERNEL_VERSION(2,6,32)
>> +static int __net_init ovs_init_net(struct net *net)
>> +{
>> + int err;
>> + struct ovs_net *ovs_net = kzalloc(sizeof(*ovs_net), GFP_KERNEL);
>> +
>> + if (!ovs_net)
>> + return -ENOMEM;
>> +
>> +
>> + err = net_assign_generic(net, ovs_net_id, ovs_net);
>> + if (err) {
>> + kfree(ovs_net);
>> + return err;
>> + }
>> +
>> + err = __ovs_init_net(net);
>> + if (err) {
>> + kfree(ovs_net);
>> + return err;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static void __net_init ovs_exit_net(struct net *net)
>> +{
>> + struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
>> +
>> + __ovs_exit_net(net);
>> + net_assign_generic(net, ovs_net_id, NULL);
>> + kfree(ovs_net);
>> +}
>> +
>> +static struct pernet_operations ovs_net_ops = {
>> + .init = ovs_init_net,
>> + .exit = ovs_exit_net,
>> +};
>> +#else
>> +static struct ovs_net __ovs_net;
>> +
>> +void *net_generic(const struct net *net, int id)
>> +{
>> + return &__ovs_net;
>> +}
>> +#endif
>
> Since we only have a single instance of per-net data I think it should
> be possible to do all of this emulation entirely in compatibility
> code.
ok.
>
>> diff --git a/datapath/datapath.h b/datapath/datapath.h
>> index 27151b9..18f7c82 100644
>> --- a/datapath/datapath.h
>> +++ b/datapath/datapath.h
>> @@ -32,7 +32,8 @@
>> #include "flow.h"
>> #include "dp_sysfs.h"
>> #include "vlan.h"
>> -
>> +#include "tunnel.h"
>> +#include "vport.h"
>> struct vport;
>
> These are supposed to be in alphabetical order (even if one is out of
> place), so let's try to maintain that along with proper spacing.
>
ok
>> @@ -87,6 +89,7 @@ struct datapath {
>>
>> /* Stats. */
>> struct dp_stats_percpu __percpu *stats_percpu;
>> + struct net *net;
>
> Please group this appropriately instead of sticking it in with "stats".
>
>> int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb);
>> +struct ovs_net {
>> + /* Per Network namespace list of datapaths to enable dumping them
>> + * all out. Protected by genl_mutex.
>> + */
>> + struct list_head dps;
>> + struct tnl_net tnl_net;
>> + struct vport_net vport_net;
>> +};
>
> This should be grouped together with the other structures and
> commented appropriately.
>
ok
>> diff --git a/datapath/linux/compat/include/linux/net.h b/datapath/linux/compat/include/linux/net.h
>> new file mode 100644
>> index 0000000..cac26cd
>> --- /dev/null
>> +++ b/datapath/linux/compat/include/linux/net.h
>> @@ -0,0 +1,12 @@
>> +#ifndef __LINUX_NET_WRAPPER_H
>> +#define __LINUX_NET_WRAPPER_H 1
>> +
>> +#include_next <linux/net.h>
>> +#include <linux/sched.h>
>
> Shouldn't this be version.h instead of sched.h?
>
right.
>> diff --git a/datapath/linux/compat/include/linux/netdevice.h b/datapath/linux/compat/include/linux/netdevice.h
>> index 0c2f2f4..e97dcbb 100644
>> --- a/datapath/linux/compat/include/linux/netdevice.h
>> +++ b/datapath/linux/compat/include/linux/netdevice.h
>> +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
>> +static inline struct net *skb_net(const struct sk_buff *skb)
>> +{
>> + return dev_net(skb->dev ? : skb_dst(skb)->dev);
>> +}
>> +#endif
>
> This is part of IPVS, not generic network code. We should just be
> using dev_net(skb->dev).
>
ok.
>> diff --git a/datapath/linux/compat/include/net/netns/generic.h b/datapath/linux/compat/include/net/netns/generic.h
>> new file mode 100644
>> index 0000000..70562d1
>> --- /dev/null
>> +++ b/datapath/linux/compat/include/net/netns/generic.h
>> @@ -0,0 +1,11 @@
>> +#ifndef __NET_NET_NETNS_GENERIC_WRAPPER_H
>> +#define __NET_NET_NETNS_GENERIC_WRAPPER_H 1
>> +
>> +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,25)
>> +/* <net/netns/generic.h> exists, go ahead and include it. */
>> +#include_next <net/netns/generic.h>
>
> net_generic() actually exists on these kernels, declared as a static
> inline. Do we actually get the right version?
>
I need to overload it to return ovs_net for compatibility with older kernel.
>> diff --git a/datapath/tunnel.c b/datapath/tunnel.c
>> index 41907b9..e48304e 100644
>> --- a/datapath/tunnel.c
>> +++ b/datapath/tunnel.c
>> +#ifdef HAVE_RT_GENID
>> +static inline int rt_genid(struct net *net)
>> +{
>> + return atomic_read(&net->ipv4.rt_genid);
>> +}
>> +#endif
>
> Doesn't this belong as compatibility code?
>
ok
>> @@ -858,10 +866,15 @@ static void cache_cleaner(struct work_struct *work)
>> struct hlist_node *n;
>> struct hlist_head *bucket;
>> struct tnl_vport *tnl_vport;
>> + struct net *net;
>>
>> - bucket = &port_table[i];
>> - hlist_for_each_entry_rcu(tnl_vport, n, bucket, hash_node)
>> - __cache_cleaner(tnl_vport);
>> + for_each_net(net) {
>> + struct tnl_net *tnl_net = get_tnl_net(net);
>
> It doesn't really matter all that much here but it's probably more
> normal to loop over each net and then the table inside of that and you
> get better cache locality as a bonus.
>
right.
>> @@ -1642,12 +1664,11 @@ void ovs_tnl_exit(void)
>> struct hlist_head *hash_head;
>> struct hlist_node *n;
>>
>> - hash_head = &port_table[i];
>> + hash_head = &tnl_net->port_table[i];
>> hlist_for_each_entry(tnl_vport, n, hash_head, hash_node) {
>> - BUG();
>> - goto out;
>> + pr_err("Tunnel-port-table not empty.");
>> + return;
>
> I think that we should just delete this block of code that checks for
> non-empty tables. The change to not free the table if there are still
> entries seems particularly difficult to reason about correctness
> because if we hit this, there is already a serious bug. It also makes
> static checkers unhappy.
>
ok
>> diff --git a/datapath/tunnel.h b/datapath/tunnel.h
>> index 6865ae6..0881acf 100644
>> --- a/datapath/tunnel.h
>> +++ b/datapath/tunnel.h
> [...]
>> +struct tnl_net {
>> + struct hlist_head *port_table;
>> + int port_table_count;
>
> I don't think that port_table_count should be per-namespace. The only
> place where it is used is to turn the cache cleaner on and off but the
> cache cleaner iterates over all the namespaces so we just need one
> copy of it.
>
ok
>> diff --git a/datapath/vport-capwap.c b/datapath/vport-capwap.c
>> index 6c1b0da..dcb72b1 100644
>> --- a/datapath/vport-capwap.c
>> +++ b/datapath/vport-capwap.c
>> +static int capwap_init_net(struct net *net)
>> {
>> int err;
>> + struct vport_net *vport_net = ovs_get_vport_net(net);
>> struct sockaddr_in sin;
>>
>> - err = sock_create(AF_INET, SOCK_DGRAM, 0, &capwap_rcv_socket);
>> +
>> + err = __sock_create(net, AF_INET, SOCK_DGRAM, 0,
>> + &vport_net->capwap_rcv_socket, 1);
>
> Double blank line.
>
>> const struct vport_ops ovs_capwap_vport_ops = {
>> .type = OVS_VPORT_TYPE_CAPWAP,
>> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
>> + /* Depends on __sock_create(). */
>> .flags = VPORT_F_TUN_ID,
>> +#else
>> + .flags = VPORT_F_TUN_ID | VPORT_F_INIT_NS,
>> +#endif
>
> I don't think the concept of VPORT_F_INIT_NS is really correct because
> sock_create() doesn't necessarily make a socket in init_net. Instead,
> it creates it in the namespace of the current process context, which
> in this case the module loader or process creating a new namespace.
> This is probably the initial namespace but it's not guaranteed to be
> and if it's not then you're going to have problems with uninitialized
> memory.
>
> I think that the capwap code itself should probably do the work
> internally of keeping track of which namespace it should be loaded
> into since it is pretty specific to this use case. Then we can keep
> track of the namespace when the module is loaded (accessible via
> current->nsproxy->net_ns) or verify that sock_create() will actually
> use the initial namespace. Either way, we should probably print out
> some warnings at either compile or run time about capwap being
> available only in a single namespace.
>
good idea. I can use per-net cap-wap sock ptr.
>> diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c
>> index 4411cac..9793b5c 100644
>> --- a/datapath/vport-gre.c
>> +++ b/datapath/vport-gre.c
>
> We should also mark the protocol (in struct net_protocol) as being
> namespace-safe.
ok
>
>> diff --git a/datapath/vport-internal_dev.c b/datapath/vport-internal_dev.c
>> index c56f3b2..66e6813 100644
>> --- a/datapath/vport-internal_dev.c
>> +++ b/datapath/vport-internal_dev.c
>> @@ -203,7 +203,8 @@ static void do_setup(struct net_device *netdev)
>> netdev->tx_queue_len = 0;
>>
>> netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST |
>> - NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_TSO;
>> + NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_TSO |
>> + NETIF_F_NETNS_LOCAL;
>
> I think we should only assert NETIF_F_NETNS_LOCAL on the original
> bridge device and not on additional internal devices because we want
> those to be able to move into different namespaces.
>
ok
>> diff --git a/datapath/vport.h b/datapath/vport.h
>> index 44cf603..3cc3a69 100644
>> --- a/datapath/vport.h
>> +++ b/datapath/vport.h
> [...]
>> +struct vport_net {
>> + struct hlist_head *vport_table;
>> + struct hlist_head *peer_table;
>
> It's not clear to me that either of these tables need to be replicated
> for each namespace. Neither is performance critical so rather than
> creating additional hash tables we can just compare the namespace
> during lookup.
>
I thought about that, but if we have global table then it will need
more locking.
if you want I can make it global table.
>> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
>> + struct socket *capwap_rcv_socket;
>> + struct netns_frags frag_state;
>> +#endif
>
> I think these should be declared in a capwap structure, which can then
> have its contents be #ifdef'ed.
ok.
More information about the dev
mailing list