[ovs-dev] [PATCH v3] datapath: Simplify datapath locking.

Wed Apr 10 01:41:58 UTC 2013

On Tue, Apr 9, 2013 at 3:03 PM, Pravin B Shelar <pshelar at nicira.com> wrote:
> diff --git a/datapath/datapath.c b/datapath/datapath.c
> index 9cd4b0e..f0191ce 100644
> --- a/datapath/datapath.c
> +++ b/datapath/datapath.c
>  /**
>   * DOC: Locking:
>   *
> - * Writes to device state (add/remove datapath, port, set operations on vports,
> - * etc.) are protected by RTNL.
> - *
> - * Writes to other state (flow table modifications, set miscellaneous datapath
> - * parameters, etc.) are protected by genl_mutex.  The RTNL lock nests inside
> - * genl_mutex.
> + * All writes e.g. Writes to device state (add/remove datapath, port, set
> + * operations on vports, etc.), Writes to other state (flow table
> + * modifications, set miscellaneous datapath parameters, etc.) are protected
> + * by ovs_lock.
>   *
>   * Reads are protected by RCU.

I think we should probably add a comment about the role and nesting of
RTNL here.

> -/* Called with genl_mutex and optionally with RTNL lock also. */
> +/* Called with ovs_mutex and optionally with ovs_mutex also. */
>  static struct datapath *lookup_datapath(struct net *net,

I think we can simplify the comment to say that this is protected with
ovs_mutex.

> @@ -1681,14 +1720,7 @@ static void __dp_destroy(struct datapath *dp)
>         }
>
>         list_del(&dp->list_node);
> -       ovs_dp_detach_port(ovs_vport_rtnl(dp, OVSP_LOCAL));
> -
> -       /* rtnl_unlock() will wait until all the references to devices that
> -        * are pending unregistration have been dropped.  We do it here to
> -        * ensure that any internal devices (which contain DP pointers) are
> -        * fully destroyed before freeing the datapath.
> -        */
> -       rtnl_unlock();
> +       ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL));

I think it would be good to have a comment in place of the one that is
being deleted explaining why OVSP_LOCAL has to be handled last.

> -static int __rehash_flow_table(void *dummy)
> +static int __rehash_flow_table(void)
>  {
>         struct datapath *dp;
>         struct net *net;
>
> +       ovs_lock();
>         rtnl_lock();
>         for_each_net(net) {
>                 struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
>
>                 list_for_each_entry(dp, &ovs_net->dps, list_node) {
> -                       struct flow_table *old_table = genl_dereference(dp->table);
> +                       struct flow_table *old_table = ovsl_dereference(dp->table);
>                         struct flow_table *new_table;
>
>                         new_table = ovs_flow_tbl_rehash(old_table);
> @@ -2290,22 +2347,24 @@ static int __rehash_flow_table(void *dummy)
>                 }
>         }
>         rtnl_unlock();
> +       ovs_unlock();
>         return 0;
>  }
>
>  static void rehash_flow_table(struct work_struct *work)
>  {
> -       genl_exec(__rehash_flow_table, NULL);
> +       __rehash_flow_table();

I think we can now merge together the two rehash_flow_table functions
to simplify things and make it look more like upstream.

> -static int dp_destroy_all(void *data)
> +static int dp_destroy_all(struct ovs_net *ovs_net)
>  {
>         struct datapath *dp, *dp_next;
> -       struct ovs_net *ovs_net = data;
>
> +       ovs_lock();
>         list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node)
>                 __dp_destroy(dp);
> +       ovs_unlock();
>
>         return 0;
>  }
> @@ -2322,7 +2382,8 @@ static void __net_exit ovs_exit_net(struct net *net)
>  {
>         struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
>
> -       genl_exec(dp_destroy_all, ovs_net);
> +       dp_destroy_all(ovs_net);
> +       cancel_delayed_work_sync(&ovs_net->work);
>  }

Similarly, I think we can merge dp_destroy_all into ovs_exit_net.

> diff --git a/datapath/datapath.h b/datapath/datapath.h
> index 9bc98fb..58d7169 100644
> --- a/datapath/datapath.h
> +++ b/datapath/datapath.h
> @@ -142,9 +142,18 @@ struct dp_upcall_info {
>  struct ovs_net {
>         struct list_head dps;
>         struct vport_net vport_net;
> +       struct delayed_work work;

We should probably give the work struct a name that makes it more
obvious that it is for notifications.

> +extern struct mutex ovs_mutex;

I think we might just want to provide a lockdep_ovsl_is_held()
function rather than exposing ovs_mutex directly.

> -static inline struct vport *ovs_vport_rtnl_rcu(const struct datapath *dp, int port_no)
> +static inline struct vport *ovs_vport_ovsl_rcu(const struct datapath *dp, int port_no)
>  {
> -       WARN_ON_ONCE(!rcu_read_lock_held() && !rtnl_is_locked());
> +       WARN_ON_ONCE(!rcu_read_lock_held() && !ovs_is_locked());
>         return ovs_lookup_vport(dp, port_no);
>  }
>
> -static inline struct vport *ovs_vport_rtnl(const struct datapath *dp, int port_no)
> +static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_no)
>  {
> -       ASSERT_RTNL();
> +       ASSERT_OVSL();

For these two functions it seems like it might be better to use the
lockdep version of the check rather than unconditionally doing it.

> +void dp_notify_wq(struct work_struct *work);

We should prefix all exposed symbols with ovs_.

> diff --git a/datapath/dp_notify.c b/datapath/dp_notify.c
> index c9eeafe..1e19125 100644
> --- a/datapath/dp_notify.c
> +++ b/datapath/dp_notify.c
> +static void vport_destroy(struct vport *vport)

We should give this a more distinctive name - right now it sounds like
it belongs in vport.c.

> +static atomic_t unreg_events = ATOMIC_INIT(0);

We should probably include the right headers for this.

> +void dp_notify_wq(struct work_struct *work)
> +{
> +       struct ovs_net *ovs_net = container_of(work, struct ovs_net, work.work);
> +       struct datapath *dp;
> +       int count;

I think we probably need a comment explaining the rationale behind all of this.

> +redo:
> +       count = atomic_read(&unreg_events);
> +       if (!count)
> +               return;

If there are devices unregistering in a different namespace, this will
cause us to loop until everything has been unregistered everywhere.
However, since we'll decrement the counter for each pass where nothing
change we can actually prevent the devices in the other namespace from
unregistering. It seems like a per-namespace counter would be safer.

I think there is potentially another race condition here: if a
notification comes after we retrieve the value of count but before we
exit the workqueue. The counter will be incremented but another work
instance won't be scheduled since one is already running. Essentially
we're trying to emulate a semaphore here but actually doing that
directly would require introducing a new thread, which I really don't
want to do.

> +       ovs_lock();
> +       list_for_each_entry(dp, &ovs_net->dps, list_node) {
> +               int i;
> +
> +               for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
> +                       struct vport *vport;
> +                       struct hlist_node *n;
> +
> +                       hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) {
> +                               struct netdev_vport *netdev_vport;
> +
> +                               if (vport->ops->type != OVS_VPORT_TYPE_NETDEV)
> +                                       continue;
> +
> +                               netdev_vport = netdev_vport_priv(vport);
> +                               if (netdev_vport->dev->reg_state != NETREG_UNREGISTERED)

I think we also have to check for NETREG_UNREGISTERING.  This is set
while the notifiers are running and if there are a lot of them they
could still be running when we get here.

I think we need a flag to check whether we have deleted any ports
rather than just looking at the count. Otherwise we could have
decremented the count and someone else incremented it.

> @@ -39,26 +103,11 @@ static int dp_device_event(struct notifier_block *unused, unsigned long event,
>
>         switch (event) {
>         case NETDEV_UNREGISTER:

At this point, we probably could just simplify this into an if statement.

> +               ovs_net = net_generic(dev_net(dev), ovs_net_id);
> +               schedule_delayed_work(&ovs_net->work, 0);

Can we just use queue_work() here?

> diff --git a/datapath/vport-netdev.c b/datapath/vport-netdev.c
> index 727194e..e556b2c 100644
> --- a/datapath/vport-netdev.c
> +++ b/datapath/vport-netdev.c
> @@ -162,10 +163,12 @@ static struct vport *netdev_create(const struct vport_parms *parms)
>         dev_disable_lro(netdev_vport->dev);
>  #endif
>         netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH;
> +       rtnl_unlock();
>
>         return vport;
>
>  error_put:
> +       rtnl_unlock();

We can jump to error_put from the check for IFF_LOOPBACK,
ARPHRD_ETHER, and is_internal_device.  We aren't yet holding RTNL so
this could try to unlock a mutex that we don't hold.

> diff --git a/datapath/vport.c b/datapath/vport.c
> index e9e1444..0a03799 100644
> --- a/datapath/vport.c
> +++ b/datapath/vport.c
> @@ -222,7 +222,7 @@ struct vport *ovs_vport_add(const struct vport_parms *parms)
>         int err = 0;
>         int i;
>
> -       ASSERT_RTNL();
> +       ASSERT_OVSL();

Now that OVS mutex is common to the whole module, I'm not sure that we
really need all of these assertions.

If you haven't already, can you please also do some testing with lockdep?