[ovs-dev] [PATCH RFC] dpif-netlink: distribute polling to discreet handlers

Flavio Leitner fbl at sysclose.org
Tue Jun 30 20:37:09 UTC 2020


On Wed, Jun 24, 2020 at 09:48:00AM -0400, Aaron Conole wrote:
> Currently, the channel handlers are polled globally.  On some
> systems, this causes a thundering herd issue where multiple
> handler threads become active, only to do no work and immediately
> sleep.
> 
> The approach here is to push the netlink socket channels to discreet
> handler threads to process, rather than polling on every thread.
> This will eliminate the need to wake multiple threads.
> 
> To check:
> 
>   ip netns add left
>   ip netns add right
>   ip link add center-left type veth peer name left0
>   ip link add center-right type veth peer name right0
>   ip link set left0 netns left
>   ip link set right0 netns right
>   ip link set center-left up
>   ip link set center-right up
>   ip netns exec left ip link set left0 up
>   ip netns exec left ip addr add 172.31.110.10/24 dev left0
>   ip netns exec right ip link set right0 up
>   ip netns exec right ip addr add 172.31.110.11/24 dev right0
> 
>   ovs-vsctl add-br br0
>   ovs-vsctl add-port br0 center-right
>   ovs-vsctl add-port br0 center-left
> 
>   # in one terminal
>   perf record -e sched:sched_wakeup,irq:softirq_entry -ag
> 
>   # in a separate terminal
>   ip netns exec left arping -I left0 -c 1 172.31.110.11
> 
>   # in the perf terminal after exiting
>   perf script
> 
> Look for the number of 'handler' threads which were made active.

I am surprised to find out that in 2020 and Linux doesn't provide
a reasonable solution for the thundering herd issue. I wonder how
many applications have this issue out there. It looks like the
devil in is the details.

Anyways, I think the previous model didn't work because the number
of attached ports is increasing and so is the number of cores. The
number of sockets escalates quickly and it had the packet re-ordering
issue.

Moving forward, we moved to one socket per port with all threads
polling the same socket in the hope that the kernel would only wake
one or very few threads. Looks like that doesn't work and the packet
re-ordering is still an issue.

Therefore this approach where there is one socket per port but only
thread polling would solve the thundering herd issue and the packet
re-ordering issue.

One concern is with load capacity, since now only one thread would
be responsible to process a port, but I think that's what we had
initially, correct? So, it's not an issue.

It looks like now it is allocating enough memory for all port_idx
per thread even though it will not use all of them. Perhaps that
can be fixed? For instance, instead of indexing the array with
port_idx, use handler->port_idx as an identifier? 

More inline

> Reported-by: David Ahern <dsahern at gmail.com>
> Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-December/365857.html
> Cc: Matteo Croce <technoboy85 at gmail.com>
> Cc: Flavio Leitner <fbl at sysclose.org>
> Signed-off-by: Aaron Conole <aconole at redhat.com>
> ---
> I haven't finished all my testing, but I wanted to send this
> for early feedback in case I went completely off course.
> 
>  lib/dpif-netlink.c | 190 ++++++++++++++++++++++++++-------------------
>  1 file changed, 112 insertions(+), 78 deletions(-)
> 
> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
> index 1817e9f849..d59375cf00 100644
> --- a/lib/dpif-netlink.c
> +++ b/lib/dpif-netlink.c
> @@ -179,9 +179,11 @@ struct dpif_windows_vport_sock {
>  
>  struct dpif_handler {
>      struct epoll_event *epoll_events;
> -    int epoll_fd;                 /* epoll fd that includes channel socks. */
> -    int n_events;                 /* Num events returned by epoll_wait(). */
> -    int event_offset;             /* Offset into 'epoll_events'. */
> +    struct dpif_channel *channels; /* Array of channels for each port. */
> +    size_t n_channels;             /* Count of channels for each port. */
> +    int epoll_fd;                  /* epoll fd that includes channel socks. */
> +    int n_events;                  /* Num events returned by epoll_wait(). */
> +    int event_offset;              /* Offset into 'epoll_events'. */
>  
>  #ifdef _WIN32
>      /* Pool of sockets. */
> @@ -201,7 +203,6 @@ struct dpif_netlink {
>      struct fat_rwlock upcall_lock;
>      struct dpif_handler *handlers;
>      uint32_t n_handlers;           /* Num of upcall handlers. */
> -    struct dpif_channel *channels; /* Array of channels for each port. */
>      int uc_array_size;             /* Size of 'handler->channels' and */
>                                     /* 'handler->epoll_events'. */
>  
> @@ -452,15 +453,22 @@ static bool
>  vport_get_pid(struct dpif_netlink *dpif, uint32_t port_idx,
>                uint32_t *upcall_pid)
>  {
> -    /* Since the nl_sock can only be assigned in either all
> -     * or none "dpif" channels, the following check
> -     * would suffice. */
> -    if (!dpif->channels[port_idx].sock) {
> -        return false;
> -    }
> +    size_t i;
>      ovs_assert(!WINDOWS || dpif->n_handlers <= 1);
>  
> -    *upcall_pid = nl_sock_pid(dpif->channels[port_idx].sock);
> +    /* The 'port_idx' should only be valid for a single handler. */
> +    for (i = 0; i < dpif->n_handlers; ++i) {
> +
> +        if (port_idx < dpif->handlers[i].n_channels &&
> +            dpif->handlers[i].channels[port_idx].sock) {
> +            *upcall_pid =
> +                nl_sock_pid(dpif->handlers[i].channels[port_idx].sock);
> +            break;
> +        }
> +    }


Perhaps

     /* The 'port_idx' should only be valid for a single handler. */
-    for (i = 0; i < dpif->n_handlers; ++i) {
+    for (i = 0; i < dpif->n_handlers; i++) {
+        struct dpif_handler *handler = &dpif->handlers[i];
 
-        if (port_idx < dpif->handlers[i].n_channels &&
-            dpif->handlers[i].channels[port_idx].sock) {
+        if (port_idx < handler->n_channels &&
+            handler->channels[port_idx].sock) {
             *upcall_pid =
-                nl_sock_pid(dpif->handlers[i].channels[port_idx].sock);
+                nl_sock_pid(handler->channels[port_idx].sock);
             break;
         }
     }

the same for similar places below.


> +
> +    if (i == dpif->n_handlers)
> +        return false;
>  
>      return true;
>  }
> @@ -473,15 +481,23 @@ vport_add_channel(struct dpif_netlink *dpif, odp_port_t port_no,
>      uint32_t port_idx = odp_to_u32(port_no);
>      size_t i;
>      int error;
> +    struct dpif_handler *handler = NULL;
>  
>      if (dpif->handlers == NULL) {
>          close_nl_sock(sock);
>          return 0;
>      }
>  
> +    /* choose a handler by finding the least populated handler. */
> +    handler = &dpif->handlers[0];
> +    for (i = 0; i < dpif->n_handlers; ++i) {
> +        if (dpif->handlers[i].n_channels < handler->n_channels)
> +            handler = &dpif->handlers[i];
> +    }
> +
>      /* We assume that the datapath densely chooses port numbers, which can
>       * therefore be used as an index into 'channels' and 'epoll_events' of
> -     * 'dpif'. */
> +     * 'dpif->handler'. */
>      if (port_idx >= dpif->uc_array_size) {
>          uint32_t new_size = port_idx + 1;
>  
> @@ -491,40 +507,33 @@ vport_add_channel(struct dpif_netlink *dpif, odp_port_t port_no,
>              return EFBIG;
>          }
>  
> -        dpif->channels = xrealloc(dpif->channels,
> -                                  new_size * sizeof *dpif->channels);
> +        handler->channels = xrealloc(handler->channels,
> +                                     new_size * sizeof *handler->channels);
>  
> -        for (i = dpif->uc_array_size; i < new_size; i++) {
> -            dpif->channels[i].sock = NULL;
> +        for (i = handler->n_channels; i < new_size; i++) {
> +            handler->channels[i].sock = NULL;
>          }
>  
> -        for (i = 0; i < dpif->n_handlers; i++) {
> -            struct dpif_handler *handler = &dpif->handlers[i];
> -
> -            handler->epoll_events = xrealloc(handler->epoll_events,
> -                new_size * sizeof *handler->epoll_events);
> -
> -        }
> +        handler->epoll_events = xrealloc(handler->epoll_events,
> +                                         new_size * sizeof *handler->epoll_events);
>          dpif->uc_array_size = new_size;
> +        handler->n_channels = new_size;
>      }
>  
>      memset(&event, 0, sizeof event);
>      event.events = EPOLLIN | EPOLLEXCLUSIVE;
>      event.data.u32 = port_idx;
>  
> -    for (i = 0; i < dpif->n_handlers; i++) {
> -        struct dpif_handler *handler = &dpif->handlers[i];
> -
>  #ifndef _WIN32
> -        if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock),
> -                      &event) < 0) {
> -            error = errno;
> -            goto error;
> -        }
> -#endif
> +    if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock),
> +                  &event) < 0) {
> +        error = errno;
> +        goto error;
>      }
> -    dpif->channels[port_idx].sock = sock;
> -    dpif->channels[port_idx].last_poll = LLONG_MIN;
> +#endif
> +
> +    handler->channels[port_idx].sock = sock;
> +    handler->channels[port_idx].last_poll = LLONG_MIN;
>  
>      return 0;
>  
> @@ -535,34 +544,53 @@ error:
>                    nl_sock_fd(sock), NULL);
>      }
>  #endif
> -    dpif->channels[port_idx].sock = NULL;
> +    handler->channels[port_idx].sock = NULL;
>  
>      return error;
>  }
>  
> +static void
> +vport_del_channel(struct dpif_handler *handler, uint32_t port_idx)
> +{
> +    if (port_idx < handler->n_channels &&
> +        handler->channels[port_idx].sock) {
> +#ifndef _WIN32
> +        epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL,
> +                  nl_sock_fd(handler->channels[port_idx].sock), NULL);
> +#endif
> +        handler->event_offset = handler->n_events = 0;
> +
> +#ifndef _WIN32
> +        nl_sock_destroy(handler->channels[port_idx].sock);
> +#endif
> +        handler->channels[port_idx].sock = NULL;
> +    }        
> +}
> +
>  static void
>  vport_del_channels(struct dpif_netlink *dpif, odp_port_t port_no)
>  {
>      uint32_t port_idx = odp_to_u32(port_no);
> +    struct dpif_handler *handler = NULL;
>      size_t i;
>  
> -    if (!dpif->handlers || port_idx >= dpif->uc_array_size
> -        || !dpif->channels[port_idx].sock) {
> +    if (!dpif->handlers || port_idx >= dpif->uc_array_size) {
>          return;
>      }
>  
>      for (i = 0; i < dpif->n_handlers; i++) {
> -        struct dpif_handler *handler = &dpif->handlers[i];
> -#ifndef _WIN32
> -        epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL,
> -                  nl_sock_fd(dpif->channels[port_idx].sock), NULL);
> -#endif
> -        handler->event_offset = handler->n_events = 0;
> +        handler = &dpif->handlers[i];
> +
> +        if (port_idx >= handler->n_channels ||
> +            !handler->channels[port_idx].sock) {
> +            handler = NULL;
> +            continue;
> +        }
> +    }
> +
> +    if (handler) {
> +        vport_del_channel(handler, port_idx);
>      }
> -#ifndef _WIN32
> -    nl_sock_destroy(dpif->channels[port_idx].sock);
> -#endif
> -    dpif->channels[port_idx].sock = NULL;
>  }
>  
>  static void
> @@ -575,36 +603,38 @@ destroy_all_channels(struct dpif_netlink *dpif)
>          return;
>      }
>  
> -    for (i = 0; i < dpif->uc_array_size; i++ ) {
> -        struct dpif_netlink_vport vport_request;
> -        uint32_t upcall_pids = 0;
> -
> -        if (!dpif->channels[i].sock) {
> -            continue;
> -        }
> +    for (i = 0; i < dpif->n_handlers; i++) {
> +        struct dpif_handler *handler = &dpif->handlers[i];
> +        size_t j;
>  
> -        /* Turn off upcalls. */
> -        dpif_netlink_vport_init(&vport_request);
> -        vport_request.cmd = OVS_VPORT_CMD_SET;
> -        vport_request.dp_ifindex = dpif->dp_ifindex;
> -        vport_request.port_no = u32_to_odp(i);
> -        vport_request.n_upcall_pids = 1;
> -        vport_request.upcall_pids = &upcall_pids;
> -        dpif_netlink_vport_transact(&vport_request, NULL, NULL);
> +        for (j = 0; j < handler->n_channels; j++ ) {
> +            struct dpif_netlink_vport vport_request;
> +            uint32_t upcall_pids = 0;
>  
> -        vport_del_channels(dpif, u32_to_odp(i));
> -    }
> +            if (!handler->channels[j].sock) {
> +                continue;
> +            }
>  
> -    for (i = 0; i < dpif->n_handlers; i++) {
> -        struct dpif_handler *handler = &dpif->handlers[i];
> +            /* Turn off upcalls. */
> +            dpif_netlink_vport_init(&vport_request);
> +            vport_request.cmd = OVS_VPORT_CMD_SET;
> +            vport_request.dp_ifindex = dpif->dp_ifindex;
> +            vport_request.port_no = u32_to_odp(j);
> +            vport_request.n_upcall_pids = 1;
> +            vport_request.upcall_pids = &upcall_pids;
> +            dpif_netlink_vport_transact(&vport_request, NULL, NULL);
>  
> +            vport_del_channel(handler, j);
> +        }
> +        handler->n_channels = 0;
> +        free(handler->channels);
> +        handler->channels = NULL;
>          dpif_netlink_handler_uninit(handler);
>          free(handler->epoll_events);
> +        handler->epoll_events = NULL;
>      }
> -    free(dpif->channels);
>      free(dpif->handlers);
>      dpif->handlers = NULL;
> -    dpif->channels = NULL;
>      dpif->n_handlers = 0;
>      dpif->uc_array_size = 0;
>  }
> @@ -1091,13 +1121,17 @@ dpif_netlink_port_get_pid__(const struct dpif_netlink *dpif,
>          /* The ODPP_NONE "reserved" port number uses the "ovs-system"'s
>           * channel, since it is not heavily loaded. */
>          uint32_t idx = port_idx >= dpif->uc_array_size ? 0 : port_idx;
> +        size_t i;
>  
>          /* Needs to check in case the socket pointer is changed in between
>           * the holding of upcall_lock.  A known case happens when the main
>           * thread deletes the vport while the handler thread is handling
>           * the upcall from that port. */
> -        if (dpif->channels[idx].sock) {
> -            pid = nl_sock_pid(dpif->channels[idx].sock);
> +        for (i = 0; i < dpif->n_handlers; ++i) {
> +            if (idx < dpif->handlers[i].n_channels &&
> +                dpif->handlers[i].channels[idx].sock) {
> +                pid = nl_sock_pid(dpif->handlers[i].channels[idx].sock);
> +            }
>          }
>      }
>  
> @@ -2738,7 +2772,7 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id,
>  
>      while (handler->event_offset < handler->n_events) {
>          int idx = handler->epoll_events[handler->event_offset].data.u32;
> -        struct dpif_channel *ch = &dpif->channels[idx];
> +        struct dpif_channel *ch = &handler->channels[idx];
>  
>          handler->event_offset++;
>  
> @@ -2840,14 +2874,14 @@ dpif_netlink_recv_purge__(struct dpif_netlink *dpif)
>      OVS_REQ_WRLOCK(dpif->upcall_lock)
>  {
>      if (dpif->handlers) {
> -        size_t i;
> +        size_t i, j;
>  
> -        if (!dpif->channels[0].sock) {
> -            return;
> -        }
> -        for (i = 0; i < dpif->uc_array_size; i++ ) {
> -
> -            nl_sock_drain(dpif->channels[i].sock);
> +        for (j = 0; j < dpif->n_handlers; ++j) {
> +            for (i = 0; i < dpif->handlers[j].n_channels; i++ ) {
> +                if (dpif->handlers[j].channels[i].sock) {
> +                    nl_sock_drain(dpif->handlers[j].channels[i].sock);
> +                }
> +            }
>          }
>      }
>  }
> -- 
> 2.25.1
> 
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev

-- 
fbl


More information about the dev mailing list