[ovs-dev] [PATCH v2] dpif-netlink: distribute polling to discreet handlers

Flavio Leitner fbl at sysclose.org
Thu Aug 6 20:35:06 UTC 2020


Hi Aaron,

Thanks for the patch. I ran some basic tests here
and they passed. I could see only one handler thread
becoming active with a single upcall.

See my comment below.

On Tue, Jul 21, 2020 at 07:27:41PM -0400, Aaron Conole wrote:
> Currently, the channel handlers are polled globally.  On some
> systems, this causes a thundering herd issue where multiple
> handler threads become active, only to do no work and immediately
> sleep.
> 
> The approach here is to push the netlink socket channels to discreet
> handler threads to process, rather than polling on every thread.
> This will eliminate the need to wake multiple threads.
> 
> To check:
> 
>   ip netns add left
>   ip netns add right
>   ip link add center-left type veth peer name left0
>   ip link add center-right type veth peer name right0
>   ip link set left0 netns left
>   ip link set right0 netns right
>   ip link set center-left up
>   ip link set center-right up
>   ip -n left ip link set left0 up
>   ip -n left ip addr add 172.31.110.10/24 dev left0
>   ip -n right ip link set right0 up
>   ip -n right ip addr add 172.31.110.11/24 dev right0
> 
>   ovs-vsctl add-br br0
>   ovs-vsctl add-port br0 center-right
>   ovs-vsctl add-port br0 center-left
> 
>   # in one terminal
>   perf record -e sched:sched_wakeup,irq:softirq_entry -ag
> 
>   # in a separate terminal
>   ip netns exec left arping -I left0 -c 1 172.31.110.11
> 
>   # in the perf terminal after exiting
>   perf script
> 
> Look for the number of 'handler' threads which were made active.
> 
> Suggested-by: Ben Pfaff <blp at ovn.org>
> Reported-by: David Ahern <dsahern at gmail.com>
> Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-December/365857.html
> Cc: Matteo Croce <technoboy85 at gmail.com>
> Cc: Flavio Leitner <fbl at sysclose.org>
> Fixes: 69c51582f ("dpif-netlink: don't allocate per thread netlink sockets")
> Signed-off-by: Aaron Conole <aconole at redhat.com>
> ---
> v2: Oops - forgot to commit my whitespace cleanups.
> 
> lib/dpif-netlink.c | 289 ++++++++++++++++++++++++++++-----------------
>  1 file changed, 179 insertions(+), 110 deletions(-)
> 
> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
> index 7da4fb54d9..71d2805427 100644
> --- a/lib/dpif-netlink.c
> +++ b/lib/dpif-netlink.c
> @@ -183,6 +183,10 @@ struct dpif_handler {
>      int n_events;                 /* Num events returned by epoll_wait(). */
>      int event_offset;             /* Offset into 'epoll_events'. */
>  
> +    struct dpif_channel *channels; /* Array of channels for each port. */
> +    uint32_t *port_idx_map;        /* Port idx to channel idx. */
> +    size_t n_channels;
> +
>  #ifdef _WIN32
>      /* Pool of sockets. */
>      struct dpif_windows_vport_sock *vport_sock_pool;
> @@ -202,8 +206,6 @@ struct dpif_netlink {
>      struct dpif_handler *handlers;
>      uint32_t n_handlers;           /* Num of upcall handlers. */
>      struct dpif_channel *channels; /* Array of channels for each port. */


Shouldn't the above channels be removed?

> -    int uc_array_size;             /* Size of 'handler->channels' and */
> -                                   /* 'handler->epoll_events'. */
>  
>      /* Change notification. */
>      struct nl_sock *port_notifier; /* vport multicast group subscriber. */
> @@ -287,6 +289,55 @@ close_nl_sock(struct nl_sock *sock)
>  #endif
>  }
>  
> +static uint32_t
> +dpif_handler_port_idx_max(const struct dpif_handler *handler,
> +                          struct dpif_channel **out_chn)

Looks like there is no caller passing out_chn, so why
that is needed?


> +{
> +    uint32_t max = 0;
> +    size_t i;
> +
> +    for (i = 0; i < handler->n_channels; ++i) {

OVS does have some code using '++i', but the majority of
it uses 'i++', can you please swap them?


> +        if (handler->channels[i].sock &&
> +            handler->port_idx_map[i] > max) {
> +            max = handler->port_idx_map[i];
> +            if (out_chn) {
> +                *out_chn = &handler->channels[i];
> +            }
> +        }
> +    }
> +
> +    return max;
> +}
> +
> +static size_t
> +dpif_handler_active_channels(const struct dpif_handler *handler)
> +{
> +    size_t i, ret = 0;

Perhaps 'count' or 'num'  would be a better name.

> +
> +    for (i = 0; i < handler->n_channels; ++i) {
> +        if (handler->channels[i].sock) {
> +            ret++;
> +        }
> +    }
> +
> +    return ret;
> +}
> +
> +static struct dpif_channel *
> +dpif_handler_has_port_idx(const struct dpif_handler *handler, uint32_t idx)
> +{
> +    size_t i;
> +
> +    for (i = 0; i < handler->n_channels; ++i) {
> +        if (handler->channels[i].sock &&
> +            handler->port_idx_map[i] == idx) {
> +            return &handler->channels[i];
> +        }
> +    }
> +
> +    return NULL;
> +}
> +
>  static struct dpif_netlink *
>  dpif_netlink_cast(const struct dpif *dpif)
>  {
> @@ -452,90 +503,86 @@ static bool
>  vport_get_pid(struct dpif_netlink *dpif, uint32_t port_idx,
>                uint32_t *upcall_pid)
>  {
> -    /* Since the nl_sock can only be assigned in either all
> -     * or none "dpif" channels, the following check
> -     * would suffice. */
> -    if (!dpif->channels[port_idx].sock) {
> -        return false;
> -    }
> +    size_t i;
> +
>      ovs_assert(!WINDOWS || dpif->n_handlers <= 1);
>  
> -    *upcall_pid = nl_sock_pid(dpif->channels[port_idx].sock);
> +    /* The 'port_idx' should only be valid for a single handler. */
> +    for (i = 0; i < dpif->n_handlers; ++i) {
> +        struct dpif_channel *channel =
> +            dpif_handler_has_port_idx(&dpif->handlers[i], port_idx);
> +
> +        if (channel) {
> +            *upcall_pid = nl_sock_pid(channel->sock);
> +            return true;
> +        }
> +    }
>  
> -    return true;
> +    return false;
>  }
>  
>  static int
>  vport_add_channel(struct dpif_netlink *dpif, odp_port_t port_no,
>                    struct nl_sock *sock)
>  {
> -    struct epoll_event event;
>      uint32_t port_idx = odp_to_u32(port_no);
> +    struct dpif_handler *handler;
> +    struct dpif_channel *channel;
> +    struct epoll_event event;
> +    int error = 0;
>      size_t i;
> -    int error;
>  
>      if (dpif->handlers == NULL) {
>          close_nl_sock(sock);
> -        return 0;
> +        return error;
>      }
>  
> -    /* We assume that the datapath densely chooses port numbers, which can
> -     * therefore be used as an index into 'channels' and 'epoll_events' of
> -     * 'dpif'. */
> -    if (port_idx >= dpif->uc_array_size) {
> -        uint32_t new_size = port_idx + 1;
> +    handler = &dpif->handlers[0];
>  
> -        if (new_size > MAX_PORTS) {
> -            VLOG_WARN_RL(&error_rl, "%s: datapath port %"PRIu32" too big",
> -                         dpif_name(&dpif->dpif), port_no);
> -            return EFBIG;
> -        }


Could you please explain why checking MAX_PORTS is not
important anymore? If it isn't, then this should remove
MAX_PORT declaration as that was the only use.


> -
> -        dpif->channels = xrealloc(dpif->channels,
> -                                  new_size * sizeof *dpif->channels);
> +    for (i = 0; i < dpif->n_handlers; ++i) {
> +        if (dpif_handler_active_channels(&dpif->handlers[i]) <
> +            dpif_handler_active_channels(handler))
> +            handler = &dpif->handlers[i];
> +    }
>  
> -        for (i = dpif->uc_array_size; i < new_size; i++) {
> -            dpif->channels[i].sock = NULL;
> +    channel = NULL;
> +    for (i = 0; i < handler->n_channels; ++i) {
> +        if (!handler->channels[i].sock) {
> +            channel = &handler->channels[i];
> +            handler->port_idx_map[i] = port_idx;
>          }
> +    }
>  
> -        for (i = 0; i < dpif->n_handlers; i++) {
> -            struct dpif_handler *handler = &dpif->handlers[i];
> -
> -            handler->epoll_events = xrealloc(handler->epoll_events,
> -                new_size * sizeof *handler->epoll_events);
> -
> -        }
> -        dpif->uc_array_size = new_size;
> +    if (channel == NULL) {
> +        size_t new_size = handler->n_channels + 1;
> +        handler->channels = xrealloc(handler->channels,
> +                                     new_size * sizeof *handler->channels);
> +        handler->epoll_events = xrealloc(handler->epoll_events,
> +                                         new_size *
> +                                         sizeof *handler->epoll_events);
> +        handler->port_idx_map = xrealloc(handler->port_idx_map,
> +                                         new_size *
> +                                         sizeof *handler->port_idx_map);
> +        channel = &handler->channels[handler->n_channels];
> +        handler->port_idx_map[handler->n_channels] = port_idx;
> +        i = handler->n_channels;
> +        handler->n_channels += 1;

perhaps handler->n_channels = new_size to remove the magic number?

>      }
>  
> +    channel->sock = sock;
> +    channel->last_poll = LLONG_MIN;
> +
>      memset(&event, 0, sizeof event);
>      event.events = EPOLLIN | EPOLLEXCLUSIVE;
> -    event.data.u32 = port_idx;
> -
> -    for (i = 0; i < dpif->n_handlers; i++) {
> -        struct dpif_handler *handler = &dpif->handlers[i];
> +    event.data.u32 = i;
>  
>  #ifndef _WIN32
> -        if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock),
> -                      &event) < 0) {
> -            error = errno;
> -            goto error;
> -        }
> -#endif
> -    }
> -    dpif->channels[port_idx].sock = sock;
> -    dpif->channels[port_idx].last_poll = LLONG_MIN;
> -
> -    return 0;
> -
> -error:
> -#ifndef _WIN32
> -    while (i--) {
> -        epoll_ctl(dpif->handlers[i].epoll_fd, EPOLL_CTL_DEL,
> -                  nl_sock_fd(sock), NULL);
> +    if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock),
> +                  &event) < 0) {
> +        error = errno;
> +        channel->sock = NULL;
>      }
>  #endif
> -    dpif->channels[port_idx].sock = NULL;
>  
>      return error;
>  }
> @@ -544,69 +591,81 @@ static void
>  vport_del_channels(struct dpif_netlink *dpif, odp_port_t port_no)
>  {
>      uint32_t port_idx = odp_to_u32(port_no);
> +    struct dpif_handler *handler = NULL;
> +    struct dpif_channel *channel = NULL;
>      size_t i;
>  
> -    if (!dpif->handlers || port_idx >= dpif->uc_array_size
> -        || !dpif->channels[port_idx].sock) {
> +    if (!dpif->handlers) {
>          return;
>      }
>  
> -    for (i = 0; i < dpif->n_handlers; i++) {
> -        struct dpif_handler *handler = &dpif->handlers[i];
> +    for (i = 0; i < dpif->n_handlers && channel == NULL; i++) {
> +        handler = &dpif->handlers[i];
> +        channel = dpif_handler_has_port_idx(handler, port_idx);
> +
> +        if (channel == NULL) {
> +            continue;
> +        }
> +
>  #ifndef _WIN32
>          epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL,
> -                  nl_sock_fd(dpif->channels[port_idx].sock), NULL);
> +                  nl_sock_fd(channel->sock), NULL);
>  #endif
>          handler->event_offset = handler->n_events = 0;
>      }
> +
> +    if (channel) {
>  #ifndef _WIN32
> -    nl_sock_destroy(dpif->channels[port_idx].sock);
> +        nl_sock_destroy(channel->sock);
>  #endif
> -    dpif->channels[port_idx].sock = NULL;
> +        channel->sock = NULL;

Ok, so it doesn't resize the memory when ports are removed.

I was wondering if it wouldn't scale better if we use hmaps and
id_pools instead:

--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -183,9 +183,7 @@ struct dpif_handler {
     int n_events;                 /* Num events returned by epoll_wait(). */
     int event_offset;             /* Offset into 'epoll_events'. */
 
-    struct dpif_channel *channels; /* Array of channels for each port. */
-    uint32_t *port_idx_map;        /* Port idx to channel idx. */
-    size_t n_channels;
+    struct hmap channels;         /* Channels for each port. */
 
 #ifdef _WIN32
     /* Pool of sockets. */
@@ -205,7 +203,7 @@ struct dpif_netlink {
     struct fat_rwlock upcall_lock;
     struct dpif_handler *handlers;
     uint32_t n_handlers;           /* Num of upcall handlers. */
-    struct dpif_channel *channels; /* Array of channels for each port. */
+    struct id_pool *port_ids;      /* Pool of port ids. */
 
     /* Change notification. */
     struct nl_sock *port_notifier; /* vport multicast group subscriber. */
 
For instance, we can find the least loaded with just hmap_count() instead
of iterating on all handlers and ports. Also when we add/remove a port, we
could use HMAP_FOR_EACH_WITH_HASH() using port_id as hash to find the
channel.

What do you think?

> +    }
>  }
>  
>  static void
>  destroy_all_channels(struct dpif_netlink *dpif)
>      OVS_REQ_WRLOCK(dpif->upcall_lock)
>  {
> -    unsigned int i;
> +    size_t i, j;
>  
>      if (!dpif->handlers) {
>          return;
>      }
>  
> -    for (i = 0; i < dpif->uc_array_size; i++ ) {
> -        struct dpif_netlink_vport vport_request;
> -        uint32_t upcall_pids = 0;
> +    for (i = 0; i < dpif->n_handlers; ++i) {

nitpick: Same comment as before to change to 'i++'

> +        struct dpif_handler *handler = &dpif->handlers[i];
>  
> -        if (!dpif->channels[i].sock) {
> -            continue;
> -        }
> +        for (j = 0; j < handler->n_channels; ++j) {
and here

> +            struct dpif_netlink_vport vport_request;
> +            uint32_t upcall_pids = 0;
>  
> -        /* Turn off upcalls. */
> -        dpif_netlink_vport_init(&vport_request);
> -        vport_request.cmd = OVS_VPORT_CMD_SET;
> -        vport_request.dp_ifindex = dpif->dp_ifindex;
> -        vport_request.port_no = u32_to_odp(i);
> -        vport_request.n_upcall_pids = 1;
> -        vport_request.upcall_pids = &upcall_pids;
> -        dpif_netlink_vport_transact(&vport_request, NULL, NULL);
> +            if (!handler->channels[j].sock) {
> +                continue;
> +            }
>  
> -        vport_del_channels(dpif, u32_to_odp(i));
> -    }
> +            /* Turn off upcalls. */
> +            dpif_netlink_vport_init(&vport_request);
> +            vport_request.cmd = OVS_VPORT_CMD_SET;
> +            vport_request.dp_ifindex = dpif->dp_ifindex;
> +            vport_request.port_no = u32_to_odp(handler->port_idx_map[j]);
> +            vport_request.n_upcall_pids = 1;
> +            vport_request.upcall_pids = &upcall_pids;
> +            dpif_netlink_vport_transact(&vport_request, NULL, NULL);
>  
> -    for (i = 0; i < dpif->n_handlers; i++) {
> -        struct dpif_handler *handler = &dpif->handlers[i];
> +            vport_del_channels(dpif, u32_to_odp(handler->port_idx_map[j]));
> +        }
>  
>          dpif_netlink_handler_uninit(handler);
> +        handler->n_channels = 0;
> +
>          free(handler->epoll_events);
> +        free(handler->port_idx_map);
> +        free(handler->channels);
>      }
> -    free(dpif->channels);
> +
>      free(dpif->handlers);
>      dpif->handlers = NULL;
> -    dpif->channels = NULL;
>      dpif->n_handlers = 0;
> -    dpif->uc_array_size = 0;
>  }
>  
>  static void
> @@ -1084,23 +1143,31 @@ dpif_netlink_port_get_pid__(const struct dpif_netlink *dpif,
>                              odp_port_t port_no)
>      OVS_REQ_RDLOCK(dpif->upcall_lock)
>  {
> +    struct dpif_channel *min_channel = NULL, *found_channel = NULL;
>      uint32_t port_idx = odp_to_u32(port_no);
>      uint32_t pid = 0;
>  
> -    if (dpif->handlers && dpif->uc_array_size > 0) {
> -        /* The ODPP_NONE "reserved" port number uses the "ovs-system"'s
> -         * channel, since it is not heavily loaded. */
> -        uint32_t idx = port_idx >= dpif->uc_array_size ? 0 : port_idx;
> -
> -        /* Needs to check in case the socket pointer is changed in between
> -         * the holding of upcall_lock.  A known case happens when the main
> -         * thread deletes the vport while the handler thread is handling
> -         * the upcall from that port. */
> -        if (dpif->channels[idx].sock) {
> -            pid = nl_sock_pid(dpif->channels[idx].sock);
> +    if (dpif->handlers) {
> +        size_t i;
> +        for (i = 0; i < dpif->n_handlers; ++i) {
> +            if (dpif_handler_has_port_idx(&dpif->handlers[i], port_idx)) {
> +                found_channel = dpif_handler_has_port_idx(&dpif->handlers[i],
> +                                                          port_idx);
> +            }
> +
> +            if (dpif_handler_has_port_idx(&dpif->handlers[i], 0)) {
> +                min_channel = dpif_handler_has_port_idx(&dpif->handlers[i],
> +                                                        0);
> +            }
>          }

The comment about the ODPP_NONE reserved port seems still relevant.
Also, if it finds a channel, couldn't it just break the loop?

>      }
>  
> +    if (found_channel) {
> +        pid = nl_sock_pid(found_channel->sock);
> +    } else if (min_channel) {
> +        pid = nl_sock_pid(min_channel->sock);
> +    }
> +
>      return pid;
>  }
>  
> @@ -2376,7 +2443,7 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers)
>  {
>      unsigned long int *keep_channels;
>      struct dpif_netlink_vport vport;
> -    size_t keep_channels_nbits;
> +    size_t keep_channels_nbits = 0;
>      struct nl_dump dump;
>      uint64_t reply_stub[NL_DUMP_BUFSIZE / 8];
>      struct ofpbuf buf;
> @@ -2414,9 +2481,11 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers)
>          struct dpif_handler *handler = &dpif->handlers[i];
>  
>          handler->event_offset = handler->n_events = 0;
> +        if (keep_channels_nbits < dpif_handler_port_idx_max(handler, NULL)) {
> +            keep_channels_nbits = dpif_handler_port_idx_max(handler, NULL);
> +        }
>      }
>  
> -    keep_channels_nbits = dpif->uc_array_size;
>      keep_channels = bitmap_allocate(keep_channels_nbits);
>  
>      ofpbuf_use_stub(&buf, reply_stub, sizeof reply_stub);
> @@ -2426,8 +2495,7 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers)
>          uint32_t upcall_pid;
>          int error;
>  
> -        if (port_no >= dpif->uc_array_size
> -            || !vport_get_pid(dpif, port_no, &upcall_pid)) {
> +        if (!vport_get_pid(dpif, port_no, &upcall_pid)) {
>              struct nl_sock *sock;
>              error = create_nl_sock(dpif, &sock);
>  
> @@ -2721,14 +2789,14 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id,
>      }
>  
>      handler = &dpif->handlers[handler_id];
> -    if (handler->event_offset >= handler->n_events) {
> +    if (handler->event_offset >= handler->n_events && handler->n_channels) {
>          int retval;
>  
>          handler->event_offset = handler->n_events = 0;
>  
>          do {
>              retval = epoll_wait(handler->epoll_fd, handler->epoll_events,
> -                                dpif->uc_array_size, 0);
> +                                handler->n_channels, 0);
>          } while (retval < 0 && errno == EINTR);
>  
>          if (retval < 0) {
> @@ -2741,7 +2809,7 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id,
>  
>      while (handler->event_offset < handler->n_events) {
>          int idx = handler->epoll_events[handler->event_offset].data.u32;
> -        struct dpif_channel *ch = &dpif->channels[idx];
> +        struct dpif_channel *ch = &handler->channels[idx];
>  
>          handler->event_offset++;
>  
> @@ -2845,12 +2913,13 @@ dpif_netlink_recv_purge__(struct dpif_netlink *dpif)
>      if (dpif->handlers) {
>          size_t i;
>  
> -        if (!dpif->channels[0].sock) {
> -            return;
> -        }
> -        for (i = 0; i < dpif->uc_array_size; i++ ) {
> +        for (i = 0; i < dpif->n_handlers; i++) {
> +            struct dpif_handler *handler = &dpif->handlers[i];
> +            size_t j;
>  
> -            nl_sock_drain(dpif->channels[i].sock);
> +            for (j = 0; j < handler->n_channels; ++j) {
> +                nl_sock_drain(handler->channels[j].sock);
> +            }
>          }
>      }
>  }
> -- 
> 2.25.4
> 
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev

-- 
fbl


More information about the dev mailing list