[ovs-dev] [PATCH v4 2/3] dpif-netlink: distribute polling to discreet handlers
Flavio Leitner
fbl at sysclose.org
Tue Sep 1 23:22:07 UTC 2020
Hi Mark,
Thanks for following up with this patch.
See my comments below.
On Fri, Aug 28, 2020 at 06:03:04PM +0100, Mark Gray wrote:
> From: Aaron Conole <aconole at redhat.com>
>
> Currently, the channel handlers are polled globally. On some
> systems, this causes a thundering herd issue where multiple
> handler threads become active, only to do no work and immediately
> sleep.
>
> The approach here is to push the netlink socket channels to discreet
> handler threads to process, rather than polling on every thread.
> This will eliminate the need to wake multiple threads.
>
> To check:
>
> ip netns add left
> ip netns add right
> ip link add center-left type veth peer name left0 netns left
> ip link add center-right type veth peer name right0 netns right
> ip link set center-left up
> ip link set center-right up
> ip -n left ip link set left0 up
> ip -n left ip addr add 172.31.110.10/24 dev left0
> ip -n right ip link set right0 up
> ip -n right ip addr add 172.31.110.11/24 dev right0
>
> ovs-vsctl add-br br0
> ovs-vsctl add-port br0 center-right
> ovs-vsctl add-port br0 center-left
>
> # in one terminal
> perf record -e sched:sched_wakeup,irq:softirq_entry -ag
>
> # in a separate terminal
> ip netns exec left arping -I left0 -c 1 172.31.110.11
>
> # in the perf terminal after exiting
> perf script
>
> Look for the number of 'handler' threads which were made active.
>
> Suggested-by: Ben Pfaff <blp at ovn.org>
> Co-authored-by: Mark Gray <mark.d.gray at redhat.com>
> Reported-by: David Ahern <dsahern at gmail.com>
> Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-December/365857.html
> Cc: Matteo Croce <technoboy85 at gmail.com>
> Cc: Flavio Leitner <fbl at sysclose.org>
> Fixes: 69c51582f ("dpif-netlink: don't allocate per thread netlink sockets")
> Signed-off-by: Aaron Conole <aconole at redhat.com>
> Signed-off-by: Mark Gray <mark.d.gray at redhat.com>
> ---
> v2: Oops - forgot to commit my whitespace cleanups.
> v3: one port latency results as per Matteo's comments
>
> Stock:
> min/avg/max/mdev = 21.5/36.5/96.5/1.0 us
> With Patch:
> min/avg/max/mdev = 5.3/9.7/98.4/0.5 us
> v4: Oops - first email did not send
>
> lib/dpif-netlink.c | 311 ++++++++++++++++++++++++++-------------------
> 1 file changed, 182 insertions(+), 129 deletions(-)
>
> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
> index 7da4fb54d..d928d7aa1 100644
> --- a/lib/dpif-netlink.c
> +++ b/lib/dpif-netlink.c
> @@ -37,6 +37,7 @@
> #include "dpif-provider.h"
> #include "fat-rwlock.h"
> #include "flow.h"
> +#include "id-pool.h"
> #include "netdev-linux.h"
> #include "netdev-offload.h"
> #include "netdev-provider.h"
> @@ -160,6 +161,8 @@ static void dpif_netlink_flow_to_dpif_flow(struct dpif_flow *,
>
> /* One of the dpif channels between the kernel and userspace. */
> struct dpif_channel {
> + struct hmap_node hmap_node;
> + uint32_t port_idx;
> struct nl_sock *sock; /* Netlink socket. */
> long long int last_poll; /* Last time this channel was polled. */
> };
> @@ -183,6 +186,8 @@ struct dpif_handler {
> int n_events; /* Num events returned by epoll_wait(). */
> int event_offset; /* Offset into 'epoll_events'. */
>
> + struct hmap channels; /* map of channels for each port. */
> +
> #ifdef _WIN32
> /* Pool of sockets. */
> struct dpif_windows_vport_sock *vport_sock_pool;
> @@ -201,9 +206,8 @@ struct dpif_netlink {
> struct fat_rwlock upcall_lock;
> struct dpif_handler *handlers;
> uint32_t n_handlers; /* Num of upcall handlers. */
> - struct dpif_channel *channels; /* Array of channels for each port. */
> - int uc_array_size; /* Size of 'handler->channels' and */
> - /* 'handler->epoll_events'. */
> +
> + struct id_pool *port_ids; /* Set of added port ids */
>
> /* Change notification. */
> struct nl_sock *port_notifier; /* vport multicast group subscriber. */
> @@ -287,6 +291,53 @@ close_nl_sock(struct nl_sock *sock)
> #endif
> }
>
> +static size_t
> +dpif_handler_channels_count(const struct dpif_handler *handler)
> +{
> + return hmap_count(&handler->channels);
> +}
> +
> +static struct dpif_channel *
> +dpif_handler_channel_get(struct dpif_handler *handler, uint32_t idx)
Perhaps _find() or lookup() ?
> +{
> + struct dpif_channel *channel;
> +
> + HMAP_FOR_EACH_WITH_HASH (channel, hmap_node, hash_int(idx, 0),
> + &handler->channels) {
> + if (channel->port_idx == idx) {
> + return channel;
> + }
> + }
> +
> + return NULL;
> +}
> +
> +static void
> +dpif_handler_channel_add(struct dpif_handler *handler, uint32_t idx,
> + struct nl_sock *sock)
> +{
> + struct dpif_channel *channel = xmalloc(sizeof *channel);
> +
> + channel->port_idx = idx;
> + channel->sock = sock;
> + channel->last_poll = LLONG_MIN;
> +
> + hmap_insert(&handler->channels, &channel->hmap_node, hash_int(idx,0));
Please add a space before that 0.
> +}
> +
> +static void
> +dpif_handler_channel_del(struct dpif_handler *handler, uint32_t port_idx)
> +{
> + struct dpif_channel *channel = dpif_handler_channel_get(handler, port_idx);
> +
> + hmap_remove(&(handler->channels), &(channel->hmap_node));
Extra line.
> +
> + free(channel);
> +}
> +
> +
> +
> +
Too many extra lines.
> static struct dpif_netlink *
> dpif_netlink_cast(const struct dpif *dpif)
> {
> @@ -385,6 +436,11 @@ open_dpif(const struct dpif_netlink_dp *dp, struct dpif **dpifp)
>
> dpif->dp_ifindex = dp->dp_ifindex;
> dpif->user_features = dp->user_features;
> + dpif->port_ids = id_pool_create(0, MAX_PORTS);
> + if (!dpif->port_ids) {
> + return ENOMEM;
> + }
> +
No need to check the return of id_pool_create().
> *dpifp = &dpif->dpif;
>
> return 0;
> @@ -452,161 +508,144 @@ static bool
> vport_get_pid(struct dpif_netlink *dpif, uint32_t port_idx,
> uint32_t *upcall_pid)
> {
> - /* Since the nl_sock can only be assigned in either all
> - * or none "dpif" channels, the following check
> - * would suffice. */
> - if (!dpif->channels[port_idx].sock) {
> + size_t i;
> + ovs_assert(!WINDOWS || dpif->n_handlers <= 1);
> +
> + if (!id_pool_check(dpif->port_ids, port_idx)) {
> return false;
> }
> - ovs_assert(!WINDOWS || dpif->n_handlers <= 1);
>
> - *upcall_pid = nl_sock_pid(dpif->channels[port_idx].sock);
> + /* The 'port_idx' should only be valid for a single handler. */
> + for (i = 0; i < dpif->n_handlers; i++) {
> + struct dpif_channel *channel =
> + dpif_handler_channel_get(&dpif->handlers[i], port_idx);
> +
> + if (channel) {
> + *upcall_pid = nl_sock_pid(channel->sock);
> + return true;
> + }
> + }
>
> - return true;
> + return false;
> }
>
> static int
> vport_add_channel(struct dpif_netlink *dpif, odp_port_t port_no,
> struct nl_sock *sock)
> {
> - struct epoll_event event;
> uint32_t port_idx = odp_to_u32(port_no);
> + struct dpif_handler *handler;
> + struct epoll_event event;
> size_t i;
> - int error;
>
> - if (dpif->handlers == NULL) {
> + if (dpif->handlers == NULL ||
> + id_pool_check(dpif->port_ids, port_idx)) {
> close_nl_sock(sock);
> - return 0;
> + return EINVAL;
> }
>
> - /* We assume that the datapath densely chooses port numbers, which can
> - * therefore be used as an index into 'channels' and 'epoll_events' of
> - * 'dpif'. */
> - if (port_idx >= dpif->uc_array_size) {
> - uint32_t new_size = port_idx + 1;
> -
> - if (new_size > MAX_PORTS) {
> + if (port_idx >= MAX_PORTS) {
> VLOG_WARN_RL(&error_rl, "%s: datapath port %"PRIu32" too big",
> dpif_name(&dpif->dpif), port_no);
VLOG_ is indented twice?
> return EFBIG;
> - }
> -
> - dpif->channels = xrealloc(dpif->channels,
> - new_size * sizeof *dpif->channels);
> + }
>
> - for (i = dpif->uc_array_size; i < new_size; i++) {
> - dpif->channels[i].sock = NULL;
> - }
> + handler = &dpif->handlers[0];
>
> - for (i = 0; i < dpif->n_handlers; i++) {
> - struct dpif_handler *handler = &dpif->handlers[i];
> + for (i = 0; i < dpif->n_handlers; i++) {
> + if (dpif_handler_channels_count(&dpif->handlers[i]) <
> + dpif_handler_channels_count(handler))
> + handler = &dpif->handlers[i];
Missing {}
> + }
>
> - handler->epoll_events = xrealloc(handler->epoll_events,
> - new_size * sizeof *handler->epoll_events);
> + dpif_handler_channel_add(handler, port_idx, sock);
>
> - }
> - dpif->uc_array_size = new_size;
> - }
> + handler->epoll_events = xrealloc(handler->epoll_events,
> + dpif_handler_channels_count(handler) *
> + sizeof *handler->epoll_events);
Please fix the indentation.
> memset(&event, 0, sizeof event);
> event.events = EPOLLIN | EPOLLEXCLUSIVE;
> event.data.u32 = port_idx;
>
> - for (i = 0; i < dpif->n_handlers; i++) {
> - struct dpif_handler *handler = &dpif->handlers[i];
> -
> #ifndef _WIN32
> - if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock),
> - &event) < 0) {
> - error = errno;
> - goto error;
> - }
> -#endif
> - }
> - dpif->channels[port_idx].sock = sock;
> - dpif->channels[port_idx].last_poll = LLONG_MIN;
> -
> - return 0;
> -
> -error:
> -#ifndef _WIN32
> - while (i--) {
> - epoll_ctl(dpif->handlers[i].epoll_fd, EPOLL_CTL_DEL,
> - nl_sock_fd(sock), NULL);
> + if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock),
> + &event) < 0) {
> + dpif_handler_channel_del(handler, port_idx);
> + id_pool_free_id(dpif->port_ids, port_idx);
> + return errno;
> }
> #endif
> - dpif->channels[port_idx].sock = NULL;
>
> - return error;
> + id_pool_add(dpif->port_ids, port_idx);
> + return 0;
> }
>
> static void
> vport_del_channels(struct dpif_netlink *dpif, odp_port_t port_no)
Please rename the function. The relation is 1:1 between ports and
channels with this patch, so this function only deletes one channel.
> {
> uint32_t port_idx = odp_to_u32(port_no);
> + struct dpif_handler *handler = NULL;
> + struct dpif_channel *channel = NULL;
> size_t i;
>
> - if (!dpif->handlers || port_idx >= dpif->uc_array_size
> - || !dpif->channels[port_idx].sock) {
> + if (!dpif->handlers) {
> return;
> }
>
> for (i = 0; i < dpif->n_handlers; i++) {
> - struct dpif_handler *handler = &dpif->handlers[i];
> + handler = &dpif->handlers[i];
> + channel = dpif_handler_channel_get(handler, port_idx);
> +
> + if (channel) {
> #ifndef _WIN32
> - epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL,
> - nl_sock_fd(dpif->channels[port_idx].sock), NULL);
> + epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL,
> + nl_sock_fd(channel->sock), NULL);
wrong indentation?
> + nl_sock_destroy(channel->sock);
> #endif
> - handler->event_offset = handler->n_events = 0;
> + dpif_handler_channel_del(handler, port_idx);
> + id_pool_free_id(dpif->port_ids, port_idx);
> + handler->event_offset = handler->n_events = 0;
> + break;
> + }
> }
> -#ifndef _WIN32
> - nl_sock_destroy(dpif->channels[port_idx].sock);
> -#endif
> - dpif->channels[port_idx].sock = NULL;
> }
>
> static void
> destroy_all_channels(struct dpif_netlink *dpif)
> OVS_REQ_WRLOCK(dpif->upcall_lock)
> {
> - unsigned int i;
> + size_t i;
>
> if (!dpif->handlers) {
> return;
> }
>
> - for (i = 0; i < dpif->uc_array_size; i++ ) {
> - struct dpif_netlink_vport vport_request;
> - uint32_t upcall_pids = 0;
> -
> - if (!dpif->channels[i].sock) {
> - continue;
> - }
> -
> - /* Turn off upcalls. */
> - dpif_netlink_vport_init(&vport_request);
> - vport_request.cmd = OVS_VPORT_CMD_SET;
> - vport_request.dp_ifindex = dpif->dp_ifindex;
> - vport_request.port_no = u32_to_odp(i);
> - vport_request.n_upcall_pids = 1;
> - vport_request.upcall_pids = &upcall_pids;
> - dpif_netlink_vport_transact(&vport_request, NULL, NULL);
> -
> - vport_del_channels(dpif, u32_to_odp(i));
> - }
> -
> for (i = 0; i < dpif->n_handlers; i++) {
> struct dpif_handler *handler = &dpif->handlers[i];
> + struct dpif_channel *channel, *next;
> +
> + HMAP_FOR_EACH_SAFE (channel, next, hmap_node, &handler->channels) {
> + struct dpif_netlink_vport vport_request;
> + uint32_t upcall_pids = 0;
> + /* Turn off upcalls. */
> + dpif_netlink_vport_init(&vport_request);
> + vport_request.cmd = OVS_VPORT_CMD_SET;
> + vport_request.dp_ifindex = dpif->dp_ifindex;
> + vport_request.port_no = u32_to_odp(channel->port_idx);
> + vport_request.n_upcall_pids = 1;
> + vport_request.upcall_pids = &upcall_pids;
> + dpif_netlink_vport_transact(&vport_request, NULL, NULL);
> +
> + vport_del_channels(dpif, u32_to_odp(channel->port_idx));
> + }
>
> dpif_netlink_handler_uninit(handler);
I am afraid we can't do that here because it will destroy the hmap.
Then, during the next handler processing, it will do:
vport_del_channels()
for (i = 0; i < dpif->n_handlers; i++) <-- start from 0:
dpif_handler_channel_get()
The dpif_handler_channel_get() will use a destroyed hmap.
Also, the call path iterates over the same objects few times:
dpif_netlink_close()
+ destroy_all_channels()
foreach n_handlers:
HMAP_FOR_EACH_SAFE(channels)
+ vport_del_channels()
foreach n_handlers:
+ dpif_handler_channel_get
HMAP_FOR_EACH_SAFE(channels)
+ dpif_handler_channel_del
+ dpif_handler_channel_get
HMAP_FOR_EACH_SAFE(channels)
Therefore, it iterates 2x on handlers and 3x on the hmaps to
remove a single channel.
> free(handler->epoll_events);
> }
> - free(dpif->channels);
> - free(dpif->handlers);
> +
> dpif->handlers = NULL;
> - dpif->channels = NULL;
> dpif->n_handlers = 0;
> - dpif->uc_array_size = 0;
> }
>
> static void
> @@ -618,9 +657,11 @@ dpif_netlink_close(struct dpif *dpif_)
>
> fat_rwlock_wrlock(&dpif->upcall_lock);
> destroy_all_channels(dpif);
> + id_pool_destroy(dpif->port_ids);
> fat_rwlock_unlock(&dpif->upcall_lock);
>
> fat_rwlock_destroy(&dpif->upcall_lock);
> +
> free(dpif);
> }
>
> @@ -1084,23 +1125,40 @@ dpif_netlink_port_get_pid__(const struct dpif_netlink *dpif,
> odp_port_t port_no)
> OVS_REQ_RDLOCK(dpif->upcall_lock)
> {
> + struct dpif_channel *min_channel = NULL, *found_channel = NULL;
> uint32_t port_idx = odp_to_u32(port_no);
> uint32_t pid = 0;
> + size_t i;
> +
> + /* The ODPP_NONE "reserved" port number uses the "ovs-system"'s
> + * channel, since it is not heavily loaded. */
> + uint32_t idx = port_no == ODPP_NONE ? 0 : port_idx;
> +
> + if (!id_pool_check(dpif->port_ids, port_idx)) {
> + return 0;
> + }
> +
> + if (dpif->handlers) {
> + for (i = 0; i < dpif->n_handlers; i++) {
> + if (dpif_handler_channel_get(&dpif->handlers[i], idx)) {
> + found_channel = dpif_handler_channel_get(&dpif->handlers[i],
> + idx);
> + break;
> + }
>
> - if (dpif->handlers && dpif->uc_array_size > 0) {
> - /* The ODPP_NONE "reserved" port number uses the "ovs-system"'s
> - * channel, since it is not heavily loaded. */
> - uint32_t idx = port_idx >= dpif->uc_array_size ? 0 : port_idx;
> -
> - /* Needs to check in case the socket pointer is changed in between
> - * the holding of upcall_lock. A known case happens when the main
> - * thread deletes the vport while the handler thread is handling
> - * the upcall from that port. */
> - if (dpif->channels[idx].sock) {
> - pid = nl_sock_pid(dpif->channels[idx].sock);
> + if (dpif_handler_channel_get(&dpif->handlers[i], 0)) {
> + min_channel = dpif_handler_channel_get(&dpif->handlers[i],
> + 0);
No need to break the line.
> + }
> }
> }
>
> + if (found_channel) {
> + pid = nl_sock_pid(found_channel->sock);
> + } else if (min_channel) {
> + pid = nl_sock_pid(min_channel->sock);
> + }
> +
> return pid;
> }
>
> @@ -2342,12 +2400,14 @@ dpif_netlink_operate(struct dpif *dpif_, struct dpif_op **ops, size_t n_ops,
> static void
> dpif_netlink_handler_uninit(struct dpif_handler *handler)
> {
> + hmap_destroy(&handler->channels);
> vport_delete_sock_pool(handler);
> }
>
> static int
> dpif_netlink_handler_init(struct dpif_handler *handler)
> {
> + hmap_init(&handler->channels);
> return vport_create_sock_pool(handler);
> }
> #else
> @@ -2355,6 +2415,7 @@ dpif_netlink_handler_init(struct dpif_handler *handler)
> static int
> dpif_netlink_handler_init(struct dpif_handler *handler)
> {
> + hmap_init(&handler->channels);
> handler->epoll_fd = epoll_create(10);
> return handler->epoll_fd < 0 ? errno : 0;
> }
> @@ -2362,6 +2423,7 @@ dpif_netlink_handler_init(struct dpif_handler *handler)
> static void
> dpif_netlink_handler_uninit(struct dpif_handler *handler)
> {
> + hmap_destroy(&handler->channels);
> close(handler->epoll_fd);
> }
> #endif
> @@ -2374,9 +2436,7 @@ static int
> dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers)
> OVS_REQ_WRLOCK(dpif->upcall_lock)
> {
> - unsigned long int *keep_channels;
> struct dpif_netlink_vport vport;
> - size_t keep_channels_nbits;
> struct nl_dump dump;
> uint64_t reply_stub[NL_DUMP_BUFSIZE / 8];
> struct ofpbuf buf;
> @@ -2416,9 +2476,6 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers)
> handler->event_offset = handler->n_events = 0;
> }
>
> - keep_channels_nbits = dpif->uc_array_size;
> - keep_channels = bitmap_allocate(keep_channels_nbits);
> -
> ofpbuf_use_stub(&buf, reply_stub, sizeof reply_stub);
> dpif_netlink_port_dump_start__(dpif, &dump);
> while (!dpif_netlink_port_dump_next__(dpif, &dump, &vport, &buf)) {
> @@ -2426,8 +2483,7 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers)
> uint32_t upcall_pid;
> int error;
>
> - if (port_no >= dpif->uc_array_size
> - || !vport_get_pid(dpif, port_no, &upcall_pid)) {
> + if (!vport_get_pid(dpif, port_no, &upcall_pid)) {
> struct nl_sock *sock;
> error = create_nl_sock(dpif, &sock);
>
> @@ -2475,9 +2531,6 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers)
> }
> }
>
> - if (port_no < keep_channels_nbits) {
> - bitmap_set1(keep_channels, port_no);
> - }
> continue;
>
> error:
> @@ -2486,14 +2539,6 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers)
> nl_dump_done(&dump);
> ofpbuf_uninit(&buf);
>
> - /* Discard any saved channels that we didn't reuse. */
> - for (i = 0; i < keep_channels_nbits; i++) {
> - if (!bitmap_is_set(keep_channels, i)) {
> - vport_del_channels(dpif, u32_to_odp(i));
> - }
> - }
> - free(keep_channels);
> -
> return retval;
> }
>
> @@ -2714,6 +2759,8 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id,
> OVS_REQ_RDLOCK(dpif->upcall_lock)
> {
> struct dpif_handler *handler;
> + size_t n_channels;
> +
> int read_tries = 0;
>
> if (!dpif->handlers || handler_id >= dpif->n_handlers) {
> @@ -2721,14 +2768,15 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id,
> }
>
> handler = &dpif->handlers[handler_id];
> - if (handler->event_offset >= handler->n_events) {
> + n_channels = dpif_handler_channels_count(handler);
> + if (handler->event_offset >= handler->n_events && n_channels) {
> int retval;
>
> handler->event_offset = handler->n_events = 0;
>
> do {
> retval = epoll_wait(handler->epoll_fd, handler->epoll_events,
> - dpif->uc_array_size, 0);
> + n_channels, 0);
> } while (retval < 0 && errno == EINTR);
>
> if (retval < 0) {
> @@ -2741,7 +2789,10 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id,
>
> while (handler->event_offset < handler->n_events) {
> int idx = handler->epoll_events[handler->event_offset].data.u32;
> - struct dpif_channel *ch = &dpif->channels[idx];
> + struct dpif_channel *ch = dpif_handler_channel_get(handler, idx);
> + if (!ch) {
> + return EAGAIN;
> + }
>
> handler->event_offset++;
>
> @@ -2845,12 +2896,14 @@ dpif_netlink_recv_purge__(struct dpif_netlink *dpif)
> if (dpif->handlers) {
> size_t i;
>
> - if (!dpif->channels[0].sock) {
> - return;
> - }
> - for (i = 0; i < dpif->uc_array_size; i++ ) {
> + for (i = 0; i < dpif->n_handlers; i++) {
> + struct dpif_handler *handler = &dpif->handlers[i];
> + struct dpif_channel *channel;
> +
> + HMAP_FOR_EACH (channel, hmap_node, &handler->channels) {
> + nl_sock_drain(channel->sock);
> + }
>
> - nl_sock_drain(dpif->channels[i].sock);
> }
> }
> }
> --
> 2.26.2
>
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
--
fbl
More information about the dev
mailing list