[ovs-dev] [PATCH 1/6] Make polling fds persistent

anton.ivanov at cambridgegreys.com anton.ivanov at cambridgegreys.com
Mon Jul 6 08:20:09 UTC 2020


From: Anton Ivanov <anton.ivanov at cambridgegreys.com>

Saves on:

1. Allocation and disposal of a hash map per iteration in all threads
2. Re-population of the hashmap with all fds per iteration
3. Walking of the hashmap to construct a pollfd array per iteration
4. Allocating/deallocating the pollfd array per iteration
5. Decreases costs on various lookups

Compared to older attempts to do this, this emulates strictly the old
behaviour and is 100% backwards compatible with the old approach.

Unix only - the unix poll loop has been pulled to a new file.

Signed-off-by: Anton Ivanov <anton.ivanov at cambridgegreys.com>
---
 lib/automake.mk      |   3 +-
 lib/poll-loop-unix.c | 415 +++++++++++++++++++++++++++++++++++++++++++
 lib/poll-loop.c      |  19 +-
 3 files changed, 418 insertions(+), 19 deletions(-)
 create mode 100644 lib/poll-loop-unix.c

diff --git a/lib/automake.mk b/lib/automake.mk
index 86940ccd2..39ff70650 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -242,7 +242,6 @@ lib_libopenvswitch_la_SOURCES = \
 	lib/perf-counter.c \
 	lib/stopwatch.h \
 	lib/stopwatch.c \
-	lib/poll-loop.c \
 	lib/process.c \
 	lib/process.h \
 	lib/pvector.c \
@@ -349,6 +348,7 @@ lib_libopenvswitch_la_SOURCES += \
 	lib/route-table-stub.c \
 	lib/if-notifier-stub.c \
 	lib/stream-windows.c \
+	lib/poll-loop.c \
 	lib/strsep.c
 else
 lib_libopenvswitch_la_SOURCES += \
@@ -357,6 +357,7 @@ lib_libopenvswitch_la_SOURCES += \
 	lib/signals.c \
 	lib/signals.h \
 	lib/socket-util-unix.c \
+	lib/poll-loop-unix.c \
 	lib/stream-unix.c
 endif
 
diff --git a/lib/poll-loop-unix.c b/lib/poll-loop-unix.c
new file mode 100644
index 000000000..0fb137855
--- /dev/null
+++ b/lib/poll-loop-unix.c
@@ -0,0 +1,415 @@
+/*
+ * Copyright (c) 2020 Red Hat Inc
+ * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include "openvswitch/poll-loop.h"
+#include <errno.h>
+#include <inttypes.h>
+#include <poll.h>
+#include <stdlib.h>
+#include <string.h>
+#include "coverage.h"
+#include "openvswitch/dynamic-string.h"
+#include "fatal-signal.h"
+#include "openvswitch/list.h"
+#include "ovs-thread.h"
+#include "seq.h"
+#include "socket-util.h"
+#include "timeval.h"
+#include "openvswitch/vlog.h"
+#include "openvswitch/hmap.h"
+#include "hash.h"
+
+VLOG_DEFINE_THIS_MODULE(poll_loop);
+
+COVERAGE_DEFINE(poll_create_node);
+COVERAGE_DEFINE(poll_zero_timeout);
+
+#define POLLFD_INCREMENT 16;
+
+/* The poll_node structures are used solely as metadata for
+ * the pollfd array associated with the loop. That pollfd
+ * array is persistent and does not need to be regenerated
+ * on every iteration.
+ */
+
+struct poll_node {
+    struct hmap_node hmap_node;
+    int index;                  /* index in the pollfd array */
+    const char *where;          /* Where poll_node was created. */
+};
+
+struct poll_loop {
+    /* All active poll waiters. */
+    struct hmap poll_nodes;
+
+    /* Time at which to wake up the next call to poll_block(), LLONG_MIN to
+     * wake up immediately, or LLONG_MAX to wait forever. */
+    long long int timeout_when; /* In msecs as returned by time_msec(). */
+    const char *timeout_where;  /* Where 'timeout_when' was set. */
+    struct pollfd * watched;    /* list of descriptors and event masks passed to poll */
+    int watched_size;           /* size of the watched allocation */
+};
+
+static struct poll_loop *poll_loop(void);
+
+/* Look up the node with same fd or wevent. */
+static struct poll_node *
+find_poll_node(struct poll_loop *loop, int fd)
+{
+    struct poll_node *node;
+
+    HMAP_FOR_EACH_WITH_HASH (node, hmap_node,
+                             hash_2words(fd, 0),
+                             &loop->poll_nodes) {
+        if (fd && loop->watched[node->index].fd == fd) {
+            return node;
+        }
+    }
+    return NULL;
+}
+
+/* On Unix based systems:
+ *
+ *     Registers 'fd' as waiting for the specified 'events' (which should be
+ *     POLLIN or POLLOUT or POLLIN | POLLOUT).  The following call to
+ *     poll_block() will wake up when 'fd' becomes ready for one or more of the
+ *     requested events. The 'fd's are given to poll() function later.
+ *
+ * The event registration is one-shot: only the following call to
+ * poll_block() is affected.  The event will need to be re-registered after
+ * poll_block() is called if it is to persist.
+ *
+ * ('where' is used in debug logging.  Commonly one would use poll_fd_wait() to
+ * automatically provide the caller's source file and line number for
+ * 'where'.) */
+
+
+static void
+poll_create_node(int fd, short int events, const char *where)
+{
+    struct poll_loop *loop = poll_loop();
+    struct poll_node *node;
+
+    COVERAGE_INC(poll_create_node);
+
+    /* Check for duplicate.  If found, "or" the events. */
+    node = find_poll_node(loop, fd);
+    if (node) {
+        loop->watched[node->index].events |= events;
+        /* we overwrite the original where with the value for
+         * this invocation */
+        node->where = where;
+    } else {
+        node = xzalloc(sizeof *node);
+        hmap_insert(&loop->poll_nodes, &node->hmap_node,
+                    hash_2words(fd, 0));
+
+        /* If the hash has grown bigger than its matching pollfd array
+         * allocation, allocate a new one and copy all elements there
+         */
+        if (hmap_count(&loop->poll_nodes) > loop->watched_size) {
+            struct pollfd *resized_watched;
+
+            loop->watched_size += POLLFD_INCREMENT;
+            resized_watched = xzalloc(sizeof(struct pollfd) * loop->watched_size);
+            memcpy(resized_watched, loop->watched, sizeof(struct pollfd) * (loop->watched_size - 1));
+            free(loop->watched);
+            loop->watched = resized_watched;
+        }
+        /* insert the new record at the end of the pollfd array */
+        node->index = hmap_count(&loop->poll_nodes) - 1;
+        loop->watched[node->index].fd = fd;
+        loop->watched[node->index].events = events | POLLHUP | POLLERR;
+        node->where = where;
+    }
+}
+
+/* Registers 'fd' as waiting for the specified 'events' (which should be POLLIN
+ * or POLLOUT or POLLIN | POLLOUT).  The following call to poll_block() will
+ * wake up when 'fd' becomes ready for one or more of the requested events.
+ *
+ * The event registration is one-shot: only the following call to poll_block()
+ * is affected.  The event will need to be re-registered after poll_block() is
+ * called if it is to persist.
+ *
+ * ('where' is used in debug logging.  Commonly one would use poll_fd_wait() to
+ * automatically provide the caller's source file and line number for
+ * 'where'.) */
+void
+poll_fd_wait_at(int fd, short int events, const char *where)
+{
+    poll_create_node(fd, events, where);
+}
+
+/* Causes the following call to poll_block() to block for no more than 'msec'
+ * milliseconds.  If 'msec' is nonpositive, the following call to poll_block()
+ * will not block at all.
+ *
+ * The timer registration is one-shot: only the following call to poll_block()
+ * is affected.  The timer will need to be re-registered after poll_block() is
+ * called if it is to persist.
+ *
+ * ('where' is used in debug logging.  Commonly one would use poll_timer_wait()
+ * to automatically provide the caller's source file and line number for
+ * 'where'.) */
+void
+poll_timer_wait_at(long long int msec, const char *where)
+{
+    long long int now = time_msec();
+    long long int when;
+
+    if (msec <= 0) {
+        /* Wake up immediately. */
+        when = LLONG_MIN;
+    } else if ((unsigned long long int) now + msec <= LLONG_MAX) {
+        /* Normal case. */
+        when = now + msec;
+    } else {
+        /* now + msec would overflow. */
+        when = LLONG_MAX;
+    }
+
+    poll_timer_wait_until_at(when, where);
+}
+
+/* Causes the following call to poll_block() to wake up when the current time,
+ * as returned by time_msec(), reaches 'when' or later.  If 'when' is earlier
+ * than the current time, the following call to poll_block() will not block at
+ * all.
+ *
+ * The timer registration is one-shot: only the following call to poll_block()
+ * is affected.  The timer will need to be re-registered after poll_block() is
+ * called if it is to persist.
+ *
+ * ('where' is used in debug logging.  Commonly one would use
+ * poll_timer_wait_until() to automatically provide the caller's source file
+ * and line number for 'where'.) */
+void
+poll_timer_wait_until_at(long long int when, const char *where)
+{
+    struct poll_loop *loop = poll_loop();
+    if (when < loop->timeout_when) {
+        loop->timeout_when = when;
+        loop->timeout_where = where;
+    }
+}
+
+/* Causes the following call to poll_block() to wake up immediately, without
+ * blocking.
+ *
+ * ('where' is used in debug logging.  Commonly one would use
+ * poll_immediate_wake() to automatically provide the caller's source file and
+ * line number for 'where'.) */
+void
+poll_immediate_wake_at(const char *where)
+{
+    poll_timer_wait_at(0, where);
+}
+
+/* Logs, if appropriate, that the poll loop was awakened by an event
+ * registered at 'where' (typically a source file and line number).  The other
+ * arguments have two possible interpretations:
+ *
+ *   - If 'pollfd' is nonnull then it should be the "struct pollfd" that caused
+ *     the wakeup.  'timeout' is ignored.
+ *
+ *   - If 'pollfd' is NULL then 'timeout' is the number of milliseconds after
+ *     which the poll loop woke up.
+ */
+static void
+log_wakeup(const char *where, const struct pollfd *pollfd, int timeout)
+{
+    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10);
+    enum vlog_level level;
+    int cpu_usage;
+    struct ds s;
+
+    cpu_usage = get_cpu_usage();
+    if (VLOG_IS_DBG_ENABLED()) {
+        level = VLL_DBG;
+    } else if (cpu_usage > 50
+               && !thread_is_pmd()
+               && !VLOG_DROP_INFO(&rl)) {
+        level = VLL_INFO;
+    } else {
+        return;
+    }
+
+    ds_init(&s);
+    ds_put_cstr(&s, "wakeup due to ");
+    if (pollfd) {
+        char *description = describe_fd(pollfd->fd);
+        if (pollfd->revents & POLLIN) {
+            ds_put_cstr(&s, "[POLLIN]");
+        }
+        if (pollfd->revents & POLLOUT) {
+            ds_put_cstr(&s, "[POLLOUT]");
+        }
+        if (pollfd->revents & POLLERR) {
+            ds_put_cstr(&s, "[POLLERR]");
+        }
+        if (pollfd->revents & POLLHUP) {
+            ds_put_cstr(&s, "[POLLHUP]");
+        }
+        if (pollfd->revents & POLLNVAL) {
+            ds_put_cstr(&s, "[POLLNVAL]");
+        }
+        ds_put_format(&s, " on fd %d (%s)", pollfd->fd, description);
+        free(description);
+    } else {
+        ds_put_format(&s, "%d-ms timeout", timeout);
+    }
+    if (where) {
+        ds_put_format(&s, " at %s", where);
+    }
+    if (cpu_usage >= 0) {
+        ds_put_format(&s, " (%d%% CPU usage)", cpu_usage);
+    }
+    VLOG(level, "%s", ds_cstr(&s));
+    ds_destroy(&s);
+}
+
+static void
+free_poll_nodes(struct poll_loop *loop)
+{
+    struct poll_node *node, *next;
+
+    HMAP_FOR_EACH_SAFE (node, next, hmap_node, &loop->poll_nodes) {
+        hmap_remove(&loop->poll_nodes, &node->hmap_node);
+        free(node);
+    }
+}
+
+/* Blocks until one or more of the events registered with poll_fd_wait()
+ * occurs, or until the minimum duration registered with poll_timer_wait()
+ * elapses, or not at all if poll_immediate_wake() has been called. */
+void
+poll_block(void)
+{
+    struct poll_loop *loop = poll_loop();
+    struct poll_node *node, *moved_node;
+    int elapsed;
+    int retval;
+    int i;
+
+    /* Register fatal signal events before actually doing any real work for
+     * poll_block. */
+    fatal_signal_wait();
+
+    if (loop->timeout_when == LLONG_MIN) {
+        COVERAGE_INC(poll_zero_timeout);
+    }
+
+    timewarp_run();
+
+    /* We do not need to pre-process the pollfd array in any way - it is
+     * ready for use.
+     */
+
+    retval = time_poll(loop->watched, hmap_count(&loop->poll_nodes), NULL,
+                       loop->timeout_when, &elapsed);
+    if (retval < 0) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+        VLOG_ERR_RL(&rl, "poll: %s", ovs_strerror(-retval));
+    } else if (!retval) {
+        log_wakeup(loop->timeout_where, NULL, elapsed);
+    } else if (get_cpu_usage() > 50 || VLOG_IS_DBG_ENABLED()) {
+        for (i = 0; i < hmap_count(&loop->poll_nodes); i++) {
+            if (loop->watched[i].revents) {
+                node = find_poll_node(loop, loop->watched[i].fd);
+                ovs_assert(node != NULL);
+                log_wakeup(node->where, &loop->watched[i], 0);
+            }
+        }
+    }
+
+    /* Update the pollfd array to reproduce one-shot behaviour
+     * and reap any fds which have been closed in the meantime
+     */
+
+    i = 0;
+    while (i < hmap_count(&loop->poll_nodes)) {
+        node = find_poll_node(loop, loop->watched[i].fd);
+        ovs_assert(node != NULL);
+        if (loop->watched[i].revents & (POLLHUP | POLLNVAL)) {
+            /* FD was closed - reap */
+            if (i < (hmap_count(&loop->poll_nodes) - 1)) {
+                /* move last record to this index position */
+                moved_node = find_poll_node(
+                        loop,
+                        loop->watched[hmap_count(&loop->poll_nodes) - 1].fd);
+                ovs_assert(moved_node != NULL);
+                loop->watched[i] = loop->watched[hmap_count(&loop->poll_nodes) - 1];
+                moved_node->index = i;
+            }
+            hmap_remove(&loop->poll_nodes, &node->hmap_node);
+            /* note - we do not i++ here as we have not processed the node
+             * which we have moved from the tail of the array into the hole
+             */
+        } else {
+            /* Clear events to replicate one-shot behaviour. Leave file
+             * close related events intact so we can track file closures
+             */
+            loop->watched[i].events = (POLLHUP & POLLERR);
+            i++;
+        }
+    }
+
+    loop->timeout_when = LLONG_MAX;
+    loop->timeout_where = NULL;
+
+    /* Handle any pending signals before doing anything else. */
+    fatal_signal_run();
+
+    seq_woke();
+}
+
+static void
+free_poll_loop(void *loop_)
+{
+    struct poll_loop *loop = loop_;
+
+    free_poll_nodes(loop);
+    hmap_destroy(&loop->poll_nodes);
+    free(loop->watched);
+    free(loop);
+}
+
+static struct poll_loop *
+poll_loop(void)
+{
+    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
+    static pthread_key_t key;
+    struct poll_loop *loop;
+
+    if (ovsthread_once_start(&once)) {
+        xpthread_key_create(&key, free_poll_loop);
+        ovsthread_once_done(&once);
+    }
+
+    loop = pthread_getspecific(key);
+    if (!loop) {
+        loop = xzalloc(sizeof *loop);
+        loop->timeout_when = LLONG_MAX;
+        hmap_init(&loop->poll_nodes);
+        loop->watched_size = POLLFD_INCREMENT;
+        loop->watched = xzalloc(sizeof(struct pollfd) * loop->watched_size);
+        xpthread_setspecific(key, loop);
+    }
+    return loop;
+}
diff --git a/lib/poll-loop.c b/lib/poll-loop.c
index 4e751ff2c..0eef47ae8 100644
--- a/lib/poll-loop.c
+++ b/lib/poll-loop.c
@@ -77,14 +77,7 @@ find_poll_node(struct poll_loop *loop, int fd, HANDLE wevent)
     return NULL;
 }
 
-/* On Unix based systems:
- *
- *     Registers 'fd' as waiting for the specified 'events' (which should be
- *     POLLIN or POLLOUT or POLLIN | POLLOUT).  The following call to
- *     poll_block() will wake up when 'fd' becomes ready for one or more of the
- *     requested events. The 'fd's are given to poll() function later.
- *
- * On Windows system:
+/* On Windows system:
  *
  *     If 'fd' is specified, create a new 'wevent'. Association of 'fd' and
  *     'wevent' for 'events' happens in poll_block(). If 'wevent' is specified,
@@ -120,11 +113,9 @@ poll_create_node(int fd, HANDLE wevent, short int events, const char *where)
                     hash_2words(fd, (uint32_t)wevent));
         node->pollfd.fd = fd;
         node->pollfd.events = events;
-#ifdef _WIN32
         if (!wevent) {
             wevent = CreateEvent(NULL, FALSE, FALSE, NULL);
         }
-#endif
         node->wevent = wevent;
         node->where = where;
     }
@@ -149,7 +140,6 @@ poll_fd_wait_at(int fd, short int events, const char *where)
     poll_create_node(fd, 0, events, where);
 }
 
-#ifdef _WIN32
 /* Registers for the next call to poll_block() to wake up when 'wevent' is
  * signaled.
  *
@@ -165,7 +155,6 @@ poll_wevent_wait_at(HANDLE wevent, const char *where)
 {
     poll_create_node(0, wevent, 0, where);
 }
-#endif /* _WIN32 */
 
 /* Causes the following call to poll_block() to block for no more than 'msec'
  * milliseconds.  If 'msec' is nonpositive, the following call to poll_block()
@@ -302,12 +291,10 @@ free_poll_nodes(struct poll_loop *loop)
 
     HMAP_FOR_EACH_SAFE (node, next, hmap_node, &loop->poll_nodes) {
         hmap_remove(&loop->poll_nodes, &node->hmap_node);
-#ifdef _WIN32
         if (node->wevent && node->pollfd.fd) {
             WSAEventSelect(node->pollfd.fd, NULL, 0);
             CloseHandle(node->wevent);
         }
-#endif
         free(node);
     }
 }
@@ -337,15 +324,12 @@ poll_block(void)
     timewarp_run();
     pollfds = xmalloc(hmap_count(&loop->poll_nodes) * sizeof *pollfds);
 
-#ifdef _WIN32
     wevents = xmalloc(hmap_count(&loop->poll_nodes) * sizeof *wevents);
-#endif
 
     /* Populate with all the fds and events. */
     i = 0;
     HMAP_FOR_EACH (node, hmap_node, &loop->poll_nodes) {
         pollfds[i] = node->pollfd;
-#ifdef _WIN32
         wevents[i] = node->wevent;
         if (node->pollfd.fd && node->wevent) {
             short int wsa_events = 0;
@@ -357,7 +341,6 @@ poll_block(void)
             }
             WSAEventSelect(node->pollfd.fd, node->wevent, wsa_events);
         }
-#endif
         i++;
     }
 
-- 
2.20.1



More information about the dev mailing list