[ovs-dev] [PATCH v3 2/3] EPOLL Support for persistent fds

anton.ivanov at cambridgegreys.com anton.ivanov at cambridgegreys.com
Wed Feb 26 11:52:38 UTC 2020


From: Anton Ivanov <anton.ivanov at cambridgegreys.com>

Switches FDs which are marked as persistent in persistent
poll loops to use epoll instead of poll

Signed-off-by: Anton Ivanov <anton.ivanov at cambridgegreys.com>
---
 lib/poll-loop.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++-
 lib/timeval.c   |  86 ++++++++++++++++++++++++++++++++++++++++
 lib/timeval.h   |   7 ++++
 3 files changed, 194 insertions(+), 2 deletions(-)

diff --git a/lib/poll-loop.c b/lib/poll-loop.c
index 68e44eba2..5ce5977d5 100644
--- a/lib/poll-loop.c
+++ b/lib/poll-loop.c
@@ -38,6 +38,14 @@ VLOG_DEFINE_THIS_MODULE(poll_loop);
 COVERAGE_DEFINE(poll_create_node);
 COVERAGE_DEFINE(poll_zero_timeout);
 
+#define MAX_EPOLL_EVENTS 64
+
+#ifdef __linux__
+#define USE_EPOLL
+#include <unistd.h>
+#include <sys/epoll.h>
+#endif
+
 struct poll_node {
     struct hmap_node hmap_node;
     struct pollfd pollfd;       /* Events to pass to time_poll(). */
@@ -45,7 +53,6 @@ struct poll_node {
     const char *where;          /* Where poll_node was created. */
     bool valid;                 /* Marked invalid if we got a HUP/NVAL from poll */
 };
-
 struct poll_loop {
     /* All active poll waiters. */
     struct hmap poll_nodes;
@@ -55,10 +62,52 @@ struct poll_loop {
     long long int timeout_when; /* In msecs as returned by time_msec(). */
     const char *timeout_where;  /* Where 'timeout_when' was set. */
     bool persist;
+#ifdef USE_EPOLL
+    int epoll_fd;
+    struct epoll_event epoll_events[MAX_EPOLL_EVENTS];
+#endif
 };
 
 static struct poll_loop *poll_loop(void);
 
+#ifdef USE_EPOLL
+static inline int poll_to_epoll_events(short events) {
+    int ret = 0;
+    if (events & POLLIN) {
+        ret |= EPOLLIN;
+    }
+    if (events & POLLOUT) {
+        ret |= EPOLLOUT;
+    }
+    /* epoll always listens on ERR, no need to map,
+     * epoll distinguishes between HUP and RDHUP,
+     * they are same in poll, epoll has no NVAL
+     */
+    if (events & (POLLHUP | POLLNVAL)) {
+        ret |= (EPOLLHUP | EPOLLRDHUP); 
+    }
+    return ret;
+}
+
+static inline short epoll_to_poll_events(int events) {
+    short ret = 0;
+    if (events & EPOLLIN) {
+        ret |= POLLIN;
+    }
+    if (events & EPOLLOUT) {
+        ret |= POLLOUT;
+    }
+    /* epoll always listens on ERR, no need to map,
+     * epoll distinguishes between HUP and RDHUP,
+     * they are same in poll, epoll has no NVAL
+     */
+    if (events & (EPOLLHUP | EPOLLRDHUP)) {
+        ret |= POLLHUP; 
+    }
+    return ret;
+}
+#endif
+
 /* Look up the node with same fd or wevent. */
 static struct poll_node *
 find_poll_node(struct poll_loop *loop, int fd, HANDLE wevent)
@@ -106,6 +155,9 @@ static struct poll_node
 {
     struct poll_loop *loop = poll_loop();
     struct poll_node *node;
+#ifdef USE_EPOLL
+    struct epoll_event event;
+#endif
 
     COVERAGE_INC(poll_create_node);
 
@@ -115,6 +167,13 @@ static struct poll_node
     /* Check for duplicate.  If found, "or" the events. */
     node = find_poll_node(loop, fd, wevent);
     if (node) {
+#ifdef USE_EPOLL
+        if (loop->persist && (node->pollfd.events != events)) {
+            event.events = poll_to_epoll_events(node->pollfd.events | events);
+            event.data.ptr = node;
+            epoll_ctl(loop->epoll_fd, EPOLL_CTL_MOD, fd, &event);
+        }
+#endif
         node->pollfd.events |= events;
     } else {
         node = xzalloc(sizeof *node);
@@ -130,6 +189,13 @@ static struct poll_node
         node->wevent = wevent;
         node->where = where;
         node->valid = true;
+#ifdef USE_EPOLL
+        if (loop->persist) {
+            event.events = poll_to_epoll_events(events);
+            event.data.ptr = node;
+            epoll_ctl(loop->epoll_fd, EPOLL_CTL_ADD, fd, &event);
+        }
+#endif
     }
     return node;
 }
@@ -186,6 +252,11 @@ poll_fd_deregister_at(int fd, const char *where) {
 
     node = find_poll_node(loop, fd, 0);
     if (node) {
+#ifdef USE_EPOLL
+        if (loop->persist) {
+            epoll_ctl(loop->epoll_fd, EPOLL_CTL_DEL, node->pollfd.fd, NULL);
+        }
+#endif
         hmap_remove(&loop->poll_nodes, &node->hmap_node);
     }
 }
@@ -344,6 +415,11 @@ free_poll_nodes(struct poll_loop *loop)
 
     HMAP_FOR_EACH_SAFE (node, next, hmap_node, &loop->poll_nodes) {
         hmap_remove(&loop->poll_nodes, &node->hmap_node);
+#ifdef USE_EPOLL
+        if (loop->persist) {
+            epoll_ctl(loop->epoll_fd, EPOLL_CTL_DEL, node->pollfd.fd, NULL);
+        }
+#endif
 #ifdef _WIN32
         if (node->wevent && node->pollfd.fd) {
             WSAEventSelect(node->pollfd.fd, NULL, 0);
@@ -455,6 +531,7 @@ persist_poll_block(struct poll_loop *loop)
 
     /* Populate with all the fds and events. */
     counter = 0;
+#ifndef USE_EPOLL
     HMAP_FOR_EACH (node, hmap_node, &loop->poll_nodes) {
         if (node->pollfd.events && node->valid) {
             pollfds[counter] = node->pollfd;
@@ -478,6 +555,12 @@ persist_poll_block(struct poll_loop *loop)
 
     retval = time_poll(pollfds, hmap_count(&loop->poll_nodes), wevents,
                        loop->timeout_when, &elapsed);
+#else
+    retval = time_epoll_wait(loop->epoll_fd,
+        (struct epoll_event *) &loop->epoll_events, MAX_EPOLL_EVENTS, loop->timeout_when, &elapsed);
+    counter = retval;
+#endif
+
     if (retval < 0) {
         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
         VLOG_ERR_RL(&rl, "poll: %s", ovs_strerror(-retval));
@@ -485,7 +568,13 @@ persist_poll_block(struct poll_loop *loop)
         log_wakeup(loop->timeout_where, NULL, elapsed);
     } else {
         for (i = 0; i < counter; i++) {
+#ifdef USE_EPOLL
+            node = loop->epoll_events[i].data.ptr;
+            pollfds[i] = node->pollfd;
+            pollfds[i].revents = epoll_to_poll_events(loop->epoll_events[i].events);
+#else
             node = find_poll_node(loop, pollfds[i].fd, 0);
+#endif
             if (!node) {
                 VLOG_FATAL("poll: persistence state corrupted, no hash entry for %d", pollfds[i].fd);
             }
@@ -546,12 +635,19 @@ free_poll_loop(void *loop_)
     free_poll_nodes(loop);
     hmap_destroy(&loop->poll_nodes);
     free(loop);
+#ifdef USE_EPOLL
+    if (loop->persist) {
+        close(loop->epoll_fd);
+    }
+#endif
 }
 
 void poll_enable_persist(void) {
     struct poll_loop *loop = poll_loop();
-
     loop->persist = true;
+#ifdef USE_EPOLL
+    loop->epoll_fd = epoll_create(MAX_EPOLL_EVENTS);
+#endif
 }
 
 static struct poll_loop *
@@ -573,6 +669,9 @@ poll_loop(void)
         hmap_init(&loop->poll_nodes);
         xpthread_setspecific(key, loop);
         loop->persist = false;
+#ifdef USE_EPOLL
+        loop->epoll_fd = -1;
+#endif
     }
     return loop;
 }
diff --git a/lib/timeval.c b/lib/timeval.c
index 193c7bab1..6b1f1cf5a 100644
--- a/lib/timeval.c
+++ b/lib/timeval.c
@@ -38,6 +38,9 @@
 #include "unixctl.h"
 #include "util.h"
 #include "openvswitch/vlog.h"
+#ifdef __linux__
+#include <sys/epoll.h>
+#endif
 
 VLOG_DEFINE_THIS_MODULE(timeval);
 
@@ -270,6 +273,89 @@ time_alarm(unsigned int secs)
     deadline = now < LLONG_MAX - msecs ? now + msecs : LLONG_MAX;
 }
 
+#ifdef __linux__
+
+/* Like epoll_wait(), except:
+ *
+ *      - The timeout is specified as an absolute time, as defined by
+ *        time_msec(), instead of a duration.
+ *
+ *      - On error, returns a negative error code (instead of setting errno).
+ *
+ *      - If interrupted by a signal, retries automatically until the original
+ *        timeout is reached.  (Because of this property, this function will
+ *        never return -EINTR.)
+ *
+ * Stores the number of milliseconds elapsed during poll in '*elapsed'. */
+int
+time_epoll_wait(int epoll_fd, struct epoll_event *events, int max,
+          long long int timeout_when, int *elapsed)
+{
+    long long int *last_wakeup = last_wakeup_get();
+    long long int start;
+    bool quiescent;
+    int retval = 0;
+
+    time_init();
+    coverage_clear();
+    coverage_run();
+    if (*last_wakeup && !thread_is_pmd()) {
+        log_poll_interval(*last_wakeup);
+    }
+    start = time_msec();
+
+    timeout_when = MIN(timeout_when, deadline);
+    quiescent = ovsrcu_is_quiescent();
+
+    for (;;) {
+        long long int now = time_msec();
+        int time_left;
+
+        if (now >= timeout_when) {
+            time_left = 0;
+        } else if ((unsigned long long int) timeout_when - now > INT_MAX) {
+            time_left = INT_MAX;
+        } else {
+            time_left = timeout_when - now;
+        }
+
+        if (!quiescent) {
+            if (!time_left) {
+                ovsrcu_quiesce();
+            } else {
+                ovsrcu_quiesce_start();
+            }
+        }
+
+        retval = epoll_wait(epoll_fd, events, max, time_left);
+        if (retval < 0) {
+            retval = -errno;
+        }
+
+        if (!quiescent && time_left) {
+            ovsrcu_quiesce_end();
+        }
+
+        if (deadline <= time_msec()) {
+            fatal_signal_handler(SIGALRM);
+            if (retval < 0) {
+                retval = 0;
+            }
+            break;
+        }
+
+        if (retval != -EINTR) {
+            break;
+        }
+    }
+    *last_wakeup = time_msec();
+    refresh_rusage();
+    *elapsed = *last_wakeup - start;
+    return retval;
+}
+#endif
+
+
 /* Like poll(), except:
  *
  *      - The timeout is specified as an absolute time, as defined by
diff --git a/lib/timeval.h b/lib/timeval.h
index 502f703d4..d640eab17 100644
--- a/lib/timeval.h
+++ b/lib/timeval.h
@@ -20,6 +20,9 @@
 #include <time.h>
 #include "openvswitch/type-props.h"
 #include "util.h"
+#ifdef __linux__
+#include <sys/epoll.h>
+#endif
 
 #ifdef  __cplusplus
 extern "C" {
@@ -59,6 +62,10 @@ long long int time_wall_usec(void);
 void time_timespec(struct timespec *);
 void time_wall_timespec(struct timespec *);
 void time_alarm(unsigned int secs);
+#ifdef __linux__
+int time_epoll_wait(int epoll_fd, struct epoll_event *events, int max,
+          long long int timeout_when, int *elapsed);
+#endif
 int time_poll(struct pollfd *, int n_pollfds, HANDLE *handles,
               long long int timeout_when, int *elapsed);
 
-- 
2.20.1



More information about the dev mailing list