[ovs-dev] [PATCH 2/2 v3] lib/dpif-netlink.c: add support for packet receive on Windows

Nithin Raju nithin at vmware.com
Thu Oct 23 15:27:34 UTC 2014


In this patch, we add support in dpif-netlink.c to receive packets on
Windows. Windows does not natively support epoll(). Even though there
are mechanisms/interfaces that provide functionality similar to epoll(),
we take a simple approach of using a pool of sockets.

Here are some details of the implementaion to aid review:
1. There's pool of sockets per upcall handler.
2. The pool of sockets is initialized while setting up the handler in
dpif_netlink_refresh_channels() primarily.
3. When sockets are to be allocated for a vport, we walk through the
pool of sockets for all handlers and pick one of the sockets in each of
the pool. Within a handler's pool, sockets are picked in a round-robin
fashion.
4. We currently support only 1 handler, since there are some kernel
changes needed for support more than 1 handler per vport.
5. The pool size is also set to 1 currently.

The restructions imposed by #4 and #5 can be removed in the future
without much code churn.

Validation:
1. With a hacked up kernel which figures out the netlink socket that is
designated to receive packets, we are cable to perform pings between 2
VMs on the same Hyper-V host.
2. Compiled the code in Linux as well.
3. Tested with pool size == 2 as well, though in this patch we set the
pool size = 1.

Signed-off-by: Nithin Raju <nithin at vmware.com>
---
 lib/dpif-netlink.c |  348 +++++++++++++++++++++++++++++++++++++++++++--------
 1 files changed, 293 insertions(+), 55 deletions(-)

diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
index 67c2814..9ed7d73 100644
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -56,6 +56,11 @@
 #include "vlog.h"
 
 VLOG_DEFINE_THIS_MODULE(dpif_netlink);
+#ifdef _WIN32
+enum { WINDOWS = 1 };
+#else
+enum { WINDOWS = 0 };
+#endif
 enum { MAX_PORTS = USHRT_MAX };
 
 /* This ethtool flag was introduced in Linux 2.6.24, so it might be
@@ -137,12 +142,32 @@ struct dpif_channel {
     long long int last_poll;    /* Last time this channel was polled. */
 };
 
+#ifdef _WIN32
+#define VPORT_SOCK_POOL_SIZE 1
+/* On Windows, there is no native support for epoll.  There are equivalent
+ * interfaces though, that are not used currently.  For simpicity, a pool of
+ * netlink sockets is used.  Each socket is represented by 'struct
+ * dpif_windows_vport_sock'.  Since it is a pool, multiple OVS ports may be
+ * sharing the same socket.  In the future, we can add a reference count and
+ * such fields. */
+struct dpif_windows_vport_sock {
+    struct nl_sock *nl_sock;    /* netlink socket. */
+};
+#endif
+
 struct dpif_handler {
     struct dpif_channel *channels;/* Array of channels for each handler. */
     struct epoll_event *epoll_events;
     int epoll_fd;                 /* epoll fd that includes channel socks. */
     int n_events;                 /* Num events returned by epoll_wait(). */
     int event_offset;             /* Offset into 'epoll_events'. */
+
+#ifdef _WIN32
+    /* Pool of sockets. */
+    struct dpif_windows_vport_sock *vport_sock_pool;
+    size_t last_used_pool_idx; /* Index to aid in allocating a
+                                  socket in the pool to a port. */
+#endif
 };
 
 /* Datapath interface for the openvswitch Linux kernel module. */
@@ -184,6 +209,7 @@ static int dpif_netlink_init(void);
 static int open_dpif(const struct dpif_netlink_dp *, struct dpif **);
 static uint32_t dpif_netlink_port_get_pid(const struct dpif *,
                                           odp_port_t port_no, uint32_t hash);
+static void dpif_netlink_handler_uninit(struct dpif_handler *handler);
 static int dpif_netlink_refresh_channels(struct dpif_netlink *,
                                          uint32_t n_handlers);
 static void dpif_netlink_vport_to_ofpbuf(const struct dpif_netlink_vport *,
@@ -283,7 +309,7 @@ open_dpif(const struct dpif_netlink_dp *dp, struct dpif **dpifp)
 /* Destroys the netlink sockets pointed by the elements in 'socksp'
  * and frees the 'socksp'.  */
 static void
-vport_del_socksp(struct nl_sock **socksp, uint32_t n_socks)
+vport_del_socksp__(struct nl_sock **socksp, uint32_t n_socks)
 {
     size_t i;
 
@@ -297,7 +323,7 @@ vport_del_socksp(struct nl_sock **socksp, uint32_t n_socks)
 /* Creates an array of netlink sockets.  Returns an array of the
  * corresponding pointers.  Records the error in 'error'. */
 static struct nl_sock **
-vport_create_socksp(uint32_t n_socks, int *error)
+vport_create_socksp__(uint32_t n_socks, int *error)
 {
     struct nl_sock **socksp = xzalloc(n_socks * sizeof *socksp);
     size_t i;
@@ -312,11 +338,131 @@ vport_create_socksp(uint32_t n_socks, int *error)
     return socksp;
 
 error:
-    vport_del_socksp(socksp, n_socks);
+    vport_del_socksp__(socksp, n_socks);
 
     return NULL;
 }
 
+#ifdef _WIN32
+static void
+vport_delete_sock_pool(struct dpif_handler *handler)
+    OVS_REQ_WRLOCK(dpif->upcall_lock)
+{
+    if (handler->vport_sock_pool) {
+        uint32_t i;
+        struct dpif_windows_vport_sock *sock_pool =
+            handler->vport_sock_pool;
+
+        for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) {
+            if (sock_pool[i].nl_sock) {
+                nl_sock_unsubscribe_packets(sock_pool[i].nl_sock);
+                nl_sock_destroy(sock_pool[i].nl_sock);
+                sock_pool[i].nl_sock = NULL;
+            }
+        }
+
+        free(handler->vport_sock_pool);
+        handler->vport_sock_pool = NULL;
+    }
+}
+
+static int
+vport_create_sock_pool(struct dpif_handler *handler)
+    OVS_REQ_WRLOCK(dpif->upcall_lock)
+{
+    struct dpif_windows_vport_sock *sock_pool;
+    size_t i;
+    int error = 0;
+
+    sock_pool = xzalloc(VPORT_SOCK_POOL_SIZE * sizeof *sock_pool);
+    for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) {
+        error = nl_sock_create(NETLINK_GENERIC, &sock_pool[i].nl_sock);
+        if (error) {
+            goto error;
+        }
+
+        /* Enable the netlink socket to receive packets.  This is equivalent to
+         * calling nl_sock_join_mcgroup() to receive events. */
+        error = nl_sock_subscribe_packets(sock_pool[i].nl_sock);
+        if (error) {
+           goto error;
+        }
+    }
+
+    handler->vport_sock_pool = sock_pool;
+    handler->last_used_pool_idx = 0;
+    return 0;
+
+error:
+    vport_delete_sock_pool(handler);
+    return error;
+}
+
+/* Returns an array pointers to netlink sockets.  The sockets are picked from a
+ * pool. Records the error in 'error'. */
+static struct nl_sock **
+vport_create_socksp_windows(struct dpif_netlink *dpif, int *error)
+    OVS_REQ_WRLOCK(dpif->upcall_lock)
+{
+    uint32_t n_socks = dpif->n_handlers;
+    struct nl_sock **socksp;
+    size_t i;
+
+    ovs_assert(n_socks <= 1);
+    socksp = xzalloc(n_socks * sizeof *socksp);
+
+    /* Pick netlink sockets to use in a round-robin fashion from each
+     * handler's pool of sockets. */
+    for (i = 0; i < n_socks; i++) {
+        struct dpif_handler *handler = &dpif->handlers[i];
+        struct dpif_windows_vport_sock *sock_pool = handler->vport_sock_pool;
+        size_t index = handler->last_used_pool_idx;
+
+        /* A pool of sockets is allocated when the handler is initialized. */
+        if (sock_pool == NULL) {
+            free(socksp);
+            *error = EINVAL;
+            return NULL;
+        }
+
+        ovs_assert(index < VPORT_SOCK_POOL_SIZE);
+        socksp[i] = sock_pool[index].nl_sock;
+        socksp[i] = sock_pool[index].nl_sock;
+        ovs_assert(socksp[i]);
+        index = (index == VPORT_SOCK_POOL_SIZE - 1) ? 0 : index + 1;
+        handler->last_used_pool_idx = index;
+    }
+
+    return socksp;
+}
+
+static void
+vport_del_socksp_windows(struct dpif_netlink *dpif, struct nl_sock **socksp)
+{
+    free(socksp);
+}
+#endif /* _WIN32 */
+
+static struct nl_sock **
+vport_create_socksp(struct dpif_netlink *dpif, int *error)
+{
+#ifdef _WIN32
+    return vport_create_socksp_windows(dpif, error);
+#else
+    return vport_create_socksp__(dpif->n_handlers, error);
+#endif
+}
+
+static void
+vport_del_socksp(struct dpif_netlink *dpif, struct nl_sock **socksp)
+{
+#ifdef _WIN32
+    vport_del_socksp_windows(dpif, socksp);
+#else
+    vport_del_socksp__(socksp, dpif->n_handlers);
+#endif
+}
+
 /* Given the array of pointers to netlink sockets 'socksp', returns
  * the array of corresponding pids. If the 'socksp' is NULL, returns
  * a single-element array of value 0. */
@@ -354,6 +500,7 @@ vport_get_pids(struct dpif_netlink *dpif, uint32_t port_idx,
     if (!dpif->handlers[0].channels[port_idx].sock) {
         return false;
     }
+    ovs_assert(!WINDOWS && dpif->n_handlers <= 1);
 
     pids = xzalloc(dpif->n_handlers * sizeof *pids);
 
@@ -415,11 +562,7 @@ vport_add_channels(struct dpif_netlink *dpif, odp_port_t port_no,
     for (i = 0; i < dpif->n_handlers; i++) {
         struct dpif_handler *handler = &dpif->handlers[i];
 
-#ifdef _WIN32
-        /*
-         * XXX : Map appropiate Windows handle
-         */
-#else
+#ifndef _WIN32
         if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(socksp[i]),
                       &event) < 0) {
             error = errno;
@@ -434,11 +577,7 @@ vport_add_channels(struct dpif_netlink *dpif, odp_port_t port_no,
 
 error:
     for (j = 0; j < i; j++) {
-#ifdef _WIN32
-        /*
-         * XXX : Map appropiate Windows handle
-         */
-#else
+#ifndef _WIN32
         epoll_ctl(dpif->handlers[j].epoll_fd, EPOLL_CTL_DEL,
                   nl_sock_fd(socksp[j]), NULL);
 #endif
@@ -467,16 +606,11 @@ vport_del_channels(struct dpif_netlink *dpif, odp_port_t port_no)
 
     for (i = 0; i < dpif->n_handlers; i++) {
         struct dpif_handler *handler = &dpif->handlers[i];
-
-#ifdef _WIN32
-        /*
-         * XXX : Map appropiate Windows handle
-         */
-#else
+#ifndef _WIN32
         epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL,
                   nl_sock_fd(handler->channels[port_idx].sock), NULL);
-#endif
         nl_sock_destroy(handler->channels[port_idx].sock);
+#endif
         handler->channels[port_idx].sock = NULL;
         handler->event_offset = handler->n_events = 0;
     }
@@ -517,13 +651,7 @@ destroy_all_channels(struct dpif_netlink *dpif)
     for (i = 0; i < dpif->n_handlers; i++) {
         struct dpif_handler *handler = &dpif->handlers[i];
 
-#ifdef _WIN32
-        /*
-         * XXX : Map appropiate Windows handle
-         */
-#else
-        close(handler->epoll_fd);
-#endif
+        dpif_netlink_handler_uninit(handler);
         free(handler->epoll_events);
         free(handler->channels);
     }
@@ -688,7 +816,7 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, struct netdev *netdev,
     int error = 0;
 
     if (dpif->handlers) {
-        socksp = vport_create_socksp(dpif->n_handlers, &error);
+        socksp = vport_create_socksp(dpif, &error);
         if (!socksp) {
             return error;
         }
@@ -702,16 +830,14 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, struct netdev *netdev,
         VLOG_WARN_RL(&error_rl, "%s: cannot create port `%s' because it has "
                      "unsupported type `%s'",
                      dpif_name(&dpif->dpif), name, type);
-        vport_del_socksp(socksp, dpif->n_handlers);
+        vport_del_socksp(dpif, socksp);
         return EINVAL;
     }
     request.name = name;
 
     if (request.type == OVS_VPORT_TYPE_NETDEV) {
 #ifdef _WIN32
-        /*
-         * XXX : Map appropiate Windows handle
-         */
+        /* XXX : Map appropiate Windows handle */
 #else
         netdev_linux_ethtool_set_flag(netdev, ETH_FLAG_LRO, "LRO", false);
 #endif
@@ -740,7 +866,7 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, struct netdev *netdev,
                       dpif_name(&dpif->dpif), *port_nop);
         }
 
-        vport_del_socksp(socksp, dpif->n_handlers);
+        vport_del_socksp(dpif, socksp);
         goto exit;
     }
 
@@ -756,7 +882,7 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, struct netdev *netdev,
             request.dp_ifindex = dpif->dp_ifindex;
             request.port_no = *port_nop;
             dpif_netlink_vport_transact(&request, NULL, NULL);
-            vport_del_socksp(socksp, dpif->n_handlers);
+            vport_del_socksp(dpif, socksp);
             goto exit;
         }
     }
@@ -1500,6 +1626,34 @@ dpif_netlink_operate(struct dpif *dpif_, struct dpif_op **ops, size_t n_ops)
     }
 }
 
+#if _WIN32
+static void
+dpif_netlink_handler_uninit(struct dpif_handler *handler)
+{
+    vport_delete_sock_pool(handler);
+}
+
+static int
+dpif_netlink_handler_init(struct dpif_handler *handler)
+{
+    return vport_create_sock_pool(handler);
+}
+#else
+
+static int
+dpif_netlink_handler_init(struct dpif_handler *handler)
+{
+    handler->epoll_fd = epoll_create(10);
+    return handler->epoll_fd < 0 ? errno : 0;
+}
+
+static void
+dpif_netlink_handler_uninit(struct dpif_handler *handler)
+{
+    close(handler->epoll_fd);
+}
+#endif
+
 /* Synchronizes 'channels' in 'dpif->handlers'  with the set of vports
  * currently in 'dpif' in the kernel, by adding a new set of channels for
  * any kernel vport that lacks one and deleting any channels that have no
@@ -1517,30 +1671,30 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers)
     int retval = 0;
     size_t i;
 
+    ovs_assert(!WINDOWS || n_handlers <= 1);
+    ovs_assert(!WINDOWS || dpif->n_handlers <= 1);
+
     if (dpif->n_handlers != n_handlers) {
         destroy_all_channels(dpif);
         dpif->handlers = xzalloc(n_handlers * sizeof *dpif->handlers);
         for (i = 0; i < n_handlers; i++) {
+            int error;
             struct dpif_handler *handler = &dpif->handlers[i];
 
-#ifdef _WIN32
-        /*
-         * XXX : Map appropiate Windows handle
-         */
-#else
-            handler->epoll_fd = epoll_create(10);
-            if (handler->epoll_fd < 0) {
+            error = dpif_netlink_handler_init(handler);
+            if (error) {
                 size_t j;
+                struct dpif_handler *tmp = &dpif->handlers[i];
+
 
                 for (j = 0; j < i; j++) {
-                    close(dpif->handlers[j].epoll_fd);
+                    dpif_netlink_handler_uninit(tmp);
                 }
                 free(dpif->handlers);
                 dpif->handlers = NULL;
 
-                return errno;
+                return error;
             }
-#endif
         }
         dpif->n_handlers = n_handlers;
     }
@@ -1563,8 +1717,7 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers)
 
         if (port_no >= dpif->uc_array_size
             || !vport_get_pids(dpif, port_no, &upcall_pids)) {
-            struct nl_sock **socksp = vport_create_socksp(dpif->n_handlers,
-                                                          &error);
+            struct nl_sock **socksp = vport_create_socksp(dpif, &error);
 
             if (!socksp) {
                 goto error;
@@ -1574,7 +1727,7 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers)
             if (error) {
                 VLOG_INFO("%s: could not add channels for port %s",
                           dpif_name(&dpif->dpif), vport.name);
-                vport_del_socksp(socksp, dpif->n_handlers);
+                vport_del_socksp(dpif, socksp);
                 retval = error;
                 goto error;
             }
@@ -1669,6 +1822,14 @@ dpif_netlink_handlers_set(struct dpif *dpif_, uint32_t n_handlers)
     struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
     int error = 0;
 
+#ifdef _WIN32
+    /* Multiple upcall handlers will be supported once kernel datpaath supports
+     * it. */
+    if (n_handlers > 1) {
+        return error;
+    }
+#endif
+
     fat_rwlock_wrlock(&dpif->upcall_lock);
     if (dpif->handlers) {
         error = dpif_netlink_refresh_channels(dpif, n_handlers);
@@ -1754,6 +1915,73 @@ parse_odp_packet(struct ofpbuf *buf, struct dpif_upcall *upcall,
     return 0;
 }
 
+#ifdef _WIN32
+#define PACKET_RECV_BATCH_SIZE 50
+static int
+dpif_netlink_recv_windows(struct dpif_netlink *dpif, uint32_t handler_id,
+                          struct dpif_upcall *upcall, struct ofpbuf *buf)
+    OVS_REQ_RDLOCK(dpif->upcall_lock)
+{
+    struct dpif_handler *handler;
+    int read_tries = 0;
+    struct dpif_windows_vport_sock *sock_pool;
+    uint32_t i;
+
+    if (!dpif->handlers) {
+        return EAGAIN;
+    }
+
+    /* Only one handler is supported currently. */
+    if (handler_id >= 1) {
+        return EAGAIN;
+    }
+
+    if (handler_id >= dpif->n_handlers) {
+        return EAGAIN;
+    }
+
+    handler = &dpif->handlers[handler_id];
+    sock_pool = handler->vport_sock_pool;
+
+    for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) {
+        for (;;) {
+            int dp_ifindex;
+            int error;
+
+            if (++read_tries > PACKET_RECV_BATCH_SIZE) {
+                return EAGAIN;
+            }
+
+            error = nl_sock_recv(sock_pool[i].nl_sock, buf, false);
+            if (error == ENOBUFS) {
+                /* ENOBUFS typically means that we've received so many
+                 * packets that the buffer overflowed.  Try again
+                 * immediately because there's almost certainly a packet
+                 * waiting for us. */
+                /* XXX: report_loss(dpif, ch, idx, handler_id); */
+                continue;
+            }
+
+            /* XXX: ch->last_poll = time_msec(); */
+            if (error) {
+                if (error == EAGAIN) {
+                    break;
+                }
+                return error;
+            }
+
+            error = parse_odp_packet(buf, upcall, &dp_ifindex);
+            if (!error && dp_ifindex == dpif->dp_ifindex) {
+                return 0;
+            } else if (error) {
+                return error;
+            }
+        }
+    }
+
+    return EAGAIN;
+}
+#else
 static int
 dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id,
                     struct dpif_upcall *upcall, struct ofpbuf *buf)
@@ -1772,15 +2000,11 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id,
 
         handler->event_offset = handler->n_events = 0;
 
-#ifdef _WIN32
-        retval = dpif->uc_array_size;
-        handler->event_offset = 0;
-#else
         do {
             retval = epoll_wait(handler->epoll_fd, handler->epoll_events,
                                 dpif->uc_array_size, 0);
         } while (retval < 0 && errno == EINTR);
-#endif
+
         if (retval < 0) {
             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
             VLOG_WARN_RL(&rl, "epoll_wait failed (%s)", ovs_strerror(errno));
@@ -1832,6 +2056,7 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id,
 
     return EAGAIN;
 }
+#endif
 
 static int
 dpif_netlink_recv(struct dpif *dpif_, uint32_t handler_id,
@@ -1841,7 +2066,11 @@ dpif_netlink_recv(struct dpif *dpif_, uint32_t handler_id,
     int error;
 
     fat_rwlock_rdlock(&dpif->upcall_lock);
+#ifdef _WIN32
+    error = dpif_netlink_recv_windows(dpif, handler_id, upcall, buf);;
+#else
     error = dpif_netlink_recv__(dpif, handler_id, upcall, buf);
+#endif
     fat_rwlock_unlock(&dpif->upcall_lock);
 
     return error;
@@ -1852,9 +2081,18 @@ dpif_netlink_recv_wait__(struct dpif_netlink *dpif, uint32_t handler_id)
     OVS_REQ_RDLOCK(dpif->upcall_lock)
 {
 #ifdef _WIN32
-        /*
-         * XXX : Map appropiate Windows handle
-         */
+    uint32_t i;
+    struct dpif_windows_vport_sock *sock_pool =
+        dpif->handlers[handler_id].vport_sock_pool;
+
+    /* Only one handler is supported currently. */
+    if (handler_id >= 1) {
+        return;
+    }
+
+    for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) {
+        nl_sock_wait(sock_pool[i].nl_sock, POLLIN);
+    }
 #else
     if (dpif->handlers && handler_id < dpif->n_handlers) {
         struct dpif_handler *handler = &dpif->handlers[handler_id];
-- 
1.7.4.1




More information about the dev mailing list