[ovs-dev] [RFC 1/4] netlink: Support for memory mapped Netlink sockets

Thomas Graf tgraf at suug.ch
Thu May 22 23:15:58 UTC 2014


Signed-off-by: Thomas Graf <tgraf at suug.ch>
---
 lib/dpif-linux.c       |   3 +-
 lib/netlink-protocol.h |  39 ++++++
 lib/netlink-socket.c   | 345 ++++++++++++++++++++++++++++++++++++++++++++-----
 lib/netlink-socket.h   |  10 +-
 4 files changed, 361 insertions(+), 36 deletions(-)

diff --git a/lib/dpif-linux.c b/lib/dpif-linux.c
index 63e66f3..42a3f72 100644
--- a/lib/dpif-linux.c
+++ b/lib/dpif-linux.c
@@ -1777,6 +1777,7 @@ dpif_linux_recv__(struct dpif_linux *dpif, uint32_t handler_id,
     while (handler->event_offset < handler->n_events) {
         int idx = handler->epoll_events[handler->event_offset].data.u32;
         struct dpif_channel *ch = &dpif->handlers[handler_id].channels[idx];
+        int events = handler->epoll_events[handler->event_offset].events;
 
         handler->event_offset++;
 
@@ -1788,7 +1789,7 @@ dpif_linux_recv__(struct dpif_linux *dpif, uint32_t handler_id,
                 return EAGAIN;
             }
 
-            error = nl_sock_recv(ch->sock, buf, false);
+            error = nl_sock_recv_events(ch->sock, buf, false, events);
             if (error == ENOBUFS) {
                 /* ENOBUFS typically means that we've received so many
                  * packets that the buffer overflowed.  Try again
diff --git a/lib/netlink-protocol.h b/lib/netlink-protocol.h
index 3009fc5..13e1216 100644
--- a/lib/netlink-protocol.h
+++ b/lib/netlink-protocol.h
@@ -161,6 +161,45 @@ enum {
 #define NETLINK_DROP_MEMBERSHIP 2
 #endif
 
+#ifndef __ALIGN_KERNEL
+#define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))
+#define __ALIGN_KERNEL(x, a)		__ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
+#endif
+
+#ifndef NETLINK_RX_RING
+#define NETLINK_RX_RING		6
+#define NETLINK_TX_RING		7
+
+struct nl_mmap_req {
+	unsigned int	nm_block_size;
+	unsigned int	nm_block_nr;
+	unsigned int	nm_frame_size;
+	unsigned int	nm_frame_nr;
+};
+
+struct nl_mmap_hdr {
+	unsigned int	nm_status;
+	unsigned int	nm_len;
+	__u32		nm_group;
+	/* credentials */
+	__u32		nm_pid;
+	__u32		nm_uid;
+	__u32		nm_gid;
+};
+
+enum nl_mmap_status {
+	NL_MMAP_STATUS_UNUSED,
+	NL_MMAP_STATUS_RESERVED,
+	NL_MMAP_STATUS_VALID,
+	NL_MMAP_STATUS_COPY,
+	NL_MMAP_STATUS_SKIP,
+};
+
+#define NL_MMAP_MSG_ALIGNMENT		NLMSG_ALIGNTO
+#define NL_MMAP_MSG_ALIGN(sz)		__ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT)
+#define NL_MMAP_HDRLEN			NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr))
+#endif /* NETLINK_RX_RING */
+
 /* These were introduced all together in 2.6.23.  (We want our programs to
  * support the newer kernel features even if compiled with older headers.) */
 #ifndef CTRL_ATTR_MCAST_GRP_MAX
diff --git a/lib/netlink-socket.c b/lib/netlink-socket.c
index e4cc4ad..a264a0a 100644
--- a/lib/netlink-socket.c
+++ b/lib/netlink-socket.c
@@ -21,6 +21,8 @@
 #include <stdlib.h>
 #include <sys/types.h>
 #include <sys/uio.h>
+#include <sys/mman.h>
+#include <sys/epoll.h>
 #include <unistd.h>
 #include "coverage.h"
 #include "dynamic-string.h"
@@ -41,7 +43,9 @@ VLOG_DEFINE_THIS_MODULE(netlink_socket);
 COVERAGE_DEFINE(netlink_overflow);
 COVERAGE_DEFINE(netlink_received);
 COVERAGE_DEFINE(netlink_recv_jumbo);
+COVERAGE_DEFINE(netlink_recv_mmap);
 COVERAGE_DEFINE(netlink_sent);
+COVERAGE_DEFINE(netlink_sent_mmap);
 
 /* Linux header file confusion causes this to be undefined. */
 #ifndef SOL_NETLINK
@@ -59,12 +63,30 @@ static void log_nlmsg(const char *function, int error,
 
 /* Netlink sockets. */
 
+/* Memory mapped ring buffer for Netlink messages
+ *
+ * Total memory consumption per ring:
+ * PAGE_SIZE * NM_BLOCK_NPAGES * NM_BLOCK_NPAGES */
+#define NL_BLOCK_NPAGES       4 /* Number of pages per block */
+#define NL_NBLOCKS           64 /* Number of blocks per ring */
+#define NL_FRAME_SIZE     16384 /* Maximum message size to be carried */
+
+struct nl_ring {
+    unsigned int head;
+    void *ring;
+};
+
 struct nl_sock {
     int fd;
     uint32_t next_seq;
     uint32_t pid;
     int protocol;
     unsigned int rcvbuf;        /* Receive buffer size (SO_RCVBUF). */
+    unsigned int frame_size;
+    unsigned int frame_nr;
+    size_t ring_size;
+    struct nl_ring rx_ring;
+    struct nl_ring tx_ring;
 };
 
 /* Compile-time limit on iovecs, so that we can allocate a maximum-size array
@@ -112,7 +134,7 @@ nl_sock_create(int protocol, struct nl_sock **sockp)
     }
 
     *sockp = NULL;
-    sock = xmalloc(sizeof *sock);
+    sock = xzalloc(sizeof *sock);
 
     sock->fd = socket(AF_NETLINK, SOCK_RAW, protocol);
     if (sock->fd < 0) {
@@ -179,13 +201,85 @@ error:
     return retval;
 }
 
+static int
+nl_sock_set_ring(struct nl_sock *sock)
+{
+    size_t block_size = NL_BLOCK_NPAGES * getpagesize();
+    size_t ring_size;
+    void *ring;
+    struct nl_mmap_req req = {
+        .nm_block_size          = block_size,
+        .nm_block_nr            = NL_NBLOCKS,
+        .nm_frame_size          = NL_FRAME_SIZE,
+    };
+
+    req.nm_frame_nr = req.nm_block_nr * block_size / req.nm_frame_size;
+
+    if (setsockopt(sock->fd, SOL_NETLINK, NETLINK_RX_RING, &req, sizeof(req)) < 0
+        || setsockopt(sock->fd, SOL_NETLINK, NETLINK_TX_RING, &req, sizeof(req)) < 0) {
+        VLOG_INFO("mmap netlink is not supported");
+        return 0;
+    }
+
+    ring_size = req.nm_block_nr * req.nm_block_size;
+    ring = mmap(NULL, 2 * ring_size, PROT_READ | PROT_WRITE,
+                MAP_SHARED, sock->fd, 0);
+    if (ring == MAP_FAILED) {
+        VLOG_ERR("netlink mmap: %s", ovs_strerror(errno));
+        return errno;
+    }
+
+    sock->frame_size = req.nm_frame_size;
+    sock->frame_nr = req.nm_frame_nr - 1;
+    sock->ring_size = ring_size;
+    sock->rx_ring.ring = ring;
+    sock->rx_ring.head = 0;
+    sock->tx_ring.ring = (char *) ring + ring_size;
+    sock->tx_ring.head = 0;
+
+    return 0;
+}
+
+/* Creates a new memory mapped netlink socket for the given netlink 'protocol'
+ * (NETLINK_ROUTE, NETLINK_GENERIC, ...). Falls back to unmapped socket if
+ * kernel side does not support it. Returns 0 and sets '*sockp' to the new
+ * socket if successful, otherwise returns a positive errno value. */
+int
+nl_sock_create_mmap(int protocol, struct nl_sock **sockp)
+{
+    int retval;
+
+    if ((retval = nl_sock_create(protocol, sockp)) < 0) {
+        return retval;
+    }
+
+    if ((retval = nl_sock_set_ring(*sockp)) < 0) {
+        VLOG_ERR("failed to initialize memory mapped netlink socket");
+        nl_sock_destroy(*sockp);
+        return retval;
+    }
+
+    return retval;
+}
+
+/* Returns true if netlink socked is memory mapped */
+bool
+nl_sock_is_mapped(const struct nl_sock *sock)
+{
+    return sock->rx_ring.ring != NULL;
+}
+
 /* Creates a new netlink socket for the same protocol as 'src'.  Returns 0 and
  * sets '*sockp' to the new socket if successful, otherwise returns a positive
  * errno value.  */
 int
 nl_sock_clone(const struct nl_sock *src, struct nl_sock **sockp)
 {
-    return nl_sock_create(src->protocol, sockp);
+    if (nl_sock_is_mapped(src)) {
+        return nl_sock_create_mmap(src->protocol, sockp);
+    } else {
+        return nl_sock_create(src->protocol, sockp);
+    }
 }
 
 /* Destroys netlink socket 'sock'. */
@@ -193,6 +287,9 @@ void
 nl_sock_destroy(struct nl_sock *sock)
 {
     if (sock) {
+        if (nl_sock_is_mapped(sock)) {
+            munmap(sock->rx_ring.ring, 2 * sock->ring_size);
+        }
         close(sock->fd);
         free(sock);
     }
@@ -243,6 +340,105 @@ nl_sock_leave_mcgroup(struct nl_sock *sock, unsigned int multicast_group)
     return 0;
 }
 
+static struct nl_mmap_hdr *
+mmap_frame(struct nl_sock *sock, struct nl_ring *r)
+{
+    char *start = r->ring;
+
+    return (struct nl_mmap_hdr *)(void *)(start + r->head * sock->frame_size);
+}
+
+static struct nl_mmap_hdr *
+mmap_next_rx_frame(struct nl_sock *sock)
+{
+    return mmap_frame(sock, &sock->rx_ring);
+}
+
+static struct nl_mmap_hdr *
+mmap_next_tx_frame(struct nl_sock *sock)
+{
+    return mmap_frame(sock, &sock->tx_ring);
+}
+
+static void
+mmap_advance_ring(struct nl_sock *sock, struct nl_ring *r)
+{
+    if (r->head != sock->frame_nr) {
+        r->head++;
+    } else {
+        r->head = 0;
+    }
+}
+
+static void
+mmap_advance_rx_ring(struct nl_sock *sock)
+{
+    mmap_advance_ring(sock, &sock->rx_ring);
+}
+
+static void
+mmap_advance_tx_ring(struct nl_sock *sock)
+{
+    mmap_advance_ring(sock, &sock->tx_ring);
+}
+
+static int
+nl_sock_send_linear(struct nl_sock *sock, const struct ofpbuf *msg,
+                    bool wait)
+{
+    int retval, error;
+
+    do {
+        retval = send(sock->fd, ofpbuf_data(msg), ofpbuf_size(msg), wait ? 0 : MSG_DONTWAIT);
+        error = retval < 0 ? errno : 0;
+    } while (error == EINTR);
+
+    return error;
+}
+
+static int
+nl_sock_send_mmap(struct nl_sock *sock, const struct ofpbuf *msg,
+                  bool wait)
+{
+    struct nl_mmap_hdr *hdr;
+    struct sockaddr_nl addr = {
+        .nl_family      = AF_NETLINK,
+    };
+    int retval, error;
+
+    if ((ofpbuf_size(msg) + NL_MMAP_HDRLEN) > sock->frame_size)
+        return nl_sock_send_linear(sock, msg, wait);
+
+    hdr = mmap_next_tx_frame(sock);
+
+    if (hdr->nm_status != NL_MMAP_STATUS_UNUSED) {
+        /* No frame available. Block? */
+        if (wait) {
+            nl_sock_wait(sock, POLLOUT | POLLERR);
+            poll_block();
+        } else {
+            return EAGAIN;
+        }
+    }
+
+    memcpy((char *) hdr + NL_MMAP_HDRLEN, ofpbuf_data(msg), ofpbuf_size(msg));
+    hdr->nm_len     = ofpbuf_size(msg);
+    hdr->nm_status  = NL_MMAP_STATUS_VALID;
+
+    mmap_advance_tx_ring(sock);
+
+    do {
+        retval = sendto(sock->fd, NULL, 0, 0, (struct sockaddr *)&addr, sizeof(addr));
+        error = retval < 0 ? errno : 0;
+    } while (error == EINTR);
+
+    if (!error) {
+        COVERAGE_INC(netlink_sent_mmap);
+    }
+
+    return error;
+}
+
 static int
 nl_sock_send__(struct nl_sock *sock, const struct ofpbuf *msg,
                uint32_t nlmsg_seq, bool wait)
@@ -253,11 +449,11 @@ nl_sock_send__(struct nl_sock *sock, const struct ofpbuf *msg,
     nlmsg->nlmsg_len = ofpbuf_size(msg);
     nlmsg->nlmsg_seq = nlmsg_seq;
     nlmsg->nlmsg_pid = sock->pid;
-    do {
-        int retval;
-        retval = send(sock->fd, ofpbuf_data(msg), ofpbuf_size(msg), wait ? 0 : MSG_DONTWAIT);
-        error = retval < 0 ? errno : 0;
-    } while (error == EINTR);
+    if (nl_sock_is_mapped(sock)) {
+        error = nl_sock_send_mmap(sock, msg, wait);
+    } else {
+        error = nl_sock_send_linear(sock, msg, wait);
+    }
     log_nlmsg(__func__, error, ofpbuf_data(msg), ofpbuf_size(msg), sock->protocol);
     if (!error) {
         COVERAGE_INC(netlink_sent);
@@ -298,26 +494,17 @@ nl_sock_send_seq(struct nl_sock *sock, const struct ofpbuf *msg,
 }
 
 static int
-nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
+nl_sock_recv_linear(struct nl_sock *sock, struct ofpbuf *buf, bool wait,
+                    uint8_t *tail, size_t taillen)
 {
-    /* We can't accurately predict the size of the data to be received.  The
-     * caller is supposed to have allocated enough space in 'buf' to handle the
-     * "typical" case.  To handle exceptions, we make available enough space in
-     * 'tail' to allow Netlink messages to be up to 64 kB long (a reasonable
-     * figure since that's the maximum length of a Netlink attribute). */
-    struct nlmsghdr *nlmsghdr;
-    uint8_t tail[65536];
     struct iovec iov[2];
     struct msghdr msg;
-    ssize_t retval;
-
-    ovs_assert(buf->allocated >= sizeof *nlmsghdr);
-    ofpbuf_clear(buf);
+    int retval;
 
     iov[0].iov_base = ofpbuf_base(buf);
     iov[0].iov_len = buf->allocated;
     iov[1].iov_base = tail;
-    iov[1].iov_len = sizeof tail;
+    iov[1].iov_len = taillen;
 
     memset(&msg, 0, sizeof msg);
     msg.msg_iov = iov;
@@ -343,21 +530,100 @@ nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
         return E2BIG;
     }
 
-    nlmsghdr = ofpbuf_data(buf);
-    if (retval < sizeof *nlmsghdr
-        || nlmsghdr->nlmsg_len < sizeof *nlmsghdr
-        || nlmsghdr->nlmsg_len > retval) {
-        VLOG_ERR_RL(&rl, "received invalid nlmsg (%"PRIuSIZE"d bytes < %"PRIuSIZE")",
-                    retval, sizeof *nlmsghdr);
-        return EPROTO;
-    }
-
     ofpbuf_set_size(buf, MIN(retval, buf->allocated));
     if (retval > buf->allocated) {
         COVERAGE_INC(netlink_recv_jumbo);
         ofpbuf_put(buf, tail, retval - buf->allocated);
     }
 
+    return 0;
+}
+
+static int
+nl_sock_recv_mmap(struct nl_sock *sock, struct ofpbuf *buf, bool wait,
+                  uint8_t *tail, size_t taillen)
+{
+    struct nl_mmap_hdr *hdr;
+    int retval = 0;
+
+restart:
+    hdr = mmap_next_rx_frame(sock);
+
+    switch (hdr->nm_status) {
+    case NL_MMAP_STATUS_VALID:
+        if (hdr->nm_len == 0) {
+            /* error occured while constructing message on other side */
+            hdr->nm_status = NL_MMAP_STATUS_UNUSED;
+            mmap_advance_rx_ring(sock);
+            goto restart;
+        }
+
+        ofpbuf_put(buf, (char *) hdr + NL_MMAP_HDRLEN, hdr->nm_len);
+        COVERAGE_INC(netlink_recv_mmap);
+        break;
+
+    case NL_MMAP_STATUS_COPY:
+        retval = nl_sock_recv_linear(sock, buf, MSG_DONTWAIT, tail, taillen);
+        if (retval < 0) {
+            return retval;
+        }
+        break;
+
+    case NL_MMAP_STATUS_UNUSED:
+    case NL_MMAP_STATUS_RESERVED:
+    default:
+        if (wait) {
+            nl_sock_wait(sock, POLLIN | POLLERR);
+            poll_block();
+            goto restart;
+        }
+
+        return EAGAIN;
+    }
+
+    hdr->nm_status = NL_MMAP_STATUS_UNUSED;
+    mmap_advance_rx_ring(sock);
+
+    return retval;
+}
+
+static int
+nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait,
+               uint32_t events)
+{
+    /* We can't accurately predict the size of the data to be received.  The
+     * caller is supposed to have allocated enough space in 'buf' to handle the
+     * "typical" case.  To handle exceptions, we make available enough space in
+     * 'tail' to allow Netlink messages to be up to 64 kB long (a reasonable
+     * figure since that's the maximum length of a Netlink attribute). */
+    struct nlmsghdr *nlmsghdr;
+    uint8_t tail[65536];
+    int retval;
+
+    ovs_assert(buf->allocated >= sizeof *nlmsghdr);
+    ofpbuf_clear(buf);
+
+    /* nl_sock_recv_mmap() cannot handle EPOLLERR events, force linear receive
+     * to retrieve error notification. */
+    if (!nl_sock_is_mapped(sock) || events == EPOLLERR) {
+        retval = nl_sock_recv_linear(sock, buf, wait, tail, sizeof(tail));
+    } else {
+        retval = nl_sock_recv_mmap(sock, buf, wait, tail, sizeof(tail));
+    }
+
+    if (retval != 0) {
+        return retval;
+    }
+
+    nlmsghdr = ofpbuf_data(buf);
+    if (ofpbuf_size(buf) < sizeof *nlmsghdr
+        || nlmsghdr->nlmsg_len < sizeof *nlmsghdr
+        || nlmsghdr->nlmsg_len > ofpbuf_size(buf)) {
+        VLOG_ERR_RL(&rl, "received invalid nlmsg (%u bytes < %"PRIuSIZE")",
+                    ofpbuf_size(buf), sizeof *nlmsghdr);
+        return EPROTO;
+    }
+
     log_nlmsg(__func__, 0, ofpbuf_data(buf), ofpbuf_size(buf), sock->protocol);
     COVERAGE_INC(netlink_received);
 
@@ -384,7 +650,22 @@ nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
 int
 nl_sock_recv(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
 {
-    return nl_sock_recv__(sock, buf, wait);
+    return nl_sock_recv__(sock, buf, wait, 0);
+}
+
+/* Variation of nl_sock_recv() to be used in combination with event polling.
+ * Behaves the same but takes the additional 'events' provided by poll() or
+ * epoll_wait().
+ *
+ * Required to receive error notifications on the socket for shared memory
+ * based ring buffer implementations which don't necessarily invoke recvmsg()
+ * on the socket.
+ */
+int
+nl_sock_recv_events(struct nl_sock *sock, struct ofpbuf *buf, bool wait,
+                    uint32_t events)
+{
+    return nl_sock_recv__(sock, buf, wait, events);
 }
 
 static void
@@ -472,7 +753,7 @@ nl_sock_transact_multiple__(struct nl_sock *sock,
         }
 
         /* Receive a reply. */
-        error = nl_sock_recv__(sock, buf_txn->reply, false);
+        error = nl_sock_recv__(sock, buf_txn->reply, false, 0);
         if (error) {
             if (error == EAGAIN) {
                 nl_sock_record_errors__(transactions, n, 0);
@@ -745,7 +1026,7 @@ nl_dump_next(struct nl_dump *dump, struct ofpbuf *reply, struct ofpbuf *buffer)
             return false;
         }
 
-        retval = nl_sock_recv__(dump->sock, buffer, false);
+        retval = nl_sock_recv__(dump->sock, buffer, false, 0);
         if (retval) {
             ofpbuf_clear(buffer);
             if (retval == EAGAIN) {
diff --git a/lib/netlink-socket.h b/lib/netlink-socket.h
index dd32409..fbb2b3f 100644
--- a/lib/netlink-socket.h
+++ b/lib/netlink-socket.h
@@ -38,9 +38,9 @@
  * Most of the netlink functions are not fully thread-safe: Only a single
  * thread may use a given nl_sock or nl_dump at one time. The exceptions are:
  *
- *    - nl_sock_recv() is conditionally thread-safe: it may be called from
- *      different threads with the same nl_sock, but each caller must provide
- *      an independent receive buffer.
+ *    - nl_sock_recv() and nl_sock_recv_events() are conditionally thread-safe:
+ *      it may be called from different threads with the same nl_sock, but each
+ *      caller must provide an independent receive buffer.
  *
  *    - nl_dump_next() is conditionally thread-safe: it may be called from
  *      different threads with the same nl_dump, but each caller must provide
@@ -61,6 +61,8 @@ struct nl_sock;
 
 /* Netlink sockets. */
 int nl_sock_create(int protocol, struct nl_sock **);
+int nl_sock_create_mmap(int protocol, struct nl_sock **);
+bool nl_sock_is_mapped(const struct nl_sock *);
 int nl_sock_clone(const struct nl_sock *, struct nl_sock **);
 void nl_sock_destroy(struct nl_sock *);
 
@@ -71,6 +73,8 @@ int nl_sock_send(struct nl_sock *, const struct ofpbuf *, bool wait);
 int nl_sock_send_seq(struct nl_sock *, const struct ofpbuf *,
                      uint32_t nlmsg_seq, bool wait);
 int nl_sock_recv(struct nl_sock *, struct ofpbuf *, bool wait);
+int nl_sock_recv_events(struct nl_sock *, struct ofpbuf *, bool wait,
+                        uint32_t events);
 int nl_sock_transact(struct nl_sock *, const struct ofpbuf *request,
                      struct ofpbuf **replyp);
 
-- 
1.8.3.1




More information about the dev mailing list