[ovs-dev] [RFC 1/4] netlink: Support for memory mapped Netlink sockets
Thomas Graf
tgraf at suug.ch
Thu May 22 23:15:58 UTC 2014
Signed-off-by: Thomas Graf <tgraf at suug.ch>
---
lib/dpif-linux.c | 3 +-
lib/netlink-protocol.h | 39 ++++++
lib/netlink-socket.c | 345 ++++++++++++++++++++++++++++++++++++++++++++-----
lib/netlink-socket.h | 10 +-
4 files changed, 361 insertions(+), 36 deletions(-)
diff --git a/lib/dpif-linux.c b/lib/dpif-linux.c
index 63e66f3..42a3f72 100644
--- a/lib/dpif-linux.c
+++ b/lib/dpif-linux.c
@@ -1777,6 +1777,7 @@ dpif_linux_recv__(struct dpif_linux *dpif, uint32_t handler_id,
while (handler->event_offset < handler->n_events) {
int idx = handler->epoll_events[handler->event_offset].data.u32;
struct dpif_channel *ch = &dpif->handlers[handler_id].channels[idx];
+ int events = handler->epoll_events[handler->event_offset].events;
handler->event_offset++;
@@ -1788,7 +1789,7 @@ dpif_linux_recv__(struct dpif_linux *dpif, uint32_t handler_id,
return EAGAIN;
}
- error = nl_sock_recv(ch->sock, buf, false);
+ error = nl_sock_recv_events(ch->sock, buf, false, events);
if (error == ENOBUFS) {
/* ENOBUFS typically means that we've received so many
* packets that the buffer overflowed. Try again
diff --git a/lib/netlink-protocol.h b/lib/netlink-protocol.h
index 3009fc5..13e1216 100644
--- a/lib/netlink-protocol.h
+++ b/lib/netlink-protocol.h
@@ -161,6 +161,45 @@ enum {
#define NETLINK_DROP_MEMBERSHIP 2
#endif
+#ifndef __ALIGN_KERNEL
+#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
+#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
+#endif
+
+#ifndef NETLINK_RX_RING
+#define NETLINK_RX_RING 6
+#define NETLINK_TX_RING 7
+
+struct nl_mmap_req {
+ unsigned int nm_block_size;
+ unsigned int nm_block_nr;
+ unsigned int nm_frame_size;
+ unsigned int nm_frame_nr;
+};
+
+struct nl_mmap_hdr {
+ unsigned int nm_status;
+ unsigned int nm_len;
+ __u32 nm_group;
+ /* credentials */
+ __u32 nm_pid;
+ __u32 nm_uid;
+ __u32 nm_gid;
+};
+
+enum nl_mmap_status {
+ NL_MMAP_STATUS_UNUSED,
+ NL_MMAP_STATUS_RESERVED,
+ NL_MMAP_STATUS_VALID,
+ NL_MMAP_STATUS_COPY,
+ NL_MMAP_STATUS_SKIP,
+};
+
+#define NL_MMAP_MSG_ALIGNMENT NLMSG_ALIGNTO
+#define NL_MMAP_MSG_ALIGN(sz) __ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT)
+#define NL_MMAP_HDRLEN NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr))
+#endif /* NETLINK_RX_RING */
+
/* These were introduced all together in 2.6.23. (We want our programs to
* support the newer kernel features even if compiled with older headers.) */
#ifndef CTRL_ATTR_MCAST_GRP_MAX
diff --git a/lib/netlink-socket.c b/lib/netlink-socket.c
index e4cc4ad..a264a0a 100644
--- a/lib/netlink-socket.c
+++ b/lib/netlink-socket.c
@@ -21,6 +21,8 @@
#include <stdlib.h>
#include <sys/types.h>
#include <sys/uio.h>
+#include <sys/mman.h>
+#include <sys/epoll.h>
#include <unistd.h>
#include "coverage.h"
#include "dynamic-string.h"
@@ -41,7 +43,9 @@ VLOG_DEFINE_THIS_MODULE(netlink_socket);
COVERAGE_DEFINE(netlink_overflow);
COVERAGE_DEFINE(netlink_received);
COVERAGE_DEFINE(netlink_recv_jumbo);
+COVERAGE_DEFINE(netlink_recv_mmap);
COVERAGE_DEFINE(netlink_sent);
+COVERAGE_DEFINE(netlink_sent_mmap);
/* Linux header file confusion causes this to be undefined. */
#ifndef SOL_NETLINK
@@ -59,12 +63,30 @@ static void log_nlmsg(const char *function, int error,
/* Netlink sockets. */
+/* Memory mapped ring buffer for Netlink messages
+ *
+ * Total memory consumption per ring:
+ * PAGE_SIZE * NM_BLOCK_NPAGES * NM_BLOCK_NPAGES */
+#define NL_BLOCK_NPAGES 4 /* Number of pages per block */
+#define NL_NBLOCKS 64 /* Number of blocks per ring */
+#define NL_FRAME_SIZE 16384 /* Maximum message size to be carried */
+
+struct nl_ring {
+ unsigned int head;
+ void *ring;
+};
+
struct nl_sock {
int fd;
uint32_t next_seq;
uint32_t pid;
int protocol;
unsigned int rcvbuf; /* Receive buffer size (SO_RCVBUF). */
+ unsigned int frame_size;
+ unsigned int frame_nr;
+ size_t ring_size;
+ struct nl_ring rx_ring;
+ struct nl_ring tx_ring;
};
/* Compile-time limit on iovecs, so that we can allocate a maximum-size array
@@ -112,7 +134,7 @@ nl_sock_create(int protocol, struct nl_sock **sockp)
}
*sockp = NULL;
- sock = xmalloc(sizeof *sock);
+ sock = xzalloc(sizeof *sock);
sock->fd = socket(AF_NETLINK, SOCK_RAW, protocol);
if (sock->fd < 0) {
@@ -179,13 +201,85 @@ error:
return retval;
}
+static int
+nl_sock_set_ring(struct nl_sock *sock)
+{
+ size_t block_size = NL_BLOCK_NPAGES * getpagesize();
+ size_t ring_size;
+ void *ring;
+ struct nl_mmap_req req = {
+ .nm_block_size = block_size,
+ .nm_block_nr = NL_NBLOCKS,
+ .nm_frame_size = NL_FRAME_SIZE,
+ };
+
+ req.nm_frame_nr = req.nm_block_nr * block_size / req.nm_frame_size;
+
+ if (setsockopt(sock->fd, SOL_NETLINK, NETLINK_RX_RING, &req, sizeof(req)) < 0
+ || setsockopt(sock->fd, SOL_NETLINK, NETLINK_TX_RING, &req, sizeof(req)) < 0) {
+ VLOG_INFO("mmap netlink is not supported");
+ return 0;
+ }
+
+ ring_size = req.nm_block_nr * req.nm_block_size;
+ ring = mmap(NULL, 2 * ring_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, sock->fd, 0);
+ if (ring == MAP_FAILED) {
+ VLOG_ERR("netlink mmap: %s", ovs_strerror(errno));
+ return errno;
+ }
+
+ sock->frame_size = req.nm_frame_size;
+ sock->frame_nr = req.nm_frame_nr - 1;
+ sock->ring_size = ring_size;
+ sock->rx_ring.ring = ring;
+ sock->rx_ring.head = 0;
+ sock->tx_ring.ring = (char *) ring + ring_size;
+ sock->tx_ring.head = 0;
+
+ return 0;
+}
+
+/* Creates a new memory mapped netlink socket for the given netlink 'protocol'
+ * (NETLINK_ROUTE, NETLINK_GENERIC, ...). Falls back to unmapped socket if
+ * kernel side does not support it. Returns 0 and sets '*sockp' to the new
+ * socket if successful, otherwise returns a positive errno value. */
+int
+nl_sock_create_mmap(int protocol, struct nl_sock **sockp)
+{
+ int retval;
+
+ if ((retval = nl_sock_create(protocol, sockp)) < 0) {
+ return retval;
+ }
+
+ if ((retval = nl_sock_set_ring(*sockp)) < 0) {
+ VLOG_ERR("failed to initialize memory mapped netlink socket");
+ nl_sock_destroy(*sockp);
+ return retval;
+ }
+
+ return retval;
+}
+
+/* Returns true if netlink socked is memory mapped */
+bool
+nl_sock_is_mapped(const struct nl_sock *sock)
+{
+ return sock->rx_ring.ring != NULL;
+}
+
/* Creates a new netlink socket for the same protocol as 'src'. Returns 0 and
* sets '*sockp' to the new socket if successful, otherwise returns a positive
* errno value. */
int
nl_sock_clone(const struct nl_sock *src, struct nl_sock **sockp)
{
- return nl_sock_create(src->protocol, sockp);
+ if (nl_sock_is_mapped(src)) {
+ return nl_sock_create_mmap(src->protocol, sockp);
+ } else {
+ return nl_sock_create(src->protocol, sockp);
+ }
}
/* Destroys netlink socket 'sock'. */
@@ -193,6 +287,9 @@ void
nl_sock_destroy(struct nl_sock *sock)
{
if (sock) {
+ if (nl_sock_is_mapped(sock)) {
+ munmap(sock->rx_ring.ring, 2 * sock->ring_size);
+ }
close(sock->fd);
free(sock);
}
@@ -243,6 +340,105 @@ nl_sock_leave_mcgroup(struct nl_sock *sock, unsigned int multicast_group)
return 0;
}
+static struct nl_mmap_hdr *
+mmap_frame(struct nl_sock *sock, struct nl_ring *r)
+{
+ char *start = r->ring;
+
+ return (struct nl_mmap_hdr *)(void *)(start + r->head * sock->frame_size);
+}
+
+static struct nl_mmap_hdr *
+mmap_next_rx_frame(struct nl_sock *sock)
+{
+ return mmap_frame(sock, &sock->rx_ring);
+}
+
+static struct nl_mmap_hdr *
+mmap_next_tx_frame(struct nl_sock *sock)
+{
+ return mmap_frame(sock, &sock->tx_ring);
+}
+
+static void
+mmap_advance_ring(struct nl_sock *sock, struct nl_ring *r)
+{
+ if (r->head != sock->frame_nr) {
+ r->head++;
+ } else {
+ r->head = 0;
+ }
+}
+
+static void
+mmap_advance_rx_ring(struct nl_sock *sock)
+{
+ mmap_advance_ring(sock, &sock->rx_ring);
+}
+
+static void
+mmap_advance_tx_ring(struct nl_sock *sock)
+{
+ mmap_advance_ring(sock, &sock->tx_ring);
+}
+
+static int
+nl_sock_send_linear(struct nl_sock *sock, const struct ofpbuf *msg,
+ bool wait)
+{
+ int retval, error;
+
+ do {
+ retval = send(sock->fd, ofpbuf_data(msg), ofpbuf_size(msg), wait ? 0 : MSG_DONTWAIT);
+ error = retval < 0 ? errno : 0;
+ } while (error == EINTR);
+
+ return error;
+}
+
+static int
+nl_sock_send_mmap(struct nl_sock *sock, const struct ofpbuf *msg,
+ bool wait)
+{
+ struct nl_mmap_hdr *hdr;
+ struct sockaddr_nl addr = {
+ .nl_family = AF_NETLINK,
+ };
+ int retval, error;
+
+ if ((ofpbuf_size(msg) + NL_MMAP_HDRLEN) > sock->frame_size)
+ return nl_sock_send_linear(sock, msg, wait);
+
+ hdr = mmap_next_tx_frame(sock);
+
+ if (hdr->nm_status != NL_MMAP_STATUS_UNUSED) {
+ /* No frame available. Block? */
+ if (wait) {
+ nl_sock_wait(sock, POLLOUT | POLLERR);
+ poll_block();
+ } else {
+ return EAGAIN;
+ }
+ }
+
+ memcpy((char *) hdr + NL_MMAP_HDRLEN, ofpbuf_data(msg), ofpbuf_size(msg));
+ hdr->nm_len = ofpbuf_size(msg);
+ hdr->nm_status = NL_MMAP_STATUS_VALID;
+
+ mmap_advance_tx_ring(sock);
+
+ do {
+ retval = sendto(sock->fd, NULL, 0, 0, (struct sockaddr *)&addr, sizeof(addr));
+ error = retval < 0 ? errno : 0;
+ } while (error == EINTR);
+
+ if (!error) {
+ COVERAGE_INC(netlink_sent_mmap);
+ }
+
+ return error;
+}
+
static int
nl_sock_send__(struct nl_sock *sock, const struct ofpbuf *msg,
uint32_t nlmsg_seq, bool wait)
@@ -253,11 +449,11 @@ nl_sock_send__(struct nl_sock *sock, const struct ofpbuf *msg,
nlmsg->nlmsg_len = ofpbuf_size(msg);
nlmsg->nlmsg_seq = nlmsg_seq;
nlmsg->nlmsg_pid = sock->pid;
- do {
- int retval;
- retval = send(sock->fd, ofpbuf_data(msg), ofpbuf_size(msg), wait ? 0 : MSG_DONTWAIT);
- error = retval < 0 ? errno : 0;
- } while (error == EINTR);
+ if (nl_sock_is_mapped(sock)) {
+ error = nl_sock_send_mmap(sock, msg, wait);
+ } else {
+ error = nl_sock_send_linear(sock, msg, wait);
+ }
log_nlmsg(__func__, error, ofpbuf_data(msg), ofpbuf_size(msg), sock->protocol);
if (!error) {
COVERAGE_INC(netlink_sent);
@@ -298,26 +494,17 @@ nl_sock_send_seq(struct nl_sock *sock, const struct ofpbuf *msg,
}
static int
-nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
+nl_sock_recv_linear(struct nl_sock *sock, struct ofpbuf *buf, bool wait,
+ uint8_t *tail, size_t taillen)
{
- /* We can't accurately predict the size of the data to be received. The
- * caller is supposed to have allocated enough space in 'buf' to handle the
- * "typical" case. To handle exceptions, we make available enough space in
- * 'tail' to allow Netlink messages to be up to 64 kB long (a reasonable
- * figure since that's the maximum length of a Netlink attribute). */
- struct nlmsghdr *nlmsghdr;
- uint8_t tail[65536];
struct iovec iov[2];
struct msghdr msg;
- ssize_t retval;
-
- ovs_assert(buf->allocated >= sizeof *nlmsghdr);
- ofpbuf_clear(buf);
+ int retval;
iov[0].iov_base = ofpbuf_base(buf);
iov[0].iov_len = buf->allocated;
iov[1].iov_base = tail;
- iov[1].iov_len = sizeof tail;
+ iov[1].iov_len = taillen;
memset(&msg, 0, sizeof msg);
msg.msg_iov = iov;
@@ -343,21 +530,100 @@ nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
return E2BIG;
}
- nlmsghdr = ofpbuf_data(buf);
- if (retval < sizeof *nlmsghdr
- || nlmsghdr->nlmsg_len < sizeof *nlmsghdr
- || nlmsghdr->nlmsg_len > retval) {
- VLOG_ERR_RL(&rl, "received invalid nlmsg (%"PRIuSIZE"d bytes < %"PRIuSIZE")",
- retval, sizeof *nlmsghdr);
- return EPROTO;
- }
-
ofpbuf_set_size(buf, MIN(retval, buf->allocated));
if (retval > buf->allocated) {
COVERAGE_INC(netlink_recv_jumbo);
ofpbuf_put(buf, tail, retval - buf->allocated);
}
+ return 0;
+}
+
+static int
+nl_sock_recv_mmap(struct nl_sock *sock, struct ofpbuf *buf, bool wait,
+ uint8_t *tail, size_t taillen)
+{
+ struct nl_mmap_hdr *hdr;
+ int retval = 0;
+
+restart:
+ hdr = mmap_next_rx_frame(sock);
+
+ switch (hdr->nm_status) {
+ case NL_MMAP_STATUS_VALID:
+ if (hdr->nm_len == 0) {
+ /* error occured while constructing message on other side */
+ hdr->nm_status = NL_MMAP_STATUS_UNUSED;
+ mmap_advance_rx_ring(sock);
+ goto restart;
+ }
+
+ ofpbuf_put(buf, (char *) hdr + NL_MMAP_HDRLEN, hdr->nm_len);
+ COVERAGE_INC(netlink_recv_mmap);
+ break;
+
+ case NL_MMAP_STATUS_COPY:
+ retval = nl_sock_recv_linear(sock, buf, MSG_DONTWAIT, tail, taillen);
+ if (retval < 0) {
+ return retval;
+ }
+ break;
+
+ case NL_MMAP_STATUS_UNUSED:
+ case NL_MMAP_STATUS_RESERVED:
+ default:
+ if (wait) {
+ nl_sock_wait(sock, POLLIN | POLLERR);
+ poll_block();
+ goto restart;
+ }
+
+ return EAGAIN;
+ }
+
+ hdr->nm_status = NL_MMAP_STATUS_UNUSED;
+ mmap_advance_rx_ring(sock);
+
+ return retval;
+}
+
+static int
+nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait,
+ uint32_t events)
+{
+ /* We can't accurately predict the size of the data to be received. The
+ * caller is supposed to have allocated enough space in 'buf' to handle the
+ * "typical" case. To handle exceptions, we make available enough space in
+ * 'tail' to allow Netlink messages to be up to 64 kB long (a reasonable
+ * figure since that's the maximum length of a Netlink attribute). */
+ struct nlmsghdr *nlmsghdr;
+ uint8_t tail[65536];
+ int retval;
+
+ ovs_assert(buf->allocated >= sizeof *nlmsghdr);
+ ofpbuf_clear(buf);
+
+ /* nl_sock_recv_mmap() cannot handle EPOLLERR events, force linear receive
+ * to retrieve error notification. */
+ if (!nl_sock_is_mapped(sock) || events == EPOLLERR) {
+ retval = nl_sock_recv_linear(sock, buf, wait, tail, sizeof(tail));
+ } else {
+ retval = nl_sock_recv_mmap(sock, buf, wait, tail, sizeof(tail));
+ }
+
+ if (retval != 0) {
+ return retval;
+ }
+
+ nlmsghdr = ofpbuf_data(buf);
+ if (ofpbuf_size(buf) < sizeof *nlmsghdr
+ || nlmsghdr->nlmsg_len < sizeof *nlmsghdr
+ || nlmsghdr->nlmsg_len > ofpbuf_size(buf)) {
+ VLOG_ERR_RL(&rl, "received invalid nlmsg (%u bytes < %"PRIuSIZE")",
+ ofpbuf_size(buf), sizeof *nlmsghdr);
+ return EPROTO;
+ }
+
log_nlmsg(__func__, 0, ofpbuf_data(buf), ofpbuf_size(buf), sock->protocol);
COVERAGE_INC(netlink_received);
@@ -384,7 +650,22 @@ nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
int
nl_sock_recv(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
{
- return nl_sock_recv__(sock, buf, wait);
+ return nl_sock_recv__(sock, buf, wait, 0);
+}
+
+/* Variation of nl_sock_recv() to be used in combination with event polling.
+ * Behaves the same but takes the additional 'events' provided by poll() or
+ * epoll_wait().
+ *
+ * Required to receive error notifications on the socket for shared memory
+ * based ring buffer implementations which don't necessarily invoke recvmsg()
+ * on the socket.
+ */
+int
+nl_sock_recv_events(struct nl_sock *sock, struct ofpbuf *buf, bool wait,
+ uint32_t events)
+{
+ return nl_sock_recv__(sock, buf, wait, events);
}
static void
@@ -472,7 +753,7 @@ nl_sock_transact_multiple__(struct nl_sock *sock,
}
/* Receive a reply. */
- error = nl_sock_recv__(sock, buf_txn->reply, false);
+ error = nl_sock_recv__(sock, buf_txn->reply, false, 0);
if (error) {
if (error == EAGAIN) {
nl_sock_record_errors__(transactions, n, 0);
@@ -745,7 +1026,7 @@ nl_dump_next(struct nl_dump *dump, struct ofpbuf *reply, struct ofpbuf *buffer)
return false;
}
- retval = nl_sock_recv__(dump->sock, buffer, false);
+ retval = nl_sock_recv__(dump->sock, buffer, false, 0);
if (retval) {
ofpbuf_clear(buffer);
if (retval == EAGAIN) {
diff --git a/lib/netlink-socket.h b/lib/netlink-socket.h
index dd32409..fbb2b3f 100644
--- a/lib/netlink-socket.h
+++ b/lib/netlink-socket.h
@@ -38,9 +38,9 @@
* Most of the netlink functions are not fully thread-safe: Only a single
* thread may use a given nl_sock or nl_dump at one time. The exceptions are:
*
- * - nl_sock_recv() is conditionally thread-safe: it may be called from
- * different threads with the same nl_sock, but each caller must provide
- * an independent receive buffer.
+ * - nl_sock_recv() and nl_sock_recv_events() are conditionally thread-safe:
+ * it may be called from different threads with the same nl_sock, but each
+ * caller must provide an independent receive buffer.
*
* - nl_dump_next() is conditionally thread-safe: it may be called from
* different threads with the same nl_dump, but each caller must provide
@@ -61,6 +61,8 @@ struct nl_sock;
/* Netlink sockets. */
int nl_sock_create(int protocol, struct nl_sock **);
+int nl_sock_create_mmap(int protocol, struct nl_sock **);
+bool nl_sock_is_mapped(const struct nl_sock *);
int nl_sock_clone(const struct nl_sock *, struct nl_sock **);
void nl_sock_destroy(struct nl_sock *);
@@ -71,6 +73,8 @@ int nl_sock_send(struct nl_sock *, const struct ofpbuf *, bool wait);
int nl_sock_send_seq(struct nl_sock *, const struct ofpbuf *,
uint32_t nlmsg_seq, bool wait);
int nl_sock_recv(struct nl_sock *, struct ofpbuf *, bool wait);
+int nl_sock_recv_events(struct nl_sock *, struct ofpbuf *, bool wait,
+ uint32_t events);
int nl_sock_transact(struct nl_sock *, const struct ofpbuf *request,
struct ofpbuf **replyp);
--
1.8.3.1
More information about the dev
mailing list