[ovs-dev] [optimize 08/13] netlink-socket: New function nl_sock_transact_multiple().

Ethan Jackson ethan at nicira.com
Thu Oct 13 20:22:28 UTC 2011


+/* Compile-time limit on iovecs, so that we can allocate a
maximum-size array+ * of iovecs on the stack. */+#define MAX_IOVS
128++/* Maximum number of iovecs that may be passed to sendmsg, capped
at a+ * minimum of _XOPEN_IOV_MAX (16) and a maximum of MAX_IOVS.
I don't know a great deal about IOV maximums so the next comment may
not bereasonable:I found _XOPEN_IOV_MAX a surprising minimum.
Wouldn't sysconf's opinion ofmax_iovs be less than _XOPEN_IOV_MAX in
rare error conditions?  In this casewouldn't 1 be a safer minimum
choice?

+        } else if (max_iovs > MAX_IOVS) {+            max_iovs = 128;
I think you want max_iovs = MAX_IOVS.
+    max_batch_count = MAX(sock->rcvbuf / 4096, 1);+
max_batch_count = MIN(max_batch_count, MAX_IOVS / 2);
Did you choose 4096 here because it's the typical page size?  Would it
makesense to use the PAGESIZE macro if defined?
+        if (error == ENOBUFS) {+            VLOG_DBG_RL(&rl, "receive
buffer overflow, resending request");
Is this code supposed to resend the request itself, or is the
callerresponsible for handling that?

+        } else if (error) {+            VLOG_ERR_RL(&rl, "transaction
error (%s)", strerror(error));+
nl_sock_record_errors__(transactions, n, error);+        }+    }+}
We may want to undef MAX_BATCH_BYTES.  Probably doesn't matter, but
seemsstylistically cleaner.
Ethan

On Tue, Sep 27, 2011 at 16:27, Ben Pfaff <blp at nicira.com> wrote:
> This will be used in an upcoming commit.
> ---
>  lib/netlink-socket.c |  302 ++++++++++++++++++++++++++++++++++++++++----------
>  lib/netlink-socket.h |   14 +++
>  2 files changed, 259 insertions(+), 57 deletions(-)
>
> diff --git a/lib/netlink-socket.c b/lib/netlink-socket.c
> index 0f4e3e7..98697e2 100644
> --- a/lib/netlink-socket.c
> +++ b/lib/netlink-socket.c
> @@ -21,6 +21,7 @@
>  #include <inttypes.h>
>  #include <stdlib.h>
>  #include <sys/types.h>
> +#include <sys/uio.h>
>  #include <unistd.h>
>  #include "coverage.h"
>  #include "dynamic-string.h"
> @@ -32,6 +33,7 @@
>  #include "poll-loop.h"
>  #include "socket-util.h"
>  #include "stress.h"
> +#include "util.h"
>  #include "vlog.h"
>
>  VLOG_DEFINE_THIS_MODULE(netlink_socket);
> @@ -63,8 +65,19 @@ struct nl_sock
>     uint32_t pid;
>     int protocol;
>     struct nl_dump *dump;
> +    unsigned int rcvbuf;        /* Receive buffer size (SO_RCVBUF). */
>  };
>
> +/* Compile-time limit on iovecs, so that we can allocate a maximum-size array
> + * of iovecs on the stack. */
> +#define MAX_IOVS 128
> +
> +/* Maximum number of iovecs that may be passed to sendmsg, capped at a
> + * minimum of _XOPEN_IOV_MAX (16) and a maximum of MAX_IOVS.
> + *
> + * Initialized by nl_sock_create(). */
> +static int max_iovs;
> +
>  static int alloc_pid(uint32_t *);
>  static void free_pid(uint32_t);
>  static int nl_sock_cow__(struct nl_sock *);
> @@ -79,6 +92,23 @@ nl_sock_create(int protocol, struct nl_sock **sockp)
>     struct sockaddr_nl local, remote;
>     int retval = 0;
>
> +    if (!max_iovs) {
> +        int save_errno = errno;
> +        errno = 0;
> +
> +        max_iovs = sysconf(_SC_UIO_MAXIOV);
> +        if (max_iovs < _XOPEN_IOV_MAX) {
> +            if (max_iovs == -1 && errno) {
> +                VLOG_WARN("sysconf(_SC_UIO_MAXIOV): %s", strerror(errno));
> +            }
> +            max_iovs = _XOPEN_IOV_MAX;
> +        } else if (max_iovs > MAX_IOVS) {
> +            max_iovs = 128;
> +        }
> +
> +        errno = save_errno;
> +    }
> +
>     *sockp = NULL;
>     sock = malloc(sizeof *sock);
>     if (sock == NULL) {
> @@ -93,6 +123,13 @@ nl_sock_create(int protocol, struct nl_sock **sockp)
>     sock->protocol = protocol;
>     sock->dump = NULL;
>
> +    retval = get_socket_rcvbuf(sock->fd);
> +    if (retval < 0) {
> +        retval = -retval;
> +        goto error;
> +    }
> +    sock->rcvbuf = retval;
> +
>     retval = alloc_pid(&sock->pid);
>     if (retval) {
>         goto error;
> @@ -354,6 +391,204 @@ nl_sock_recv(struct nl_sock *sock, struct ofpbuf **bufp, bool wait)
>     return nl_sock_recv__(sock, bufp, wait);
>  }
>
> +static int
> +find_nl_transaction_by_seq(struct nl_transaction **transactions, size_t n,
> +                           uint32_t seq)
> +{
> +    int i;
> +
> +    for (i = 0; i < n; i++) {
> +        struct nl_transaction *t = transactions[i];
> +
> +        if (seq == nl_msg_nlmsghdr(t->request)->nlmsg_seq) {
> +            return i;
> +        }
> +    }
> +
> +    return -1;
> +}
> +
> +static void
> +nl_sock_record_errors__(struct nl_transaction **transactions, size_t n,
> +                        int error)
> +{
> +    size_t i;
> +
> +    for (i = 0; i < n; i++) {
> +        transactions[i]->error = error;
> +        transactions[i]->reply = NULL;
> +    }
> +}
> +
> +static int
> +nl_sock_transact_multiple__(struct nl_sock *sock,
> +                            struct nl_transaction **transactions, size_t n,
> +                            size_t *done)
> +{
> +    struct iovec iovs[MAX_IOVS];
> +    struct msghdr msg;
> +    int error;
> +    int i;
> +
> +    *done = 0;
> +    for (i = 0; i < n; i++) {
> +        struct ofpbuf *request = transactions[i]->request;
> +        struct nlmsghdr *nlmsg = nl_msg_nlmsghdr(request);
> +
> +        nlmsg->nlmsg_len = request->size;
> +        nlmsg->nlmsg_pid = sock->pid;
> +        if (i == n - 1) {
> +            /* Ensure that we get a reply even if the final request doesn't
> +             * ordinarily call for one. */
> +            nlmsg->nlmsg_flags |= NLM_F_ACK;
> +        }
> +
> +        iovs[i].iov_base = request->data;
> +        iovs[i].iov_len = request->size;
> +    }
> +
> +    memset(&msg, 0, sizeof msg);
> +    msg.msg_iov = iovs;
> +    msg.msg_iovlen = n;
> +    do {
> +        error = sendmsg(sock->fd, &msg, 0) < 0 ? errno : 0;
> +    } while (error == EINTR);
> +
> +    for (i = 0; i < n; i++) {
> +        struct ofpbuf *request = transactions[i]->request;
> +
> +        log_nlmsg(__func__, error, request->data, request->size,
> +                  sock->protocol);
> +    }
> +    if (!error) {
> +        COVERAGE_ADD(netlink_sent, n);
> +    }
> +
> +    if (error) {
> +        return error;
> +    }
> +
> +    while (n > 0) {
> +        struct ofpbuf *reply;
> +
> +        error = nl_sock_recv__(sock, &reply, true);
> +        if (error) {
> +            return error;
> +        }
> +
> +        i = find_nl_transaction_by_seq(transactions, n,
> +                                       nl_msg_nlmsghdr(reply)->nlmsg_seq);
> +        if (i < 0) {
> +            VLOG_DBG_RL(&rl, "ignoring unexpected seq %#"PRIx32,
> +                        nl_msg_nlmsghdr(reply)->nlmsg_seq);
> +            ofpbuf_delete(reply);
> +            continue;
> +        }
> +
> +        nl_sock_record_errors__(transactions, i, 0);
> +        if (nl_msg_nlmsgerr(reply, &error)) {
> +            transactions[i]->reply = NULL;
> +            transactions[i]->error = error;
> +            if (error) {
> +                VLOG_DBG_RL(&rl, "received NAK error=%d (%s)",
> +                            error, strerror(error));
> +            }
> +            ofpbuf_delete(reply);
> +        } else {
> +            transactions[i]->reply = reply;
> +            transactions[i]->error = 0;
> +        }
> +
> +        *done += i + 1;
> +        transactions += i + 1;
> +        n -= i + 1;
> +    }
> +
> +    return 0;
> +}
> +
> +/* Sends the 'request' member of the 'n' transactions in 'transactions' to the
> + * kernel, in order, and waits for responses to all of them.  Fills in the
> + * 'error' member of each transaction with 0 if it was successful, otherwise
> + * with a positive errno value.  'reply' will be NULL on error or if the
> + * transaction was successful but had no reply beyond an indication of success.
> + * For a successful transaction that did have a more detailed reply, 'reply'
> + * will be set to the reply message.
> + *
> + * The caller is responsible for destroying each request and reply, and the
> + * transactions array itself.
> + *
> + * Before sending each message, this function will finalize nlmsg_len in each
> + * 'request' to match the ofpbuf's size, and set nlmsg_pid to 'sock''s pid.
> + * NLM_F_ACK will be added to some requests' nlmsg_flags.
> + *
> + * Bare Netlink is an unreliable transport protocol.  This function layers
> + * reliable delivery and reply semantics on top of bare Netlink.  See
> + * nl_sock_transact() for some caveats.
> + */
> +void
> +nl_sock_transact_multiple(struct nl_sock *sock,
> +                          struct nl_transaction **transactions, size_t n)
> +{
> +    int max_batch_count;
> +    int error;
> +
> +    if (!n) {
> +        return;
> +    }
> +
> +    error = nl_sock_cow__(sock);
> +    if (error) {
> +        nl_sock_record_errors__(transactions, n, error);
> +        return;
> +    }
> +
> +    /* In theory, every request could have a 64 kB reply.  But the default and
> +     * maximum socket rcvbuf size with typical Dom0 memory sizes both tend to
> +     * be a bit below 128 kB, so that would only allow a single message in a
> +     * "batch".  So we assume that replies average (at most) 4 kB, which allows
> +     * a good deal of batching.
> +     *
> +     * Each request uses 2 iovecs so we cap batching at MAX_IOVS / 2.
> +     *
> +     * In practice, most of the requests that we batch either have no reply at
> +     * all or a brief reply. */
> +    max_batch_count = MAX(sock->rcvbuf / 4096, 1);
> +    max_batch_count = MIN(max_batch_count, MAX_IOVS / 2);
> +
> +    while (n > 0) {
> +        size_t count, bytes;
> +        size_t done;
> +
> +        /* Batch up to 'max_batch_count' transactions.  But cap it at about a
> +         * page of requests total because big skbuffs are expensive to
> +         * allocate in the kernel.  */
> +#if defined(PAGESIZE)
> +#define MAX_BATCH_BYTES MAX(1, PAGESIZE - 512)
> +#else
> +#define MAX_BATCH_BYTES (4096 - 512)
> +#endif
> +        bytes = transactions[0]->request->size;
> +        for (count = 1; count < n && count < max_batch_count; count++) {
> +            if (bytes + transactions[count]->request->size > MAX_BATCH_BYTES) {
> +                break;
> +            }
> +            bytes += transactions[count]->request->size;
> +        }
> +
> +        error = nl_sock_transact_multiple__(sock, transactions, count, &done);
> +        transactions += done;
> +        n -= done;
> +
> +        if (error == ENOBUFS) {
> +            VLOG_DBG_RL(&rl, "receive buffer overflow, resending request");
> +        } else if (error) {
> +            VLOG_ERR_RL(&rl, "transaction error (%s)", strerror(error));
> +            nl_sock_record_errors__(transactions, n, error);
> +        }
> +    }
> +}
> +
>  /* Sends 'request' to the kernel via 'sock' and waits for a response.  If
>  * successful, returns 0.  On failure, returns a positive errno value.
>  *
> @@ -395,68 +630,21 @@ nl_sock_recv(struct nl_sock *sock, struct ofpbuf **bufp, bool wait)
>  *         needs to be idempotent.
>  */
>  int
> -nl_sock_transact(struct nl_sock *sock,
> -                 const struct ofpbuf *request, struct ofpbuf **replyp)
> +nl_sock_transact(struct nl_sock *sock, const struct ofpbuf *request,
> +                 struct ofpbuf **replyp)
>  {
> -    uint32_t seq = nl_msg_nlmsghdr(request)->nlmsg_seq;
> -    struct nlmsghdr *nlmsghdr;
> -    struct ofpbuf *reply;
> -    int retval;
> +    struct nl_transaction *transactionp;
> +    struct nl_transaction transaction;
>
> +    transaction.request = (struct ofpbuf *) request;
> +    transactionp = &transaction;
> +    nl_sock_transact_multiple(sock, &transactionp, 1);
>     if (replyp) {
> -        *replyp = NULL;
> -    }
> -
> -    /* Ensure that we get a reply even if this message doesn't ordinarily call
> -     * for one. */
> -    nl_msg_nlmsghdr(request)->nlmsg_flags |= NLM_F_ACK;
> -
> -send:
> -    retval = nl_sock_send(sock, request, true);
> -    if (retval) {
> -        return retval;
> -    }
> -
> -recv:
> -    retval = nl_sock_recv(sock, &reply, true);
> -    if (retval) {
> -        if (retval == ENOBUFS) {
> -            COVERAGE_INC(netlink_overflow);
> -            VLOG_DBG_RL(&rl, "receive buffer overflow, resending request");
> -            goto send;
> -        } else {
> -            return retval;
> -        }
> -    }
> -    nlmsghdr = nl_msg_nlmsghdr(reply);
> -    if (seq != nlmsghdr->nlmsg_seq) {
> -        VLOG_DBG_RL(&rl, "ignoring seq %#"PRIx32" != expected %#"PRIx32,
> -                    nl_msg_nlmsghdr(reply)->nlmsg_seq, seq);
> -        ofpbuf_delete(reply);
> -        goto recv;
> -    }
> -
> -    /* If the reply is an error, discard the reply and return the error code.
> -     *
> -     * Except: if the reply is just an acknowledgement (error code of 0), and
> -     * the caller is interested in the reply (replyp != NULL), pass the reply
> -     * up to the caller.  Otherwise the caller will get a return value of 0
> -     * and null '*replyp', which makes unwary callers likely to segfault. */
> -    if (nl_msg_nlmsgerr(reply, &retval) && (retval || !replyp)) {
> -        ofpbuf_delete(reply);
> -        if (retval) {
> -            VLOG_DBG_RL(&rl, "received NAK error=%d (%s)",
> -                        retval, strerror(retval));
> -        }
> -        return retval != EAGAIN ? retval : EPROTO;
> -    }
> -
> -    if (replyp) {
> -        *replyp = reply;
> +        *replyp = transaction.reply;
>     } else {
> -        ofpbuf_delete(reply);
> +        ofpbuf_delete(transaction.reply);
>     }
> -    return 0;
> +    return transaction.error;
>  }
>
>  /* Drain all the messages currently in 'sock''s receive queue. */
> diff --git a/lib/netlink-socket.h b/lib/netlink-socket.h
> index d789f41..7e01acb 100644
> --- a/lib/netlink-socket.h
> +++ b/lib/netlink-socket.h
> @@ -35,6 +35,7 @@
>  #include <stdbool.h>
>  #include <stddef.h>
>  #include <stdint.h>
> +#include "list.h"
>
>  struct ofpbuf;
>  struct nl_sock;
> @@ -63,6 +64,19 @@ short int nl_sock_woke(const struct nl_sock *);
>
>  uint32_t nl_sock_pid(const struct nl_sock *);
>
> +/* Batching transactions. */
> +struct nl_transaction {
> +    /* Filled in by client. */
> +    struct ofpbuf *request;     /* Request to send. */
> +
> +    /* Filled in by nl_sock_transact_batch(). */
> +    struct ofpbuf *reply;       /* Reply (NULL if reply was an error code). */
> +    int error;                  /* Positive errno value, 0 if no error. */
> +};
> +
> +void nl_sock_transact_multiple(struct nl_sock *,
> +                               struct nl_transaction **, size_t n);
> +
>  /* Table dumping. */
>  struct nl_dump {
>     struct nl_sock *sock;       /* Socket being dumped. */
> --
> 1.7.4.4
>
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
>



More information about the dev mailing list