[ovs-dev] [PATCH v3] ovs-hyperv: make kernel return values netlink socket like

Nithin Raju nithin at vmware.com
Tue Apr 28 21:35:37 UTC 2015


In this patch, we make changes to usersapce as well as
kernel datapath on hyperv to make it more netlink socket
like. Previously, the kernel datapath did not distinguish
between "transport errors" and other errors. Netlink
semantics dictate that netlink functions should only
return an error only in the case of a "transport error"
which is generally something fatal. Eg. failure to
communicate with the OVS module, or an invalid command
altogether. Other errors such as an unsupported action,
or an invalid flow key is not considered a "transport
error", and in such cases, netlink functions are to return
success with a 'struct nlmsgerr' populated in the output
buffer.

This patch implements these semantics.

Signed-off-by: Nithin Raju <nithin at vmware.com>
Acked-by: Sorin Vinturis <svinturis at cloudbasesolutions.com>
Reported-at: https://github.com/openvswitch/ovs-issues/issues/72
---
v2: addressed Sorin's comments
v3: rebase to HEAD

 datapath-windows/ovsext/Datapath.c |  61 +++++++++++++++++++---
 datapath-windows/ovsext/Datapath.h |   2 -
 lib/netlink-socket.c               | 102 ++++++++++++++++++++++---------------
 3 files changed, 114 insertions(+), 51 deletions(-)

diff --git a/datapath-windows/ovsext/Datapath.c b/datapath-windows/ovsext/Datapath.c
index 1dead33..7646f0a 100644
--- a/datapath-windows/ovsext/Datapath.c
+++ b/datapath-windows/ovsext/Datapath.c
@@ -78,10 +78,11 @@ typedef struct _NETLINK_CMD {
 /* A netlink family is a group of commands. */
 typedef struct _NETLINK_FAMILY {
     CHAR *name;
-    UINT32 id;
+    UINT16 id;
     UINT8 version;
-    UINT8 pad;
+    UINT8 pad1;
     UINT16 maxAttr;
+    UINT16 pad2;
     NETLINK_CMD *cmds;          /* Array of netlink commands and handlers. */
     UINT16 opsCount;
 } NETLINK_FAMILY, *PNETLINK_FAMILY;
@@ -143,12 +144,12 @@ NETLINK_CMD nlControlFamilyCmdOps[] = {
     },
     { .cmd = OVS_CTRL_CMD_EVENT_NOTIFY,
       .handler = OvsReadEventCmdHandler,
-      .supportedDevOp = OVS_READ_EVENT_DEV_OP,
+      .supportedDevOp = OVS_READ_DEV_OP,
       .validateDpIndex = FALSE,
     },
     { .cmd = OVS_CTRL_CMD_READ_NOTIFY,
       .handler = OvsReadPacketCmdHandler,
-      .supportedDevOp = OVS_READ_PACKET_DEV_OP,
+      .supportedDevOp = OVS_READ_DEV_OP,
       .validateDpIndex = FALSE,
     }
 };
@@ -799,12 +800,17 @@ OvsDeviceControl(PDEVICE_OBJECT deviceObject,
         inputBufferLen = 0;
 
         ovsMsg = &ovsMsgReadOp;
-        ovsMsg->nlMsg.nlmsgType = OVS_WIN_NL_CTRL_FAMILY_ID;
+        RtlZeroMemory(ovsMsg, sizeof *ovsMsg);
+        ovsMsg->nlMsg.nlmsgLen = sizeof *ovsMsg;
+        ovsMsg->nlMsg.nlmsgType = nlControlFamilyOps.id;
         ovsMsg->nlMsg.nlmsgPid = instance->pid;
+
         /* An "artificial" command so we can use NL family function table*/
         ovsMsg->genlMsg.cmd = (code == OVS_IOCTL_READ_EVENT) ?
                               OVS_CTRL_CMD_EVENT_NOTIFY :
                               OVS_CTRL_CMD_READ_NOTIFY;
+        ovsMsg->genlMsg.version = nlControlFamilyOps.version;
+
         devOp = OVS_READ_DEV_OP;
         break;
 
@@ -895,8 +901,8 @@ OvsDeviceControl(PDEVICE_OBJECT deviceObject,
     }
 
     /*
-     * For read operation, the netlink command has already been validated
-     * previously.
+     * For read operation, avoid duplicate validation since 'ovsMsg' is either
+     * "artificial" or was copied from a previously validated 'ovsMsg'.
      */
     if (devOp != OVS_READ_DEV_OP) {
         status = ValidateNetlinkCmd(devOp, instance, ovsMsg, nlFamilyOps);
@@ -982,7 +988,9 @@ done:
 
 /*
  * --------------------------------------------------------------------------
- * Function to invoke the netlink command handler.
+ * Function to invoke the netlink command handler. The function also stores
+ * the return value of the handler function to construct a 'NL_ERROR' message,
+ * and in turn returns success to the caller.
  * --------------------------------------------------------------------------
  */
 static NTSTATUS
@@ -1004,6 +1012,43 @@ InvokeNetlinkCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx,
         }
     }
 
+    /*
+     * Netlink socket semantics dictate that the return value of the netlink
+     * function should be an error ONLY under fatal conditions. If the message
+     * made it all the way to the handler function, it is not a fatal condition.
+     * Absorb the error returned by the handler function into a 'struct
+     * NL_ERROR' and populate the 'output buffer' to return to userspace.
+     *
+     * This behavior is obviously applicable only to netlink commands that
+     * specify an 'output buffer'. For other commands, we return the error as
+     * is.
+     *
+     * 'STATUS_PENDING' is a special return value and userspace is equipped to
+     * handle it.
+     */
+    if (status != STATUS_SUCCESS && status != STATUS_PENDING) {
+        if (usrParamsCtx->devOp != OVS_WRITE_DEV_OP && *replyLen == 0) {
+            NL_ERROR nlError = NlMapStatusToNlErr(status);
+            POVS_MESSAGE msgIn = (POVS_MESSAGE)usrParamsCtx->inputBuffer;
+            POVS_MESSAGE_ERROR msgError = (POVS_MESSAGE_ERROR)
+                usrParamsCtx->outputBuffer;
+
+            ASSERT(msgError);
+            NlBuildErrorMsg(msgIn, msgError, nlError);
+            *replyLen = msgError->nlMsg.nlmsgLen;
+        }
+
+        if (*replyLen != 0) {
+            status = STATUS_SUCCESS;
+        }
+    }
+
+#ifdef DBG
+    if (usrParamsCtx->devOp != OVS_WRITE_DEV_OP) {
+        ASSERT(status == STATUS_PENDING || *replyLen != 0 || status == STATUS_SUCCESS);
+    }
+#endif
+
     return status;
 }
 
diff --git a/datapath-windows/ovsext/Datapath.h b/datapath-windows/ovsext/Datapath.h
index 863afc4..dbc9dea 100644
--- a/datapath-windows/ovsext/Datapath.h
+++ b/datapath-windows/ovsext/Datapath.h
@@ -32,8 +32,6 @@
 #define OVS_READ_DEV_OP          (1 << 0)
 #define OVS_WRITE_DEV_OP         (1 << 1)
 #define OVS_TRANSACTION_DEV_OP   (1 << 2)
-#define OVS_READ_EVENT_DEV_OP    (1 << 3)
-#define OVS_READ_PACKET_DEV_OP   (1 << 4)
 
 typedef struct _OVS_DEVICE_EXTENSION {
     INT numberOpenInstance;
diff --git a/lib/netlink-socket.c b/lib/netlink-socket.c
index a809b1e..42eb232 100644
--- a/lib/netlink-socket.c
+++ b/lib/netlink-socket.c
@@ -475,6 +475,8 @@ nl_sock_send__(struct nl_sock *sock, const struct ofpbuf *msg,
             retval = -1;
             /* XXX: Map to a more appropriate error based on GetLastError(). */
             errno = EINVAL;
+            VLOG_DBG_RL(&rl, "fatal driver failure in write: %s",
+                ovs_lasterror_to_string());
         } else {
             retval = msg->size;
         }
@@ -564,7 +566,10 @@ nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
         DWORD bytes;
         if (!DeviceIoControl(sock->handle, sock->read_ioctl,
                              NULL, 0, tail, sizeof tail, &bytes, NULL)) {
+            VLOG_DBG_RL(&rl, "fatal driver failure in transact: %s",
+                ovs_lasterror_to_string());
             retval = -1;
+            /* XXX: Map to a more appropriate error. */
             errno = EINVAL;
         } else {
             retval = bytes;
@@ -789,14 +794,27 @@ nl_sock_transact_multiple__(struct nl_sock *sock,
     uint8_t reply_buf[65536];
     for (i = 0; i < n; i++) {
         DWORD reply_len;
+        bool ret;
         struct nl_transaction *txn = transactions[i];
         struct nlmsghdr *request_nlmsg, *reply_nlmsg;
 
-        if (!DeviceIoControl(sock->handle, OVS_IOCTL_TRANSACT,
-                             txn->request->data,
-                             txn->request->size,
-                             reply_buf, sizeof reply_buf,
-                             &reply_len, NULL)) {
+        ret = DeviceIoControl(sock->handle, OVS_IOCTL_TRANSACT,
+                              txn->request->data,
+                              txn->request->size,
+                              reply_buf, sizeof reply_buf,
+                              &reply_len, NULL);
+
+        if (ret && reply_len == 0) {
+            /*
+             * The current transaction did not produce any data to read and that
+             * is not an error as such. Continue with the remainder of the
+             * transactions.
+             */
+            txn->error = 0;
+            if (txn->reply) {
+                ofpbuf_clear(txn->reply);
+            }
+        } else if (!ret) {
             /* XXX: Map to a more appropriate error. */
             error = EINVAL;
             VLOG_DBG_RL(&rl, "fatal driver failure: %s",
@@ -804,48 +822,50 @@ nl_sock_transact_multiple__(struct nl_sock *sock,
             break;
         }
 
-        if (reply_len < sizeof *reply_nlmsg) {
-            nl_sock_record_errors__(transactions, n, 0);
-            VLOG_DBG_RL(&rl, "insufficient length of reply %#"PRIu32
-                " for seq: %#"PRIx32, reply_len, request_nlmsg->nlmsg_seq);
-            break;
-        }
-
-        /* Validate the sequence number in the reply. */
-        request_nlmsg = nl_msg_nlmsghdr(txn->request);
-        reply_nlmsg = (struct nlmsghdr *)reply_buf;
+        if (reply_len != 0) {
+            if (reply_len < sizeof *reply_nlmsg) {
+                nl_sock_record_errors__(transactions, n, 0);
+                VLOG_DBG_RL(&rl, "insufficient length of reply %#"PRIu32
+                    " for seq: %#"PRIx32, reply_len, request_nlmsg->nlmsg_seq);
+                break;
+            }
 
-        if (request_nlmsg->nlmsg_seq != reply_nlmsg->nlmsg_seq) {
-            ovs_assert(request_nlmsg->nlmsg_seq == reply_nlmsg->nlmsg_seq);
-            VLOG_DBG_RL(&rl, "mismatched seq request %#"PRIx32
-                ", reply %#"PRIx32, request_nlmsg->nlmsg_seq,
-                reply_nlmsg->nlmsg_seq);
-            break;
-        }
+            /* Validate the sequence number in the reply. */
+            request_nlmsg = nl_msg_nlmsghdr(txn->request);
+            reply_nlmsg = (struct nlmsghdr *)reply_buf;
 
-        /* Handle errors embedded within the netlink message. */
-        ofpbuf_use_stub(&tmp_reply, reply_buf, sizeof reply_buf);
-        tmp_reply.size = sizeof reply_buf;
-        if (nl_msg_nlmsgerr(&tmp_reply, &txn->error)) {
-            if (txn->reply) {
-                ofpbuf_clear(txn->reply);
-            }
-            if (txn->error) {
-                VLOG_DBG_RL(&rl, "received NAK error=%d (%s)",
-                            error, ovs_strerror(txn->error));
+            if (request_nlmsg->nlmsg_seq != reply_nlmsg->nlmsg_seq) {
+                ovs_assert(request_nlmsg->nlmsg_seq == reply_nlmsg->nlmsg_seq);
+                VLOG_DBG_RL(&rl, "mismatched seq request %#"PRIx32
+                    ", reply %#"PRIx32, request_nlmsg->nlmsg_seq,
+                    reply_nlmsg->nlmsg_seq);
+                break;
             }
-        } else {
-            txn->error = 0;
-            if (txn->reply) {
-                /* Copy the reply to the buffer specified by the caller. */
-                if (reply_len > txn->reply->allocated) {
-                    ofpbuf_reinit(txn->reply, reply_len);
+
+            /* Handle errors embedded within the netlink message. */
+            ofpbuf_use_stub(&tmp_reply, reply_buf, sizeof reply_buf);
+            tmp_reply.size = sizeof reply_buf;
+            if (nl_msg_nlmsgerr(&tmp_reply, &txn->error)) {
+                if (txn->reply) {
+                    ofpbuf_clear(txn->reply);
+                }
+                if (txn->error) {
+                    VLOG_DBG_RL(&rl, "received NAK error=%d (%s)",
+                                error, ovs_strerror(txn->error));
+                }
+            } else {
+                txn->error = 0;
+                if (txn->reply) {
+                    /* Copy the reply to the buffer specified by the caller. */
+                    if (reply_len > txn->reply->allocated) {
+                        ofpbuf_reinit(txn->reply, reply_len);
+                    }
+                    memcpy(txn->reply->data, reply_buf, reply_len);
+                    txn->reply->size = reply_len;
                 }
-                memcpy(txn->reply->data, reply_buf, reply_len);
-                txn->reply->size = reply_len;
             }
+            ofpbuf_uninit(&tmp_reply);
         }
-        ofpbuf_uninit(&tmp_reply);
 
         /* Count the number of successful transactions. */
         (*done)++;
-- 
1.9.4.msysgit.2




More information about the dev mailing list