[ovs-dev] [PATCH] ovs-hyperv: make kernel return values netlink socket like

Sorin Vinturis svinturis at cloudbasesolutions.com
Tue Apr 28 09:43:19 UTC 2015


Hi Nithin,

The patch looks good to me. I have some minor comments. Please see inline.

Acked-by: Sorin Vinturis <svinturis at cloudbasesolutions.com>

Thanks,
Sorin

-----Original Message-----
From: dev [mailto:dev-bounces at openvswitch.org] On Behalf Of Nithin Raju
Sent: Saturday, 18 April, 2015 00:37
To: dev at openvswitch.org
Subject: [ovs-dev] [PATCH] ovs-hyperv: make kernel return values netlink socket like

In this patch, we make changes to usersapce as well as kernel datapath on hyperv to make it more netlink socket like. Previously, the kernel datapath did not distinguish between "transport errors" and other errors. Netlink semantics dictate that netlink functions should only return an error only in the case of a "transport error"
which is generally something fatal. Eg. failure to communicate with the OVS module, or an invalid command altogether. Other errors such as an unsupported action, or an invalid flow key is not considered a "transport error", and in such cases, netlink functions are to return success with a 'struct nlmsgerr' populated in the output buffer.

This patch implements these semantics.

Signed-off-by: Nithin Raju <nithin at vmware.com>
Reported-at: https://github.com/openvswitch/ovs-issues/issues/72
---
 datapath-windows/ovsext/Datapath.c |  60 +++++++++++++++++++---
 datapath-windows/ovsext/Datapath.h |   2 -
 lib/netlink-socket.c               | 102 ++++++++++++++++++++++---------------
 3 files changed, 113 insertions(+), 51 deletions(-)

diff --git a/datapath-windows/ovsext/Datapath.c b/datapath-windows/ovsext/Datapath.c
index fea7d3a..b1b359d 100644
--- a/datapath-windows/ovsext/Datapath.c
+++ b/datapath-windows/ovsext/Datapath.c
@@ -78,10 +78,11 @@ typedef struct _NETLINK_CMD {
 /* A netlink family is a group of commands. */  typedef struct _NETLINK_FAMILY {
     CHAR *name;
-    UINT32 id;
+    UINT16 id;
     UINT8 version;
-    UINT8 pad;
+    UINT8 pad1;
     UINT16 maxAttr;
+    UINT16 pad2;
SV: Do we need the new 'pad' members, 'pad1' & 'pad2'? I didn't saw them referenced in any part of the code.

     NETLINK_CMD *cmds;          /* Array of netlink commands and handlers. */
     UINT16 opsCount;
 } NETLINK_FAMILY, *PNETLINK_FAMILY;
@@ -143,12 +144,12 @@ NETLINK_CMD nlControlFamilyCmdOps[] = {
     },
     { .cmd = OVS_CTRL_CMD_EVENT_NOTIFY,
       .handler = OvsReadEventCmdHandler,
-      .supportedDevOp = OVS_READ_EVENT_DEV_OP,
+      .supportedDevOp = OVS_READ_DEV_OP,
       .validateDpIndex = FALSE,
     },
     { .cmd = OVS_CTRL_CMD_READ_NOTIFY,
       .handler = OvsReadPacketCmdHandler,
-      .supportedDevOp = OVS_READ_PACKET_DEV_OP,
SV: The two operations, 'OVS_READ_EVENT_DEV_OP' and 'OVS_READ_PACKET_DEV_OP' are not used anymore. Shouldn't we remove them from the header as well?

+      .supportedDevOp = OVS_READ_DEV_OP,
       .validateDpIndex = FALSE,
     }
 };
@@ -799,12 +800,17 @@ OvsDeviceControl(PDEVICE_OBJECT deviceObject,
         inputBufferLen = 0;
 
         ovsMsg = &ovsMsgReadOp;
-        ovsMsg->nlMsg.nlmsgType = OVS_WIN_NL_CTRL_FAMILY_ID;
+        RtlZeroMemory(ovsMsg, sizeof *ovsMsg);
+        ovsMsg->nlMsg.nlmsgLen = sizeof *ovsMsg;
+        ovsMsg->nlMsg.nlmsgType = nlControlFamilyOps.id;
         ovsMsg->nlMsg.nlmsgPid = instance->pid;
+
         /* An "artificial" command so we can use NL family function table*/
         ovsMsg->genlMsg.cmd = (code == OVS_IOCTL_READ_EVENT) ?
                               OVS_CTRL_CMD_EVENT_NOTIFY :
                               OVS_CTRL_CMD_READ_NOTIFY;
+        ovsMsg->genlMsg.version = nlControlFamilyOps.version;
+
         devOp = OVS_READ_DEV_OP;
         break;
 
@@ -895,8 +901,8 @@ OvsDeviceControl(PDEVICE_OBJECT deviceObject,
     }
 
     /*
-     * For read operation, the netlink command has already been validated
-     * previously.
+     * For read operation, avoid duplicate validation since 'ovsMsg' is either
+     * "artificial" or was copied from a previously validated 'ovsMsg'.
      */
     if (devOp != OVS_READ_DEV_OP) {
         status = ValidateNetlinkCmd(devOp, instance, ovsMsg, nlFamilyOps); @@ -985,7 +991,8 @@ done:
 
 /*
  * --------------------------------------------------------------------------
- * Function to invoke the netlink command handler.
+ * Function to invoke the netlink command handler. The function also 
+ massages
+ * the return value of the handler function to construct a 'NL_ERROR' 
+ message
SV: Please rephrase the function description above - 'massage' is not appropriate.

  * --------------------------------------------------------------------------
  */
 static NTSTATUS
@@ -1007,6 +1014,43 @@ InvokeNetlinkCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx,
         }
     }
 
+    /*
+     * Netlink socket semantics dictate that the return value of the netlink
+     * function should be an error ONLY under fatal conditions. If the message
+     * made it all the way to the handler function, it is not a fatal condition.
+     * Absorb the error returned by the handler function into a 'struct
+     * NL_ERROR' and populate the 'output buffer' to return to userspace.
+     *
+     * This behavior is obviously applicable only to netlink commands that
+     * specify an 'output buffer'. For other commands, we return the error as
+     * is.
+     *
+     * 'STATUS_PENDING' is a special return value and userspace is equipped to
+     * handle it.
+     */
+    if (status != STATUS_SUCCESS && status != STATUS_PENDING) {
+        if (usrParamsCtx->devOp != OVS_WRITE_DEV_OP && *replyLen == 0) {
+            NL_ERROR nlError = NlMapStatusToNlErr(status);
+            POVS_MESSAGE msgIn = (POVS_MESSAGE)usrParamsCtx->inputBuffer;
+            POVS_MESSAGE_ERROR msgError = (POVS_MESSAGE_ERROR)
+                usrParamsCtx->outputBuffer;
+
+            ASSERT(msgError);
+            NlBuildErrorMsg(msgIn, msgError, nlError);
+            *replyLen = msgError->nlMsg.nlmsgLen;
+        }
+
+        if (*replyLen != 0) {
+            status = STATUS_SUCCESS;
+        }
+    }
+
+#ifdef DBG
+    if (usrParamsCtx->devOp != OVS_WRITE_DEV_OP) {
+        ASSERT(status == STATUS_PENDING || *replyLen != 0 || status == STATUS_SUCCESS);
+    }
+#endif
+
     return status;
 }
 
diff --git a/datapath-windows/ovsext/Datapath.h b/datapath-windows/ovsext/Datapath.h
index 863afc4..dbc9dea 100644
--- a/datapath-windows/ovsext/Datapath.h
+++ b/datapath-windows/ovsext/Datapath.h
@@ -32,8 +32,6 @@
 #define OVS_READ_DEV_OP          (1 << 0)
 #define OVS_WRITE_DEV_OP         (1 << 1)
 #define OVS_TRANSACTION_DEV_OP   (1 << 2)
-#define OVS_READ_EVENT_DEV_OP    (1 << 3)
-#define OVS_READ_PACKET_DEV_OP   (1 << 4)
 
 typedef struct _OVS_DEVICE_EXTENSION {
     INT numberOpenInstance;
diff --git a/lib/netlink-socket.c b/lib/netlink-socket.c index 91de6b4..ef1493b 100644
--- a/lib/netlink-socket.c
+++ b/lib/netlink-socket.c
@@ -475,6 +475,8 @@ nl_sock_send__(struct nl_sock *sock, const struct ofpbuf *msg,
             retval = -1;
             /* XXX: Map to a more appropriate error based on GetLastError(). */
             errno = EINVAL;
+            VLOG_DBG_RL(&rl, "fatal driver failure in write: %s",
+                ovs_lasterror_to_string());
         } else {
             retval = msg->size;
         }
@@ -564,7 +566,10 @@ nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
         DWORD bytes;
         if (!DeviceIoControl(sock->handle, sock->read_ioctl,
                              NULL, 0, tail, sizeof tail, &bytes, NULL)) {
+            VLOG_DBG_RL(&rl, "fatal driver failure in transact: %s",
+                ovs_lasterror_to_string());
             retval = -1;
+            /* XXX: Map to a more appropriate error. */
             errno = EINVAL;
         } else {
             retval = bytes;
@@ -789,14 +794,27 @@ nl_sock_transact_multiple__(struct nl_sock *sock,
     uint8_t reply_buf[65536];
     for (i = 0; i < n; i++) {
         DWORD reply_len;
+        bool ret;
         struct nl_transaction *txn = transactions[i];
         struct nlmsghdr *request_nlmsg, *reply_nlmsg;
 
-        if (!DeviceIoControl(sock->handle, OVS_IOCTL_TRANSACT,
-                             txn->request->data,
-                             txn->request->size,
-                             reply_buf, sizeof reply_buf,
-                             &reply_len, NULL)) {
+        ret = DeviceIoControl(sock->handle, OVS_IOCTL_TRANSACT,
+                              txn->request->data,
+                              txn->request->size,
+                              reply_buf, sizeof reply_buf,
+                              &reply_len, NULL);
+
+        if (ret && reply_len == 0) {
+            /*
+             * The current transaction did not produce any data to read and that
+             * is not an error as such. Continue with the remainder of the
+             * transactions.
+             */
+            txn->error = 0;
+            if (txn->reply) {
+                ofpbuf_clear(txn->reply);
+            }
+        } else if (!ret) {
             /* XXX: Map to a more appropriate error. */
             error = EINVAL;
             VLOG_DBG_RL(&rl, "fatal driver failure: %s", @@ -804,48 +822,50 @@ nl_sock_transact_multiple__(struct nl_sock *sock,
             break;
         }
 
-        if (reply_len < sizeof *reply_nlmsg) {
-            nl_sock_record_errors__(transactions, n, 0);
-            VLOG_DBG_RL(&rl, "insufficient length of reply %#"PRIu32
-                " for seq: %#"PRIx32, reply_len, request_nlmsg->nlmsg_seq);
-            break;
-        }
-
-        /* Validate the sequence number in the reply. */
-        request_nlmsg = nl_msg_nlmsghdr(txn->request);
-        reply_nlmsg = (struct nlmsghdr *)reply_buf;
+        if (reply_len != 0) {
+            if (reply_len < sizeof *reply_nlmsg) {
+                nl_sock_record_errors__(transactions, n, 0);
+                VLOG_DBG_RL(&rl, "insufficient length of reply %#"PRIu32
+                    " for seq: %#"PRIx32, reply_len, request_nlmsg->nlmsg_seq);
+                break;
+            }
 
-        if (request_nlmsg->nlmsg_seq != reply_nlmsg->nlmsg_seq) {
-            ovs_assert(request_nlmsg->nlmsg_seq == reply_nlmsg->nlmsg_seq);
-            VLOG_DBG_RL(&rl, "mismatched seq request %#"PRIx32
-                ", reply %#"PRIx32, request_nlmsg->nlmsg_seq,
-                reply_nlmsg->nlmsg_seq);
-            break;
-        }
+            /* Validate the sequence number in the reply. */
+            request_nlmsg = nl_msg_nlmsghdr(txn->request);
+            reply_nlmsg = (struct nlmsghdr *)reply_buf;
 
-        /* Handle errors embedded within the netlink message. */
-        ofpbuf_use_stub(&tmp_reply, reply_buf, sizeof reply_buf);
-        tmp_reply.size = sizeof reply_buf;
-        if (nl_msg_nlmsgerr(&tmp_reply, &txn->error)) {
-            if (txn->reply) {
-                ofpbuf_clear(txn->reply);
-            }
-            if (txn->error) {
-                VLOG_DBG_RL(&rl, "received NAK error=%d (%s)",
-                            error, ovs_strerror(txn->error));
+            if (request_nlmsg->nlmsg_seq != reply_nlmsg->nlmsg_seq) {
+                ovs_assert(request_nlmsg->nlmsg_seq == reply_nlmsg->nlmsg_seq);
+                VLOG_DBG_RL(&rl, "mismatched seq request %#"PRIx32
+                    ", reply %#"PRIx32, request_nlmsg->nlmsg_seq,
+                    reply_nlmsg->nlmsg_seq);
+                break;
             }
-        } else {
-            txn->error = 0;
-            if (txn->reply) {
-                /* Copy the reply to the buffer specified by the caller. */
-                if (reply_len > txn->reply->allocated) {
-                    ofpbuf_reinit(txn->reply, reply_len);
+
+            /* Handle errors embedded within the netlink message. */
+            ofpbuf_use_stub(&tmp_reply, reply_buf, sizeof reply_buf);
+            tmp_reply.size = sizeof reply_buf;
+            if (nl_msg_nlmsgerr(&tmp_reply, &txn->error)) {
+                if (txn->reply) {
+                    ofpbuf_clear(txn->reply);
+                }
+                if (txn->error) {
+                    VLOG_DBG_RL(&rl, "received NAK error=%d (%s)",
+                                error, ovs_strerror(txn->error));
+                }
+            } else {
+                txn->error = 0;
+                if (txn->reply) {
+                    /* Copy the reply to the buffer specified by the caller. */
+                    if (reply_len > txn->reply->allocated) {
+                        ofpbuf_reinit(txn->reply, reply_len);
+                    }
+                    memcpy(txn->reply->data, reply_buf, reply_len);
+                    txn->reply->size = reply_len;
                 }
-                memcpy(txn->reply->data, reply_buf, reply_len);
-                txn->reply->size = reply_len;
             }
+            ofpbuf_uninit(&tmp_reply);
         }
-        ofpbuf_uninit(&tmp_reply);
 
         /* Count the number of successful transactions. */
         (*done)++;
--
1.9.4.msysgit.2

_______________________________________________
dev mailing list
dev at openvswitch.org
http://openvswitch.org/mailman/listinfo/dev


More information about the dev mailing list