[ovs-dev] [PATCH v3] datapath-windows: Compute checksums for VXLAN inner packets

Alin Serdean aserdean at cloudbasesolutions.com
Wed Sep 30 21:16:55 UTC 2015


Windows does not support VXLAN hardware offloading.

Currently we do not compute IP/TCP/UDP checksums for the inner packet. This
patch computes the checksums mentioned above in regards with the enabled
settings.

i.e. if IP checksum offloading is enabled for the inner packet we compute it.
The same applies for TCP and UDP packets.

This patch also revizes the computation of ones' complement over different
memory blocks, in the case the lengths are odd.

Also per documentation:
https://msdn.microsoft.com/en-us/library/windows/hardware/ff568840%28v=vs.85%29.aspx
set the TCP flags FIN and PSH only for the last segment in the case LSO is
enabled.

Signed-off-by: Alin Gabriel Serdean <aserdean at cloudbasesolutions.com>
Acked-by: Sairam Venugopal <vsairam at vmware.com>
Acked-by: Sorin Vinturis <svinturis at cloudbasesolutions.com>
---
This patch is meant for branch-2.4 as well
v3 add Acked-by's
v2 address comments
---
 datapath-windows/ovsext/BufferMgmt.c | 37 ++++++++++++----
 datapath-windows/ovsext/Checksum.c   | 64 +++++++++++++++++----------
 datapath-windows/ovsext/Vxlan.c      | 84 ++++++++++++++++++++++++++++++++----
 3 files changed, 145 insertions(+), 40 deletions(-)

diff --git a/datapath-windows/ovsext/BufferMgmt.c b/datapath-windows/ovsext/BufferMgmt.c
index 3550e20..9a1cf96 100644
--- a/datapath-windows/ovsext/BufferMgmt.c
+++ b/datapath-windows/ovsext/BufferMgmt.c
@@ -1116,7 +1116,8 @@ GetSegmentHeaderInfo(PNET_BUFFER_LIST nbl,
  * --------------------------------------------------------------------------
  */
 static NDIS_STATUS
-FixSegmentHeader(PNET_BUFFER nb, UINT16 segmentSize, UINT32 seqNumber)
+FixSegmentHeader(PNET_BUFFER nb, UINT16 segmentSize, UINT32 seqNumber,
+                 BOOLEAN lastPacket, UINT16 packetCounter)
 {
     EthHdr *dstEth;
     IPHdr *dstIP;
@@ -1141,18 +1142,34 @@ FixSegmentHeader(PNET_BUFFER nb, UINT16 segmentSize, UINT32 seqNumber)
     /* Fix IP length and checksum */
     ASSERT(dstIP->protocol == IPPROTO_TCP);
     dstIP->tot_len = htons(segmentSize + dstIP->ihl * 4 + TCP_HDR_LEN(dstTCP));
+    dstIP->id += packetCounter;
     dstIP->check = 0;
     dstIP->check = IPChecksum((UINT8 *)dstIP, dstIP->ihl * 4, 0);
 
     /* Fix TCP checksum */
     dstTCP->seq = htonl(seqNumber);
-    dstTCP->check =
-        IPPseudoChecksum((UINT32 *)&dstIP->saddr,
-                         (UINT32 *)&dstIP->daddr,
-                         IPPROTO_TCP, segmentSize + TCP_HDR_LEN(dstTCP));
+
+    /*
+     * Set the TCP FIN and PSH bit only for the last packet
+     * More information can be found under:
+     * https://msdn.microsoft.com/en-us/library/windows/hardware/ff568840%28v=vs.85%29.aspx
+     */
+    if (dstTCP->fin) {
+        dstTCP->fin = lastPacket;
+    }
+    if (dstTCP->psh) {
+        dstTCP->psh = lastPacket;
+    }
+
+    UINT16 csumLength = segmentSize + TCP_HDR_LEN(dstTCP);
+    dstTCP->check = IPPseudoChecksum(&dstIP->saddr,
+                                     &dstIP->daddr,
+                                     IPPROTO_TCP,
+                                     csumLength);
     dstTCP->check = CalculateChecksumNB(nb,
-            (UINT16)(NET_BUFFER_DATA_LENGTH(nb) - sizeof *dstEth - dstIP->ihl * 4),
-            sizeof *dstEth + dstIP->ihl * 4);
+                                        csumLength,
+                                        sizeof *dstEth + dstIP->ihl * 4);
+
     return STATUS_SUCCESS;
 }
 
@@ -1190,6 +1207,7 @@ OvsTcpSegmentNBL(PVOID ovsContext,
     NDIS_STATUS status;
     UINT16 segmentSize;
     ULONG copiedSize;
+    UINT16 packetCounter = 0;
 
     srcCtx = (POVS_BUFFER_CONTEXT)NET_BUFFER_LIST_CONTEXT_DATA_START(nbl);
     if (srcCtx == NULL || srcCtx->magic != OVS_CTX_MAGIC) {
@@ -1232,7 +1250,9 @@ OvsTcpSegmentNBL(PVOID ovsContext,
             goto nblcopy_error;
         }
 
-        status = FixSegmentHeader(newNb, segmentSize, seqNumber);
+        status = FixSegmentHeader(newNb, segmentSize, seqNumber,
+                                  NET_BUFFER_NEXT_NB(newNb) == NULL,
+                                  packetCounter);
         if (status != NDIS_STATUS_SUCCESS) {
             goto nblcopy_error;
         }
@@ -1241,6 +1261,7 @@ OvsTcpSegmentNBL(PVOID ovsContext,
         /* Move on to the next segment */
         size -= segmentSize;
         seqNumber += segmentSize;
+        packetCounter++;
     }
 
     status = OvsAllocateNBLContext(context, newNbl);
diff --git a/datapath-windows/ovsext/Checksum.c b/datapath-windows/ovsext/Checksum.c
index 510a094..5d9b035 100644
--- a/datapath-windows/ovsext/Checksum.c
+++ b/datapath-windows/ovsext/Checksum.c
@@ -68,34 +68,48 @@ CalculateOnesComplement(UINT8 *start,
 {
     UINT64  sum = 0, val;
     UINT64  *src = (UINT64 *)start;
-    union {
-        UINT32 val;
-        UINT8  b8[4];
-    } tmp;
-
     while (totalLength > 7) {
         val = *src;
-        sum += (val >> 32) + (val & 0xffffffff);
+        sum += val;
+        if (sum < val) sum++;
         src++;
         totalLength -= 8;
     }
+
+    start = (UINT8 *)src;
+
     if (totalLength > 3) {
-        sum += *(UINT32 *)src;
-        src = (UINT64 *)((UINT8 *)src + 4);
+        UINT32 val = *(UINT32 *)start;
+        sum += val;
+        if (sum < val) sum++;
+        start += 4;
         totalLength -= 4;
     }
-    start = (UINT8 *)src;
-    tmp.val = 0;
-    switch (totalLength) {
-    case 3:
-        tmp.b8[2] = start[2];
-    case 2:
-        tmp.b8[1] = start[1];
-    case 1:
-        tmp.b8[0] = start[0];
-        sum += tmp.val;
+
+    if (totalLength > 1) {
+        UINT16 val = *(UINT16 *)start;
+        sum += val;
+        if (sum < val) sum++;
+        start += 2;
+        totalLength -= 2;
     }
-    sum = (isEvenStart ? sum : swap64(sum)) + initial;
+
+    if (totalLength > 0) {
+        UINT8 val = *start;
+        sum += val;
+        if (sum < val) sum++;
+        start += 1;
+        totalLength -= 1;
+    }
+    ASSERT(totalLength == 0);
+
+    if (!isEvenStart) {
+        sum = _byteswap_uint64(sum);
+    }
+
+    sum += initial;
+    if (sum < initial) sum++;
+
     return sum;
 }
 
@@ -428,6 +442,7 @@ CalculateChecksumNB(const PNET_BUFFER nb,
     ULONG firstMdlLen;
     /* Running count of bytes in remainder of the MDLs including current. */
     ULONG packetLen;
+    BOOLEAN swapEnd = 1 & csumDataLen;
 
     if ((nb == NULL) || (csumDataLen == 0)
             || (offset >= NET_BUFFER_DATA_LENGTH(nb))
@@ -482,10 +497,8 @@ CalculateChecksumNB(const PNET_BUFFER nb,
     while (csumDataLen && (currentMdl != NULL)) {
         ASSERT(mdlLen < 65536);
         csLen = MIN((UINT16) mdlLen, csumDataLen);
-        //XXX Not handling odd bytes yet.
-        ASSERT(((csLen & 0x1) == 0) || csumDataLen <= mdlLen);
 
-        csum = CalculateOnesComplement(src, csLen, csum, TRUE);
+        csum = CalculateOnesComplement(src, csLen, csum, !(1 & csumDataLen));
         fold64(csum);
 
         csumDataLen -= csLen;
@@ -504,9 +517,14 @@ CalculateChecksumNB(const PNET_BUFFER nb,
         }
     }
 
+    fold64(csum);
     ASSERT(csumDataLen == 0);
     ASSERT((csum & ~0xffff) == 0);
-    return (UINT16) ~csum;
+    csum = (UINT16)~csum;
+    if (swapEnd) {
+        return _byteswap_ushort((UINT16)csum);
+    }
+    return (UINT16)csum;
 }
 
 /*
diff --git a/datapath-windows/ovsext/Vxlan.c b/datapath-windows/ovsext/Vxlan.c
index 2364f28..c0611da 100644
--- a/datapath-windows/ovsext/Vxlan.c
+++ b/datapath-windows/ovsext/Vxlan.c
@@ -152,9 +152,9 @@ OvsCleanupVxlanTunnel(PIRP irp,
 
     if (vxlanPort->filterID != 0) {
         status = OvsTunnelFilterDelete(irp,
-				       vxlanPort->filterID,
-				       callback,
-				       tunnelContext);
+                                       vxlanPort->filterID,
+                                       callback,
+                                       tunnelContext);
     } else {
         OvsFreeMemoryWithTag(vport->priv, OVS_VXLAN_POOL_TAG);
         vport->priv = NULL;
@@ -198,20 +198,24 @@ OvsDoEncapVxlan(POVS_VPORT_ENTRY vport,
      */
     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
     packetLength = NET_BUFFER_DATA_LENGTH(curNb);
+
     if (layers->isTcp) {
         NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO tsoInfo;
 
         tsoInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
-                TcpLargeSendNetBufferListInfo);
-        OVS_LOG_TRACE("MSS %u packet len %u", tsoInfo.LsoV1Transmit.MSS, packetLength);
+                                             TcpLargeSendNetBufferListInfo);
+        OVS_LOG_TRACE("MSS %u packet len %u", tsoInfo.LsoV1Transmit.MSS,
+                      packetLength);
         if (tsoInfo.LsoV1Transmit.MSS) {
             OVS_LOG_TRACE("l4Offset %d", layers->l4Offset);
             *newNbl = OvsTcpSegmentNBL(switchContext, curNbl, layers,
-                        tsoInfo.LsoV1Transmit.MSS, headRoom);
+                                       tsoInfo.LsoV1Transmit.MSS, headRoom);
             if (*newNbl == NULL) {
                 OVS_LOG_ERROR("Unable to segment NBL");
                 return NDIS_STATUS_FAILURE;
             }
+            /* Clear out LSO flags after this point */
+            NET_BUFFER_LIST_INFO(*newNbl, TcpLargeSendNetBufferListInfo) = 0;
         }
     }
 
@@ -226,6 +230,70 @@ OvsDoEncapVxlan(POVS_VPORT_ENTRY vport,
             OVS_LOG_ERROR("Unable to copy NBL");
             return NDIS_STATUS_FAILURE;
         }
+        /*
+         * To this point we do not have VXLAN offloading.
+         * Apply defined checksums
+         */
+        curNb = NET_BUFFER_LIST_FIRST_NB(*newNbl);
+        curMdl = NET_BUFFER_CURRENT_MDL(curNb);
+        bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority);
+        if (!bufferStart) {
+            status = NDIS_STATUS_RESOURCES;
+            goto ret_error;
+        }
+
+        NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
+        csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
+                                              TcpIpChecksumNetBufferListInfo);
+
+        bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
+
+        if (layers->isIPv4) {
+            IPHdr *ip = (IPHdr *)(bufferStart + layers->l3Offset);
+
+            if (csumInfo.Transmit.IpHeaderChecksum) {
+                ip->check = 0;
+                ip->check = IPChecksum((UINT8 *)ip, 4 * ip->ihl, 0);
+            }
+
+            if (layers->isTcp && csumInfo.Transmit.TcpChecksum) {
+                UINT16 csumLength = (UINT16)(packetLength - layers->l4Offset);
+                TCPHdr *tcp = (TCPHdr *)(bufferStart + layers->l4Offset);
+                tcp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr,
+                                              IPPROTO_TCP, csumLength);
+                tcp->check = CalculateChecksumNB(curNb, csumLength,
+                                                 (UINT32)(layers->l4Offset));
+            } else if (layers->isUdp && csumInfo.Transmit.UdpChecksum) {
+                UINT16 csumLength = (UINT16)(packetLength - layers->l4Offset);
+                UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip);
+                udp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr,
+                                              IPPROTO_UDP, csumLength);
+                udp->check = CalculateChecksumNB(curNb, csumLength,
+                                                 (UINT32)(layers->l4Offset));
+            }
+        } else if (layers->isIPv6) {
+            IPv6Hdr *ip = (IPv6Hdr *)(bufferStart + layers->l3Offset);
+
+            if (layers->isTcp && csumInfo.Transmit.TcpChecksum) {
+                UINT16 csumLength = (UINT16)(packetLength - layers->l4Offset);
+                TCPHdr *tcp = (TCPHdr *)(bufferStart + layers->l4Offset);
+                tcp->check = IPv6PseudoChecksum((UINT32 *) &ip->saddr,
+                                                (UINT32 *) &ip->daddr,
+                                                IPPROTO_TCP, csumLength);
+                tcp->check = CalculateChecksumNB(curNb, csumLength,
+                                                 (UINT32)(layers->l4Offset));
+            } else if (layers->isUdp && csumInfo.Transmit.UdpChecksum) {
+                UINT16 csumLength = (UINT16)(packetLength - layers->l4Offset);
+                UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip);
+                udp->check = IPv6PseudoChecksum((UINT32 *) &ip->saddr,
+                                                (UINT32 *) &ip->daddr,
+                                                IPPROTO_UDP, csumLength);
+                udp->check = CalculateChecksumNB(curNb, csumLength,
+                                                 (UINT32)(layers->l4Offset));
+            }
+        }
+        /* Clear out TcpIpChecksumNetBufferListInfo flag */
+        NET_BUFFER_LIST_INFO(*newNbl, TcpIpChecksumNetBufferListInfo) = 0;
     }
 
     curNbl = *newNbl;
@@ -257,9 +325,6 @@ OvsDoEncapVxlan(POVS_VPORT_ENTRY vport,
                        sizeof ethHdr->Destination + sizeof ethHdr->Source);
         ethHdr->Type = htons(ETH_TYPE_IPV4);
 
-        // XXX: question: there are fields in the OvsIPv4TunnelKey for ttl and such,
-        // should we use those values instead? or will they end up being
-        // uninitialized;
         /* IP header */
         ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr);
 
@@ -277,6 +342,7 @@ OvsDoEncapVxlan(POVS_VPORT_ENTRY vport,
         ASSERT(tunKey->src == fwdInfo->srcIpAddr || tunKey->src == 0);
         ipHdr->saddr = fwdInfo->srcIpAddr;
         ipHdr->daddr = fwdInfo->dstIpAddr;
+
         ipHdr->check = 0;
         ipHdr->check = IPChecksum((UINT8 *)ipHdr, sizeof *ipHdr, 0);
 
-- 
1.9.5.msysgit.0



More information about the dev mailing list