[ovs-dev] [PATCH] [RFC] datapath: Implement vxlan-soe

Han Zhou zhouhan at gmail.com
Mon May 12 08:04:26 UTC 2014


This patch implements vxlan-soe:
    http://tools.ietf.org/html/draft-zhou-li-vxlan-soe-01

Tested VXLAN throughput between two hypervisors, and the performance
gain of vxlan-soe is significant.
netperf TCP_STREAM test result:

Before the change: 2.62 Gbits/sec
After the change: 6.68 Gbits/sec
Speedup is ~250%.

Hope this feature is useful for those who rely on VXLAN.
Let me know your thoughts and any comments are welcome!

Signed-off-by: Han Zhou <zhouhan at gmail.com>
---
 datapath/linux/compat/include/net/vxlan.h |  28 +++++-
 datapath/linux/compat/vxlan.c             | 153 ++++++++++++++++++++++++++----
 datapath/vport-vxlan.c                    |   9 +-
 3 files changed, 165 insertions(+), 25 deletions(-)

diff --git a/datapath/linux/compat/include/net/vxlan.h
b/datapath/linux/compat/include/net/vxlan.h
index 414a497..7ba5291 100644
--- a/datapath/linux/compat/include/net/vxlan.h
+++ b/datapath/linux/compat/include/net/vxlan.h
@@ -10,8 +10,32 @@
 #include_next <net/vxlan.h>
 #else

+#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
+
+#define VXLAN_FLAG_GSO 0x80 /* VXLAN-SOE */
+#define VXLAN_FLAGS 0x08 /* struct vxlanhdr.vx_flags required value. */
+
+/* VXLAN protocol header */
+struct vxlanhdr {
+    __u8     vx_flags;
+    __u8     vx_mss_hi;
+    __be16     vx_protocol; /* VXLAN-GPE */
+    __u8    vx_vni[3];
+    __u8    vx_mss_lo;
+};
+
+static inline void vxh_set_vni(struct vxlanhdr *vxh, __u32 vni)
+{
+    *((__u32*)&vxh->vx_vni) = htonl(vxh->vx_mss_lo | (vni << 8));
+}
+
+static inline __u32 vxh_get_vni(struct vxlanhdr *vxh)
+{
+    return ((ntohl(*(__u32*)&vxh->vx_vni) & 0xffffff00) >> 8);
+}
+
 struct vxlan_sock;
-typedef void (vxlan_rcv_t)(struct vxlan_sock *vs, struct sk_buff
*skb, __be32 key);
+typedef void (vxlan_rcv_t)(struct vxlan_sock *vs, struct sk_buff
*skb, __u32 key);

 /* per UDP socket information */
 struct vxlan_sock {
@@ -32,7 +56,7 @@ void vxlan_sock_release(struct vxlan_sock *vs);
 int vxlan_xmit_skb(struct vxlan_sock *vs,
            struct rtable *rt, struct sk_buff *skb,
            __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
-           __be16 src_port, __be16 dst_port, __be32 vni);
+           __be16 src_port, __be16 dst_port, __u32 vni);

 __be16 vxlan_src_port(__u16 port_min, __u16 port_max, struct sk_buff *skb);

diff --git a/datapath/linux/compat/vxlan.c b/datapath/linux/compat/vxlan.c
index b8b8fa7..80fa233 100644
--- a/datapath/linux/compat/vxlan.c
+++ b/datapath/linux/compat/vxlan.c
@@ -59,15 +59,102 @@
 #include "gso.h"
 #include "vlan.h"

-#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))

-#define VXLAN_FLAGS 0x08000000    /* struct vxlanhdr.vx_flags
required value. */
+static inline int vxlan_parse_inner_hdr(struct sk_buff *skb, u16
*l3_type, u8 *l4_type, u16 *l4_offset)
+{
+    struct ethhdr *ethh = (struct ethhdr*)skb->data;
+    unsigned char *p = (unsigned char *)(ethh + 1);
+    u16 l2_hdr_size, l3_hdr_size;
+    u16 ethertype;
+    u8 l4_proto;
+    struct iphdr *iph;
+    struct ipv6hdr *ipv6;
+
+
+    ethertype = ntohs(ethh->h_proto);
+    if (ethertype == ETH_P_8021Q) {
+        ethertype = ntohs(*(__be16*)(p + 2));
+        p += 4;
+    }
+    l2_hdr_size = p - skb->data;
+
+    if (ethertype == ETH_P_IP) {
+        iph = (struct iphdr *)p;
+        l3_hdr_size = iph->ihl << 2;
+        l4_proto = iph->protocol;
+
+    } else if (ethertype == ETH_P_IPV6) {
+        ipv6 = (struct ipv6hdr *)p;
+        l3_hdr_size = sizeof(struct ipv6hdr);
+        l4_proto = ipv6->nexthdr;
+
+    } else {
+        return -1;
+    }
+    if (l4_proto != IPPROTO_TCP && l4_proto != IPPROTO_UDP) {
+        return -1;
+    }
+
+    *l3_type = ethertype;
+    *l4_type = l4_proto;
+    *l4_offset = l2_hdr_size + l3_hdr_size;
+    return 0;
+
+}
+
+static inline int vxlan_handle_soe(struct sk_buff *skb, struct vxlanhdr *vxh)
+{
+    u16 ethertype;
+    u8 ipproto;
+    u16 csum_offset, l4_offset;
+    unsigned short gso_type;
+
+    if (unlikely(skb_unclone(skb, GFP_ATOMIC))) {
+        return -1;
+    }
+
+    skb_shinfo(skb)->gso_size = (((__u16)vxh->vx_mss_hi) << 8) +
+        vxh->vx_mss_lo;
+    skb_shinfo(skb)->gso_segs = 0;

-/* VXLAN protocol header */
-struct vxlanhdr {
-    __be32 vx_flags;
-    __be32 vx_vni;
-};
+
+    if (unlikely(skb_linearize(skb)))
+        return -1;
+
+    if (unlikely(vxlan_parse_inner_hdr(skb, &ethertype, &ipproto,
&l4_offset))) {
+        return -1;
+    }
+    if (ethertype == ETH_P_IP) {
+        if (ipproto == IPPROTO_TCP) {
+            gso_type = SKB_GSO_TCPV4;
+            csum_offset = offsetof(struct tcphdr, check);
+        } else if (ipproto == IPPROTO_UDP) {
+            gso_type = SKB_GSO_UDP;
+            csum_offset = offsetof(struct udphdr, check);
+        } else {
+            BUG();
+        }
+    } else if (ethertype == ETH_P_IPV6) {
+        if (ipproto == IPPROTO_TCP) {
+            gso_type = SKB_GSO_TCPV6;
+            csum_offset = offsetof(struct tcphdr, check);
+        } else if (ipproto == IPPROTO_UDP) {
+            gso_type = SKB_GSO_UDP;
+            csum_offset = offsetof(struct udphdr, check);
+        } else {
+            BUG();
+        }
+    } else {
+        BUG();
+    }
+
+    skb_shinfo(skb)->gso_type = gso_type;
+    skb->ip_summed = CHECKSUM_PARTIAL;
+    skb->csum_start = skb_headroom(skb) + l4_offset;
+    skb->csum_offset = offsetof(struct tcphdr, check);
+
+    return 0;
+}

 /* Callback from net/ipv4/udp.c to receive packets */
 static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
@@ -81,13 +168,13 @@ static int vxlan_udp_encap_recv(struct sock *sk,
struct sk_buff *skb)

     /* Return packets with reserved bits set */
     vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
-    if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
+/*    if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
         (vxh->vx_vni & htonl(0xff))) {
         pr_warn("invalid vxlan flags=%#x vni=%#x\n",
             ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
         goto error;
     }
-
+*/
     if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
         goto drop;

@@ -95,7 +182,13 @@ static int vxlan_udp_encap_recv(struct sock *sk,
struct sk_buff *skb)
     if (!vs)
         goto drop;

-    vs->rcv(vs, skb, vxh->vx_vni);
+    if (vxh->vx_flags & VXLAN_FLAG_GSO) {
+        if (unlikely(vxlan_handle_soe(skb, vxh)))
+            goto drop;
+    }
+
+    vs->rcv(vs, skb, vxh_get_vni(vxh));
+
     return 0;

 drop:
@@ -153,10 +246,10 @@ static void vxlan_gso(struct sk_buff *skb)
         struct iphdr *iph = ip_hdr(skb);

         uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
-                           skb->len - udp_offset,
-                           IPPROTO_UDP, 0);
+                    skb->len - udp_offset,
+                    IPPROTO_UDP, 0);
         uh->check = csum_fold(skb_checksum(skb, udp_offset,
-                      skb->len - udp_offset, 0));
+                    skb->len - udp_offset, 0));

         if (uh->check == 0)
             uh->check = CSUM_MANGLED_0;
@@ -165,10 +258,31 @@ static void vxlan_gso(struct sk_buff *skb)
     skb->ip_summed = CHECKSUM_NONE;
 }

-static int handle_offloads(struct sk_buff *skb)
+
+static int handle_offloads(struct sk_buff *skb, struct vxlanhdr* vxh)
 {
+    int err;
     if (skb_is_gso(skb)) {
-        OVS_GSO_CB(skb)->fix_segment = vxlan_gso;
+        /* offload with vxlan-soe if encapsulated packet
+           fits in MAX IP packet size, otherwise fallback to
+           local GSO */
+        if (skb->len + sizeof(struct iphdr) > 65535) {
+            OVS_GSO_CB(skb)->fix_segment = vxlan_gso;
+        } else {
+
+            vxh->vx_flags |= VXLAN_FLAG_GSO;
+            vxh->vx_mss_hi = (__u8)(skb_shinfo(skb)->gso_size >> 8);
+            vxh->vx_mss_lo = (__u8)skb_shinfo(skb)->gso_size;
+
+            err = skb_unclone(skb, GFP_ATOMIC);
+            if (unlikely(err))
+                return err;
+
+            skb_shinfo(skb)->gso_type = 0;
+            skb_shinfo(skb)->gso_size = 0;
+            skb_shinfo(skb)->gso_segs = 0;
+        }
+
     } else {
         if (skb->ip_summed != CHECKSUM_PARTIAL)
             skb->ip_summed = CHECKSUM_NONE;
@@ -179,7 +293,7 @@ static int handle_offloads(struct sk_buff *skb)
 int vxlan_xmit_skb(struct vxlan_sock *vs,
            struct rtable *rt, struct sk_buff *skb,
            __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
-           __be16 src_port, __be16 dst_port, __be32 vni)
+           __be16 src_port, __be16 dst_port, __u32 vni)
 {
     struct vxlanhdr *vxh;
     struct udphdr *uh;
@@ -207,8 +321,9 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
     skb_reset_inner_headers(skb);

     vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
-    vxh->vx_flags = htonl(VXLAN_FLAGS);
-    vxh->vx_vni = vni;
+    memset(vxh, 0, sizeof(*vxh));
+    vxh->vx_flags = VXLAN_FLAGS;
+    vxh_set_vni(vxh, vni);

     __skb_push(skb, sizeof(*uh));
     skb_reset_transport_header(skb);
@@ -222,7 +337,7 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,

     vxlan_set_owner(vs->sock->sk, skb);

-    err = handle_offloads(skb);
+    err = handle_offloads(skb, vxh);
     if (err)
         return err;

diff --git a/datapath/vport-vxlan.c b/datapath/vport-vxlan.c
index cc9477d..ef0cd06 100644
--- a/datapath/vport-vxlan.c
+++ b/datapath/vport-vxlan.c
@@ -58,16 +58,17 @@ static inline struct vxlan_port *vxlan_vport(const
struct vport *vport)
 }

 /* Called with rcu_read_lock and BH disabled. */
-static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
__be32 vx_vni)
+static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __u32 vx_vni)
 {
     struct ovs_key_ipv4_tunnel tun_key;
     struct vport *vport = vs->data;
     struct iphdr *iph;
     __be64 key;
-
+
+
     /* Save outer tunnel values */
     iph = ip_hdr(skb);
-    key = cpu_to_be64(ntohl(vx_vni) >> 8);
+    key = cpu_to_be64(vx_vni);
     ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY);

     ovs_vport_receive(vport, skb, &tun_key);
@@ -181,7 +182,7 @@ static int vxlan_tnl_send(struct vport *vport,
struct sk_buff *skb)
                  OVS_CB(skb)->tun_key->ipv4_tos,
                  OVS_CB(skb)->tun_key->ipv4_ttl, df,
                  src_port, dst_port,
-                 htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8));
+                 (__u32)be64_to_cpu(OVS_CB(skb)->tun_key->tun_id));
     if (err < 0)
         ip_rt_put(rt);
 error:
-- 
1.9.0



More information about the dev mailing list