[ovs-dev] [PATCH] [RFC] datapath: Implement vxlan-soe
Han Zhou
zhouhan at gmail.com
Mon May 12 08:04:26 UTC 2014
This patch implements vxlan-soe:
http://tools.ietf.org/html/draft-zhou-li-vxlan-soe-01
Tested VXLAN throughput between two hypervisors, and the performance
gain of vxlan-soe is significant.
netperf TCP_STREAM test result:
Before the change: 2.62 Gbits/sec
After the change: 6.68 Gbits/sec
Speedup is ~250%.
Hope this feature is useful for those who rely on VXLAN.
Let me know your thoughts and any comments are welcome!
Signed-off-by: Han Zhou <zhouhan at gmail.com>
---
datapath/linux/compat/include/net/vxlan.h | 28 +++++-
datapath/linux/compat/vxlan.c | 153 ++++++++++++++++++++++++++----
datapath/vport-vxlan.c | 9 +-
3 files changed, 165 insertions(+), 25 deletions(-)
diff --git a/datapath/linux/compat/include/net/vxlan.h
b/datapath/linux/compat/include/net/vxlan.h
index 414a497..7ba5291 100644
--- a/datapath/linux/compat/include/net/vxlan.h
+++ b/datapath/linux/compat/include/net/vxlan.h
@@ -10,8 +10,32 @@
#include_next <net/vxlan.h>
#else
+#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
+
+#define VXLAN_FLAG_GSO 0x80 /* VXLAN-SOE */
+#define VXLAN_FLAGS 0x08 /* struct vxlanhdr.vx_flags required value. */
+
+/* VXLAN protocol header */
+struct vxlanhdr {
+ __u8 vx_flags;
+ __u8 vx_mss_hi;
+ __be16 vx_protocol; /* VXLAN-GPE */
+ __u8 vx_vni[3];
+ __u8 vx_mss_lo;
+};
+
+static inline void vxh_set_vni(struct vxlanhdr *vxh, __u32 vni)
+{
+ *((__u32*)&vxh->vx_vni) = htonl(vxh->vx_mss_lo | (vni << 8));
+}
+
+static inline __u32 vxh_get_vni(struct vxlanhdr *vxh)
+{
+ return ((ntohl(*(__u32*)&vxh->vx_vni) & 0xffffff00) >> 8);
+}
+
struct vxlan_sock;
-typedef void (vxlan_rcv_t)(struct vxlan_sock *vs, struct sk_buff
*skb, __be32 key);
+typedef void (vxlan_rcv_t)(struct vxlan_sock *vs, struct sk_buff
*skb, __u32 key);
/* per UDP socket information */
struct vxlan_sock {
@@ -32,7 +56,7 @@ void vxlan_sock_release(struct vxlan_sock *vs);
int vxlan_xmit_skb(struct vxlan_sock *vs,
struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
- __be16 src_port, __be16 dst_port, __be32 vni);
+ __be16 src_port, __be16 dst_port, __u32 vni);
__be16 vxlan_src_port(__u16 port_min, __u16 port_max, struct sk_buff *skb);
diff --git a/datapath/linux/compat/vxlan.c b/datapath/linux/compat/vxlan.c
index b8b8fa7..80fa233 100644
--- a/datapath/linux/compat/vxlan.c
+++ b/datapath/linux/compat/vxlan.c
@@ -59,15 +59,102 @@
#include "gso.h"
#include "vlan.h"
-#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
-#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags
required value. */
+static inline int vxlan_parse_inner_hdr(struct sk_buff *skb, u16
*l3_type, u8 *l4_type, u16 *l4_offset)
+{
+ struct ethhdr *ethh = (struct ethhdr*)skb->data;
+ unsigned char *p = (unsigned char *)(ethh + 1);
+ u16 l2_hdr_size, l3_hdr_size;
+ u16 ethertype;
+ u8 l4_proto;
+ struct iphdr *iph;
+ struct ipv6hdr *ipv6;
+
+
+ ethertype = ntohs(ethh->h_proto);
+ if (ethertype == ETH_P_8021Q) {
+ ethertype = ntohs(*(__be16*)(p + 2));
+ p += 4;
+ }
+ l2_hdr_size = p - skb->data;
+
+ if (ethertype == ETH_P_IP) {
+ iph = (struct iphdr *)p;
+ l3_hdr_size = iph->ihl << 2;
+ l4_proto = iph->protocol;
+
+ } else if (ethertype == ETH_P_IPV6) {
+ ipv6 = (struct ipv6hdr *)p;
+ l3_hdr_size = sizeof(struct ipv6hdr);
+ l4_proto = ipv6->nexthdr;
+
+ } else {
+ return -1;
+ }
+ if (l4_proto != IPPROTO_TCP && l4_proto != IPPROTO_UDP) {
+ return -1;
+ }
+
+ *l3_type = ethertype;
+ *l4_type = l4_proto;
+ *l4_offset = l2_hdr_size + l3_hdr_size;
+ return 0;
+
+}
+
+static inline int vxlan_handle_soe(struct sk_buff *skb, struct vxlanhdr *vxh)
+{
+ u16 ethertype;
+ u8 ipproto;
+ u16 csum_offset, l4_offset;
+ unsigned short gso_type;
+
+ if (unlikely(skb_unclone(skb, GFP_ATOMIC))) {
+ return -1;
+ }
+
+ skb_shinfo(skb)->gso_size = (((__u16)vxh->vx_mss_hi) << 8) +
+ vxh->vx_mss_lo;
+ skb_shinfo(skb)->gso_segs = 0;
-/* VXLAN protocol header */
-struct vxlanhdr {
- __be32 vx_flags;
- __be32 vx_vni;
-};
+
+ if (unlikely(skb_linearize(skb)))
+ return -1;
+
+ if (unlikely(vxlan_parse_inner_hdr(skb, ðertype, &ipproto,
&l4_offset))) {
+ return -1;
+ }
+ if (ethertype == ETH_P_IP) {
+ if (ipproto == IPPROTO_TCP) {
+ gso_type = SKB_GSO_TCPV4;
+ csum_offset = offsetof(struct tcphdr, check);
+ } else if (ipproto == IPPROTO_UDP) {
+ gso_type = SKB_GSO_UDP;
+ csum_offset = offsetof(struct udphdr, check);
+ } else {
+ BUG();
+ }
+ } else if (ethertype == ETH_P_IPV6) {
+ if (ipproto == IPPROTO_TCP) {
+ gso_type = SKB_GSO_TCPV6;
+ csum_offset = offsetof(struct tcphdr, check);
+ } else if (ipproto == IPPROTO_UDP) {
+ gso_type = SKB_GSO_UDP;
+ csum_offset = offsetof(struct udphdr, check);
+ } else {
+ BUG();
+ }
+ } else {
+ BUG();
+ }
+
+ skb_shinfo(skb)->gso_type = gso_type;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_headroom(skb) + l4_offset;
+ skb->csum_offset = offsetof(struct tcphdr, check);
+
+ return 0;
+}
/* Callback from net/ipv4/udp.c to receive packets */
static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
@@ -81,13 +168,13 @@ static int vxlan_udp_encap_recv(struct sock *sk,
struct sk_buff *skb)
/* Return packets with reserved bits set */
vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
- if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
+/* if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
(vxh->vx_vni & htonl(0xff))) {
pr_warn("invalid vxlan flags=%#x vni=%#x\n",
ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
goto error;
}
-
+*/
if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
goto drop;
@@ -95,7 +182,13 @@ static int vxlan_udp_encap_recv(struct sock *sk,
struct sk_buff *skb)
if (!vs)
goto drop;
- vs->rcv(vs, skb, vxh->vx_vni);
+ if (vxh->vx_flags & VXLAN_FLAG_GSO) {
+ if (unlikely(vxlan_handle_soe(skb, vxh)))
+ goto drop;
+ }
+
+ vs->rcv(vs, skb, vxh_get_vni(vxh));
+
return 0;
drop:
@@ -153,10 +246,10 @@ static void vxlan_gso(struct sk_buff *skb)
struct iphdr *iph = ip_hdr(skb);
uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
- skb->len - udp_offset,
- IPPROTO_UDP, 0);
+ skb->len - udp_offset,
+ IPPROTO_UDP, 0);
uh->check = csum_fold(skb_checksum(skb, udp_offset,
- skb->len - udp_offset, 0));
+ skb->len - udp_offset, 0));
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;
@@ -165,10 +258,31 @@ static void vxlan_gso(struct sk_buff *skb)
skb->ip_summed = CHECKSUM_NONE;
}
-static int handle_offloads(struct sk_buff *skb)
+
+static int handle_offloads(struct sk_buff *skb, struct vxlanhdr* vxh)
{
+ int err;
if (skb_is_gso(skb)) {
- OVS_GSO_CB(skb)->fix_segment = vxlan_gso;
+ /* offload with vxlan-soe if encapsulated packet
+ fits in MAX IP packet size, otherwise fallback to
+ local GSO */
+ if (skb->len + sizeof(struct iphdr) > 65535) {
+ OVS_GSO_CB(skb)->fix_segment = vxlan_gso;
+ } else {
+
+ vxh->vx_flags |= VXLAN_FLAG_GSO;
+ vxh->vx_mss_hi = (__u8)(skb_shinfo(skb)->gso_size >> 8);
+ vxh->vx_mss_lo = (__u8)skb_shinfo(skb)->gso_size;
+
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(err))
+ return err;
+
+ skb_shinfo(skb)->gso_type = 0;
+ skb_shinfo(skb)->gso_size = 0;
+ skb_shinfo(skb)->gso_segs = 0;
+ }
+
} else {
if (skb->ip_summed != CHECKSUM_PARTIAL)
skb->ip_summed = CHECKSUM_NONE;
@@ -179,7 +293,7 @@ static int handle_offloads(struct sk_buff *skb)
int vxlan_xmit_skb(struct vxlan_sock *vs,
struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
- __be16 src_port, __be16 dst_port, __be32 vni)
+ __be16 src_port, __be16 dst_port, __u32 vni)
{
struct vxlanhdr *vxh;
struct udphdr *uh;
@@ -207,8 +321,9 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
skb_reset_inner_headers(skb);
vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
- vxh->vx_flags = htonl(VXLAN_FLAGS);
- vxh->vx_vni = vni;
+ memset(vxh, 0, sizeof(*vxh));
+ vxh->vx_flags = VXLAN_FLAGS;
+ vxh_set_vni(vxh, vni);
__skb_push(skb, sizeof(*uh));
skb_reset_transport_header(skb);
@@ -222,7 +337,7 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
vxlan_set_owner(vs->sock->sk, skb);
- err = handle_offloads(skb);
+ err = handle_offloads(skb, vxh);
if (err)
return err;
diff --git a/datapath/vport-vxlan.c b/datapath/vport-vxlan.c
index cc9477d..ef0cd06 100644
--- a/datapath/vport-vxlan.c
+++ b/datapath/vport-vxlan.c
@@ -58,16 +58,17 @@ static inline struct vxlan_port *vxlan_vport(const
struct vport *vport)
}
/* Called with rcu_read_lock and BH disabled. */
-static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
__be32 vx_vni)
+static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __u32 vx_vni)
{
struct ovs_key_ipv4_tunnel tun_key;
struct vport *vport = vs->data;
struct iphdr *iph;
__be64 key;
-
+
+
/* Save outer tunnel values */
iph = ip_hdr(skb);
- key = cpu_to_be64(ntohl(vx_vni) >> 8);
+ key = cpu_to_be64(vx_vni);
ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY);
ovs_vport_receive(vport, skb, &tun_key);
@@ -181,7 +182,7 @@ static int vxlan_tnl_send(struct vport *vport,
struct sk_buff *skb)
OVS_CB(skb)->tun_key->ipv4_tos,
OVS_CB(skb)->tun_key->ipv4_ttl, df,
src_port, dst_port,
- htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8));
+ (__u32)be64_to_cpu(OVS_CB(skb)->tun_key->tun_id));
if (err < 0)
ip_rt_put(rt);
error:
--
1.9.0
More information about the dev
mailing list