[ovs-dev] [PATCH] [RFC] datapath: Implement vxlan-soe
Han Zhou
zhouhan at gmail.com
Wed May 14 01:59:05 UTC 2014
Repost it since the original one was blocked by some spam filters.
@Jesse, could you help review it from technical point of view? If this
looks good I can then make it optional and configurable. Thanks a lot!
-Han
On Mon, May 12, 2014 at 4:04 PM, Han Zhou <zhouhan at gmail.com> wrote:
> This patch implements vxlan-soe:
> http://tools.ietf.org/html/draft-zhou-li-vxlan-soe-01
>
> Tested VXLAN throughput between two hypervisors, and the performance
> gain of vxlan-soe is significant.
> netperf TCP_STREAM test result:
>
> Before the change: 2.62 Gbits/sec
> After the change: 6.68 Gbits/sec
> Speedup is ~250%.
>
> Hope this feature is useful for those who rely on VXLAN.
>
>
> Signed-off-by: Han Zhou <zhouhan at gmail.com>
> ---
> datapath/linux/compat/include/net/vxlan.h | 28 +++++-
> datapath/linux/compat/vxlan.c | 153 ++++++++++++++++++++++++++----
> datapath/vport-vxlan.c | 9 +-
> 3 files changed, 165 insertions(+), 25 deletions(-)
>
> diff --git a/datapath/linux/compat/include/net/vxlan.h
> b/datapath/linux/compat/include/net/vxlan.h
> index 414a497..7ba5291 100644
> --- a/datapath/linux/compat/include/net/vxlan.h
> +++ b/datapath/linux/compat/include/net/vxlan.h
> @@ -10,8 +10,32 @@
> #include_next <net/vxlan.h>
> #else
>
> +#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
> +
> +#define VXLAN_FLAG_GSO 0x80 /* VXLAN-SOE */
> +#define VXLAN_FLAGS 0x08 /* struct vxlanhdr.vx_flags required value. */
> +
> +/* VXLAN protocol header */
> +struct vxlanhdr {
> + __u8 vx_flags;
> + __u8 vx_mss_hi;
> + __be16 vx_protocol; /* VXLAN-GPE */
> + __u8 vx_vni[3];
> + __u8 vx_mss_lo;
> +};
> +
> +static inline void vxh_set_vni(struct vxlanhdr *vxh, __u32 vni)
> +{
> + *((__u32*)&vxh->vx_vni) = htonl(vxh->vx_mss_lo | (vni << 8));
> +}
> +
> +static inline __u32 vxh_get_vni(struct vxlanhdr *vxh)
> +{
> + return ((ntohl(*(__u32*)&vxh->vx_vni) & 0xffffff00) >> 8);
> +}
> +
> struct vxlan_sock;
> -typedef void (vxlan_rcv_t)(struct vxlan_sock *vs, struct sk_buff
> *skb, __be32 key);
> +typedef void (vxlan_rcv_t)(struct vxlan_sock *vs, struct sk_buff
> *skb, __u32 key);
>
> /* per UDP socket information */
> struct vxlan_sock {
> @@ -32,7 +56,7 @@ void vxlan_sock_release(struct vxlan_sock *vs);
> int vxlan_xmit_skb(struct vxlan_sock *vs,
> struct rtable *rt, struct sk_buff *skb,
> __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
> - __be16 src_port, __be16 dst_port, __be32 vni);
> + __be16 src_port, __be16 dst_port, __u32 vni);
>
> __be16 vxlan_src_port(__u16 port_min, __u16 port_max, struct sk_buff *skb);
>
> diff --git a/datapath/linux/compat/vxlan.c b/datapath/linux/compat/vxlan.c
> index b8b8fa7..80fa233 100644
> --- a/datapath/linux/compat/vxlan.c
> +++ b/datapath/linux/compat/vxlan.c
> @@ -59,15 +59,102 @@
> #include "gso.h"
> #include "vlan.h"
>
> -#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
>
> -#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags
> required value. */
> +static inline int vxlan_parse_inner_hdr(struct sk_buff *skb, u16
> *l3_type, u8 *l4_type, u16 *l4_offset)
> +{
> + struct ethhdr *ethh = (struct ethhdr*)skb->data;
> + unsigned char *p = (unsigned char *)(ethh + 1);
> + u16 l2_hdr_size, l3_hdr_size;
> + u16 ethertype;
> + u8 l4_proto;
> + struct iphdr *iph;
> + struct ipv6hdr *ipv6;
> +
> +
> + ethertype = ntohs(ethh->h_proto);
> + if (ethertype == ETH_P_8021Q) {
> + ethertype = ntohs(*(__be16*)(p + 2));
> + p += 4;
> + }
> + l2_hdr_size = p - skb->data;
> +
> + if (ethertype == ETH_P_IP) {
> + iph = (struct iphdr *)p;
> + l3_hdr_size = iph->ihl << 2;
> + l4_proto = iph->protocol;
> +
> + } else if (ethertype == ETH_P_IPV6) {
> + ipv6 = (struct ipv6hdr *)p;
> + l3_hdr_size = sizeof(struct ipv6hdr);
> + l4_proto = ipv6->nexthdr;
> +
> + } else {
> + return -1;
> + }
> + if (l4_proto != IPPROTO_TCP && l4_proto != IPPROTO_UDP) {
> + return -1;
> + }
> +
> + *l3_type = ethertype;
> + *l4_type = l4_proto;
> + *l4_offset = l2_hdr_size + l3_hdr_size;
> + return 0;
> +
> +}
> +
> +static inline int vxlan_handle_soe(struct sk_buff *skb, struct vxlanhdr *vxh)
> +{
> + u16 ethertype;
> + u8 ipproto;
> + u16 csum_offset, l4_offset;
> + unsigned short gso_type;
> +
> + if (unlikely(skb_unclone(skb, GFP_ATOMIC))) {
> + return -1;
> + }
> +
> + skb_shinfo(skb)->gso_size = (((__u16)vxh->vx_mss_hi) << 8) +
> + vxh->vx_mss_lo;
> + skb_shinfo(skb)->gso_segs = 0;
>
> -/* VXLAN protocol header */
> -struct vxlanhdr {
> - __be32 vx_flags;
> - __be32 vx_vni;
> -};
> +
> + if (unlikely(skb_linearize(skb)))
> + return -1;
> +
> + if (unlikely(vxlan_parse_inner_hdr(skb, ðertype, &ipproto,
> &l4_offset))) {
> + return -1;
> + }
> + if (ethertype == ETH_P_IP) {
> + if (ipproto == IPPROTO_TCP) {
> + gso_type = SKB_GSO_TCPV4;
> + csum_offset = offsetof(struct tcphdr, check);
> + } else if (ipproto == IPPROTO_UDP) {
> + gso_type = SKB_GSO_UDP;
> + csum_offset = offsetof(struct udphdr, check);
> + } else {
> + BUG();
> + }
> + } else if (ethertype == ETH_P_IPV6) {
> + if (ipproto == IPPROTO_TCP) {
> + gso_type = SKB_GSO_TCPV6;
> + csum_offset = offsetof(struct tcphdr, check);
> + } else if (ipproto == IPPROTO_UDP) {
> + gso_type = SKB_GSO_UDP;
> + csum_offset = offsetof(struct udphdr, check);
> + } else {
> + BUG();
> + }
> + } else {
> + BUG();
> + }
> +
> + skb_shinfo(skb)->gso_type = gso_type;
> + skb->ip_summed = CHECKSUM_PARTIAL;
> + skb->csum_start = skb_headroom(skb) + l4_offset;
> + skb->csum_offset = offsetof(struct tcphdr, check);
> +
> + return 0;
> +}
>
> /* Callback from net/ipv4/udp.c to receive packets */
> static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
> @@ -81,13 +168,13 @@ static int vxlan_udp_encap_recv(struct sock *sk,
> struct sk_buff *skb)
>
> /* Return packets with reserved bits set */
> vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
> - if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
> +/* if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
> (vxh->vx_vni & htonl(0xff))) {
> pr_warn("invalid vxlan flags=%#x vni=%#x\n",
> ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
> goto error;
> }
> -
> +*/
> if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
> goto drop;
>
> @@ -95,7 +182,13 @@ static int vxlan_udp_encap_recv(struct sock *sk,
> struct sk_buff *skb)
> if (!vs)
> goto drop;
>
> - vs->rcv(vs, skb, vxh->vx_vni);
> + if (vxh->vx_flags & VXLAN_FLAG_GSO) {
> + if (unlikely(vxlan_handle_soe(skb, vxh)))
> + goto drop;
> + }
> +
> + vs->rcv(vs, skb, vxh_get_vni(vxh));
> +
> return 0;
>
> drop:
> @@ -153,10 +246,10 @@ static void vxlan_gso(struct sk_buff *skb)
> struct iphdr *iph = ip_hdr(skb);
>
> uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
> - skb->len - udp_offset,
> - IPPROTO_UDP, 0);
> + skb->len - udp_offset,
> + IPPROTO_UDP, 0);
> uh->check = csum_fold(skb_checksum(skb, udp_offset,
> - skb->len - udp_offset, 0));
> + skb->len - udp_offset, 0));
>
> if (uh->check == 0)
> uh->check = CSUM_MANGLED_0;
> @@ -165,10 +258,31 @@ static void vxlan_gso(struct sk_buff *skb)
> skb->ip_summed = CHECKSUM_NONE;
> }
>
> -static int handle_offloads(struct sk_buff *skb)
> +
> +static int handle_offloads(struct sk_buff *skb, struct vxlanhdr* vxh)
> {
> + int err;
> if (skb_is_gso(skb)) {
> - OVS_GSO_CB(skb)->fix_segment = vxlan_gso;
> + /* offload with vxlan-soe if encapsulated packet
> + fits in MAX IP packet size, otherwise fallback to
> + local GSO */
> + if (skb->len + sizeof(struct iphdr) > 65535) {
> + OVS_GSO_CB(skb)->fix_segment = vxlan_gso;
> + } else {
> +
> + vxh->vx_flags |= VXLAN_FLAG_GSO;
> + vxh->vx_mss_hi = (__u8)(skb_shinfo(skb)->gso_size >> 8);
> + vxh->vx_mss_lo = (__u8)skb_shinfo(skb)->gso_size;
> +
> + err = skb_unclone(skb, GFP_ATOMIC);
> + if (unlikely(err))
> + return err;
> +
> + skb_shinfo(skb)->gso_type = 0;
> + skb_shinfo(skb)->gso_size = 0;
> + skb_shinfo(skb)->gso_segs = 0;
> + }
> +
> } else {
> if (skb->ip_summed != CHECKSUM_PARTIAL)
> skb->ip_summed = CHECKSUM_NONE;
> @@ -179,7 +293,7 @@ static int handle_offloads(struct sk_buff *skb)
> int vxlan_xmit_skb(struct vxlan_sock *vs,
> struct rtable *rt, struct sk_buff *skb,
> __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
> - __be16 src_port, __be16 dst_port, __be32 vni)
> + __be16 src_port, __be16 dst_port, __u32 vni)
> {
> struct vxlanhdr *vxh;
> struct udphdr *uh;
> @@ -207,8 +321,9 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
> skb_reset_inner_headers(skb);
>
> vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
> - vxh->vx_flags = htonl(VXLAN_FLAGS);
> - vxh->vx_vni = vni;
> + memset(vxh, 0, sizeof(*vxh));
> + vxh->vx_flags = VXLAN_FLAGS;
> + vxh_set_vni(vxh, vni);
>
> __skb_push(skb, sizeof(*uh));
> skb_reset_transport_header(skb);
> @@ -222,7 +337,7 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
>
> vxlan_set_owner(vs->sock->sk, skb);
>
> - err = handle_offloads(skb);
> + err = handle_offloads(skb, vxh);
> if (err)
> return err;
>
> diff --git a/datapath/vport-vxlan.c b/datapath/vport-vxlan.c
> index cc9477d..ef0cd06 100644
> --- a/datapath/vport-vxlan.c
> +++ b/datapath/vport-vxlan.c
> @@ -58,16 +58,17 @@ static inline struct vxlan_port *vxlan_vport(const
> struct vport *vport)
> }
>
> /* Called with rcu_read_lock and BH disabled. */
> -static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
> __be32 vx_vni)
> +static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __u32 vx_vni)
> {
> struct ovs_key_ipv4_tunnel tun_key;
> struct vport *vport = vs->data;
> struct iphdr *iph;
> __be64 key;
> -
> +
> +
> /* Save outer tunnel values */
> iph = ip_hdr(skb);
> - key = cpu_to_be64(ntohl(vx_vni) >> 8);
> + key = cpu_to_be64(vx_vni);
> ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY);
>
> ovs_vport_receive(vport, skb, &tun_key);
> @@ -181,7 +182,7 @@ static int vxlan_tnl_send(struct vport *vport,
> struct sk_buff *skb)
> OVS_CB(skb)->tun_key->ipv4_tos,
> OVS_CB(skb)->tun_key->ipv4_ttl, df,
> src_port, dst_port,
> - htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8));
> + (__u32)be64_to_cpu(OVS_CB(skb)->tun_key->tun_id));
> if (err < 0)
> ip_rt_put(rt);
> error:
> --
> 1.9.0
More information about the dev
mailing list