[ovs-dev] [PATCH] [RFC] datapath: Implement vxlan-soe

Han Zhou zhouhan at gmail.com
Wed May 14 01:59:05 UTC 2014


Repost it since the original one was blocked by some spam filters.
@Jesse, could you help review it from technical point of view? If this
looks good I can then make it optional and configurable. Thanks a lot!

-Han

On Mon, May 12, 2014 at 4:04 PM, Han Zhou <zhouhan at gmail.com> wrote:
> This patch implements vxlan-soe:
>     http://tools.ietf.org/html/draft-zhou-li-vxlan-soe-01
>
> Tested VXLAN throughput between two hypervisors, and the performance
> gain of vxlan-soe is significant.
> netperf TCP_STREAM test result:
>
> Before the change: 2.62 Gbits/sec
> After the change: 6.68 Gbits/sec
> Speedup is ~250%.
>
> Hope this feature is useful for those who rely on VXLAN.
>
>
> Signed-off-by: Han Zhou <zhouhan at gmail.com>
> ---
>  datapath/linux/compat/include/net/vxlan.h |  28 +++++-
>  datapath/linux/compat/vxlan.c             | 153 ++++++++++++++++++++++++++----
>  datapath/vport-vxlan.c                    |   9 +-
>  3 files changed, 165 insertions(+), 25 deletions(-)
>
> diff --git a/datapath/linux/compat/include/net/vxlan.h
> b/datapath/linux/compat/include/net/vxlan.h
> index 414a497..7ba5291 100644
> --- a/datapath/linux/compat/include/net/vxlan.h
> +++ b/datapath/linux/compat/include/net/vxlan.h
> @@ -10,8 +10,32 @@
>  #include_next <net/vxlan.h>
>  #else
>
> +#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
> +
> +#define VXLAN_FLAG_GSO 0x80 /* VXLAN-SOE */
> +#define VXLAN_FLAGS 0x08 /* struct vxlanhdr.vx_flags required value. */
> +
> +/* VXLAN protocol header */
> +struct vxlanhdr {
> +    __u8     vx_flags;
> +    __u8     vx_mss_hi;
> +    __be16     vx_protocol; /* VXLAN-GPE */
> +    __u8    vx_vni[3];
> +    __u8    vx_mss_lo;
> +};
> +
> +static inline void vxh_set_vni(struct vxlanhdr *vxh, __u32 vni)
> +{
> +    *((__u32*)&vxh->vx_vni) = htonl(vxh->vx_mss_lo | (vni << 8));
> +}
> +
> +static inline __u32 vxh_get_vni(struct vxlanhdr *vxh)
> +{
> +    return ((ntohl(*(__u32*)&vxh->vx_vni) & 0xffffff00) >> 8);
> +}
> +
>  struct vxlan_sock;
> -typedef void (vxlan_rcv_t)(struct vxlan_sock *vs, struct sk_buff
> *skb, __be32 key);
> +typedef void (vxlan_rcv_t)(struct vxlan_sock *vs, struct sk_buff
> *skb, __u32 key);
>
>  /* per UDP socket information */
>  struct vxlan_sock {
> @@ -32,7 +56,7 @@ void vxlan_sock_release(struct vxlan_sock *vs);
>  int vxlan_xmit_skb(struct vxlan_sock *vs,
>             struct rtable *rt, struct sk_buff *skb,
>             __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
> -           __be16 src_port, __be16 dst_port, __be32 vni);
> +           __be16 src_port, __be16 dst_port, __u32 vni);
>
>  __be16 vxlan_src_port(__u16 port_min, __u16 port_max, struct sk_buff *skb);
>
> diff --git a/datapath/linux/compat/vxlan.c b/datapath/linux/compat/vxlan.c
> index b8b8fa7..80fa233 100644
> --- a/datapath/linux/compat/vxlan.c
> +++ b/datapath/linux/compat/vxlan.c
> @@ -59,15 +59,102 @@
>  #include "gso.h"
>  #include "vlan.h"
>
> -#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
>
> -#define VXLAN_FLAGS 0x08000000    /* struct vxlanhdr.vx_flags
> required value. */
> +static inline int vxlan_parse_inner_hdr(struct sk_buff *skb, u16
> *l3_type, u8 *l4_type, u16 *l4_offset)
> +{
> +    struct ethhdr *ethh = (struct ethhdr*)skb->data;
> +    unsigned char *p = (unsigned char *)(ethh + 1);
> +    u16 l2_hdr_size, l3_hdr_size;
> +    u16 ethertype;
> +    u8 l4_proto;
> +    struct iphdr *iph;
> +    struct ipv6hdr *ipv6;
> +
> +
> +    ethertype = ntohs(ethh->h_proto);
> +    if (ethertype == ETH_P_8021Q) {
> +        ethertype = ntohs(*(__be16*)(p + 2));
> +        p += 4;
> +    }
> +    l2_hdr_size = p - skb->data;
> +
> +    if (ethertype == ETH_P_IP) {
> +        iph = (struct iphdr *)p;
> +        l3_hdr_size = iph->ihl << 2;
> +        l4_proto = iph->protocol;
> +
> +    } else if (ethertype == ETH_P_IPV6) {
> +        ipv6 = (struct ipv6hdr *)p;
> +        l3_hdr_size = sizeof(struct ipv6hdr);
> +        l4_proto = ipv6->nexthdr;
> +
> +    } else {
> +        return -1;
> +    }
> +    if (l4_proto != IPPROTO_TCP && l4_proto != IPPROTO_UDP) {
> +        return -1;
> +    }
> +
> +    *l3_type = ethertype;
> +    *l4_type = l4_proto;
> +    *l4_offset = l2_hdr_size + l3_hdr_size;
> +    return 0;
> +
> +}
> +
> +static inline int vxlan_handle_soe(struct sk_buff *skb, struct vxlanhdr *vxh)
> +{
> +    u16 ethertype;
> +    u8 ipproto;
> +    u16 csum_offset, l4_offset;
> +    unsigned short gso_type;
> +
> +    if (unlikely(skb_unclone(skb, GFP_ATOMIC))) {
> +        return -1;
> +    }
> +
> +    skb_shinfo(skb)->gso_size = (((__u16)vxh->vx_mss_hi) << 8) +
> +        vxh->vx_mss_lo;
> +    skb_shinfo(skb)->gso_segs = 0;
>
> -/* VXLAN protocol header */
> -struct vxlanhdr {
> -    __be32 vx_flags;
> -    __be32 vx_vni;
> -};
> +
> +    if (unlikely(skb_linearize(skb)))
> +        return -1;
> +
> +    if (unlikely(vxlan_parse_inner_hdr(skb, &ethertype, &ipproto,
> &l4_offset))) {
> +        return -1;
> +    }
> +    if (ethertype == ETH_P_IP) {
> +        if (ipproto == IPPROTO_TCP) {
> +            gso_type = SKB_GSO_TCPV4;
> +            csum_offset = offsetof(struct tcphdr, check);
> +        } else if (ipproto == IPPROTO_UDP) {
> +            gso_type = SKB_GSO_UDP;
> +            csum_offset = offsetof(struct udphdr, check);
> +        } else {
> +            BUG();
> +        }
> +    } else if (ethertype == ETH_P_IPV6) {
> +        if (ipproto == IPPROTO_TCP) {
> +            gso_type = SKB_GSO_TCPV6;
> +            csum_offset = offsetof(struct tcphdr, check);
> +        } else if (ipproto == IPPROTO_UDP) {
> +            gso_type = SKB_GSO_UDP;
> +            csum_offset = offsetof(struct udphdr, check);
> +        } else {
> +            BUG();
> +        }
> +    } else {
> +        BUG();
> +    }
> +
> +    skb_shinfo(skb)->gso_type = gso_type;
> +    skb->ip_summed = CHECKSUM_PARTIAL;
> +    skb->csum_start = skb_headroom(skb) + l4_offset;
> +    skb->csum_offset = offsetof(struct tcphdr, check);
> +
> +    return 0;
> +}
>
>  /* Callback from net/ipv4/udp.c to receive packets */
>  static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
> @@ -81,13 +168,13 @@ static int vxlan_udp_encap_recv(struct sock *sk,
> struct sk_buff *skb)
>
>      /* Return packets with reserved bits set */
>      vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
> -    if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
> +/*    if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
>          (vxh->vx_vni & htonl(0xff))) {
>          pr_warn("invalid vxlan flags=%#x vni=%#x\n",
>              ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
>          goto error;
>      }
> -
> +*/
>      if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
>          goto drop;
>
> @@ -95,7 +182,13 @@ static int vxlan_udp_encap_recv(struct sock *sk,
> struct sk_buff *skb)
>      if (!vs)
>          goto drop;
>
> -    vs->rcv(vs, skb, vxh->vx_vni);
> +    if (vxh->vx_flags & VXLAN_FLAG_GSO) {
> +        if (unlikely(vxlan_handle_soe(skb, vxh)))
> +            goto drop;
> +    }
> +
> +    vs->rcv(vs, skb, vxh_get_vni(vxh));
> +
>      return 0;
>
>  drop:
> @@ -153,10 +246,10 @@ static void vxlan_gso(struct sk_buff *skb)
>          struct iphdr *iph = ip_hdr(skb);
>
>          uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
> -                           skb->len - udp_offset,
> -                           IPPROTO_UDP, 0);
> +                    skb->len - udp_offset,
> +                    IPPROTO_UDP, 0);
>          uh->check = csum_fold(skb_checksum(skb, udp_offset,
> -                      skb->len - udp_offset, 0));
> +                    skb->len - udp_offset, 0));
>
>          if (uh->check == 0)
>              uh->check = CSUM_MANGLED_0;
> @@ -165,10 +258,31 @@ static void vxlan_gso(struct sk_buff *skb)
>      skb->ip_summed = CHECKSUM_NONE;
>  }
>
> -static int handle_offloads(struct sk_buff *skb)
> +
> +static int handle_offloads(struct sk_buff *skb, struct vxlanhdr* vxh)
>  {
> +    int err;
>      if (skb_is_gso(skb)) {
> -        OVS_GSO_CB(skb)->fix_segment = vxlan_gso;
> +        /* offload with vxlan-soe if encapsulated packet
> +           fits in MAX IP packet size, otherwise fallback to
> +           local GSO */
> +        if (skb->len + sizeof(struct iphdr) > 65535) {
> +            OVS_GSO_CB(skb)->fix_segment = vxlan_gso;
> +        } else {
> +
> +            vxh->vx_flags |= VXLAN_FLAG_GSO;
> +            vxh->vx_mss_hi = (__u8)(skb_shinfo(skb)->gso_size >> 8);
> +            vxh->vx_mss_lo = (__u8)skb_shinfo(skb)->gso_size;
> +
> +            err = skb_unclone(skb, GFP_ATOMIC);
> +            if (unlikely(err))
> +                return err;
> +
> +            skb_shinfo(skb)->gso_type = 0;
> +            skb_shinfo(skb)->gso_size = 0;
> +            skb_shinfo(skb)->gso_segs = 0;
> +        }
> +
>      } else {
>          if (skb->ip_summed != CHECKSUM_PARTIAL)
>              skb->ip_summed = CHECKSUM_NONE;
> @@ -179,7 +293,7 @@ static int handle_offloads(struct sk_buff *skb)
>  int vxlan_xmit_skb(struct vxlan_sock *vs,
>             struct rtable *rt, struct sk_buff *skb,
>             __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
> -           __be16 src_port, __be16 dst_port, __be32 vni)
> +           __be16 src_port, __be16 dst_port, __u32 vni)
>  {
>      struct vxlanhdr *vxh;
>      struct udphdr *uh;
> @@ -207,8 +321,9 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
>      skb_reset_inner_headers(skb);
>
>      vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
> -    vxh->vx_flags = htonl(VXLAN_FLAGS);
> -    vxh->vx_vni = vni;
> +    memset(vxh, 0, sizeof(*vxh));
> +    vxh->vx_flags = VXLAN_FLAGS;
> +    vxh_set_vni(vxh, vni);
>
>      __skb_push(skb, sizeof(*uh));
>      skb_reset_transport_header(skb);
> @@ -222,7 +337,7 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
>
>      vxlan_set_owner(vs->sock->sk, skb);
>
> -    err = handle_offloads(skb);
> +    err = handle_offloads(skb, vxh);
>      if (err)
>          return err;
>
> diff --git a/datapath/vport-vxlan.c b/datapath/vport-vxlan.c
> index cc9477d..ef0cd06 100644
> --- a/datapath/vport-vxlan.c
> +++ b/datapath/vport-vxlan.c
> @@ -58,16 +58,17 @@ static inline struct vxlan_port *vxlan_vport(const
> struct vport *vport)
>  }
>
>  /* Called with rcu_read_lock and BH disabled. */
> -static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
> __be32 vx_vni)
> +static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __u32 vx_vni)
>  {
>      struct ovs_key_ipv4_tunnel tun_key;
>      struct vport *vport = vs->data;
>      struct iphdr *iph;
>      __be64 key;
> -
> +
> +
>      /* Save outer tunnel values */
>      iph = ip_hdr(skb);
> -    key = cpu_to_be64(ntohl(vx_vni) >> 8);
> +    key = cpu_to_be64(vx_vni);
>      ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY);
>
>      ovs_vport_receive(vport, skb, &tun_key);
> @@ -181,7 +182,7 @@ static int vxlan_tnl_send(struct vport *vport,
> struct sk_buff *skb)
>                   OVS_CB(skb)->tun_key->ipv4_tos,
>                   OVS_CB(skb)->tun_key->ipv4_ttl, df,
>                   src_port, dst_port,
> -                 htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8));
> +                 (__u32)be64_to_cpu(OVS_CB(skb)->tun_key->tun_id));
>      if (err < 0)
>          ip_rt_put(rt);
>  error:
> --
> 1.9.0



More information about the dev mailing list