[ovs-dev] [PATCH v4] Enable VXLAN TSO for DPDK datapath

Flavio Leitner fbl at sysclose.org
Thu Feb 4 17:38:14 UTC 2021



Hi Yi,

Again, sorry the delay to review the patch.

The patch is using the outer length fields from DPDK
which seems to be a problem in OVS because most of the
packet transformation functions are not aware of that.

Therefore, after the packet gets encapsulated most of
existing functions will stop working.

I think it makes more sense to keep inner headers lengths,
which are unlikely to change after encapsulation and use
struct dp_packet fields to hold outer headers. Doing so,
we can re-use all the existing functions.

Another thing is that this patch pushes to the user to
set a global tso_segsz which will not work well when
bridges use different MTU sizes, encapsulations, etc.

I think the tso_segsz comes in some form or another from
each port and after the encapsulation we need adjust that
value.

I am working on a PoC with those changes in mind to see
how it looks like, so if you wait a few days, I can follow
up with results.

More comments inline.

On Thu, Nov 26, 2020 at 03:26:17PM +0800, yang_y_yi at 163.com wrote:
> From: Yi Yang <yangyi01 at inspur.com>
> 
> Many NICs can support VXLAN TSO which can help
> improve across-compute-node VM-to-VM performance
> in case that MTU is set to 1500.
> 
> This patch allows dpdkvhostuserclient interface
> and veth/tap interface to leverage NICs' offload
> capability to maximize across-compute-node TCP
> performance, with it applied, OVS DPDK can reach
> line speed for across-compute-node VM-to-VM TCP
> performance.
> 
> Signed-off-by: Yi Yang <yangyi01 at inspur.com>
> ---
> Changelog:
> 
> v3 -> v4:
>   - Split it from v3 as a separate patch.
>   - Add IPv6 support.
>   - Remove GRO and GSO code for simplicity.
>   - Remove dependency on multi-segmented mbuf, VXLAN
>     TSO needn't it if without GRO and GSO.
> 
> ---
>  lib/automake.mk           |   2 +
>  lib/dp-packet.c           |   9 ++
>  lib/dp-packet.h           | 238 +++++++++++++++++++++++++++++++++++++-
>  lib/netdev-dpdk.c         | 284 ++++++++++++++++++++++++++++++++++++++++++++--
>  lib/netdev-linux.c        | 154 +++++++++++++++++++++++--
>  lib/netdev-provider.h     |   1 +
>  lib/netdev.c              | 149 +++++++++++++++++++++++-
>  lib/userspace-tso-segsz.c |  55 +++++++++
>  lib/userspace-tso-segsz.h |  23 ++++
>  vswitchd/bridge.c         |   2 +
>  10 files changed, 892 insertions(+), 25 deletions(-)
>  create mode 100644 lib/userspace-tso-segsz.c
>  create mode 100644 lib/userspace-tso-segsz.h
> 
> diff --git a/lib/automake.mk b/lib/automake.mk
> index 8eeb6c3..7e0b9fc 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -345,6 +345,8 @@ lib_libopenvswitch_la_SOURCES = \
>  	lib/unixctl.h \
>  	lib/userspace-tso.c \
>  	lib/userspace-tso.h \
> +	lib/userspace-tso-segsz.c \
> +	lib/userspace-tso-segsz.h \
>  	lib/util.c \
>  	lib/util.h \
>  	lib/uuid.c \
> diff --git a/lib/dp-packet.c b/lib/dp-packet.c
> index 72f6d09..ee0ccee 100644
> --- a/lib/dp-packet.c
> +++ b/lib/dp-packet.c
> @@ -502,7 +502,16 @@ dp_packet_resize_l2_5(struct dp_packet *b, int increment)
>  void *
>  dp_packet_resize_l2(struct dp_packet *b, int increment)
>  {
> +    int outer_l2_len = dp_packet_hwol_get_outer_l2_len(b);
> +
>      dp_packet_resize_l2_5(b, increment);
>      dp_packet_adjust_layer_offset(&b->l2_5_ofs, increment);
> +    if (outer_l2_len) {
> +        dp_packet_hwol_set_outer_l2_len(b, outer_l2_len + increment);
> +    } else {
> +        int l2_len = dp_packet_hwol_get_l2_len(b);
> +
> +        dp_packet_hwol_set_l2_len(b, l2_len + increment);
> +    }
>      return dp_packet_data(b);
>  }
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h
> index 0430cca..f1c07e0 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -81,6 +81,14 @@ enum dp_packet_offload_mask {
>      DEF_OL_FLAG(DP_PACKET_OL_TX_UDP_CKSUM, PKT_TX_UDP_CKSUM, 0x400),
>      /* Offload SCTP checksum. */
>      DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, PKT_TX_SCTP_CKSUM, 0x800),
> +    /* VXLAN TCP Segmentation Offload. */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN, PKT_TX_TUNNEL_VXLAN, 0x1000),
> +    /* UDP Segmentation Offload. */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_UDP_SEG, PKT_TX_UDP_SEG, 0x2000),
> +    /* Outer L3 Type IPV4 For Tunnel Offload. */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IPV4, PKT_TX_OUTER_IPV4, 0x4000),
> +    /* Outer L3 Type IPV6 For Tunnel Offload. */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IPV6, PKT_TX_OUTER_IPV6, 0x8000),
>      /* Adding new field requires adding to DP_PACKET_OL_SUPPORTED_MASK. */
>  };
>  
> @@ -95,7 +103,8 @@ enum dp_packet_offload_mask {
>                                       DP_PACKET_OL_TX_IPV6          | \
>                                       DP_PACKET_OL_TX_TCP_CKSUM     | \
>                                       DP_PACKET_OL_TX_UDP_CKSUM     | \
> -                                     DP_PACKET_OL_TX_SCTP_CKSUM)
> +                                     DP_PACKET_OL_TX_SCTP_CKSUM    | \
> +                                     DP_PACKET_OL_TX_UDP_SEG)

Isn't this missing DP_PACKET_OL_TX_OUTER_IPV4 and
DP_PACKET_OL_TX_OUTER_IPV6?


>  #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \
>                                   DP_PACKET_OL_TX_UDP_CKSUM | \
> @@ -954,6 +963,13 @@ dp_packet_hwol_is_tso(const struct dp_packet *b)
>      return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TCP_SEG);
>  }
>  
> +/* Returns 'true' if packet 'b' is marked for UDP fragmentation offloading. */
> +static inline bool
> +dp_packet_hwol_is_ufo(const struct dp_packet *b)
> +{
> +    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_UDP_SEG);
> +}
> +
>  /* Returns 'true' if packet 'b' is marked for IPv4 checksum offloading. */
>  static inline bool
>  dp_packet_hwol_is_ipv4(const struct dp_packet *b)
> @@ -992,6 +1008,13 @@ dp_packet_hwol_set_tx_ipv4(struct dp_packet *b)
>      *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_IPV4;
>  }
>  
> +/* Reset packet 'b' for IPv4 checksum offloading. */
> +static inline void
> +dp_packet_hwol_reset_tx_ipv4(struct dp_packet *b)
> +{
> +    *dp_packet_ol_flags_ptr(b) &= ~DP_PACKET_OL_TX_IPV4;
> +}
> +
>  /* Mark packet 'b' for IPv6 checksum offloading. */
>  static inline void
>  dp_packet_hwol_set_tx_ipv6(struct dp_packet *b)
> @@ -999,6 +1022,27 @@ dp_packet_hwol_set_tx_ipv6(struct dp_packet *b)
>      *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_IPV6;
>  }
>  
> +/* Reset packet 'b' for IPv6 checksum offloading. */
> +static inline void
> +dp_packet_hwol_reset_tx_ipv6(struct dp_packet *b)
> +{
> +    *dp_packet_ol_flags_ptr(b) &= ~DP_PACKET_OL_TX_IPV6;
> +}
> +
> +/* Mark packet 'b' for Outer IPv4 checksum offloading. */
> +static inline void
> +dp_packet_hwol_set_tx_outer_ipv4(struct dp_packet *b)
> +{

Although I see you wanted to keep the API complete, this
is not used, so I think it's fine to remove this.

> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IPV4;
> +}
> +
> +/* Mark packet 'b' for Outer IPv6 checksum offloading. */
> +static inline void
> +dp_packet_hwol_set_tx_outer_ipv6(struct dp_packet *b)

Same comment here.


> +{
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IPV6;
> +}
> +
>  /* Mark packet 'b' for TCP checksum offloading.  It implies that either
>   * the packet 'b' is marked for IPv4 or IPv6 checksum offloading. */
>  static inline void
> @@ -1007,6 +1051,14 @@ dp_packet_hwol_set_csum_tcp(struct dp_packet *b)
>      *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_CKSUM;
>  }
>  
> +/* Reset TCP checksum offloading flag for packet 'b'.
> + */
> +static inline void
> +dp_packet_hwol_reset_csum_tcp(struct dp_packet *b)
> +{
> +    *dp_packet_ol_flags_ptr(b) &= ~DP_PACKET_OL_TX_TCP_CKSUM;
> +}
> +
>  /* Mark packet 'b' for UDP checksum offloading.  It implies that either
>   * the packet 'b' is marked for IPv4 or IPv6 checksum offloading. */
>  static inline void
> @@ -1015,6 +1067,15 @@ dp_packet_hwol_set_csum_udp(struct dp_packet *b)
>      *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_UDP_CKSUM;
>  }
>  
> +/* Reset UDP checksum offloading flag for packet 'b'.
> + */
> +static inline void
> +dp_packet_hwol_reset_csum_udp(struct dp_packet *b)
> +{
> +    *dp_packet_ol_flags_ptr(b) &= ~DP_PACKET_OL_TX_UDP_CKSUM;
> +}
> +
> +
>  /* Mark packet 'b' for SCTP checksum offloading.  It implies that either
>   * the packet 'b' is marked for IPv4 or IPv6 checksum offloading. */
>  static inline void
> @@ -1032,6 +1093,181 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
>      *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;
>  }
>  
> +/* Mark packet 'b' for UDP segmentation offloading.  It implies that
> + * either the packet 'b' is marked for IPv4 or IPv6 checksum offloading
> + * and also for UDP checksum offloading. */
> +static inline void
> +dp_packet_hwol_set_udp_seg(struct dp_packet *b)
> +{
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_UDP_SEG;
> +}
> +
> +#ifdef DPDK_NETDEV
> +/* Set l2_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len)
> +{
> +    b->mbuf.l2_len = l2_len;
> +}
> +
> +/* Set l3_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len)
> +{
> +    b->mbuf.l3_len = l3_len;
> +}
> +
> +/* Set l4_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len)
> +{
> +    b->mbuf.l4_len = l4_len;
> +}
> +
> +/* Set outer_l2_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_outer_l2_len(struct dp_packet *b, int outer_l2_len)
> +{
> +    b->mbuf.outer_l2_len = outer_l2_len;
> +}
> +
> +/* Set outer_l3_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_outer_l3_len(struct dp_packet *b, int outer_l3_len)
> +{
> +    b->mbuf.outer_l3_len = outer_l3_len;
> +}
> +
> +/* Get l2_len for the packet 'b' */
> +static inline int
> +dp_packet_hwol_get_l2_len(struct dp_packet *b)
> +{
> +    return b->mbuf.l2_len;
> +}
> +
> +/* Get l3_len for the packet 'b' */
> +static inline int
> +dp_packet_hwol_get_l3_len(struct dp_packet *b)
> +{
> +    return b->mbuf.l3_len;
> +}
> +
> +/* Get l4_len for the packet 'b' */
> +static inline int
> +dp_packet_hwol_get_l4_len(struct dp_packet *b)
> +{
> +    return b->mbuf.l4_len;
> +}
> +
> +/* Get outer_l2_len for the packet 'b' */
> +static inline int
> +dp_packet_hwol_get_outer_l2_len(struct dp_packet *b)
> +{
> +    return b->mbuf.outer_l2_len;
> +}
> +
> +
> +/* Get outer_l3_len for the packet 'b' */
> +static inline int
> +dp_packet_hwol_get_outer_l3_len(struct dp_packet *b)
> +{
> +    return b->mbuf.outer_l3_len;
> +}
> +
> +#else
> +/* Set l2_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l2_len(struct dp_packet *b OVS_UNUSED,
> +                          int l2_len OVS_UNUSED)
> +{
> +}
> +
> +/* Set l3_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l3_len(struct dp_packet *b OVS_UNUSED,
> +                          int l3_len OVS_UNUSED)
> +{
> +}
> +
> +/* Set l4_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l4_len(struct dp_packet *b OVS_UNUSED,
> +                          int l4_len OVS_UNUSED)
> +{
> +}
> +
> +/* Set outer_l2_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_outer_l2_len(struct dp_packet *b OVS_UNUSED,
> +                                int outer_l2_len OVS_UNUSED)
> +{
> +}
> +
> +/* Set outer_l3_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_outer_l3_len(struct dp_packet *b OVS_UNUSED,
> +                                int outer_l3_len OVS_UNUSED)
> +{
> +}
> +
> +/* Get l2_len for the packet 'b' */
> +static inline int
> +dp_packet_hwol_get_l2_len(struct dp_packet *b)
> +{
> +    return ((char *)dp_packet_l3(b) - (char *)dp_packet_eth(b));
> +}
> +
> +/* Get l3_len for the packet 'b' */
> +static inline int
> +dp_packet_hwol_get_l3_len(struct dp_packet *b)
> +{
> +    return ((char *)dp_packet_l4(b) - (char *)dp_packet_l3(b));
> +}
> +
> +/* Get l4_len for the packet 'b' */
> +static inline int
> +dp_packet_hwol_get_l4_len(struct dp_packet *b OVS_UNUSED)
> +{
> +    return 0;
> +}
> +
> +
> +/* Get outer_l2_len for the packet 'b' */
> +static inline int
> +dp_packet_hwol_get_outer_l2_len(struct dp_packet *b)
> +{
> +    return ((char *)dp_packet_l3(b) - (char *)dp_packet_eth(b));
> +}
> +
> +/* Get outer_l3_len for the packet 'b' */
> +static inline int
> +dp_packet_hwol_get_outer_l3_len(struct dp_packet *b)
> +{
> +    return ((char *)dp_packet_l4(b) - (char *)dp_packet_l3(b));
> +}
> +
> +#endif /* DPDK_NETDEV */
> +
> +/* Mark packet 'b' for VXLAN TCP segmentation offloading. */
> +static inline void
> +dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b)
> +{
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_VXLAN;
> +    /* Set outer_l2_len and outer_l3_len */
> +    dp_packet_hwol_set_outer_l2_len(b, (char *) dp_packet_l3(b)
> +                                       - (char *) dp_packet_eth(b));
> +    dp_packet_hwol_set_outer_l3_len(b, (char *) dp_packet_l4(b)
> +                                       - (char *) dp_packet_l3(b));
> +}
> +
> +/* Check if it is a VXLAN packet */
> +static inline bool
> +dp_packet_hwol_is_vxlan_tcp_seg(struct dp_packet *b)
> +{
> +    return (*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TUNNEL_VXLAN);
> +}
> +
> +
>  static inline bool
>  dp_packet_ip_checksum_valid(const struct dp_packet *p)
>  {
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 75dffef..b2dd008 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -38,6 +38,7 @@
>  #include <rte_errno.h>
>  #include <rte_ethdev.h>
>  #include <rte_flow.h>
> +#include <rte_ip.h>
>  #include <rte_malloc.h>
>  #include <rte_mbuf.h>
>  #include <rte_meter.h>
> @@ -72,6 +73,7 @@
>  #include "unaligned.h"
>  #include "unixctl.h"
>  #include "userspace-tso.h"
> +#include "userspace-tso-segsz.h"
>  #include "util.h"
>  #include "uuid.h"
>  
> @@ -87,6 +89,7 @@ COVERAGE_DEFINE(vhost_notification);
>  
>  #define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
>  #define OVS_VPORT_DPDK "ovs_dpdk"
> +#define DPDK_RTE_HDR_OFFSET 1
>  
>  /*
>   * need to reserve tons of extra space in the mbufs so we can align the
> @@ -96,6 +99,8 @@ COVERAGE_DEFINE(vhost_notification);
>   */
>  #define ETHER_HDR_MAX_LEN           (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN \
>                                       + (2 * VLAN_HEADER_LEN))
> +#define ETHER_VLAN_HDR_MAX_LEN      (RTE_ETHER_HDR_LEN + \
> +                                     + (2 * VLAN_HEADER_LEN))
>  #define MTU_TO_FRAME_LEN(mtu)       ((mtu) + RTE_ETHER_HDR_LEN + \
>                                       RTE_ETHER_CRC_LEN)
>  #define MTU_TO_MAX_FRAME_LEN(mtu)   ((mtu) + ETHER_HDR_MAX_LEN)
> @@ -404,6 +409,7 @@ enum dpdk_hw_ol_features {
>      NETDEV_RX_HW_SCATTER = 1 << 2,
>      NETDEV_TX_TSO_OFFLOAD = 1 << 3,
>      NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 4,
> +    NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 5,
>  };
>  
>  /*
> @@ -998,6 +1004,11 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
>  
>      if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
>          conf.txmode.offloads |= DPDK_TX_TSO_OFFLOAD_FLAGS;
> +        /* Enable VXLAN TSO support if available */
> +        if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
> +            conf.txmode.offloads |= DEV_TX_OFFLOAD_VXLAN_TNL_TSO;
> +            conf.txmode.offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
> +        }
>          if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
>              conf.txmode.offloads |= DEV_TX_OFFLOAD_SCTP_CKSUM;
>          }
> @@ -1136,6 +1147,10 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>          if ((info.tx_offload_capa & tx_tso_offload_capa)
>              == tx_tso_offload_capa) {
>              dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
> +            /* Enable VXLAN TSO support if available */
> +            if (info.tx_offload_capa & DEV_TX_OFFLOAD_VXLAN_TNL_TSO) {
> +                dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
> +            }
>              if (info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM) {
>                  dev->hw_ol_features |= NETDEV_TX_SCTP_CHECKSUM_OFFLOAD;
>              } else {
> @@ -2173,37 +2188,267 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
>      rte_free(rx);
>  }
>  
> +static inline bool
> +is_local_to_local(uint16_t src_port_id, struct netdev_dpdk *dev)
> +{


I don't see how this works. Even if the packet is coming from a
another local DPDK port, then it would still be required to translate
from OVS generic offloading to, in this case, DPDK device specific.


> +    bool ret = false;
> +    struct netdev_dpdk *src_dev;
> +
> +    if (src_port_id == UINT16_MAX) {
> +        ret = true;
> +    } else {
> +        src_dev = netdev_dpdk_lookup_by_port_id(src_port_id);
> +        if (src_dev && (netdev_dpdk_get_vid(src_dev) >= 0)) {
> +            ret = true;
> +        }
> +    }
> +
> +    if (ret) {
> +        if (netdev_dpdk_get_vid(dev) < 0) {
> +            ret = false;
> +        }
> +    }
> +
> +    return ret;
> +}
> +
> +#define UDP_VXLAN_ETH_HDR_SIZE 30
> +
>  /* Prepare the packet for HWOL.
>   * Return True if the packet is OK to continue. */
>  static bool
>  netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>  {
>      struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf);
> +    uint16_t l4_proto = 0;
> +    uint8_t *l3_hdr_ptr = NULL;
> +    struct rte_ether_hdr *eth_hdr =
> +        rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *);
> +    struct rte_ipv4_hdr *ip_hdr;
> +    struct rte_ipv6_hdr *ip6_hdr;
> +    const uint16_t tso_segsz = get_userspace_tso_segsz();
> +
> +    /* Return directly if source and destitation of mbuf are local ports
> +     * because mbuf has already set ol_flags and l*_len correctly.
> +     */
> +    if (is_local_to_local(mbuf->port, dev)) {
> +        if (mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)) {
> +            mbuf->tso_segsz = tso_segsz - mbuf->l3_len - mbuf->l4_len;
> +        }
> +        return true;
> +    }
> +
> +    if (mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) {
> +        /* Handle VXLAN TSO */
> +        struct rte_udp_hdr *udp_hdr = NULL;
> +
> +        /* Correct l2_len for VxLAN packet */
> +        mbuf->l2_len += sizeof(struct udp_header)
> +                        + sizeof(struct vxlanhdr);
> +
> +        /* small packets whose size is less than or equal to  MTU needn't
> +         * VXLAN TSO. In addtion, if hardware can't support VXLAN TSO, it
> +         * also can't be handled. So PKT_TX_TUNNEL_VXLAN must be cleared
> +         * outer_l2_len and outer_l3_len must be zeroed.
> +         */
> +        if (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG))
> +            && (mbuf->pkt_len <= tso_segsz + mbuf->outer_l2_len
> +                                     + mbuf->outer_l3_len + mbuf->l2_len))  {
> +            mbuf->ol_flags &= ~PKT_TX_TUNNEL_VXLAN;
> +            if ((mbuf->ol_flags & PKT_TX_IPV4) &&
> +                (mbuf->outer_l3_len == IPV6_HEADER_LEN)) {
> +                dp_packet_hwol_reset_tx_ipv4(pkt);
> +                dp_packet_hwol_set_tx_ipv6(pkt);
> +            } else if ((mbuf->ol_flags & PKT_TX_IPV6) &&
> +                (mbuf->outer_l3_len == IP_HEADER_LEN)) {
> +                dp_packet_hwol_reset_tx_ipv6(pkt);
> +                dp_packet_hwol_set_tx_ipv4(pkt);
> +            }
> +            mbuf->l2_len = mbuf->outer_l2_len;
> +            mbuf->l3_len = mbuf->outer_l3_len;
> +            mbuf->l4_len = sizeof(struct rte_udp_hdr);
> +            mbuf->outer_l2_len = 0;
> +            mbuf->outer_l3_len = 0;
> +            return true;
> +        }
> +
> +        /* Handle outer packet */
> +        if (mbuf->outer_l3_len == IP_HEADER_LEN) {
> +            ip_hdr = (struct rte_ipv4_hdr *)((char *) eth_hdr
> +                                                  + mbuf->outer_l2_len);
> +            /* outer IP checksum offload */
> +            ip_hdr->hdr_checksum = 0;
> +            mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM;
> +            mbuf->ol_flags |= PKT_TX_OUTER_IPV4;
> +
> +            udp_hdr = (struct rte_udp_hdr *)(ip_hdr + DPDK_RTE_HDR_OFFSET);
> +        } else if (mbuf->outer_l3_len == IPV6_HEADER_LEN) {
> +            ip6_hdr = (struct rte_ipv6_hdr *)((char *) eth_hdr
> +                                                  + mbuf->outer_l2_len);
> +            /* no IP checksum for outer IPv6 */
> +            mbuf->ol_flags |= PKT_TX_OUTER_IPV6;
>  
> -    if (mbuf->ol_flags & PKT_TX_L4_MASK) {
> +            udp_hdr = (struct rte_udp_hdr *)(ip6_hdr + DPDK_RTE_HDR_OFFSET);
> +
> +        }
> +
> +        /* Handle inner packet */
> +        if (udp_hdr != NULL) {
> +            if (mbuf->ol_flags & PKT_TX_IPV4) {
> +                ip_hdr = (struct rte_ipv4_hdr *)
> +                    ((uint8_t *)udp_hdr + mbuf->l2_len);
> +                l4_proto = ip_hdr->next_proto_id;
> +                l3_hdr_ptr = (uint8_t *)ip_hdr;
> +
> +                /* inner IP checksum offload */
> +                ip_hdr->hdr_checksum = 0;
> +                mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +            } else if (mbuf->ol_flags & PKT_TX_IPV6) {
> +                ip6_hdr = (struct rte_ipv6_hdr *)
> +                    ((uint8_t *)udp_hdr + mbuf->l2_len);
> +                l4_proto = ip6_hdr->proto;
> +                l3_hdr_ptr = (uint8_t *)ip6_hdr;
> +            }
> +        }
> +
> +        /* In case of MTU > tso_segsz, PKT_TX_TCP_SEG or PKT_TX_UDP_SEG wasn't
> +         * set by client/server, here is a place we can mark it.
> +         */
> +        if ((mbuf->pkt_len > tso_segsz + mbuf->outer_l2_len
> +                                 + mbuf->outer_l3_len + mbuf->l2_len)
> +            && (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)))) {
> +            if (l4_proto == IPPROTO_UDP) {
> +                mbuf->ol_flags |= PKT_TX_UDP_SEG;
> +            } else if (l4_proto == IPPROTO_TCP) {
> +                mbuf->ol_flags |= PKT_TX_TCP_SEG;
> +            }
> +        }
> +    } else if (mbuf->ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6)) {
> +        /* Handle VLAN TSO */
>          mbuf->l2_len = (char *)dp_packet_l3(pkt) - (char *)dp_packet_eth(pkt);
>          mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt);
>          mbuf->outer_l2_len = 0;
>          mbuf->outer_l3_len = 0;
> +
> +        if (mbuf->ol_flags & PKT_TX_IPV4) {
> +            ip_hdr = (struct rte_ipv4_hdr *)((char *)eth_hdr + mbuf->l2_len);
> +            l4_proto = ip_hdr->next_proto_id;
> +            l3_hdr_ptr = (uint8_t *)ip_hdr;
> +
> +            /* IP checksum offload */
> +            ip_hdr->hdr_checksum = 0;
> +            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        } else if (mbuf->ol_flags & PKT_TX_IPV6) {
> +            ip6_hdr = (struct rte_ipv6_hdr *)((char *)eth_hdr + mbuf->l2_len);
> +            l4_proto = ip6_hdr->proto;
> +            l3_hdr_ptr = (uint8_t *)ip6_hdr;
> +        }
> +
> +        /* In some cases, PKT_TX_TCP_SEG or PKT_TX_UDP_SEG wasn't set, here is
> +         * a place we can mark it.
> +         */
> +        if ((mbuf->pkt_len > (tso_segsz + mbuf->l2_len))
> +            && (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)))) {
> +            if (l4_proto == IPPROTO_UDP) {
> +                mbuf->ol_flags |= PKT_TX_UDP_SEG;
> +            } else if (l4_proto == IPPROTO_TCP) {
> +                mbuf->ol_flags |= PKT_TX_TCP_SEG;
> +            }
> +        }
>      }
>  
> -    if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
> -        struct tcp_header *th = dp_packet_l4(pkt);
> +    /* It is possible that l4_len isn't set for vhostuserclient */
> +    if ((l3_hdr_ptr != NULL) && (l4_proto == IPPROTO_TCP)
> +        && (mbuf->l4_len < 20)) {
> +        struct rte_tcp_hdr *tcp_hdr = (struct rte_tcp_hdr *)
> +            (l3_hdr_ptr + mbuf->l3_len);
>  
> -        if (!th) {
> -            VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
> +        mbuf->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
> +    }
> +
> +    if ((l4_proto != IPPROTO_UDP) && (l4_proto != IPPROTO_TCP)) {
> +        return true;
> +    }
> +
> +    if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) {
> +        if (l4_proto != IPPROTO_UDP) {
> +            VLOG_WARN_RL(&rl, "%s: UDP packet without L4 header"
>                           " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
>              return false;
>          }
> +    } else if (mbuf->ol_flags & PKT_TX_TCP_SEG ||
> +               mbuf->ol_flags & PKT_TX_TCP_CKSUM) {
> +        if (l4_proto != IPPROTO_TCP) {
> +            VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
> +                         " pkt len: %"PRIu32" l4_proto = %d",
> +                         dev->up.name, mbuf->pkt_len, l4_proto);
> +            return false;
> +        }
> +
> +        if (mbuf->pkt_len > tso_segsz + mbuf->outer_l2_len + mbuf->outer_l3_len
> +            + mbuf->l2_len) {
> +            dp_packet_hwol_set_tcp_seg(pkt);
> +        }
>  
> -        mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
>          mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
> -        mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
> +        if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
> +            mbuf->tso_segsz = tso_segsz - mbuf->l3_len - mbuf->l4_len;
> +        } else {
> +            mbuf->tso_segsz = 0;
> +        }
>  
> -        if (mbuf->ol_flags & PKT_TX_IPV4) {
> -            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
> +            /* PKT_TX_TCP_CKSUM must be cleaned because
> +             * tcp checksum only can be caculated by software if NIC
> +             * can not support it.
> +             */
> +            mbuf->ol_flags &= ~PKT_TX_TCP_CKSUM;
>          }
>      }
> +
> +    if (l4_proto == IPPROTO_UDP) {
> +        /* in case of pkt_len < dev->mtu, it still can be handled correctly */
> +        if (mbuf->pkt_len < dev->mtu + ETHER_VLAN_HDR_MAX_LEN) {
> +            mbuf->ol_flags &= ~PKT_TX_UDP_SEG;
> +            if (mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) {
> +                /* Pretend it as a normal UDP and stop inner cksum offload */
> +                mbuf->ol_flags &= ~PKT_TX_TUNNEL_VXLAN;
> +                mbuf->ol_flags &= ~PKT_TX_OUTER_IP_CKSUM;
> +                if (mbuf->ol_flags & PKT_TX_OUTER_IPV4) {
> +                    mbuf->ol_flags &= ~PKT_TX_OUTER_IPV4;
> +                    if (mbuf->ol_flags & PKT_TX_IPV6) {
> +                        mbuf->ol_flags &= ~PKT_TX_IPV6;
> +                    }
> +                    if ((mbuf->ol_flags & PKT_TX_IPV4) == 0) {
> +                        mbuf->ol_flags |= PKT_TX_IPV4;
> +                    }
> +                    mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +                } else if (mbuf->ol_flags & PKT_TX_OUTER_IPV6) {
> +                    mbuf->ol_flags &= ~PKT_TX_OUTER_IPV6;
> +                    if (mbuf->ol_flags & PKT_TX_IPV4) {
> +                        mbuf->ol_flags &= ~PKT_TX_IPV4;
> +                        mbuf->ol_flags &= ~PKT_TX_IP_CKSUM;
> +                    }
> +                    if ((mbuf->ol_flags & PKT_TX_IPV6) == 0) {
> +                        mbuf->ol_flags |= PKT_TX_IPV6;
> +                    }
> +                    /* For outer IPv6, outer udp checksum is incorrect */
> +                    mbuf->ol_flags |= PKT_TX_UDP_CKSUM;
> +                }
> +                mbuf->l2_len = mbuf->outer_l2_len;
> +                mbuf->l3_len = mbuf->outer_l3_len;
> +                mbuf->outer_l2_len = 0;
> +                mbuf->outer_l3_len = 0;
> +            }
> +            return true;
> +        }
> +
> +        /* Can't handle bigger UDP packet, so return false */
> +        VLOG_WARN_RL(&rl, "%s: too big UDP packet"
> +                     ", pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
> +        return false;
> +    }
> +
>      return true;
>  }
>  
> @@ -2781,17 +3026,26 @@ dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig)
>      mbuf_dest->packet_type = pkt_orig->mbuf.packet_type;
>      mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
>                              ~(EXT_ATTACHED_MBUF | IND_ATTACHED_MBUF));
> +    mbuf_dest->l2_len = pkt_orig->mbuf.l2_len;
> +    mbuf_dest->l3_len = pkt_orig->mbuf.l3_len;
> +    mbuf_dest->l4_len = pkt_orig->mbuf.l4_len;
> +    mbuf_dest->outer_l2_len = pkt_orig->mbuf.outer_l2_len;
> +    mbuf_dest->outer_l3_len = pkt_orig->mbuf.outer_l3_len;

Although this is used before actually calling the send function
which will update tso_segs, it seems this should copy that too.
In that case, maybe it worth to use memcpy() as below to copy
all fields at once.

>  
>      memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
>             sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size));
>  
> -    if (mbuf_dest->ol_flags & PKT_TX_L4_MASK) {
> +    if ((mbuf_dest->outer_l2_len == 0) &&
> +        (mbuf_dest->ol_flags & PKT_TX_L4_MASK)) {
>          mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest)
>                                  - (char *)dp_packet_eth(pkt_dest);
>          mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest)
>                                  - (char *) dp_packet_l3(pkt_dest);
>      }
>  
> +    /* Mark it as non-DPDK port */
> +    mbuf_dest->port = UINT16_MAX;
> +
>      return pkt_dest;
>  }
>  
> @@ -2850,6 +3104,11 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
>          if (dev->type == DPDK_DEV_VHOST) {
>              __netdev_dpdk_vhost_send(netdev, qid, pkts, txcnt);
>          } else {
> +            if (userspace_tso_enabled()) {
> +                txcnt = netdev_dpdk_prep_hwol_batch(dev,
> +                                                    (struct rte_mbuf **)pkts,
> +                                                    txcnt);
> +            }
>              tx_failure += netdev_dpdk_eth_tx_burst(dev, qid,
>                                                     (struct rte_mbuf **)pkts,
>                                                     txcnt);
> @@ -2872,7 +3131,6 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
>                         struct dp_packet_batch *batch,
>                         bool concurrent_txq OVS_UNUSED)
>  {
> -
>      if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
>          dpdk_do_tx_copy(netdev, qid, batch);
>          dp_packet_delete_batch(batch, true);
> @@ -5033,6 +5291,10 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
>          netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
>          netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
>          netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
> +        /* Enable VXLAN TSO support if available */
> +        if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
> +            netdev->ol_flags |= NETDEV_TX_OFFLOAD_VXLAN_TSO;
> +        }
>          if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
>              netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
>          }
> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> index 6be23db..3965ae5 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -50,6 +50,7 @@
>  #include <unistd.h>
>  
>  #include "coverage.h"
> +#include "csum.h"
>  #include "dp-packet.h"
>  #include "dpif-netlink.h"
>  #include "dpif-netdev.h"
> @@ -79,6 +80,7 @@
>  #include "unaligned.h"
>  #include "openvswitch/vlog.h"
>  #include "userspace-tso.h"
> +#include "userspace-tso-segsz.h"
>  #include "util.h"
>  
>  VLOG_DEFINE_THIS_MODULE(netdev_linux);
> @@ -6508,6 +6510,8 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
>      struct eth_header *eth_hdr;
>      ovs_be16 eth_type;
>      int l2_len;
> +    int l3_len = 0;
> +    int l4_len = 0;
>  
>      eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN);
>      if (!eth_hdr) {
> @@ -6527,6 +6531,8 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
>          l2_len += VLAN_HEADER_LEN;
>      }
>  
> +    dp_packet_hwol_set_l2_len(b, l2_len);

This is not needed as the other like calls below because
the packet is going to through the datapath yet and there
is a packet parser at the beginning  which will set length
values again.

The purpose of this function is to identify the offloading
features used and translate that to OVS generic. For example,
if TSO was used, then we need to flag the packet properly.


> +
>      if (eth_type == htons(ETH_TYPE_IP)) {
>          struct ip_header *ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN);
>  
> @@ -6534,6 +6540,7 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
>              return -EINVAL;
>          }
>  
> +        l3_len = IP_HEADER_LEN;
>          *l4proto = ip_hdr->ip_proto;
>          dp_packet_hwol_set_tx_ipv4(b);
>      } else if (eth_type == htons(ETH_TYPE_IPV6)) {
> @@ -6544,10 +6551,35 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
>              return -EINVAL;
>          }
>  
> +        l3_len = IPV6_HEADER_LEN;
>          *l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
>          dp_packet_hwol_set_tx_ipv6(b);
>      }
>  
> +    dp_packet_hwol_set_l3_len(b, l3_len);
> +
> +    if (*l4proto == IPPROTO_TCP) {
> +        struct tcp_header *tcp_hdr =  dp_packet_at(b, l2_len + l3_len,
> +                                          sizeof(struct tcp_header));
> +
> +        if (!tcp_hdr) {
> +            return -EINVAL;
> +        }
> +
> +        l4_len = TCP_OFFSET(tcp_hdr->tcp_ctl) * 4;
> +        dp_packet_hwol_set_l4_len(b, l4_len);
> +    } else if (*l4proto == IPPROTO_UDP) {
> +        struct udp_header *udp_hdr =  dp_packet_at(b, l2_len + l3_len,
> +                                          sizeof(struct udp_header));
> +
> +        if (!udp_hdr) {
> +            return -EINVAL;
> +        }
> +
> +        l4_len = sizeof(struct udp_header);
> +        dp_packet_hwol_set_l4_len(b, l4_len);
> +    }
> +
>      return 0;
>  }
>  
> @@ -6561,10 +6593,6 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b)
>          return -EINVAL;
>      }
>  
> -    if (vnet->flags == 0 && vnet->gso_type == VIRTIO_NET_HDR_GSO_NONE) {
> -        return 0;
> -    }
> -

Why is that required? It seems that if flags are zero and gso_type
is none, it doesn't need to continue.


>      if (netdev_linux_parse_l2(b, &l4proto)) {
>          return -EINVAL;
>      }
> @@ -6595,22 +6623,130 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b)
>  }
>  
>  static void
> -netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu)
> +netdev_linux_set_ol_flags_and_cksum(struct dp_packet *b, int mtu)
> +{
> +    struct eth_header *eth_hdr;
> +    struct ip_header *ip_hdr = NULL;
> +    struct ovs_16aligned_ip6_hdr *nh6 = NULL;
> +    uint16_t l4proto = 0;
> +    ovs_be16 eth_type;
> +    int l2_len;
> +    int l3_len = 0;
> +    int l4_len = 0;


At this point the packet and struct dp_packet should be enough
to do the translation without requiring another packet parsing.


> +
> +    eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN);
> +    if (!eth_hdr) {
> +        return;
> +    }
> +
> +    l2_len = ETH_HEADER_LEN;
> +    eth_type = eth_hdr->eth_type;
> +    if (eth_type_vlan(eth_type)) {
> +        struct vlan_header *vlan = dp_packet_at(b, l2_len, VLAN_HEADER_LEN);
> +
> +        if (!vlan) {
> +            return;
> +        }
> +
> +        eth_type = vlan->vlan_next_type;
> +        l2_len += VLAN_HEADER_LEN;
> +    }
> +
> +    if (eth_type == htons(ETH_TYPE_IP)) {
> +        ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN);
> +
> +        if (!ip_hdr) {
> +            return;
> +        }
> +
> +        dp_packet_set_l3(b, ip_hdr);
> +        ip_hdr->ip_csum = 0;
> +        ip_hdr->ip_csum = csum(ip_hdr, sizeof *ip_hdr);
> +        l4proto = ip_hdr->ip_proto;
> +        dp_packet_hwol_set_tx_ipv4(b);
> +        l3_len = IP_HEADER_LEN;
> +    } else if (eth_type == htons(ETH_TYPE_IPV6)) {
> +        nh6 = dp_packet_at(b, l2_len, IPV6_HEADER_LEN);
> +        if (!nh6) {
> +            return;
> +        }
> +
> +        dp_packet_set_l3(b, nh6);
> +        l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
> +        dp_packet_hwol_set_tx_ipv6(b);
> +        l3_len = IPV6_HEADER_LEN;
> +    }
> +
> +    if (l4proto == IPPROTO_TCP) {
> +        /* Note: need set tcp pseudo checksum */
> +        struct tcp_header *tcp_hdr =  dp_packet_at(b, l2_len + l3_len,
> +                                          sizeof(struct tcp_header));
> +
> +        if (!tcp_hdr) {
> +            return;
> +        }
> +        l4_len = TCP_OFFSET(tcp_hdr->tcp_ctl) * 4;
> +        dp_packet_hwol_set_l4_len(b, l4_len);
> +        dp_packet_set_l4(b, tcp_hdr);
> +
> +        if (l3_len == IP_HEADER_LEN) {
> +            tcp_hdr->tcp_csum = csum_finish(packet_csum_pseudoheader(ip_hdr));
> +        } else {
> +            tcp_hdr->tcp_csum = csum_finish(packet_csum_pseudoheader6(nh6));
> +        }
> +        if (dp_packet_size(b) > mtu + l2_len) {
> +            dp_packet_hwol_set_tcp_seg(b);
> +        }
> +        dp_packet_hwol_set_csum_tcp(b);
> +    } else if (l4proto == IPPROTO_UDP) {
> +        struct udp_header *udp_hdr =  dp_packet_at(b, l2_len + l3_len,
> +                                          sizeof(struct udp_header));
> +
> +        if (!udp_hdr) {
> +            return;
> +        }
> +        l4_len = sizeof(struct udp_header);
> +        dp_packet_hwol_set_l4_len(b, l4_len);
> +        dp_packet_set_l4(b, udp_hdr);
> +        if (dp_packet_size(b) > mtu + l2_len) {
> +            dp_packet_hwol_set_udp_seg(b);
> +        }
> +        dp_packet_hwol_set_csum_udp(b);
> +    }
> +}
> +
> +static void
> +netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu OVS_UNUSED)
>  {
> -    struct virtio_net_hdr *vnet = dp_packet_push_zeros(b, sizeof *vnet);
> +    struct virtio_net_hdr *vnet;
> +    uint16_t tso_segsz = get_userspace_tso_segsz();
> +    uint16_t l4proto;
> +
> +    netdev_linux_parse_l2(b, &l4proto);
> +
> +    /* ol_flags weren't set correctly for received packets which are from
> +     * physical port, so it has to been set again in order that
> +     * vnet_hdr can be prepended correctly. Note: here tso_segsz but not
> +     * mtu are used because tso_segsz may be less than mtu.
> +     */
> +    if ((dp_packet_size(b) > tso_segsz + dp_packet_hwol_get_l2_len(b))
> +        && !dp_packet_hwol_l4_mask(b)) {
> +        netdev_linux_set_ol_flags_and_cksum(b, tso_segsz);
> +    }
> +
> +    vnet = dp_packet_push_zeros(b, sizeof *vnet);
>  
>      if (dp_packet_hwol_is_tso(b)) {
>          uint16_t hdr_len = ((char *)dp_packet_l4(b) - (char *)dp_packet_eth(b))
> -                            + TCP_HEADER_LEN;
> +                            + dp_packet_hwol_get_l4_len(b);
>  
>          vnet->hdr_len = (OVS_FORCE __virtio16)hdr_len;
> -        vnet->gso_size = (OVS_FORCE __virtio16)(mtu - hdr_len);
> +        vnet->gso_size = (OVS_FORCE __virtio16)(tso_segsz - hdr_len);
>          if (dp_packet_hwol_is_ipv4(b)) {
>              vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
>          } else {
>              vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
>          }
> -
>      } else {
>          vnet->flags = VIRTIO_NET_HDR_GSO_NONE;
>      }
> diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
> index 73dce2f..d616d79 100644
> --- a/lib/netdev-provider.h
> +++ b/lib/netdev-provider.h
> @@ -43,6 +43,7 @@ enum netdev_ol_flags {
>      NETDEV_TX_OFFLOAD_UDP_CKSUM = 1 << 2,
>      NETDEV_TX_OFFLOAD_SCTP_CKSUM = 1 << 3,
>      NETDEV_TX_OFFLOAD_TCP_TSO = 1 << 4,
> +    NETDEV_TX_OFFLOAD_VXLAN_TSO = 1 << 5,
>  };
>  
>  /* A network device (e.g. an Ethernet device).
> diff --git a/lib/netdev.c b/lib/netdev.c
> index 91e9195..8c881b0 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> @@ -33,6 +33,7 @@
>  
>  #include "cmap.h"
>  #include "coverage.h"
> +#include "csum.h"
>  #include "dpif.h"
>  #include "dp-packet.h"
>  #include "openvswitch/dynamic-string.h"
> @@ -55,6 +56,7 @@
>  #include "svec.h"
>  #include "openvswitch/vlog.h"
>  #include "flow.h"
> +#include "userspace-tso.h"
>  #include "util.h"
>  #ifdef __linux__
>  #include "tc.h"
> @@ -785,6 +787,64 @@ netdev_get_pt_mode(const struct netdev *netdev)
>              : NETDEV_PT_LEGACY_L2);
>  }
>  
> +static inline void
> +calculate_tcpudp_checksum(struct dp_packet *p)
> +{

Perhaps this could be in packet.c instead? 

> +    uint32_t pseudo_hdr_csum = 0;
> +    bool is_ipv6 = false;
> +    struct ovs_16aligned_ip6_hdr *ip6 = NULL;
> +    size_t len_l2 = (char *) dp_packet_l3(p) - (char *) dp_packet_eth(p);
> +    size_t len_l3 = (char *) dp_packet_l4(p) - (char *) dp_packet_l3(p);
> +    size_t l4_len = (char *) dp_packet_tail(p) - (char *) dp_packet_l4(p);
> +    uint16_t l4_proto = 0;
> +
> +    /* It is possible l2_len and l3_len aren't set here, so set them if no */
> +    if (dp_packet_hwol_get_l2_len(p) != len_l2) {
> +        dp_packet_hwol_set_l2_len(p, len_l2);
> +        dp_packet_hwol_set_l3_len(p, len_l3);
> +    }
> +
> +    if (len_l3 == sizeof(struct ovs_16aligned_ip6_hdr)) {
> +        ip6 = dp_packet_l3(p);
> +        l4_proto = ip6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
> +        is_ipv6 = true;
> +    } else {
> +        struct ip_header *ip = dp_packet_l3(p);
> +
> +        l4_proto = ip->ip_proto;
> +        ip->ip_csum = 0;
> +        ip->ip_csum = csum(ip, sizeof *ip);
> +        pseudo_hdr_csum = packet_csum_pseudoheader(ip);
> +    }
> +
> +    if (l4_proto == IPPROTO_TCP) {
> +        struct tcp_header *tcp = dp_packet_l4(p);
> +
> +        tcp->tcp_csum = 0;
> +        if (is_ipv6) {
> +            tcp->tcp_csum = packet_csum_upperlayer6(ip6, tcp, l4_proto,
> +                                                    l4_len);
> +        } else {
> +            tcp->tcp_csum = csum_finish(csum_continue(pseudo_hdr_csum,
> +                                                      tcp, l4_len));
> +        }
> +    } else if (l4_proto == IPPROTO_UDP) {
> +        struct udp_header *udp = dp_packet_l4(p);
> +
> +        udp->udp_csum = 0;
> +        if (is_ipv6) {
> +            udp->udp_csum = packet_csum_upperlayer6(ip6, udp, l4_proto,
> +                                                    l4_len);
> +        } else {
> +            udp->udp_csum = csum_finish(csum_continue(pseudo_hdr_csum,
> +                                                      udp, l4_len));
> +        }
> +        if (!udp->udp_csum) {
> +            udp->udp_csum = htons(0xffff);
> +        }
> +    }
> +}
> +
>  /* Check if a 'packet' is compatible with 'netdev_flags'.
>   * If a packet is incompatible, return 'false' with the 'errormsg'
>   * pointing to a reason. */
> @@ -794,6 +854,14 @@ netdev_send_prepare_packet(const uint64_t netdev_flags,
>  {
>      uint64_t l4_mask;
>  
> +    if (dp_packet_hwol_is_vxlan_tcp_seg(packet)
> +        && (dp_packet_hwol_is_tso(packet) || dp_packet_hwol_l4_mask(packet))
> +        && !(netdev_flags & NETDEV_TX_OFFLOAD_VXLAN_TSO)) {
> +        /* Fall back to GSO in software. */
> +        VLOG_ERR_BUF(errormsg, "No VXLAN TSO support");
> +        return false;
> +    }
> +
>      if (dp_packet_hwol_is_tso(packet)
>          && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
>              /* Fall back to GSO in software. */
> @@ -803,6 +871,33 @@ netdev_send_prepare_packet(const uint64_t netdev_flags,
>  
>      l4_mask = dp_packet_hwol_l4_mask(packet);
>      if (l4_mask) {
> +        /* Calculate checksum for VLAN TSO case when no hardware offload
> +         * feature is available. Note: for VXLAN TSO case, checksum has
> +         * been calculated before here, so it won't be done here again
> +         * because checksum flags in packet->m.ol_flags have been cleaned.
> +         */
> +        if (dp_packet_hwol_l4_is_tcp(packet)
> +            && !dp_packet_hwol_is_vxlan_tcp_seg(packet)
> +            && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) {
> +            dp_packet_hwol_reset_csum_tcp(packet);
> +            /* Only calculate TCP checksum for non-TSO packet.
> +             */
> +            if (!dp_packet_hwol_is_tso(packet)) {
> +                calculate_tcpudp_checksum(packet);
> +            }
> +            return true;
> +        } else if (dp_packet_hwol_l4_is_udp(packet)
> +            && !dp_packet_hwol_is_vxlan_tcp_seg(packet)
> +            && !(netdev_flags & NETDEV_TX_OFFLOAD_UDP_CKSUM)) {
> +            dp_packet_hwol_reset_csum_udp(packet);
> +            /* Only calculate UDP checksum for non-UFO packet.
> +             */
> +            if (!dp_packet_hwol_is_ufo(packet)) {
> +                calculate_tcpudp_checksum(packet);
> +            }
> +            return true;
> +        }
> +
>          if (dp_packet_hwol_l4_is_tcp(packet)) {
>              if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) {
>                  /* Fall back to TCP csum in software. */
> @@ -960,15 +1055,61 @@ netdev_push_header(const struct netdev *netdev,
>      size_t i, size = dp_packet_batch_size(batch);
>  
>      DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
> -        if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet)
> -                         || dp_packet_hwol_l4_mask(packet))) {
> +        if (OVS_UNLIKELY((dp_packet_hwol_is_tso(packet)
> +                          || dp_packet_hwol_l4_mask(packet))
> +                         && (data->tnl_type != OVS_VPORT_TYPE_VXLAN))) {
>              COVERAGE_INC(netdev_push_header_drops);
>              dp_packet_delete(packet);
> -            VLOG_WARN_RL(&rl, "%s: Tunneling packets with HW offload flags is "
> -                         "not supported: packet dropped",
> +            VLOG_WARN_RL(&rl,
> +                         "%s: non-VxLAN Tunneling packets with HW offload "
> +                         "flags is not supported: packet dropped",
>                           netdev_get_name(netdev));
>          } else {
> +            size_t len_l2 = (char *) dp_packet_l3(packet)
> +                                - (char *) dp_packet_eth(packet);
> +            size_t len_l3 = (char *) dp_packet_l4(packet)
> +                                - (char *) dp_packet_l3(packet);
> +            if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) {
> +                /* VXLAN offload can't support udp checksum offload
> +                 * for inner udp packet, so udp checksum must be set
> +                 * before push header in order that outer checksum can
> +                 * be set correctly.
> +                 */
> +                if (dp_packet_hwol_l4_is_udp(packet)) {
> +                    dp_packet_hwol_reset_csum_udp(packet);
> +                    /* Only calculate UDP checksum for non-UFO packet.
> +                     */
> +                    if (!dp_packet_hwol_is_ufo(packet)) {
> +                        calculate_tcpudp_checksum(packet);
> +                    }
> +                } else if (dp_packet_hwol_l4_is_tcp(packet)) {
> +                    dp_packet_hwol_reset_csum_tcp(packet);
> +                    /* Only calculate TCP checksum for non-TSO packet.
> +                     */
> +                    if (!dp_packet_hwol_is_tso(packet)) {
> +                        calculate_tcpudp_checksum(packet);
> +                    }
> +                }
> +            }
> +            /* It is possible l2_len and l3_len aren't set here, so set them
> +             * if no.
> +             */
> +            if (dp_packet_hwol_get_l2_len(packet) != len_l2) {
> +                dp_packet_hwol_set_l2_len(packet, len_l2);
> +                dp_packet_hwol_set_l3_len(packet, len_l3);
> +            }
> +
>              netdev->netdev_class->push_header(netdev, packet, data);
> +            if (userspace_tso_enabled()
> +                && (data->tnl_type == OVS_VPORT_TYPE_VXLAN)) {
> +                /* Just identify it as a vxlan packet, here netdev is
> +                 * vxlan_sys_*, netdev->ol_flags can't indicate if final
> +                 * physical output port can support VXLAN TSO, in
> +                 * netdev_send_prepare_packet will drop it if final
> +                 * physical output port can't support VXLAN TSO.
> +                 */
> +                dp_packet_hwol_set_vxlan_tcp_seg(packet);
> +            }
>              pkt_metadata_init(&packet->md, data->out_port);
>              dp_packet_batch_refill(batch, packet, i);
>          }
> diff --git a/lib/userspace-tso-segsz.c b/lib/userspace-tso-segsz.c
> new file mode 100644
> index 0000000..2d31a5b
> --- /dev/null
> +++ b/lib/userspace-tso-segsz.c
> @@ -0,0 +1,55 @@
> +/*
> + * Copyright (c) 2020 Inspur, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <config.h>
> +
> +#include "smap.h"
> +#include "ovs-thread.h"
> +#include "openvswitch/vlog.h"
> +#include "dpdk.h"
> +#include "userspace-tso-segsz.h"
> +#include "vswitch-idl.h"
> +
> +VLOG_DEFINE_THIS_MODULE(userspace_tso_segsz);
> +
> +#define DEFAULT_TSO_SEGSZ 1500
> +#define MAX_TSO_SEGSZ 9000
> +static uint16_t userspace_tso_segsz = DEFAULT_TSO_SEGSZ;
> +
> +void
> +userspace_tso_segsz_init(const struct smap *ovs_other_config)
> +{
> +    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
> +
> +    if (ovsthread_once_start(&once)) {
> +        int tso_segsz;
> +
> +        tso_segsz = smap_get_int(ovs_other_config, "userspace-tso-segsz",
> +                       DEFAULT_TSO_SEGSZ);
> +        if ((tso_segsz < 0) || (tso_segsz > MAX_TSO_SEGSZ)) {
> +            tso_segsz = DEFAULT_TSO_SEGSZ;
> +        }
> +        userspace_tso_segsz = tso_segsz;
> +        VLOG_INFO("Userspace TSO segsz set to %u", userspace_tso_segsz);
> +        ovsthread_once_done(&once);
> +    }
> +}
> +
> +uint16_t
> +get_userspace_tso_segsz(void)
> +{
> +    return userspace_tso_segsz;
> +}
> diff --git a/lib/userspace-tso-segsz.h b/lib/userspace-tso-segsz.h
> new file mode 100644
> index 0000000..c4e9e46
> --- /dev/null
> +++ b/lib/userspace-tso-segsz.h
> @@ -0,0 +1,23 @@
> +/*
> + * Copyright (c) 2020 Inspur, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef USERSPACE_TSO_SEGSZ_H
> +#define USERSPACE_TSO_SEGSZ_H 1
> +
> +void userspace_tso_segsz_init(const struct smap *ovs_other_config);
> +uint16_t get_userspace_tso_segsz(void);
> +
> +#endif /* userspace-tso-segsz.h */
> diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c
> index 5ed7e82..b131e73 100644
> --- a/vswitchd/bridge.c
> +++ b/vswitchd/bridge.c
> @@ -66,6 +66,7 @@
>  #include "timeval.h"
>  #include "tnl-ports.h"
>  #include "userspace-tso.h"
> +#include "userspace-tso-segsz.h"
>  #include "util.h"
>  #include "unixctl.h"
>  #include "lib/vswitch-idl.h"
> @@ -3292,6 +3293,7 @@ bridge_run(void)
>          netdev_set_flow_api_enabled(&cfg->other_config);
>          dpdk_init(&cfg->other_config);
>          userspace_tso_init(&cfg->other_config);
> +        userspace_tso_segsz_init(&cfg->other_config);
>      }
>  
>      /* Initialize the ofproto library.  This only needs to run once, but
> -- 
> 1.8.3.1
> 
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev

-- 
fbl


More information about the dev mailing list