[ovs-dev] 答复: [PATCH V3 2/4] Add GSO support for DPDK data path

Yi Yang (杨燚)-云服务集团 yangyi01 at inspur.com
Wed Oct 28 00:56:16 UTC 2020


-----邮件原件-----
发件人: dev [mailto:ovs-dev-bounces at openvswitch.org] 代表 Ilya Maximets
发送时间: 2020年10月27日 21:03
收件人: yang_y_yi at 163.com; ovs-dev at openvswitch.org
抄送: fbl at sysclose.org; i.maximets at ovn.org
主题: Re: [ovs-dev] [PATCH V3 2/4] Add GSO support for DPDK data path

On 8/7/20 12:56 PM, yang_y_yi at 163.com wrote:
> From: Yi Yang <yangyi01 at inspur.com>
> 
> GSO(Generic Segment Offload) can segment large UDP  and TCP packet to 
> small packets per MTU of destination , especially for the case that 
> physical NIC can't do hardware offload VXLAN TSO and VXLAN UFO, GSO 
> can make sure userspace TSO can still work but not drop.
> 
> In addition, GSO can help improve UDP performane when UFO is enabled 
> in VM.
> 
> GSO can support TCP, UDP, VXLAN TCP, VXLAN UDP, it is done in Tx 
> function of physical NIC.
> 
> Signed-off-by: Yi Yang <yangyi01 at inspur.com>
> ---
>  lib/dp-packet.h    |  21 +++-
>  lib/netdev-dpdk.c  | 358 
> +++++++++++++++++++++++++++++++++++++++++++++++++----
>  lib/netdev-linux.c |  17 ++-
>  lib/netdev.c       |  67 +++++++---
>  4 files changed, 417 insertions(+), 46 deletions(-)
> 
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 79895f2..c33868d 
> 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -83,6 +83,8 @@ enum dp_packet_offload_mask {
>      DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, PKT_TX_SCTP_CKSUM, 0x800),
>      /* VXLAN TCP Segmentation Offload. */
>      DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN, PKT_TX_TUNNEL_VXLAN, 
> 0x1000),
> +    /* UDP Segmentation Offload. */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_UDP_SEG, PKT_TX_UDP_SEG, 0x2000),
>      /* Adding new field requires adding to 
> DP_PACKET_OL_SUPPORTED_MASK. */  };
>  
> @@ -97,7 +99,8 @@ enum dp_packet_offload_mask {
>                                       DP_PACKET_OL_TX_IPV6          | \
>                                       DP_PACKET_OL_TX_TCP_CKSUM     | \
>                                       DP_PACKET_OL_TX_UDP_CKSUM     | \
> -                                     DP_PACKET_OL_TX_SCTP_CKSUM)
> +                                     DP_PACKET_OL_TX_SCTP_CKSUM    | \
> +                                     DP_PACKET_OL_TX_UDP_SEG)
>  
>  #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \
>                                   DP_PACKET_OL_TX_UDP_CKSUM | \ @@ 
> -956,6 +959,13 @@ dp_packet_hwol_is_tso(const struct dp_packet *b)
>      return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TCP_SEG);  
> }
>  
> +/* Returns 'true' if packet 'b' is marked for UDP segmentation 
> +offloading. */ static inline bool dp_packet_hwol_is_uso(const struct 
> +dp_packet *b) {
> +    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_UDP_SEG); 
> +}
> +
>  /* Returns 'true' if packet 'b' is marked for IPv4 checksum 
> offloading. */  static inline bool  dp_packet_hwol_is_ipv4(const 
> struct dp_packet *b) @@ -1034,6 +1044,15 @@ 
> dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
>      *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;  }
>  
> +/* Mark packet 'b' for UDP segmentation offloading.  It implies that
> + * either the packet 'b' is marked for IPv4 or IPv6 checksum 
> +offloading
> + * and also for UDP checksum offloading. */ static inline void 
> +dp_packet_hwol_set_udp_seg(struct dp_packet *b) {
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_UDP_SEG; }
> +
>  #ifdef DPDK_NETDEV
>  /* Mark packet 'b' for VXLAN TCP segmentation offloading. */  static 
> inline void diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 
> 30493ed..888a45e 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -38,13 +38,15 @@
>  #include <rte_errno.h>
>  #include <rte_ethdev.h>
>  #include <rte_flow.h>
> +#include <rte_gso.h>
> +#include <rte_ip.h>
>  #include <rte_malloc.h>
>  #include <rte_mbuf.h>
>  #include <rte_meter.h>
>  #include <rte_pci.h>
>  #include <rte_version.h>
>  #include <rte_vhost.h>
> -#include <rte_ip.h>
> +#include <rte_ip_frag.h>
>  
>  #include "cmap.h"
>  #include "coverage.h"
> @@ -162,6 +164,7 @@ typedef uint16_t dpdk_port_t;
>                                     | DEV_TX_OFFLOAD_UDP_CKSUM    \
>                                     | DEV_TX_OFFLOAD_IPV4_CKSUM)
>  
> +#define MAX_GSO_MBUFS 64
>  
>  static const struct rte_eth_conf port_conf = {
>      .rxmode = {
> @@ -2171,6 +2174,16 @@ is_local_to_local(uint16_t src_port_id, struct netdev_dpdk *dev)
>      return ret;
>  }
>  
> +static uint16_t
> +get_udptcp_checksum(void *l3_hdr, void *l4_hdr, uint16_t ethertype) {
> +        if (ethertype == htons(RTE_ETHER_TYPE_IPV4)) {
> +                return rte_ipv4_udptcp_cksum(l3_hdr, l4_hdr);
> +        } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
> +                return rte_ipv6_udptcp_cksum(l3_hdr, l4_hdr);
> +        }
> +}
> +
>  /* Prepare the packet for HWOL.
>   * Return True if the packet is OK to continue. */  static bool @@ 
> -2203,10 +2216,9 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>           * also can't be handled. So PKT_TX_TUNNEL_VXLAN must be cleared
>           * outer_l2_len and outer_l3_len must be zeroed.
>           */
> -        if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_VXLAN_TSO)
> -            || (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG))
> +        if (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG))
>              && (mbuf->pkt_len <= 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len
> -                + mbuf->l2_len)))  {
> +                + mbuf->l2_len))  {
>              mbuf->ol_flags &= ~PKT_TX_TUNNEL_VXLAN;
>              mbuf->l2_len -= sizeof(struct udp_header)
>                          + sizeof(struct vxlanhdr); @@ -2249,7 +2261,7 
> @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>              /* inner IP checksum offload offload */
>              mbuf->ol_flags |= PKT_TX_IP_CKSUM;
>          }
> -    } else if (mbuf->ol_flags & PKT_TX_L4_MASK) {
> +    } else if (mbuf->ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6)) {
>          /* Handle VLAN TSO */
>              /* no inner IP checksum for IPV6 */
>          if (mbuf->ol_flags & PKT_TX_IPV4) { @@ -2273,6 +2285,18 @@ 
> netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>          mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt);
>          mbuf->outer_l2_len = 0;
>          mbuf->outer_l3_len = 0;
> +
> +        /* In case of GRO, PKT_TX_TCP_SEG or PKT_TX_UDP_SEG wasn't set by GRO
> +         * APIs, here is a place we can mark it.
> +         */
> +        if ((mbuf->pkt_len > 1464)
> +            && (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)))) {
> +            if (l4_proto == IPPROTO_UDP) {
> +                mbuf->ol_flags |= PKT_TX_UDP_SEG;
> +            } else if (l4_proto == IPPROTO_TCP) {
> +                mbuf->ol_flags |= PKT_TX_TCP_SEG;
> +            }
> +        }
>      }
>  
>      /* It is possible that l4_len isn't set for vhostuserclient */ @@ 
> -2284,6 +2308,10 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>          mbuf->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
>      }
>  
> +    if ((l4_proto != IPPROTO_UDP) && (l4_proto != IPPROTO_TCP)) {
> +        return true;
> +    }
> +
>      if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) {
>          if (l4_proto != IPPROTO_UDP) {
>              VLOG_WARN_RL(&rl, "%s: UDP packet without L4 header"
> @@ -2294,11 +2322,13 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>                 mbuf->ol_flags & PKT_TX_TCP_CKSUM) {
>          if (l4_proto != IPPROTO_TCP) {
>              VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
> -                         " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
> +                         " pkt len: %"PRIu32" l4_proto = %d",
> +                         dev->up.name, mbuf->pkt_len, l4_proto);
>              return false;
>          }
>  
> -        if (mbuf->pkt_len - mbuf->l2_len > 1450) {
> +        if (mbuf->pkt_len > 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len
> +            + mbuf->l2_len) {
>              dp_packet_hwol_set_tcp_seg(pkt);
>          }
>  
> @@ -2308,7 +2338,66 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>          } else {
>              mbuf->tso_segsz = 0;
>          }
> +
> +        if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
> +            /* PKT_TX_TCP_CKSUM must be cleaned for GSO because
> +             * tcp checksum only can be caculated by software for
> +             * GSO case.
> +             */
> +            mbuf->ol_flags &= ~PKT_TX_TCP_CKSUM;
> +        }
>      }
> +
> +    /* UDP GSO if necessary */
> +    if (l4_proto == IPPROTO_UDP) {
> +        /* VXLAN GSO can be done here */
> +        if ((mbuf->ol_flags & PKT_TX_UDP_SEG) ||
> +            (mbuf->pkt_len > (1450 + mbuf->outer_l2_len + mbuf->outer_l3_len +
> +                             mbuf->l2_len))) {
> +            dp_packet_hwol_set_udp_seg(pkt);
> +
> +            /* For UDP GSO, udp checksum must be calculated by software */
> +            if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) {
> +                void *l3_hdr, *l4_hdr;
> +                struct rte_udp_hdr *udp_hdr;
> +
> +                /* PKT_TX_UDP_CKSUM must be cleaned for GSO because
> +                 * udp checksum only can be caculated by software for
> +                 * GSO case.
> +                 */
> +                mbuf->ol_flags &= ~PKT_TX_UDP_CKSUM;
> +
> +                eth_hdr = (struct rte_ether_hdr *)
> +                    ((uint8_t *)eth_hdr + mbuf->outer_l2_len +
> +                               mbuf->outer_l3_len +
> +                               sizeof(struct udp_header) +
> +                               sizeof(struct vxlanhdr));
> +                l3_hdr = (uint8_t *)eth_hdr + mbuf->l2_len -
> +                         sizeof(struct udp_header) -
> +                         sizeof(struct vxlanhdr);
> +                l4_hdr = (uint8_t *)l3_hdr + mbuf->l3_len;
> +                ip_hdr = (struct rte_ipv4_hdr *)l3_hdr;
> +                ip_hdr->hdr_checksum = 0;
> +                ip_hdr->hdr_checksum = rte_ipv4_cksum(ip_hdr);
> +                /* Don't touch UDP checksum if it is ip fragment */
> +                if (!rte_ipv4_frag_pkt_is_fragmented(ip_hdr)) {
> +                    udp_hdr = (struct rte_udp_hdr *)l4_hdr;
> +                    udp_hdr->dgram_cksum = 0;
> +                    udp_hdr->dgram_cksum =
> +                        get_udptcp_checksum(l3_hdr, l4_hdr,
> +                                            eth_hdr->ether_type);
> +                }
> +            }
> +
> +            /* FOR GSO, gso_size includes l2_len + l3_len */
> +            mbuf->tso_segsz = 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len +
> +                              mbuf->l2_len;
> +            if (mbuf->tso_segsz > dev->mtu) {
> +                mbuf->tso_segsz = dev->mtu;
> +            }
> +        }
> +    }
> +
>      return true;
>  }
>  
> @@ -2339,24 +2428,19 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
>      return cnt;
>  }
>  
> -/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes 
> ownership of
> - * 'pkts', even in case of failure.
> - *
> - * Returns the number of packets that weren't transmitted. */  static 
> inline int -netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> -                         struct rte_mbuf **pkts, int cnt)
> +__netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> +                           struct rte_mbuf **pkts, int cnt)
>  {
>      uint32_t nb_tx = 0;
> -    uint16_t nb_tx_prep = cnt;
> +    uint32_t nb_tx_prep;
>  
> -    if (userspace_tso_enabled()) {
> -        nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
> -        if (nb_tx_prep != cnt) {
> -            VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
> -                         "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep,
> -                         cnt, rte_strerror(rte_errno));
> -        }
> +    nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
> +    if (nb_tx_prep != cnt) {
> +        VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
> +                          "Only %u/%u are valid: %s",
> +                     dev->up.name, nb_tx_prep,
> +                     cnt, rte_strerror(rte_errno));
>      }
>  
>      while (nb_tx != nb_tx_prep) {
> @@ -2384,6 +2468,200 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
>      return cnt - nb_tx;
>  }
>  
> +static inline void
> +set_multiseg_udptcp_cksum(struct rte_mbuf *mbuf)

I didn't review the patch, only had a quick glance, but this part bothers me.  OVS doesn't support multi-segment mbufs, so it should not be possible for such mbufs being transmitted by OVS.  So, I do not understand why this function needs to work with such mbufs.

[Yi Yang] Only DPDK driver/Tx function will use it, not OVS, set_multiseg_udptcp_cksum is called in GSO part, it is last step before Tx function, it is a big external mbuf before rte_gso_segment, that isn't a multi-segmented mbuf.




More information about the dev mailing list