[ovs-dev] 答复: [PATCH V3 2/4] Add GSO support for DPDK data path
Yi Yang (杨燚)-云服务集团
yangyi01 at inspur.com
Wed Oct 28 00:56:16 UTC 2020
-----邮件原件-----
发件人: dev [mailto:ovs-dev-bounces at openvswitch.org] 代表 Ilya Maximets
发送时间: 2020年10月27日 21:03
收件人: yang_y_yi at 163.com; ovs-dev at openvswitch.org
抄送: fbl at sysclose.org; i.maximets at ovn.org
主题: Re: [ovs-dev] [PATCH V3 2/4] Add GSO support for DPDK data path
On 8/7/20 12:56 PM, yang_y_yi at 163.com wrote:
> From: Yi Yang <yangyi01 at inspur.com>
>
> GSO(Generic Segment Offload) can segment large UDP and TCP packet to
> small packets per MTU of destination , especially for the case that
> physical NIC can't do hardware offload VXLAN TSO and VXLAN UFO, GSO
> can make sure userspace TSO can still work but not drop.
>
> In addition, GSO can help improve UDP performane when UFO is enabled
> in VM.
>
> GSO can support TCP, UDP, VXLAN TCP, VXLAN UDP, it is done in Tx
> function of physical NIC.
>
> Signed-off-by: Yi Yang <yangyi01 at inspur.com>
> ---
> lib/dp-packet.h | 21 +++-
> lib/netdev-dpdk.c | 358
> +++++++++++++++++++++++++++++++++++++++++++++++++----
> lib/netdev-linux.c | 17 ++-
> lib/netdev.c | 67 +++++++---
> 4 files changed, 417 insertions(+), 46 deletions(-)
>
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 79895f2..c33868d
> 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -83,6 +83,8 @@ enum dp_packet_offload_mask {
> DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, PKT_TX_SCTP_CKSUM, 0x800),
> /* VXLAN TCP Segmentation Offload. */
> DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN, PKT_TX_TUNNEL_VXLAN,
> 0x1000),
> + /* UDP Segmentation Offload. */
> + DEF_OL_FLAG(DP_PACKET_OL_TX_UDP_SEG, PKT_TX_UDP_SEG, 0x2000),
> /* Adding new field requires adding to
> DP_PACKET_OL_SUPPORTED_MASK. */ };
>
> @@ -97,7 +99,8 @@ enum dp_packet_offload_mask {
> DP_PACKET_OL_TX_IPV6 | \
> DP_PACKET_OL_TX_TCP_CKSUM | \
> DP_PACKET_OL_TX_UDP_CKSUM | \
> - DP_PACKET_OL_TX_SCTP_CKSUM)
> + DP_PACKET_OL_TX_SCTP_CKSUM | \
> + DP_PACKET_OL_TX_UDP_SEG)
>
> #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \
> DP_PACKET_OL_TX_UDP_CKSUM | \ @@
> -956,6 +959,13 @@ dp_packet_hwol_is_tso(const struct dp_packet *b)
> return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TCP_SEG);
> }
>
> +/* Returns 'true' if packet 'b' is marked for UDP segmentation
> +offloading. */ static inline bool dp_packet_hwol_is_uso(const struct
> +dp_packet *b) {
> + return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_UDP_SEG);
> +}
> +
> /* Returns 'true' if packet 'b' is marked for IPv4 checksum
> offloading. */ static inline bool dp_packet_hwol_is_ipv4(const
> struct dp_packet *b) @@ -1034,6 +1044,15 @@
> dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
> *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG; }
>
> +/* Mark packet 'b' for UDP segmentation offloading. It implies that
> + * either the packet 'b' is marked for IPv4 or IPv6 checksum
> +offloading
> + * and also for UDP checksum offloading. */ static inline void
> +dp_packet_hwol_set_udp_seg(struct dp_packet *b) {
> + *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_UDP_SEG; }
> +
> #ifdef DPDK_NETDEV
> /* Mark packet 'b' for VXLAN TCP segmentation offloading. */ static
> inline void diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
> 30493ed..888a45e 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -38,13 +38,15 @@
> #include <rte_errno.h>
> #include <rte_ethdev.h>
> #include <rte_flow.h>
> +#include <rte_gso.h>
> +#include <rte_ip.h>
> #include <rte_malloc.h>
> #include <rte_mbuf.h>
> #include <rte_meter.h>
> #include <rte_pci.h>
> #include <rte_version.h>
> #include <rte_vhost.h>
> -#include <rte_ip.h>
> +#include <rte_ip_frag.h>
>
> #include "cmap.h"
> #include "coverage.h"
> @@ -162,6 +164,7 @@ typedef uint16_t dpdk_port_t;
> | DEV_TX_OFFLOAD_UDP_CKSUM \
> | DEV_TX_OFFLOAD_IPV4_CKSUM)
>
> +#define MAX_GSO_MBUFS 64
>
> static const struct rte_eth_conf port_conf = {
> .rxmode = {
> @@ -2171,6 +2174,16 @@ is_local_to_local(uint16_t src_port_id, struct netdev_dpdk *dev)
> return ret;
> }
>
> +static uint16_t
> +get_udptcp_checksum(void *l3_hdr, void *l4_hdr, uint16_t ethertype) {
> + if (ethertype == htons(RTE_ETHER_TYPE_IPV4)) {
> + return rte_ipv4_udptcp_cksum(l3_hdr, l4_hdr);
> + } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
> + return rte_ipv6_udptcp_cksum(l3_hdr, l4_hdr);
> + }
> +}
> +
> /* Prepare the packet for HWOL.
> * Return True if the packet is OK to continue. */ static bool @@
> -2203,10 +2216,9 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
> * also can't be handled. So PKT_TX_TUNNEL_VXLAN must be cleared
> * outer_l2_len and outer_l3_len must be zeroed.
> */
> - if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_VXLAN_TSO)
> - || (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG))
> + if (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG))
> && (mbuf->pkt_len <= 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len
> - + mbuf->l2_len))) {
> + + mbuf->l2_len)) {
> mbuf->ol_flags &= ~PKT_TX_TUNNEL_VXLAN;
> mbuf->l2_len -= sizeof(struct udp_header)
> + sizeof(struct vxlanhdr); @@ -2249,7 +2261,7
> @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
> /* inner IP checksum offload offload */
> mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> }
> - } else if (mbuf->ol_flags & PKT_TX_L4_MASK) {
> + } else if (mbuf->ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6)) {
> /* Handle VLAN TSO */
> /* no inner IP checksum for IPV6 */
> if (mbuf->ol_flags & PKT_TX_IPV4) { @@ -2273,6 +2285,18 @@
> netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
> mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt);
> mbuf->outer_l2_len = 0;
> mbuf->outer_l3_len = 0;
> +
> + /* In case of GRO, PKT_TX_TCP_SEG or PKT_TX_UDP_SEG wasn't set by GRO
> + * APIs, here is a place we can mark it.
> + */
> + if ((mbuf->pkt_len > 1464)
> + && (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)))) {
> + if (l4_proto == IPPROTO_UDP) {
> + mbuf->ol_flags |= PKT_TX_UDP_SEG;
> + } else if (l4_proto == IPPROTO_TCP) {
> + mbuf->ol_flags |= PKT_TX_TCP_SEG;
> + }
> + }
> }
>
> /* It is possible that l4_len isn't set for vhostuserclient */ @@
> -2284,6 +2308,10 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
> mbuf->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
> }
>
> + if ((l4_proto != IPPROTO_UDP) && (l4_proto != IPPROTO_TCP)) {
> + return true;
> + }
> +
> if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) {
> if (l4_proto != IPPROTO_UDP) {
> VLOG_WARN_RL(&rl, "%s: UDP packet without L4 header"
> @@ -2294,11 +2322,13 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
> mbuf->ol_flags & PKT_TX_TCP_CKSUM) {
> if (l4_proto != IPPROTO_TCP) {
> VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
> - " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
> + " pkt len: %"PRIu32" l4_proto = %d",
> + dev->up.name, mbuf->pkt_len, l4_proto);
> return false;
> }
>
> - if (mbuf->pkt_len - mbuf->l2_len > 1450) {
> + if (mbuf->pkt_len > 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len
> + + mbuf->l2_len) {
> dp_packet_hwol_set_tcp_seg(pkt);
> }
>
> @@ -2308,7 +2338,66 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
> } else {
> mbuf->tso_segsz = 0;
> }
> +
> + if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
> + /* PKT_TX_TCP_CKSUM must be cleaned for GSO because
> + * tcp checksum only can be caculated by software for
> + * GSO case.
> + */
> + mbuf->ol_flags &= ~PKT_TX_TCP_CKSUM;
> + }
> }
> +
> + /* UDP GSO if necessary */
> + if (l4_proto == IPPROTO_UDP) {
> + /* VXLAN GSO can be done here */
> + if ((mbuf->ol_flags & PKT_TX_UDP_SEG) ||
> + (mbuf->pkt_len > (1450 + mbuf->outer_l2_len + mbuf->outer_l3_len +
> + mbuf->l2_len))) {
> + dp_packet_hwol_set_udp_seg(pkt);
> +
> + /* For UDP GSO, udp checksum must be calculated by software */
> + if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) {
> + void *l3_hdr, *l4_hdr;
> + struct rte_udp_hdr *udp_hdr;
> +
> + /* PKT_TX_UDP_CKSUM must be cleaned for GSO because
> + * udp checksum only can be caculated by software for
> + * GSO case.
> + */
> + mbuf->ol_flags &= ~PKT_TX_UDP_CKSUM;
> +
> + eth_hdr = (struct rte_ether_hdr *)
> + ((uint8_t *)eth_hdr + mbuf->outer_l2_len +
> + mbuf->outer_l3_len +
> + sizeof(struct udp_header) +
> + sizeof(struct vxlanhdr));
> + l3_hdr = (uint8_t *)eth_hdr + mbuf->l2_len -
> + sizeof(struct udp_header) -
> + sizeof(struct vxlanhdr);
> + l4_hdr = (uint8_t *)l3_hdr + mbuf->l3_len;
> + ip_hdr = (struct rte_ipv4_hdr *)l3_hdr;
> + ip_hdr->hdr_checksum = 0;
> + ip_hdr->hdr_checksum = rte_ipv4_cksum(ip_hdr);
> + /* Don't touch UDP checksum if it is ip fragment */
> + if (!rte_ipv4_frag_pkt_is_fragmented(ip_hdr)) {
> + udp_hdr = (struct rte_udp_hdr *)l4_hdr;
> + udp_hdr->dgram_cksum = 0;
> + udp_hdr->dgram_cksum =
> + get_udptcp_checksum(l3_hdr, l4_hdr,
> + eth_hdr->ether_type);
> + }
> + }
> +
> + /* FOR GSO, gso_size includes l2_len + l3_len */
> + mbuf->tso_segsz = 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len +
> + mbuf->l2_len;
> + if (mbuf->tso_segsz > dev->mtu) {
> + mbuf->tso_segsz = dev->mtu;
> + }
> + }
> + }
> +
> return true;
> }
>
> @@ -2339,24 +2428,19 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
> return cnt;
> }
>
> -/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes
> ownership of
> - * 'pkts', even in case of failure.
> - *
> - * Returns the number of packets that weren't transmitted. */ static
> inline int -netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> - struct rte_mbuf **pkts, int cnt)
> +__netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> + struct rte_mbuf **pkts, int cnt)
> {
> uint32_t nb_tx = 0;
> - uint16_t nb_tx_prep = cnt;
> + uint32_t nb_tx_prep;
>
> - if (userspace_tso_enabled()) {
> - nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
> - if (nb_tx_prep != cnt) {
> - VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
> - "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep,
> - cnt, rte_strerror(rte_errno));
> - }
> + nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
> + if (nb_tx_prep != cnt) {
> + VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
> + "Only %u/%u are valid: %s",
> + dev->up.name, nb_tx_prep,
> + cnt, rte_strerror(rte_errno));
> }
>
> while (nb_tx != nb_tx_prep) {
> @@ -2384,6 +2468,200 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> return cnt - nb_tx;
> }
>
> +static inline void
> +set_multiseg_udptcp_cksum(struct rte_mbuf *mbuf)
I didn't review the patch, only had a quick glance, but this part bothers me. OVS doesn't support multi-segment mbufs, so it should not be possible for such mbufs being transmitted by OVS. So, I do not understand why this function needs to work with such mbufs.
[Yi Yang] Only DPDK driver/Tx function will use it, not OVS, set_multiseg_udptcp_cksum is called in GSO part, it is last step before Tx function, it is a big external mbuf before rte_gso_segment, that isn't a multi-segmented mbuf.
More information about the dev
mailing list