[ovs-dev] 答复: [PATCH v1] Enable VXLAN TSO for DPDK datapath
Yi Yang (杨燚)-云服务集团
yangyi01 at inspur.com
Thu Jun 4 00:29:43 UTC 2020
Ian, thank you so much for comments, I will run travis to find out all the similar issues and fix them in v2. For setup, I have sent you an email which included dpdk bug link, please use that to setup it, please let me know if you have any issue.
-----邮件原件-----
发件人: Stokes, Ian [mailto:ian.stokes at intel.com]
发送时间: 2020年6月3日 20:43
收件人: yang_y_yi at 163.com; ovs-dev at openvswitch.org
抄送: blp at ovn.org; u9012063 at gmail.com; Yi Yang (杨燚)-云服务集团 <yangyi01 at inspur.com>
主题: Re: [PATCH v1] Enable VXLAN TSO for DPDK datapath
On 6/1/2020 11:15 AM, yang_y_yi at 163.com wrote:
> From: Yi Yang <yangyi01 at inspur.com>
>
> Many NICs can support VXLAN TSO which can help improve
> across-compute-node VM-to-VM performance in case that MTU is set to
> 1500.
>
> This patch allows dpdkvhostuserclient interface and veth/tap interface
> to leverage NICs' offload capability to maximize across-compute-node
> TCP performance, with it applied, OVS DPDK can reach linespeed for
> across-compute-node VM-to-VM TCP performance.
>
> GSO (for UDP and software VXLAN TSO ) and GRO will be added in the
> near future.
Hi Yi, thanks for working on this, I have not tested yet as there were multiple compilation issues to be addressed, I've flagged these below.
If you can address these and meanwhile I'll look to deploy a setup in
our lab to enable testing/review in greater detail.
>
> Signed-off-by: Yi Yang <yangyi01 at inspur.com>
> ---
> lib/dp-packet.h | 58 ++++++++++++++++
> lib/netdev-dpdk.c | 193 ++++++++++++++++++++++++++++++++++++++++++++++++++----
> lib/netdev.c | 16 ++---
> 3 files changed, 243 insertions(+), 24 deletions(-)
>
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h
> index 0430cca..7faa675 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -1032,6 +1032,64 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
> *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;
> }
>
> +#ifdef DPDK_NETDEV
> +/* Mark packet 'b' for VXLAN TCP segmentation offloading. */
> +static inline void
> +dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b)
> +{
> + b->mbuf.ol_flags |= PKT_TX_TUNNEL_VXLAN;
> + b->mbuf.l2_len += sizeof(struct udp_header) +
> + sizeof(struct vxlanhdr);
> + b->mbuf.outer_l2_len = ETH_HEADER_LEN;
> + b->mbuf.outer_l3_len = IP_HEADER_LEN;
> +}
> +
> +/* Set l2_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len)
> +{
> + b->mbuf.l2_len = l2_len;
> +}
> +
> +/* Set l3_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len)
> +{
> + b->mbuf.l3_len = l3_len;
> +}
> +
> +/* Set l4_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len)
> +{
> + b->mbuf.l4_len = l4_len;
> +}
> +#else
For non DPDK case, need to add OVS_UNUSED after each function parameter
below, otherwise it will be treated as unused parameter and cause
compilation failure for non DPDK case.
> +/* Mark packet 'b' for VXLAN TCP segmentation offloading. */
> +static inline void
> +dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b)
> +{
> +}
> +
> +/* Set l2_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len)
> +{
> +}
> +
> +/* Set l3_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len)
> +{
> +}
> +
> +/* Set l4_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len)
> +{
> +}
> +#endif /* DPDK_NETDEV */
> +
> static inline bool
> dp_packet_ip_checksum_valid(const struct dp_packet *p)
> {
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 44ebf96..c2424f7 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -44,6 +44,7 @@
> #include <rte_pci.h>
> #include <rte_version.h>
> #include <rte_vhost.h>
> +#include <rte_ip.h>
>
> #include "cmap.h"
> #include "coverage.h"
> @@ -405,6 +406,7 @@ enum dpdk_hw_ol_features {
> NETDEV_RX_HW_SCATTER = 1 << 2,
> NETDEV_TX_TSO_OFFLOAD = 1 << 3,
> NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 4,
> + NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 5,
> };
>
> /*
> @@ -988,6 +990,12 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
>
> if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
> conf.txmode.offloads |= DPDK_TX_TSO_OFFLOAD_FLAGS;
> + /* Enable VXLAN TSO support if available */
> + if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
> + conf.txmode.offloads |= DEV_TX_OFFLOAD_VXLAN_TNL_TSO;
> + conf.txmode.offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
> + conf.txmode.offloads |= DEV_TX_OFFLOAD_MULTI_SEGS;
> + }
> if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
> conf.txmode.offloads |= DEV_TX_OFFLOAD_SCTP_CKSUM;
> }
> @@ -1126,6 +1134,10 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
> if ((info.tx_offload_capa & tx_tso_offload_capa)
> == tx_tso_offload_capa) {
> dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
> + /* Enable VXLAN TSO support if available */
> + if (info.tx_offload_capa & DEV_TX_OFFLOAD_VXLAN_TNL_TSO) {
> + dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
> + }
> if (info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM) {
> dev->hw_ol_features |= NETDEV_TX_SCTP_CHECKSUM_OFFLOAD;
> } else {
> @@ -2137,29 +2149,97 @@ static bool
> netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
> {
> struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf);
> + uint16_t l4_proto = 0;
> + struct rte_ether_hdr *eth_hdr =
> + rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *);
> + struct rte_ipv4_hdr *ip_hdr;
> + struct rte_ipv6_hdr *ip6_hdr;
> +
> + if (mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) {
> + /* Handle VXLAN TSO */
> + struct rte_udp_hdr *udp_hdr;
> +
> + if (mbuf->ol_flags & PKT_TX_IPV4) {
We try to avoid use of magic numbers where possible, might be worth
defining the + 1 separately for where it's used below and elsewhere.
> + ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> + udp_hdr = (struct rte_udp_hdr *)(ip_hdr + 1);
> +
> + /* outer IP checksum offload */
> + ip_hdr->hdr_checksum = 0;
> + mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM;
> + mbuf->ol_flags |= PKT_TX_OUTER_IPV4;
> +
> + ip_hdr = (struct rte_ipv4_hdr *)
> + ((uint8_t *)udp_hdr + mbuf->l2_len);
> + l4_proto = ip_hdr->next_proto_id;
> +
> + /* inner IP checksum offload */
> + ip_hdr->hdr_checksum = 0;
> + mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> + } else if (mbuf->ol_flags & PKT_TX_IPV6) {
> + ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> + udp_hdr = (struct rte_udp_hdr *)(ip_hdr + 1);
> +
> + /* outer IP checksum offload */
> + ip_hdr->hdr_checksum = 0;
> + mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM;
> + mbuf->ol_flags |= PKT_TX_OUTER_IPV4;
> +
> + ip6_hdr = (struct rte_ipv6_hdr *)
> + ((uint8_t *)udp_hdr + mbuf->l2_len);
> + l4_proto = ip6_hdr->proto;
> +
> + /* inner IP checksum offload offload */
> + mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> + }
> + } else if (mbuf->ol_flags & PKT_TX_L4_MASK) {
> + /* Handle VLAN TSO */
> + /* no inner IP checksum for IPV6 */
> + if (mbuf->ol_flags & PKT_TX_IPV4) {
> + ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> + l4_proto = ip_hdr->next_proto_id;
> +
> + /* IP checksum offload */
> + ip_hdr->hdr_checksum = 0;
> + mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> + } else if (mbuf->ol_flags & PKT_TX_IPV6) {
> + ip6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
> + l4_proto = ip6_hdr->proto;
> +
> + /* IP checksum offload */
> + mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> + }
>
> - if (mbuf->ol_flags & PKT_TX_L4_MASK) {
> mbuf->l2_len = (char *)dp_packet_l3(pkt) - (char *)dp_packet_eth(pkt);
> mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt);
> mbuf->outer_l2_len = 0;
> mbuf->outer_l3_len = 0;
> }
>
> - if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
> - struct tcp_header *th = dp_packet_l4(pkt);
> -
> - if (!th) {
> + if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) { //UDP
> + if (l4_proto != IPPROTO_UDP) {
> + VLOG_WARN_RL(&rl, "%s: UDP packet without L4 header"
> + " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
> + return false;
> + }
> + /* VXLAN GSO can be done here */
> + } else if (mbuf->ol_flags & PKT_TX_TCP_SEG ||
> + mbuf->ol_flags & PKT_TX_TCP_CKSUM) {
> + if (l4_proto != IPPROTO_TCP) {
> VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
> " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
> return false;
> }
>
> - mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
> - mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
> - mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
> + if (mbuf->pkt_len - mbuf->l2_len > 1450) {
> + dp_packet_hwol_set_tcp_seg(pkt);
> + }
>
> - if (mbuf->ol_flags & PKT_TX_IPV4) {
> - mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> + mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
> + if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
> + //mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
> + mbuf->tso_segsz = 1450 - mbuf->l3_len - mbuf->l4_len;
> + } else {
> + mbuf->tso_segsz = 0;
> }
> }
> return true;
> @@ -2365,6 +2445,71 @@ netdev_dpdk_vhost_update_rx_counters(struct netdev_dpdk *dev,
> }
> }
>
> +static void
> +netdev_linux_parse_l2(struct dp_packet *pkt, uint16_t *l4_proto)
> +{
> + struct rte_mbuf *mbuf = (struct rte_mbuf *)pkt;
> + struct rte_ether_hdr *eth_hdr =
> + rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *);
> + ovs_be16 eth_type;
> + int l2_len;
> + int l3_len = 0;
> + int l4_len = 0;
> +
> + l2_len = ETH_HEADER_LEN;
Will cause compilation issue below, mixing base types, will expect
restricted ovs_be16 but will be assigned unsigned short.
> + eth_type = eth_hdr->ether_type;
> + if (eth_type_vlan(eth_type)) {
> + struct rte_vlan_hdr *vlan_hdr =
> + (struct rte_vlan_hdr *)(eth_hdr + 1);
> +
Same issue as flagged above, unexpected type.
> + eth_type = vlan_hdr->eth_proto;
> + l2_len += VLAN_HEADER_LEN;
> + }
> +
> + dp_packet_hwol_set_l2_len(pkt, l2_len);
> +
> + if (eth_type == htons(ETH_TYPE_IP)) {
> + struct rte_ipv4_hdr *ipv4_hdr = (struct rte_ipv4_hdr *)
> + ((char *)eth_hdr + l2_len);
> +
> + l3_len = IP_HEADER_LEN;
> + dp_packet_hwol_set_tx_ipv4(pkt);
> + *l4_proto = ipv4_hdr->next_proto_id;
> + } else if (eth_type == htons(RTE_ETHER_TYPE_IPV6)) {
> + struct rte_ipv6_hdr *ipv6_hdr = (struct rte_ipv6_hdr *)
> + ((char *)eth_hdr + l2_len);
> + l3_len = IPV6_HEADER_LEN;
> + dp_packet_hwol_set_tx_ipv6(pkt);
> + *l4_proto = ipv6_hdr->proto;
> + }
> +
> + dp_packet_hwol_set_l3_len(pkt, l3_len);
> +
> + if (*l4_proto == IPPROTO_TCP) {
> + struct rte_tcp_hdr *tcp_hdr = (struct rte_tcp_hdr *)
> + ((char *)eth_hdr + l2_len + l3_len);
> +
> + l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
> + dp_packet_hwol_set_l4_len(pkt, l4_len);
> + }
> +}
> +
> +static void
> +netdev_dpdk_parse_hdr(struct dp_packet *b)
> +{
> + uint16_t l4_proto = 0;
> +
> + netdev_linux_parse_l2(b, &l4_proto);
> +
> + if (l4_proto == IPPROTO_TCP) {
> + dp_packet_hwol_set_csum_tcp(b);
> + } else if (l4_proto == IPPROTO_UDP) {
> + dp_packet_hwol_set_csum_udp(b);
> + } else if (l4_proto == IPPROTO_SCTP) {
> + dp_packet_hwol_set_csum_sctp(b);
> + }
> +}
> +
> /*
> * The receive path for the vhost port is the TX path out from guest.
> */
> @@ -2378,6 +2523,7 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
> uint16_t qos_drops = 0;
> int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
> int vid = netdev_dpdk_get_vid(dev);
> + struct dp_packet *packet;
>
> if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
> || !(dev->flags & NETDEV_UP))) {
> @@ -2417,6 +2563,14 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
> batch->count = nb_rx;
> dp_packet_batch_init_packet_fields(batch);
>
> + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> + struct rte_mbuf *mbuf = (struct rte_mbuf *)packet;
> +
> + /* Clear ol_flags and set it by parsing header */
> + mbuf->ol_flags = 0;
> + netdev_dpdk_parse_hdr(packet);
> + }
> +
> return 0;
> }
>
> @@ -2737,13 +2891,18 @@ dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig)
>
> mbuf_dest->tx_offload = pkt_orig->mbuf.tx_offload;
> mbuf_dest->packet_type = pkt_orig->mbuf.packet_type;
> - mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
> - ~(EXT_ATTACHED_MBUF | IND_ATTACHED_MBUF));
> + mbuf_dest->ol_flags |= pkt_orig->mbuf.ol_flags;
> + mbuf_dest->l2_len = pkt_orig->mbuf.l2_len;
> + mbuf_dest->l3_len = pkt_orig->mbuf.l3_len;
> + mbuf_dest->l4_len = pkt_orig->mbuf.l4_len;
> + mbuf_dest->outer_l2_len = pkt_orig->mbuf.outer_l2_len;
> + mbuf_dest->outer_l3_len = pkt_orig->mbuf.outer_l3_len;
>
> memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
> sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size));
>
> - if (mbuf_dest->ol_flags & PKT_TX_L4_MASK) {
> + if ((mbuf_dest->outer_l2_len == 0) &&
> + (mbuf_dest->ol_flags & PKT_TX_L4_MASK)) {
> mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest)
> - (char *)dp_packet_eth(pkt_dest);
> mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest)
> @@ -2773,6 +2932,7 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
> uint32_t tx_failure = 0;
> uint32_t mtu_drops = 0;
> uint32_t qos_drops = 0;
> + struct rte_mbuf *mbuf;
>
> if (dev->type != DPDK_DEV_VHOST) {
> /* Check if QoS has been configured for this netdev. */
> @@ -2801,6 +2961,9 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
> break;
> }
>
> + mbuf = (struct rte_mbuf *)pkts[txcnt];
> + netdev_dpdk_prep_hwol_packet(dev, mbuf);
> +
> txcnt++;
> }
>
> @@ -4949,6 +5112,10 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
> netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
> netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
> netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
> + /* Enable VXLAN TSO support if available */
> + if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
> + netdev->ol_flags |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
> + }
> if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
> netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
> }
> diff --git a/lib/netdev.c b/lib/netdev.c
> index 90962ee..378f04f 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> @@ -960,18 +960,12 @@ netdev_push_header(const struct netdev *netdev,
> size_t i, size = dp_packet_batch_size(batch);
>
> DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
> - if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet)
> - || dp_packet_hwol_l4_mask(packet))) {
Removing the line below will break coverage and cause compilation
failure. If coverage for the function is unused then you need need to
either remove the existing COVERAGE_DEFINE(netdev_push_header_drops); or
else define a new one for use later in the function.
> - COVERAGE_INC(netdev_push_header_drops);
> - dp_packet_delete(packet);
> - VLOG_WARN_RL(&rl, "%s: Tunneling packets with HW offload flags is "
> - "not supported: packet dropped",
> - netdev_get_name(netdev));
> - } else {
> - netdev->netdev_class->push_header(netdev, packet, data);
> - pkt_metadata_init(&packet->md, data->out_port);
> - dp_packet_batch_refill(batch, packet, i);
> + netdev->netdev_class->push_header(netdev, packet, data);
> + if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) {
> + dp_packet_hwol_set_vxlan_tcp_seg(packet);
> }
> + pkt_metadata_init(&packet->md, data->out_port);
> + dp_packet_batch_refill(batch, packet, i);
> }
>
> return 0;
>
It would be worth to update the documentation at some stage in the
future as part of the patch as the limitations directly calls out VXLAN
not being supported.
http://docs.openvswitch.org/en/latest/topics/userspace-tso/#limitations
BR
Ian
More information about the dev
mailing list