[ovs-dev] [PATCH V3 1/4] Enable VXLAN TSO for DPDK datapath
yang_y_yi
yang_y_yi at 163.com
Mon Nov 2 03:16:49 UTC 2020
Thanks a lot, Flavio, please check inline comments for more discussion.
At 2020-10-31 01:55:57, "Flavio Leitner" <fbl at sysclose.org> wrote:
>
>Hi Yi,
>
>Thanks for the patch and sorry the delay to review it.
>See my comments in line.
>
>Thanks,
>fbl
>
>
>On Fri, Aug 07, 2020 at 06:56:45PM +0800, yang_y_yi at 163.com wrote:
>> From: Yi Yang <yangyi01 at inspur.com>
>>
>> Many NICs can support VXLAN TSO which can help
>> improve across-compute-node VM-to-VM performance
>> in case that MTU is set to 1500.
>>
>> This patch allows dpdkvhostuserclient interface
>> and veth/tap interface to leverage NICs' offload
>> capability to maximize across-compute-node TCP
>> performance, with it applied, OVS DPDK can reach
>> linespeed for across-compute-node VM-to-VM TCP
>> performance.
>>
>> Signed-off-by: Yi Yang <yangyi01 at inspur.com>
>> ---
>> lib/dp-packet.h | 76 ++++++++++++++++++++
>> lib/netdev-dpdk.c | 188 ++++++++++++++++++++++++++++++++++++++++++++++----
>> lib/netdev-linux.c | 20 ++++++
>> lib/netdev-provider.h | 1 +
>> lib/netdev.c | 69 ++++++++++++++++--
>> 5 files changed, 338 insertions(+), 16 deletions(-)
>>
>> diff --git a/lib/dp-packet.h b/lib/dp-packet.h
>> index 0430cca..79895f2 100644
>> --- a/lib/dp-packet.h
>> +++ b/lib/dp-packet.h
>> @@ -81,6 +81,8 @@ enum dp_packet_offload_mask {
>> DEF_OL_FLAG(DP_PACKET_OL_TX_UDP_CKSUM, PKT_TX_UDP_CKSUM, 0x400),
>> /* Offload SCTP checksum. */
>> DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, PKT_TX_SCTP_CKSUM, 0x800),
>> + /* VXLAN TCP Segmentation Offload. */
>> + DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN, PKT_TX_TUNNEL_VXLAN, 0x1000),
>> /* Adding new field requires adding to DP_PACKET_OL_SUPPORTED_MASK. */
>> };
>>
>> @@ -1032,6 +1034,80 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
>> *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;
>> }
>>
>> +#ifdef DPDK_NETDEV
>> +/* Mark packet 'b' for VXLAN TCP segmentation offloading. */
>> +static inline void
>> +dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b)
>> +{
>> + b->mbuf.ol_flags |= DP_PACKET_OL_TX_TUNNEL_VXLAN;
>> + b->mbuf.l2_len += sizeof(struct udp_header) +
>> + sizeof(struct vxlanhdr);
>
>
>What about L3 length?
For tunnel offload, l2_len must be original l2_len plus vxlan and udp header size, l3_len is still be inner l3_len.
>
>> + b->mbuf.outer_l2_len = ETH_HEADER_LEN;
>> + b->mbuf.outer_l3_len = IP_HEADER_LEN;
>
>What about IPv6?
Good catch, we need to care outer ipv6 case. I'll split it to a single function dp_packet_hwol_set_outer_l2_len & dp_packet_hwol_set_l3_len to handle this.
>
>> +}
>> +
>> +/* Check if it is a VXLAN packet */
>> +static inline bool
>> +dp_packet_hwol_is_vxlan_tcp_seg(struct dp_packet *b)
>> +{
>> + return (b->mbuf.ol_flags & DP_PACKET_OL_TX_TUNNEL_VXLAN);
>
>
>Please use dp_packet_ol_flags_ptr()
Ok, will use use dp_packet_ol_flags_ptr() in new version.
>
>> +}
>> +
>> +/* Set l2_len for the packet 'b' */
>> +static inline void
>> +dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len)
>> +{
>> + b->mbuf.l2_len = l2_len;
>> +}
>
>This function is only called by Linux in the ingress
>path before the data processing, so it shouldn't set
>any value other than the ones related to the iface
>offloading at this point. Also that the data path can
>change the packet and there is nothing to update
>those lengths.
Does "Linux" mean "system interfaces"?, we need to use l2_len, but I saw l2_len isn't set in some cases, so added this function.
>
>In the egress path it calls netdev_dpdk_prep_hwol_packet()
>to update those fields.
If output port is system interfaces (veth or tap), netdev_dpdk_prep_hwol_packet() won't be called.
>
>
>> +
>> +/* Set l3_len for the packet 'b' */
>> +static inline void
>> +dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len)
>> +{
>> + b->mbuf.l3_len = l3_len;
>> +}
>
>The same comment above is valid here.
In some cases, we do need to set l3_len if it isn't set correctly.
>
>
>> +
>> +/* Set l4_len for the packet 'b' */
>> +static inline void
>> +dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len)
>> +{
>> + b->mbuf.l4_len = l4_len;
>> +}
>
>
>And here.
DPDK offload API will use l4_len to handle something but it isn't set in vhost/virtio, so bring it in to set l4_len correctly.
>
>
>> +#else
>> +/* Mark packet 'b' for VXLAN TCP segmentation offloading. */
>> +static inline void
>> +dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b OVS_UNUSED)
>> +{
>> +}
>> +
>> +/* Check if it is a VXLAN packet */
>> +static inline bool
>> +dp_packet_hwol_is_vxlan_tcp_seg(struct dp_packet *b OVS_UNUSED)
>> +{
>> +}
>> +
>> +/* Set l2_len for the packet 'b' */
>> +static inline void
>> +dp_packet_hwol_set_l2_len(struct dp_packet *b OVS_UNUSED,
>> + int l2_len OVS_UNUSED)
>> +{
>> +}
>> +
>> +/* Set l3_len for the packet 'b' */
>> +static inline void
>> +dp_packet_hwol_set_l3_len(struct dp_packet *b OVS_UNUSED,
>> + int l3_len OVS_UNUSED)
>> +{
>> +}
>> +
>> +/* Set l4_len for the packet 'b' */
>> +static inline void
>> +dp_packet_hwol_set_l4_len(struct dp_packet *b OVS_UNUSED,
>> + int l4_len OVS_UNUSED)
>> +{
>> +}
>> +#endif /* DPDK_NETDEV */
>> +
>> static inline bool
>> dp_packet_ip_checksum_valid(const struct dp_packet *p)
>> {
>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>> index 44ebf96..30493ed 100644
>> --- a/lib/netdev-dpdk.c
>> +++ b/lib/netdev-dpdk.c
>> @@ -44,6 +44,7 @@
>> #include <rte_pci.h>
>> #include <rte_version.h>
>> #include <rte_vhost.h>
>> +#include <rte_ip.h>
>
>I think all the headers you need are defined in OVS
>and that's the current preference. Please use those
>instead.
But in lib/netdev-dpdk.c, we used many DPDK APIs which need DPDK structures, I think lib/netdev-dpdk.c is an exceptional case.
>
>
>> #include "cmap.h"
>> #include "coverage.h"
>> @@ -87,6 +88,7 @@ COVERAGE_DEFINE(vhost_notification);
>>
>> #define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
>> #define OVS_VPORT_DPDK "ovs_dpdk"
>> +#define DPDK_RTE_HDR_OFFSET 1
>
>This is not specific to DPDK, neither to RTE. Perhaps this
>could be added to packet header is a more generic way?
Good suggestion, but we only used it here, other places don't use it, maybe a generic name is better.
>
>
>>
>> /*
>> * need to reserve tons of extra space in the mbufs so we can align the
>> @@ -405,6 +407,7 @@ enum dpdk_hw_ol_features {
>> NETDEV_RX_HW_SCATTER = 1 << 2,
>> NETDEV_TX_TSO_OFFLOAD = 1 << 3,
>> NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 4,
>> + NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 5,
>> };
>>
>> /*
>> @@ -986,8 +989,17 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
>> conf.rxmode.offloads |= DEV_RX_OFFLOAD_KEEP_CRC;
>> }
>>
>> + if (info.rx_offload_capa & DEV_TX_OFFLOAD_MULTI_SEGS) {
>> + conf.txmode.offloads |= DEV_TX_OFFLOAD_MULTI_SEGS;
>> + }
>
>Why enabling multi seg is necessary?
it is mandatory requirement of GRO and GSO becuase they use multi-segment mbuf to avoid copy. GRO is very helpful to TCP perfoemance, GSO is necessary for UDP or TSO offoad isn't available on hardware NIC.
>
>
>> +
>> if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
>> conf.txmode.offloads |= DPDK_TX_TSO_OFFLOAD_FLAGS;
>> + /* Enable VXLAN TSO support if available */
>> + if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
>> + conf.txmode.offloads |= DEV_TX_OFFLOAD_VXLAN_TNL_TSO;
>> + conf.txmode.offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
>> + }
>> if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
>> conf.txmode.offloads |= DEV_TX_OFFLOAD_SCTP_CKSUM;
>> }
>> @@ -1126,6 +1138,10 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>> if ((info.tx_offload_capa & tx_tso_offload_capa)
>> == tx_tso_offload_capa) {
>> dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
>> + /* Enable VXLAN TSO support if available */
>> + if (info.tx_offload_capa & DEV_TX_OFFLOAD_VXLAN_TNL_TSO) {
>> + dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
>> + }
>> if (info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM) {
>> dev->hw_ol_features |= NETDEV_TX_SCTP_CHECKSUM_OFFLOAD;
>> } else {
>> @@ -2131,35 +2147,166 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
>> rte_free(rx);
>> }
>>
>> +static inline bool
>> +is_local_to_local(uint16_t src_port_id, struct netdev_dpdk *dev)
>> +{
>> + bool ret = false;
>> + struct netdev_dpdk *src_dev;
>> +
>> + if (src_port_id == UINT16_MAX) {
>> + ret = true;
>> + } else {
>> + src_dev = netdev_dpdk_lookup_by_port_id(src_port_id);
>> + if (src_dev && (netdev_dpdk_get_vid(src_dev) >= 0)) {
>> + ret = true;
>> + }
>> + }
>> +
>> + if (ret) {
>> + if (netdev_dpdk_get_vid(dev) < 0) {
>> + ret = false;
>> + }
>> + }
>> +
>> + return ret;
>> +}
>> +
>> /* Prepare the packet for HWOL.
>> * Return True if the packet is OK to continue. */
>> static bool
>> netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>> {
>> struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf);
>> + uint16_t l4_proto = 0;
>> + uint8_t *l3_hdr_ptr = NULL;
>> + struct rte_ether_hdr *eth_hdr =
>> + rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *);
>> + struct rte_ipv4_hdr *ip_hdr;
>> + struct rte_ipv6_hdr *ip6_hdr;
>> +
>> + /* Return directly if source and destitation of mbuf are local ports
>> + * because mbuf has already set ol_flags and l*_len correctly.
>> + */
>> + if (is_local_to_local(mbuf->port, dev)) {
>> + if (mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)) {
>> + mbuf->tso_segsz = 1450 - mbuf->l3_len - mbuf->l4_len;
>> + }
>> + return true;
>> + }
>
>This assumption here doesn't seem correct. The packet could
>have come from a NIC and then had been modified by data path,
>so it doesn't mean the packet lengths and other fields are
>correct in the mbuf. That's why we prepare here.
For local to local case, it is special, maybe l3_len and l4_len are incorrect, here we set tso_segsz for vhostuser interfaces and veth, I didn't see the case you mentioned, can you give a specific case? I can verify if it is ok. In the worst case, we can re-parse the packet before it to get correct l3_len and l4_len.
>
>
>> +
>> + if (mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) {
>> + /* Handle VXLAN TSO */
>> + struct rte_udp_hdr *udp_hdr;
>> +
>> + /* small packets whose size is less than or equal to MTU needn't
>> + * VXLAN TSO. In addtion, if hardware can't support VXLAN TSO, it
>> + * also can't be handled. So PKT_TX_TUNNEL_VXLAN must be cleared
>> + * outer_l2_len and outer_l3_len must be zeroed.
>> + */
>> + if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_VXLAN_TSO)
>> + || (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG))
>> + && (mbuf->pkt_len <= 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len
>> + + mbuf->l2_len))) {
>> + mbuf->ol_flags &= ~PKT_TX_TUNNEL_VXLAN;
>> + mbuf->l2_len -= sizeof(struct udp_header)
>> + + sizeof(struct vxlanhdr);
>> + mbuf->outer_l2_len = 0;
>> + mbuf->outer_l3_len = 0;
>> + return true;
>> + }
>
>The above doesn't look right to me. The l2 and l3 lengths are
>not updated and it assumes they are ok, which they are not.
There are two cases entering netdev_dpdk_prep_hwol_packet()
#1. To be sent to dpdk interfacses, the packet is from system inertfaces or vhostuser.
#2. To be sent to vhosuser tinterfaces, the packet is from dpdk interface or system interfacses,
In most case they are set correctly, if not, we can re-parse the packet to get correct value, but I'm not sure how we can decide this kind of scarce case.
Note: l2_len and l3_len are always inner l2 nd l3 length for tunnel packets. I don't know in what cases they will be wrong.
>
>> +
>> + if (mbuf->ol_flags & PKT_TX_IPV4) {
>> + ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + DPDK_RTE_HDR_OFFSET);
>> + udp_hdr = (struct rte_udp_hdr *)(ip_hdr + DPDK_RTE_HDR_OFFSET);
>> +
>> + /* outer IP checksum offload */
>> + ip_hdr->hdr_checksum = 0;
>> + mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM;
>> + mbuf->ol_flags |= PKT_TX_OUTER_IPV4;
>> +
>> + ip_hdr = (struct rte_ipv4_hdr *)
>> + ((uint8_t *)udp_hdr + mbuf->l2_len);
>> + l4_proto = ip_hdr->next_proto_id;
>> + l3_hdr_ptr = (uint8_t *)ip_hdr;
>> +
>> + /* inner IP checksum offload */
>> + ip_hdr->hdr_checksum = 0;
>> + mbuf->ol_flags |= PKT_TX_IP_CKSUM;
>> + } else if (mbuf->ol_flags & PKT_TX_IPV6) {
>> + ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + DPDK_RTE_HDR_OFFSET);
>> + udp_hdr = (struct rte_udp_hdr *)(ip_hdr + DPDK_RTE_HDR_OFFSET);
>
>This branch is about IPv6 but it uses IPv4 header.
IPv6 means inner IPv6, outer is still IPv4 in most cases unless we use IPv6 for underlay, I think that is scarce. I'll handle outer IPv6 case, but this also needs IPv6 GRO and GSO, DPDK doesn't support them, I need to impelement them by myself.
>
>> +
>> + /* outer IP checksum offload */
>> + ip_hdr->hdr_checksum = 0;
>> + mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM;
>> + mbuf->ol_flags |= PKT_TX_OUTER_IPV4;
>
>And sets the flag for the wrong protocol.
This is outer IPv4 case (inner is IPv6).
>
>> +
>> + ip6_hdr = (struct rte_ipv6_hdr *)
>> + ((uint8_t *)udp_hdr + mbuf->l2_len);
>> + l4_proto = ip6_hdr->proto;
>> + l3_hdr_ptr = (uint8_t *)ip6_hdr;
>> +
>> + /* inner IP checksum offload offload */
>> + mbuf->ol_flags |= PKT_TX_IP_CKSUM;
>> + }
>> + } else if (mbuf->ol_flags & PKT_TX_L4_MASK) {
>> + /* Handle VLAN TSO */
>> + /* no inner IP checksum for IPV6 */
>> + if (mbuf->ol_flags & PKT_TX_IPV4) {
>> + ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + DPDK_RTE_HDR_OFFSET);
>> + l4_proto = ip_hdr->next_proto_id;
>> + l3_hdr_ptr = (uint8_t *)ip_hdr;
>> +
>> + /* IP checksum offload */
>> + ip_hdr->hdr_checksum = 0;
>> + mbuf->ol_flags |= PKT_TX_IP_CKSUM;
>> + } else if (mbuf->ol_flags & PKT_TX_IPV6) {
>> + ip6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + DPDK_RTE_HDR_OFFSET);
>> + l4_proto = ip6_hdr->proto;
>> + l3_hdr_ptr = (uint8_t *)ip6_hdr;
>> +
>> + /* IP checksum offload */
>> + mbuf->ol_flags |= PKT_TX_IP_CKSUM;
>> + }
>>
>> - if (mbuf->ol_flags & PKT_TX_L4_MASK) {
>> mbuf->l2_len = (char *)dp_packet_l3(pkt) - (char *)dp_packet_eth(pkt);
>> mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt);
>> mbuf->outer_l2_len = 0;
>> mbuf->outer_l3_len = 0;
>> }
>>
>> - if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
>> - struct tcp_header *th = dp_packet_l4(pkt);
>> + /* It is possible that l4_len isn't set for vhostuserclient */
>> + if ((l3_hdr_ptr != NULL) && (l4_proto == IPPROTO_TCP)
>> + && (mbuf->l4_len < 20)) {
>> + struct rte_tcp_hdr *tcp_hdr = (struct rte_tcp_hdr *)
>> + (l3_hdr_ptr + mbuf->l3_len);
>>
>> - if (!th) {
>> + mbuf->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
>> + }
>> +
>> + if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) {
>> + if (l4_proto != IPPROTO_UDP) {
>> + VLOG_WARN_RL(&rl, "%s: UDP packet without L4 header"
>> + " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
>> + return false;
>> + }
>> + } else if (mbuf->ol_flags & PKT_TX_TCP_SEG ||
>> + mbuf->ol_flags & PKT_TX_TCP_CKSUM) {
>> + if (l4_proto != IPPROTO_TCP) {
>> VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
>> " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
>> return false;
>> }
>>
>> - mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
>> - mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
>> - mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
>> + if (mbuf->pkt_len - mbuf->l2_len > 1450) {
>> + dp_packet_hwol_set_tcp_seg(pkt);
>> + }
>>
>> - if (mbuf->ol_flags & PKT_TX_IPV4) {
>> - mbuf->ol_flags |= PKT_TX_IP_CKSUM;
>> + mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
>> + if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
>> + mbuf->tso_segsz = 1450 - mbuf->l3_len - mbuf->l4_len;
>> + } else {
>> + mbuf->tso_segsz = 0;
>> }
>
>The L2, L3 and L4 headers are available in dp_packet, so
>most of the parsing above is not needed.
In some cases, they aren't set. As you said before, maybe we can move these code to head of netdev_dpdk_prep_hwol_packet, in this way, it handle all the possible cases. But it can be duplicate overhead if they have been set correctly.
>
>
>> }
>> return true;
>> @@ -2737,19 +2884,27 @@ dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig)
>>
>> mbuf_dest->tx_offload = pkt_orig->mbuf.tx_offload;
>> mbuf_dest->packet_type = pkt_orig->mbuf.packet_type;
>> - mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
>> - ~(EXT_ATTACHED_MBUF | IND_ATTACHED_MBUF));
>> + mbuf_dest->ol_flags |= pkt_orig->mbuf.ol_flags;
>> + mbuf_dest->l2_len = pkt_orig->mbuf.l2_len;
>> + mbuf_dest->l3_len = pkt_orig->mbuf.l3_len;
>> + mbuf_dest->l4_len = pkt_orig->mbuf.l4_len;
>> + mbuf_dest->outer_l2_len = pkt_orig->mbuf.outer_l2_len;
>> + mbuf_dest->outer_l3_len = pkt_orig->mbuf.outer_l3_len;
>
>fbl: Packets not from DPDK don't have those values
>except offloading flags, which can't be straight copied
>like that.
It will arrive here only if it is built with DPDK enabled, so these fields should be there, do you mean they aren't set? netdev_linux_parse_vnet_hdr in lib/netdev-linux.c has set them.
>
>
>>
>> memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
>> sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size));
>>
>> - if (mbuf_dest->ol_flags & PKT_TX_L4_MASK) {
>> + if ((mbuf_dest->outer_l2_len == 0) &&
>> + (mbuf_dest->ol_flags & PKT_TX_L4_MASK)) {
>> mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest)
>> - (char *)dp_packet_eth(pkt_dest);
>> mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest)
>> - (char *) dp_packet_l3(pkt_dest);
>> }
>>
>> + /* Mark it as non-DPDK port */
>> + mbuf_dest->port = UINT16_MAX;
>
>This is a work around for the is_local_to_local() to work,
>but as explained already, the assumption doesn't seem ok.
Can you help list excptional cases? I don't know which non-DPDK port isn't this case.
>
>
>> +
>> return pkt_dest;
>> }
>>
>> @@ -2808,6 +2963,11 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
>> if (dev->type == DPDK_DEV_VHOST) {
>> __netdev_dpdk_vhost_send(netdev, qid, pkts, txcnt);
>> } else {
>> + if (userspace_tso_enabled()) {
>> + txcnt = netdev_dpdk_prep_hwol_batch(dev,
>> + (struct rte_mbuf **)pkts,
>> + txcnt);
>> + }
>> tx_failure += netdev_dpdk_eth_tx_burst(dev, qid,
>> (struct rte_mbuf **)pkts,
>> txcnt);
>> @@ -4949,6 +5109,10 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
>> netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
>> netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
>> netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
>> + /* Enable VXLAN TSO support if available */
>> + if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
>> + netdev->ol_flags |= NETDEV_TX_OFFLOAD_VXLAN_TSO;
>> + }
>> if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
>> netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
>> }
>> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
>> index fe7fb9b..9f830b4 100644
>> --- a/lib/netdev-linux.c
>> +++ b/lib/netdev-linux.c
>> @@ -6508,6 +6508,8 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
>> struct eth_header *eth_hdr;
>> ovs_be16 eth_type;
>> int l2_len;
>> + int l3_len = 0;
>> + int l4_len = 0;
>>
>> eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN);
>> if (!eth_hdr) {
>> @@ -6527,6 +6529,8 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
>> l2_len += VLAN_HEADER_LEN;
>> }
>>
>> + dp_packet_hwol_set_l2_len(b, l2_len);
>> +
>> if (eth_type == htons(ETH_TYPE_IP)) {
>> struct ip_header *ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN);
>>
>> @@ -6534,6 +6538,7 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
>> return -EINVAL;
>> }
>>
>> + l3_len = IP_HEADER_LEN;
>> *l4proto = ip_hdr->ip_proto;
>> dp_packet_hwol_set_tx_ipv4(b);
>> } else if (eth_type == htons(ETH_TYPE_IPV6)) {
>> @@ -6544,10 +6549,25 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
>> return -EINVAL;
>> }
>>
>> + l3_len = IPV6_HEADER_LEN;
>> *l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
>> dp_packet_hwol_set_tx_ipv6(b);
>> }
>>
>> + dp_packet_hwol_set_l3_len(b, l3_len);
>> +
>> + if (*l4proto == IPPROTO_TCP) {
>> + struct tcp_header *tcp_hdr = dp_packet_at(b, l2_len + l3_len,
>> + sizeof(struct tcp_header));
>> +
>> + if (!tcp_hdr) {
>> + return -EINVAL;
>> + }
>> +
>> + l4_len = TCP_OFFSET(tcp_hdr->tcp_ctl) * 4;
>> + dp_packet_hwol_set_l4_len(b, l4_len);
>> + }
>
>
>As explained already, all the lengths set here can change
>during data path processing.
If so we can re-parse it at the head of netdev_dpdk_prep_hwol_packet, it will be better if we can re-parse it only if they are changed, but I don't know in what cases they will be changed, can you explian how we can detect such changes?
>
>Thanks
>fbl
>
>> +
>> return 0;
>> }
>>
>> diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
>> index 73dce2f..d616d79 100644
>> --- a/lib/netdev-provider.h
>> +++ b/lib/netdev-provider.h
>> @@ -43,6 +43,7 @@ enum netdev_ol_flags {
>> NETDEV_TX_OFFLOAD_UDP_CKSUM = 1 << 2,
>> NETDEV_TX_OFFLOAD_SCTP_CKSUM = 1 << 3,
>> NETDEV_TX_OFFLOAD_TCP_TSO = 1 << 4,
>> + NETDEV_TX_OFFLOAD_VXLAN_TSO = 1 << 5,
>> };
>>
>> /* A network device (e.g. an Ethernet device).
>> diff --git a/lib/netdev.c b/lib/netdev.c
>> index 91e9195..64583d1 100644
>> --- a/lib/netdev.c
>> +++ b/lib/netdev.c
>> @@ -33,6 +33,7 @@
>>
>> #include "cmap.h"
>> #include "coverage.h"
>> +#include "csum.h"
>> #include "dpif.h"
>> #include "dp-packet.h"
>> #include "openvswitch/dynamic-string.h"
>> @@ -785,6 +786,36 @@ netdev_get_pt_mode(const struct netdev *netdev)
>> : NETDEV_PT_LEGACY_L2);
>> }
>>
>> +static inline void
>> +calculate_tcpudp_checksum(struct dp_packet *p)
>> +{
>> + uint32_t pseudo_hdr_csum;
>> + struct ip_header *ip = dp_packet_l3(p);
>> + size_t l4_len = (char *) dp_packet_tail(p) - (char *) dp_packet_l4(p);
>> + uint16_t l4_proto = 0;
>> +
>> + l4_proto = ip->ip_proto;
>> + ip->ip_csum = 0;
>> + ip->ip_csum = csum(ip, sizeof *ip);
>> + pseudo_hdr_csum = packet_csum_pseudoheader(ip);
>> + if (l4_proto == IPPROTO_TCP) {
>> + struct tcp_header *tcp = dp_packet_l4(p);
>> +
>> + tcp->tcp_csum = 0;
>> + tcp->tcp_csum = csum_finish(csum_continue(pseudo_hdr_csum,
>> + tcp, l4_len));
>> + } else if (l4_proto == IPPROTO_UDP) {
>> + struct udp_header *udp = dp_packet_l4(p);
>> +
>> + udp->udp_csum = 0;
>> + udp->udp_csum = csum_finish(csum_continue(pseudo_hdr_csum,
>> + udp, l4_len));
>> + if (!udp->udp_csum) {
>> + udp->udp_csum = htons(0xffff);
>> + }
>> + }
>> +}
>> +
>> /* Check if a 'packet' is compatible with 'netdev_flags'.
>> * If a packet is incompatible, return 'false' with the 'errormsg'
>> * pointing to a reason. */
>> @@ -794,6 +825,14 @@ netdev_send_prepare_packet(const uint64_t netdev_flags,
>> {
>> uint64_t l4_mask;
>>
>> + if (dp_packet_hwol_is_vxlan_tcp_seg(packet)
>> + && (dp_packet_hwol_is_tso(packet) || dp_packet_hwol_l4_mask(packet))
>> + && !(netdev_flags & NETDEV_TX_OFFLOAD_VXLAN_TSO)) {
>> + /* Fall back to GSO in software. */
>> + VLOG_ERR_BUF(errormsg, "No VXLAN TSO support");
>> + return false;
>> + }
>> +
>> if (dp_packet_hwol_is_tso(packet)
>> && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
>> /* Fall back to GSO in software. */
>> @@ -960,15 +999,37 @@ netdev_push_header(const struct netdev *netdev,
>> size_t i, size = dp_packet_batch_size(batch);
>>
>> DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
>> - if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet)
>> - || dp_packet_hwol_l4_mask(packet))) {
>> + if (OVS_UNLIKELY((dp_packet_hwol_is_tso(packet)
>> + || dp_packet_hwol_l4_mask(packet))
>> + && (data->tnl_type != OVS_VPORT_TYPE_VXLAN))) {
>> COVERAGE_INC(netdev_push_header_drops);
>> dp_packet_delete(packet);
>> - VLOG_WARN_RL(&rl, "%s: Tunneling packets with HW offload flags is "
>> - "not supported: packet dropped",
>> + VLOG_WARN_RL(&rl,
>> + "%s: non-VxLAN Tunneling packets with HW offload "
>> + "flags is not supported: packet dropped",
>> netdev_get_name(netdev));
>> } else {
>> + if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) {
>> + /* VXLAN offload can't support udp checksum offload
>> + * for inner udp packet, so udp checksum must be set
>> + * before push header in order that outer checksum can
>> + * be set correctly.
>> + */
>> + if (dp_packet_hwol_l4_is_udp(packet)) {
>> + packet->mbuf.ol_flags &= ~DP_PACKET_OL_TX_UDP_CKSUM;
>> + calculate_tcpudp_checksum(packet);
>> + }
>> + }
>> netdev->netdev_class->push_header(netdev, packet, data);
>> + if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) {
>> + /* Just identify it as a vxlan packet, here netdev is
>> + * vxlan_sys_*, netdev->ol_flags can't indicate if final
>> + * physical output port can support VXLAN TSO, in
>> + * netdev_send_prepare_packet will drop it if final
>> + * physical output port can't support VXLAN TSO.
>> + */
>> + dp_packet_hwol_set_vxlan_tcp_seg(packet);
>> + }
>> pkt_metadata_init(&packet->md, data->out_port);
>> dp_packet_batch_refill(batch, packet, i);
>> }
>> --
>> 2.7.4
>>
>> _______________________________________________
>> dev mailing list
>> dev at openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
>--
>fbl
More information about the dev
mailing list