[ovs-dev] [RFC 2/2] netdev-dpdk: Enable TSO when using multi-seg mbufs

Tiago Lam tiago.lam at intel.com
Wed Aug 8 15:39:28 UTC 2018


TCP Segmentation Offload (TSO) is a feature which enables the TCP/IP
network stack to delegate segmentation of a TCP segment to the hardware
NIC, thus saving compute resources. This may improve performance
significantly for TCP workload in virtualized environments.

While a previous commit already added the necesary logic to netdev-dpdk
to deal with packets marked for TSO, this set of changes enables TSO by
default when using multi-segment mbufs.

Thus, to enable TSO on the physical DPDK interfaces, only the following
command needs to be issued before starting OvS:
    ovs-vsctl set Open_vSwitch . other_config:dpdk-multi-seg-mbufs=true

Co-authored-by: Mark Kavanagh <mark.b.kavanagh at intel.com>

Signed-off-by: Mark Kavanagh <mark.b.kavanagh at intel.com>
Signed-off-by: Tiago Lam <tiago.lam at intel.com>
---
 Documentation/topics/dpdk/phy.rst | 64 +++++++++++++++++++++++++++++++++++++++
 lib/netdev-dpdk.c                 | 52 ++++++++++++++++++++++++++-----
 2 files changed, 108 insertions(+), 8 deletions(-)

diff --git a/Documentation/topics/dpdk/phy.rst b/Documentation/topics/dpdk/phy.rst
index 1470623..980a629 100644
--- a/Documentation/topics/dpdk/phy.rst
+++ b/Documentation/topics/dpdk/phy.rst
@@ -248,3 +248,67 @@ Command to set interrupt mode for a specific interface::
 
 Command to set polling mode for a specific interface::
     $ ovs-vsctl set interface <iface_name> options:dpdk-lsc-interrupt=false
+
+TCP Segmentation Offload (TSO)
+------------------------------
+
+Overview
+~~~~~~~~
+
+TCP Segmentation Offload (TSO) enables a network stack to delegate
+segmentation of an oversized TCP segment to the underlying physical NIC.
+Offload of frame segmentation achieves computational savings in the core,
+freeing up CPU cycles for more useful work.
+
+DPDK v16.07 added support for `TSO` in the vHost user backend; as such, a
+guest's virtual network interfaces may avail of `TSO`. In such a setup, the
+aforementioned computational savings are made in the core acting as the VM's
+virtual CPU, typically resulting in improved TCP throughput.
+
+To enable TSO in a guest, the underlying NIC must first support `TSO` -
+consult your controller's datasheet for compatibility. Secondly, the NIC
+must have an associated DPDK Poll Mode Driver (PMD) which supports `TSO`.
+
+Enabling TSO
+~~~~~~~~~~~~
+
+TSO may be enabled in one of two ways, as follows:
+
+  1. QEMU Command Line Parameter:
+
+      ```
+      sudo $QEMU_DIR/x86_64-softmmu/qemu-system-x86_64 \
+      ...
+      -device virtio-net-pci,mac=00:00:00:00:00:01,netdev=mynet1,\
+      mrg_rxbuf=on,csum=on,gso=on,guest_csum=on,guest_tso4=on,\
+      guest_tso6=on,guest_ecn=on \
+      ...
+      ```
+
+  2. ethtool
+
+`TSO` is enabled in OvS by the DPDK vHost User backend; when a new guest
+connection is established, `TSO` is advertised to the guest as an available
+feature. Assuming that the guest's OS also supports `TSO`, ethtool can be used
+to enable same:
+
+      ```
+      ethtool -K eth0 sg on     # scatter-gather is a prerequisite for TSO
+      ethtool -K eth0 tso on
+      ethtool -k eth0           # verify that TSO is reported as 'on'
+      ```
+
+      <b>Note:</b> In both methods, `mergeable buffers` are required:
+      ```
+      sudo $QEMU_DIR/x86_64-softmmu/qemu-system-x86_64 \
+      ...
+      mrg_rxbuf=on,\
+      ...
+      ```
+
+Limitations
+~~~~~~~~~~~
+
+The current OvS `TSO` implementation supports flat and VLAN networks only
+(i.e. no support for `TSO` over tunneled connection [VxLAN, GRE, IPinIP,
+etc.]).
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 5da5996..20d4fd5 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -379,6 +379,7 @@ struct ingress_policer {
 enum dpdk_hw_ol_features {
     NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
     NETDEV_RX_HW_CRC_STRIP = 1 << 1,
+    NETDEV_TX_TSO_OFFLOAD = 1 << 2,
 };
 
 /*
@@ -1003,6 +1004,8 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
     struct rte_eth_dev_info info;
     uint16_t conf_mtu;
 
+    rte_eth_dev_info_get(dev->port_id, &info);
+
     /* As of DPDK 17.11.1 a few PMDs require to explicitly enable
      * scatter to support jumbo RX. Checking the offload capabilities
      * is not an option as PMDs are not required yet to report
@@ -1010,7 +1013,6 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
      * (testing or code review). Listing all such PMDs feels harder
      * than highlighting the one known not to need scatter */
     if (dev->mtu > ETHER_MTU) {
-        rte_eth_dev_info_get(dev->port_id, &info);
         if (strncmp(info.driver_name, "net_nfp", 7)) {
             conf.rxmode.enable_scatter = 1;
         }
@@ -1018,14 +1020,28 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
 
     /* Multi-segment-mbuf-specific setup. */
     if (dpdk_multi_segment_mbufs) {
-        /* DPDK PMDs typically attempt to use simple or vectorized
-         * transmit functions, neither of which are compatible with
-         * multi-segment mbufs. Ensure that these are disabled when
-         * multi-segment mbufs are enabled.
-         */
-        rte_eth_dev_info_get(dev->port_id, &info);
+        if (info.tx_offload_capa & DEV_TX_OFFLOAD_MULTI_SEGS) {
+            /* DPDK PMDs typically attempt to use simple or vectorized
+             * transmit functions, neither of which are compatible with
+             * multi-segment mbufs. Ensure that these are disabled when
+             * multi-segment mbufs are enabled.
+             */
+            conf.txmode.offloads |= DEV_TX_OFFLOAD_MULTI_SEGS;
+        }
+
+        if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
+            conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
+            conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_CKSUM;
+            conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
+        }
+
         txconf = info.default_txconf;
-        txconf.txq_flags &= ~ETH_TXQ_FLAGS_NOMULTSEGS;
+        txconf.txq_flags = ETH_TXQ_FLAGS_IGNORE;
+        txconf.offloads = conf.txmode.offloads;
+    } else if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
+        dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD;
+        VLOG_WARN("Failed to set Tx TSO offload in %s. Requires option "
+                  "`dpdk-multi-seg-mbufs` to be enabled.", dev->up.name);
     }
 
     conf.intr_conf.lsc = dev->lsc_interrupt_mode;
@@ -1135,6 +1151,9 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
     uint32_t rx_chksm_offload_capa = DEV_RX_OFFLOAD_UDP_CKSUM |
                                      DEV_RX_OFFLOAD_TCP_CKSUM |
                                      DEV_RX_OFFLOAD_IPV4_CKSUM;
+    uint32_t tx_tso_offload_capa = DEV_TX_OFFLOAD_TCP_TSO |
+                                   DEV_TX_OFFLOAD_TCP_CKSUM |
+                                   DEV_TX_OFFLOAD_IPV4_CKSUM;
 
     rte_eth_dev_info_get(dev->port_id, &info);
 
@@ -1154,6 +1173,18 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
         dev->hw_ol_features |= NETDEV_RX_CHECKSUM_OFFLOAD;
     }
 
+    if (dpdk_multi_segment_mbufs) {
+        if (info.tx_offload_capa & tx_tso_offload_capa) {
+            dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
+        } else {
+            dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD;
+            VLOG_WARN("Tx TSO offload is not supported on port "
+                      DPDK_PORT_ID_FMT, dev->port_id);
+        }
+    } else {
+        dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD;
+    }
+
     n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
     n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
 
@@ -1673,6 +1704,11 @@ netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args)
         } else {
             smap_add(args, "rx_csum_offload", "false");
         }
+        if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
+            smap_add(args, "tx_tso_offload", "true");
+        } else {
+            smap_add(args, "tx_tso_offload", "false");
+        }
         smap_add(args, "lsc_interrupt_mode",
                  dev->lsc_interrupt_mode ? "true" : "false");
     }
-- 
2.7.4



More information about the dev mailing list