[ovs-dev] [PATCH RFC] netdev-dpdk: Rx checksum offloading feature on DPDK physical ports.

Sugesh Chandran sugesh.chandran at intel.com
Mon Aug 8 10:03:38 UTC 2016


Add Rx checksum offloading feature support on DPDK physical ports. By default,
the Rx checksum is calculated by the software in OVS-DPDK datapath. However,
the checksum offloading can be enabled either while adding a new DPDK physical
port to OVS or at runtime.

To enable checksum offloading at rx side while adding a port, add the
'rx-checksum-offload' option to the 'ovs-vsctl add-port' command-line as below,

'ovs-vsctl add-port br0 dpdk0 -- \
set Interface dpdk0 type=dpdk options:rx-checksum-offload=true'

To turn ON rx checksum offloading at run time,
'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=true'

The rx checksum offloading can be turned off by setting the parameter to
'false'. For eg: To turn off the rx checksum offloading when adding a port,

 'ovs-vsctl add-port br0 dpdk0 -- \
  set Interface dpdk0 type=dpdk options:rx-checksum-offload=false'

OR (to disable at run time after port is being added to OVS)

'ovs-vsctl set Interface dpdk0 options:rx-checksum-offload=false'

This is a RFC patch as the new checksum offload flags 'PKT_RX_L4_CKSUM_GOOD'
and 'PKT_RX_IP_CKSUM_GOOD' will be added only in DPDK 16.11 release. OVS must
compile with DPDK 16.11 release to avail the checksum offloading feature.
The documentation('vswitch.xml' and 'INSTALL.DPDK-ADVANCED.md') for the
checksum offloading feature will be updated in the next version patch once the
proposal is finalized.

The Tx checksum offloading support is not implemented due to the following
reasons.

1) Checksum offloading and vectorization are mutually exclusive in DPDK poll
mode driver. Vector packet processing is turned OFF when checksum offloading
is enabled which causes significant performance drop at Tx side.

2) Normally, OVS generates checksum for tunnel packets. It is more efficient to
calculate the checksum at 'tunnel_push' operation itself as all the relevant
fields are already present in the cache and also no additional validation
overhead is involved as in checksum offloading.

Rx checksum offloading still offers 8-9% of improvement on VxLAN tunneling
decapsulation even though the SSE vector Rx function is disabled in DPDK poll
mode driver.

Signed-off-by: Sugesh Chandran <sugesh.chandran at intel.com>
---
 lib/dp-packet.h         |   8 ++++
 lib/dpif-netdev.c       |  33 ++++++++++++----
 lib/netdev-dpdk.c       | 103 ++++++++++++++++++++++++++++++++++++------------
 lib/netdev-dpdk.h       |   6 +++
 lib/netdev-native-tnl.c |  10 ++++-
 lib/netdev-provider.h   |   5 +++
 lib/netdev.c            |   1 +
 lib/netdev.h            |   5 +++
 lib/packets.h           |   2 +
 9 files changed, 138 insertions(+), 35 deletions(-)

diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index 7c1e637..f2e9d4f 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -38,6 +38,14 @@ enum OVS_PACKED_ENUM dp_packet_source {
 
 #define DP_PACKET_CONTEXT_SIZE 64
 
+enum hw_pkt_ol_flags {
+    OL_RX_IP_CKSUM_GOOD = (1 << 0),
+    OL_RX_L4_CKSUM_GOOD = (1 << 1),
+    OL_RX_IP_CKSUM_BAD = (1 << 2),
+    OL_RX_L4_CKSUM_BAD = (1 << 3),
+
+};
+
 /* Buffer for holding packet data.  A dp_packet is automatically reallocated
  * as necessary if it grows too large for the available memory.
  */
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index e39362e..6168b3a 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -46,6 +46,7 @@
 #include "hmapx.h"
 #include "latch.h"
 #include "netdev.h"
+#include "netdev-provider.h"
 #include "netdev-dpdk.h"
 #include "netdev-vport.h"
 #include "netlink.h"
@@ -532,7 +533,8 @@ static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
                                       size_t actions_len,
                                       long long now);
 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
-                            struct dp_packet_batch *, odp_port_t port_no);
+                            struct dp_packet_batch *,
+                            struct dp_netdev_port *port);
 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
                                   struct dp_packet_batch *);
 
@@ -2784,7 +2786,7 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
         *recirc_depth_get() = 0;
 
         cycles_count_start(pmd);
-        dp_netdev_input(pmd, &batch, port->port_no);
+        dp_netdev_input(pmd, &batch, port);
         cycles_count_end(pmd, PMD_CYCLES_PROCESSING);
     } else if (error != EAGAIN && error != EOPNOTSUPP) {
         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
@@ -3898,13 +3900,22 @@ static inline size_t
 emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet_batch *packets_,
                struct netdev_flow_key *keys,
                struct packet_batch_per_flow batches[], size_t *n_batches,
-               bool md_is_valid, odp_port_t port_no)
+               bool md_is_valid, struct dp_netdev_port *port)
 {
     struct emc_cache *flow_cache = &pmd->flow_cache;
     struct netdev_flow_key *key = &keys[0];
     size_t i, n_missed = 0, n_dropped = 0;
     struct dp_packet **packets = packets_->packets;
     int cnt = packets_->count;
+    odp_port_t port_no = 0;
+    bool rx_csum_ol_flag = 0;
+
+    if (port) {
+        port_no = port->port_no;
+        rx_csum_ol_flag = (port->netdev->hw_features &
+                           NETDEV_RX_CHECKSUM_OFFLOAD);
+    }
+
 
     for (i = 0; i < cnt; i++) {
         struct dp_netdev_flow *flow;
@@ -3918,6 +3929,7 @@ emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet_batch *packets
 
         if (i != cnt - 1) {
             /* Prefetch next packet data and metadata. */
+            OVS_PREFETCH(&packets[i+1]->md.ol_flags);
             OVS_PREFETCH(dp_packet_data(packets[i+1]));
             pkt_metadata_prefetch_init(&packets[i+1]->md);
         }
@@ -3925,6 +3937,11 @@ emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet_batch *packets
         if (!md_is_valid) {
             pkt_metadata_init(&packet->md, port_no);
         }
+
+        if(rx_csum_ol_flag) {
+            dpdk_hw_checksum_validate(packet);
+        }
+
         miniflow_extract(packet, &key->mf);
         key->len = 0; /* Not computed yet. */
         key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
@@ -4114,7 +4131,7 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
 static void
 dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
                   struct dp_packet_batch *packets,
-                  bool md_is_valid, odp_port_t port_no)
+                  bool md_is_valid, struct dp_netdev_port *port)
 {
     int cnt = packets->count;
 #if !defined(__CHECKER__) && !defined(_WIN32)
@@ -4130,7 +4147,7 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
 
     n_batches = 0;
     newcnt = emc_processing(pmd, packets, keys, batches, &n_batches,
-                            md_is_valid, port_no);
+                            md_is_valid, port);
     if (OVS_UNLIKELY(newcnt)) {
         packets->count = newcnt;
         fast_path_processing(pmd, packets, keys, batches, &n_batches, now);
@@ -4148,16 +4165,16 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
 static void
 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
                 struct dp_packet_batch *packets,
-                odp_port_t port_no)
+                struct dp_netdev_port *port)
 {
-     dp_netdev_input__(pmd, packets, false, port_no);
+     dp_netdev_input__(pmd, packets, false, port);
 }
 
 static void
 dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
                       struct dp_packet_batch *packets)
 {
-     dp_netdev_input__(pmd, packets, true, 0);
+     dp_netdev_input__(pmd, packets, true, NULL);
 }
 
 struct dp_netdev_execute_aux {
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index a0d541a..c9ebdcb 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -416,6 +416,12 @@ dpdk_buf_size(int mtu)
                      NETDEV_DPDK_MBUF_ALIGN);
 }
 
+static struct netdev_dpdk *
+netdev_dpdk_cast(const struct netdev *netdev)
+{
+    return CONTAINER_OF(netdev, struct netdev_dpdk, up);
+}
+
 /* XXX: use dpdk malloc for entire OVS. in fact huge page should be used
  * for all other segments data, bss and text. */
 
@@ -573,6 +579,7 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
 {
     int diag = 0;
     int i;
+    struct rte_eth_conf ol_port_conf = port_conf;
 
     /* A device may report more queues than it makes available (this has
      * been observed for Intel xl710, which reserves some of them for
@@ -584,7 +591,10 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
             VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
         }
 
-        diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &port_conf);
+        ol_port_conf.rxmode.hw_ip_checksum = (dev->up.hw_features &
+                                              NETDEV_RX_CHECKSUM_OFFLOAD) != 0;
+        diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq,
+                                     &ol_port_conf);
         if (diag) {
             break;
         }
@@ -639,6 +649,29 @@ dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex)
     }
 }
 
+static void
+dpdk_eth_checksum_offload_configure(struct netdev *netdev)
+    OVS_REQUIRES(dev->mutex)
+{
+    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+    struct rte_eth_dev_info info;
+    bool rx_csum_ol_flag = false;
+    uint32_t rx_chksm_offload_capa = DEV_RX_OFFLOAD_UDP_CKSUM |
+                                     DEV_RX_OFFLOAD_TCP_CKSUM |
+                                     DEV_RX_OFFLOAD_IPV4_CKSUM;
+    rte_eth_dev_info_get(dev->port_id, &info);
+    rx_csum_ol_flag = (netdev->hw_features & NETDEV_RX_CHECKSUM_OFFLOAD) != 0;
+
+    if (rx_csum_ol_flag &&
+        (info.rx_offload_capa & rx_chksm_offload_capa) !=
+         rx_chksm_offload_capa) {
+        VLOG_WARN("Failed to enable Rx checksum offload on device %d",
+                   dev->port_id);
+        netdev->hw_features &= ~NETDEV_RX_CHECKSUM_OFFLOAD;
+    }
+    netdev_request_reconfigure(netdev);
+}
+
 static int
 dpdk_eth_dev_init(struct netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
 {
@@ -697,12 +730,6 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
     return 0;
 }
 
-static struct netdev_dpdk *
-netdev_dpdk_cast(const struct netdev *netdev)
-{
-    return CONTAINER_OF(netdev, struct netdev_dpdk, up);
-}
-
 static struct netdev *
 netdev_dpdk_alloc(void)
 {
@@ -1004,20 +1031,35 @@ netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args)
         netdev_request_reconfigure(netdev);
     }
 
-    /* Flow control configuration for DPDK Ethernet ports. */
     if (dev->type == DPDK_DEV_ETH) {
-        bool rx_fc_en = false;
-        bool tx_fc_en = false;
-        enum rte_eth_fc_mode fc_mode_set[2][2] =
+        {
+            /* Flow control configuration for DPDK Ethernet ports. */
+            bool rx_fc_en = false;
+            bool tx_fc_en = false;
+            enum rte_eth_fc_mode fc_mode_set[2][2] =
                                            {{RTE_FC_NONE, RTE_FC_TX_PAUSE},
                                             {RTE_FC_RX_PAUSE, RTE_FC_FULL}
                                            };
-        rx_fc_en = smap_get_bool(args, "rx-flow-ctrl", false);
-        tx_fc_en = smap_get_bool(args, "tx-flow-ctrl", false);
-        dev->fc_conf.autoneg = smap_get_bool(args, "flow-ctrl-autoneg", false);
-        dev->fc_conf.mode = fc_mode_set[tx_fc_en][rx_fc_en];
-
-        dpdk_eth_flow_ctrl_setup(dev);
+            rx_fc_en = smap_get_bool(args, "rx-flow-ctrl", false);
+            tx_fc_en = smap_get_bool(args, "tx-flow-ctrl", false);
+            dev->fc_conf.autoneg = smap_get_bool(args, "flow-ctrl-autoneg",
+                                                 false);
+            dev->fc_conf.mode = fc_mode_set[tx_fc_en][rx_fc_en];
+            dpdk_eth_flow_ctrl_setup(dev);
+        }
+        {
+            /* Checksum offload configuration for DPDK Ethernet ports. */
+            bool rx_chksm_ofld = false;
+            bool temp_ol_flag = false;
+
+            rx_chksm_ofld = smap_get_bool(args, "rx-checksum-offload", false);
+            temp_ol_flag = (netdev->hw_features & NETDEV_RX_CHECKSUM_OFFLOAD)
+                            != 0;
+            if (temp_ol_flag != rx_chksm_ofld) {
+                netdev->hw_features ^= NETDEV_RX_CHECKSUM_OFFLOAD;
+                dpdk_eth_checksum_offload_configure(netdev);
+            }
+        }
     }
     ovs_mutex_unlock(&dev->mutex);
 
@@ -2837,13 +2879,6 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
     ovs_mutex_lock(&dpdk_mutex);
     ovs_mutex_lock(&dev->mutex);
 
-    if (netdev->n_txq == dev->requested_n_txq
-        && netdev->n_rxq == dev->requested_n_rxq) {
-        /* Reconfiguration is unnecessary */
-
-        goto out;
-    }
-
     rte_eth_dev_stop(dev->port_id);
 
     netdev->n_txq = dev->requested_n_txq;
@@ -2853,7 +2888,6 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
     err = dpdk_eth_dev_init(dev);
     netdev_dpdk_alloc_txq(dev, netdev->n_txq);
 
-out:
 
     ovs_mutex_unlock(&dev->mutex);
     ovs_mutex_unlock(&dpdk_mutex);
@@ -3469,6 +3503,25 @@ dpdk_set_lcore_id(unsigned cpu)
     RTE_PER_LCORE(_lcore_id) = cpu;
 }
 
+inline void
+dpdk_hw_checksum_validate(struct dp_packet *packet) {
+    if (packet->mbuf.ol_flags & PKT_RX_L4_CKSUM_GOOD) {
+        packet->md.ol_flags |= OL_RX_L4_CKSUM_GOOD;
+    }
+
+    else if (packet->mbuf.ol_flags & PKT_RX_L4_CKSUM_BAD) {
+        packet->md.ol_flags |= OL_RX_L4_CKSUM_BAD;
+    }
+
+    if (packet->mbuf.ol_flags & PKT_RX_IP_CKSUM_GOOD) {
+        packet->md.ol_flags |= OL_RX_IP_CKSUM_GOOD ;
+    }
+
+    else if (packet->mbuf.ol_flags & PKT_RX_IP_CKSUM_BAD) {
+        packet->md.ol_flags |= OL_RX_IP_CKSUM_BAD;
+    }
+}
+
 static bool
 dpdk_thread_is_pmd(void)
 {
diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h
index 80bb834..a723097 100644
--- a/lib/netdev-dpdk.h
+++ b/lib/netdev-dpdk.h
@@ -26,6 +26,7 @@ struct smap;
 void netdev_dpdk_register(void);
 void free_dpdk_buf(struct dp_packet *);
 void dpdk_set_lcore_id(unsigned cpu);
+void dpdk_hw_checksum_validate(struct dp_packet *packet);
 
 #else
 
@@ -51,6 +52,11 @@ dpdk_set_lcore_id(unsigned cpu OVS_UNUSED)
     /* Nothing */
 }
 
+inline void
+dpdk_hw_checksum_validate(struct dp_packet *packet) {
+    /*Nothing*/
+}
+
 #endif /* DPDK_NETDEV */
 
 void dpdk_init(const struct smap *ovs_other_config);
diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c
index ce2582f..bad2c55 100644
--- a/lib/netdev-native-tnl.c
+++ b/lib/netdev-native-tnl.c
@@ -85,7 +85,8 @@ netdev_tnl_ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
 
         ovs_be32 ip_src, ip_dst;
 
-        if (csum(ip, IP_IHL(ip->ip_ihl_ver) * 4)) {
+        if (!(packet->md.ol_flags & OL_RX_IP_CKSUM_GOOD) &&
+            csum(ip, IP_IHL(ip->ip_ihl_ver) * 4)) {
             VLOG_WARN_RL(&err_rl, "ip packet has invalid checksum");
             return NULL;
         }
@@ -178,7 +179,10 @@ udp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
         return NULL;
     }
 
-    if (udp->udp_csum) {
+    if (packet->md.ol_flags & OL_RX_L4_CKSUM_GOOD) {
+        tnl->flags |= FLOW_TNL_F_CSUM;
+    }
+    else if (udp->udp_csum) {
         uint32_t csum;
         if (netdev_tnl_is_header_ipv6(dp_packet_data(packet))) {
             csum = packet_csum_pseudoheader6(dp_packet_l3(packet));
@@ -195,6 +199,8 @@ udp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
         tnl->flags |= FLOW_TNL_F_CSUM;
     }
 
+    /* reset the offload flags */
+    packet->md.ol_flags = 0;
     tnl->tp_src = udp->udp_src;
     tnl->tp_dst = udp->udp_dst;
 
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index ae390cb..548ffe9 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -69,6 +69,11 @@ struct netdev {
     int n_txq;
     int n_rxq;
     int ref_cnt;                        /* Times this devices was opened. */
+
+    /* offload features supported by netdev, defined by set of
+     * netdev_hw_features enums. */
+    uint32_t hw_features;
+
     struct shash_node *node;            /* Pointer to element in global map. */
     struct ovs_list saved_flags_list; /* Contains "struct netdev_saved_flags". */
 };
diff --git a/lib/netdev.c b/lib/netdev.c
index 75bf1cb..6ba326e 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -352,6 +352,7 @@ netdev_open(const char *name, const char *type, struct netdev **netdevp)
                 netdev->last_reconfigure_seq =
                     seq_read(netdev->reconfigure_seq);
                 netdev->node = shash_add(&netdev_shash, name, netdev);
+                netdev->hw_features = 0x0;
 
                 /* By default enable one tx and rx queue per netdev. */
                 netdev->n_txq = netdev->netdev_class->send ? 1 : 0;
diff --git a/lib/netdev.h b/lib/netdev.h
index dc7ede8..43b1eb8 100644
--- a/lib/netdev.h
+++ b/lib/netdev.h
@@ -193,6 +193,11 @@ enum netdev_flags {
     NETDEV_LOOPBACK = 0x0004    /* This is a loopback device. */
 };
 
+/*Flags for supported netdev offload features */
+enum netdev_hw_features {
+    NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
+};
+
 int netdev_get_flags(const struct netdev *, enum netdev_flags *);
 int netdev_set_flags(struct netdev *, enum netdev_flags,
                      struct netdev_saved_flags **);
diff --git a/lib/packets.h b/lib/packets.h
index 9b98b29..fd94162 100644
--- a/lib/packets.h
+++ b/lib/packets.h
@@ -107,6 +107,7 @@ struct pkt_metadata {
     struct flow_tnl tunnel;     /* Encapsulating tunnel parameters. Note that
                                  * if 'ip_dst' == 0, the rest of the fields may
                                  * be uninitialized. */
+    uint32_t ol_flags;          /* offload flags */
 };
 
 static inline void
@@ -129,6 +130,7 @@ pkt_metadata_init(struct pkt_metadata *md, odp_port_t port)
     md->tunnel.ipv6_dst = in6addr_any;
 
     md->in_port.odp_port = port;
+    md->ol_flags = 0;
 }
 
 /* This function prefetches the cachelines touched by pkt_metadata_init()
-- 
2.5.0




More information about the dev mailing list