[ovs-dev] [PATCH v1] netdev-dpdk: Implement TCP/UDP TX cksum in ovs-dpdk side
Zhenyu Gao
sysugaozhenyu at gmail.com
Tue Aug 1 14:26:31 UTC 2017
Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.
So L4 packets's cksum were calculated in VM side but performance is not
good.
Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput and
makes virtio-net frontend-driver support NETIF_F_SG as well
Signed-off-by: Zhenyu Gao <sysugaozhenyu at gmail.com>
---
Here is some performance number:
Setup:
qperf client
+---------+
| VM |
+---------+
|
| qperf server
+--------------+ +------------+
| vswitch+dpdk | | bare-metal |
+--------------+ +------------+
| |
| |
pNic---------PhysicalSwitch----
do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0 tx on' in VM side.
It offload cksum job to ovs-dpdk side.
do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx off' in VM side.
VM calculate cksum for tcp/udp packets.
We can see huge improvment in TCP throughput if we leverage ovs-dpdk cksum.
[root at localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2 host-qperf-server01 tcp_bw tcp_lat udp_bw udp_lat
do cksum in ovs-dpdk do cksum in VM without this patch
tcp_bw:
bw = 2.05 MB/sec bw = 1.92 MB/sec bw = 1.95 MB/sec
tcp_bw:
bw = 3.9 MB/sec bw = 3.99 MB/sec bw = 3.98 MB/sec
tcp_bw:
bw = 8.09 MB/sec bw = 7.82 MB/sec bw = 8.19 MB/sec
tcp_bw:
bw = 14.9 MB/sec bw = 14.8 MB/sec bw = 15.7 MB/sec
tcp_bw:
bw = 27.7 MB/sec bw = 28 MB/sec bw = 29.7 MB/sec
tcp_bw:
bw = 51.2 MB/sec bw = 50.9 MB/sec bw = 54.9 MB/sec
tcp_bw:
bw = 86.7 MB/sec bw = 86.8 MB/sec bw = 95.1 MB/sec
tcp_bw:
bw = 149 MB/sec bw = 160 MB/sec bw = 149 MB/sec
tcp_bw:
bw = 211 MB/sec bw = 205 MB/sec bw = 216 MB/sec
tcp_bw:
bw = 271 MB/sec bw = 254 MB/sec bw = 275 MB/sec
tcp_bw:
bw = 326 MB/sec bw = 303 MB/sec bw = 321 MB/sec
tcp_bw:
bw = 407 MB/sec bw = 359 MB/sec bw = 361 MB/sec
tcp_bw:
bw = 816 MB/sec bw = 512 MB/sec bw = 419 MB/sec
tcp_bw:
bw = 840 MB/sec bw = 756 MB/sec bw = 457 MB/sec
tcp_bw:
bw = 1.07 GB/sec bw = 880 MB/sec bw = 480 MB/sec
tcp_bw:
bw = 1.17 GB/sec bw = 1.01 GB/sec bw = 488 MB/sec
tcp_bw:
bw = 1.17 GB/sec bw = 1.11 GB/sec bw = 483 MB/sec
tcp_lat:
latency = 29 us latency = 29.2 us latency = 29.6 us
tcp_lat:
latency = 28.9 us latency = 29.3 us latency = 29.5 us
tcp_lat:
latency = 29 us latency = 29.3 us latency = 29.6 us
tcp_lat:
latency = 29 us latency = 29.4 us latency = 29.5 us
tcp_lat:
latency = 29 us latency = 29.2 us latency = 29.6 us
tcp_lat:
latency = 29.1 us latency = 29.3 us latency = 29.7 us
tcp_lat:
latency = 29.4 us latency = 29.6 us latency = 30 us
tcp_lat:
latency = 29.8 us latency = 30.1 us latency = 30.2 us
tcp_lat:
latency = 30.9 us latency = 30.9 us latency = 31 us
tcp_lat:
latency = 46.9 us latency = 46.2 us latency = 32.2 us
tcp_lat:
latency = 51.5 us latency = 52.6 us latency = 34.5 us
tcp_lat:
latency = 43.9 us latency = 43.8 us latency = 43.6 us
tcp_lat:
latency = 47.6 us latency = 48 us latency = 48.1 us
tcp_lat:
latency = 77.7 us latency = 78.8 us latency = 78.8 us
tcp_lat:
latency = 82.8 us latency = 82.3 us latency = 116 us
tcp_lat:
latency = 94.8 us latency = 94.2 us latency = 134 us
tcp_lat:
latency = 167 us latency = 197 us latency = 172 us
udp_bw:
send_bw = 418 KB/sec send_bw = 413 KB/sec send_bw = 403 KB/sec
recv_bw = 410 KB/sec recv_bw = 412 KB/sec recv_bw = 400 KB/sec
udp_bw:
send_bw = 831 KB/sec send_bw = 825 KB/sec send_bw = 810 KB/sec
recv_bw = 828 KB/sec recv_bw = 816 KB/sec recv_bw = 807 KB/sec
udp_bw:
send_bw = 1.67 MB/sec send_bw = 1.65 MB/sec send_bw = 1.63 MB/sec
recv_bw = 1.64 MB/sec recv_bw = 1.62 MB/sec recv_bw = 1.63 MB/sec
udp_bw:
send_bw = 3.36 MB/sec send_bw = 3.29 MB/sec send_bw = 3.26 MB/sec
recv_bw = 3.29 MB/sec recv_bw = 3.25 MB/sec recv_bw = 2.82 MB/sec
udp_bw:
send_bw = 6.72 MB/sec send_bw = 6.61 MB/sec send_bw = 6.45 MB/sec
recv_bw = 6.54 MB/sec recv_bw = 6.59 MB/sec recv_bw = 6.45 MB/sec
udp_bw:
send_bw = 13.4 MB/sec send_bw = 13.2 MB/sec send_bw = 13 MB/sec
recv_bw = 13.1 MB/sec recv_bw = 13.1 MB/sec recv_bw = 13 MB/sec
udp_bw:
send_bw = 26.8 MB/sec send_bw = 26.4 MB/sec send_bw = 25.9 MB/sec
recv_bw = 26.4 MB/sec recv_bw = 26.2 MB/sec recv_bw = 25.7 MB/sec
udp_bw:
send_bw = 53.4 MB/sec send_bw = 52.5 MB/sec send_bw = 52 MB/sec
recv_bw = 48.4 MB/sec recv_bw = 51.8 MB/sec recv_bw = 51.2 MB/sec
udp_bw:
send_bw = 106 MB/sec send_bw = 104 MB/sec send_bw = 103 MB/sec
recv_bw = 98.9 MB/sec recv_bw = 93.2 MB/sec recv_bw = 100 MB/sec
udp_bw:
send_bw = 213 MB/sec send_bw = 206 MB/sec send_bw = 205 MB/sec
recv_bw = 197 MB/sec recv_bw = 196 MB/sec recv_bw = 202 MB/sec
udp_bw:
send_bw = 417 MB/sec send_bw = 405 MB/sec send_bw = 401 MB/sec
recv_bw = 400 MB/sec recv_bw = 333 MB/sec recv_bw = 358 MB/sec
udp_bw:
send_bw = 556 MB/sec send_bw = 552 MB/sec send_bw = 557 MB/sec
recv_bw = 361 MB/sec recv_bw = 365 MB/sec recv_bw = 362 MB/sec
udp_bw:
send_bw = 865 MB/sec send_bw = 866 MB/sec send_bw = 863 MB/sec
recv_bw = 564 MB/sec recv_bw = 573 MB/sec recv_bw = 584 MB/sec
udp_bw:
send_bw = 1.05 GB/sec send_bw = 1.09 GB/sec send_bw = 1.08 GB/sec
recv_bw = 789 MB/sec recv_bw = 732 MB/sec recv_bw = 793 MB/sec
udp_bw:
send_bw = 1.18 GB/sec send_bw = 1.23 GB/sec send_bw = 1.19 GB/sec
recv_bw = 658 MB/sec recv_bw = 788 MB/sec recv_bw = 673 MB/sec
udp_bw:
send_bw = 1.3 GB/sec send_bw = 1.3 GB/sec send_bw = 1.3 GB/sec
recv_bw = 659 MB/sec recv_bw = 763 MB/sec recv_bw = 762 MB/sec
udp_bw:
send_bw = 0 bytes/sec send_bw = 0 bytes/sec send_bw = 0 bytes/sec
recv_bw = 0 bytes/sec recv_bw = 0 bytes/sec recv_bw = 0 bytes/sec
udp_lat:
latency = 26.7 us latency = 26.5 us latency = 26.4 us
udp_lat:
latency = 26.7 us latency = 26.5 us latency = 26.3 us
udp_lat:
latency = 26.7 us latency = 26.7 us latency = 26.3 us
udp_lat:
latency = 26.7 us latency = 26.6 us latency = 26.3 us
udp_lat:
latency = 26.7 us latency = 26.7 us latency = 26.7 us
udp_lat:
latency = 27 us latency = 26.7 us latency = 26.6 us
udp_lat:
latency = 27 us latency = 26.9 us latency = 26.7 us
udp_lat:
latency = 27.6 us latency = 27.4 us latency = 27.3 us
udp_lat:
latency = 28.1 us latency = 28 us latency = 28 us
udp_lat:
latency = 29.4 us latency = 29.2 us latency = 29.2 us
udp_lat:
latency = 31 us latency = 31 us latency = 30.8 us
udp_lat:
latency = 41.4 us latency = 41.4 us latency = 41.3 us
udp_lat:
latency = 41.6 us latency = 41.5 us latency = 41.5 us
udp_lat:
latency = 64.9 us latency = 65 us latency = 65 us
udp_lat:
latency = 72.3 us latency = 72 us latency = 72 us
udp_lat:
latency = 121 us latency = 122 us latency = 122 us
udp_lat:
latency = 0 ns latency = 0 ns latency = 0 ns
lib/netdev-dpdk.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 82 insertions(+), 2 deletions(-)
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index ea17b97..d27d615 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -28,6 +28,7 @@
#include <rte_errno.h>
#include <rte_eth_ring.h>
#include <rte_ethdev.h>
+#include <rte_ip.h>
#include <rte_malloc.h>
#include <rte_mbuf.h>
#include <rte_meter.h>
@@ -1392,6 +1393,84 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
rte_free(rx);
}
+static inline void
+netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,
+ uint8_t l4_proto, bool is_ipv4)
+{
+ void *l3hdr = (void *)(data + pkt->l3_ofs);
+
+ if (l4_proto == IPPROTO_TCP) {
+ struct tcp_header *tcp_hdr = (struct tcp_header *)(data + pkt->l4_ofs);
+
+ pkt->mbuf.l2_len = pkt->l3_ofs;
+ pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
+ tcp_hdr->tcp_csum = 0;
+ if (is_ipv4) {
+ tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);
+ pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV4;
+ } else {
+ pkt->mbuf.ol_flags ^= PKT_TX_TCP_CKSUM | PKT_TX_IPV6;
+ tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);
+ }
+ } else if (l4_proto == IPPROTO_UDP) {
+ struct udp_header *udp_hdr = (struct udp_header *)(data + pkt->l4_ofs);
+ /* do not recalculate udp cksum if it was 0 */
+ if (udp_hdr->udp_csum != 0) {
+ pkt->mbuf.l2_len = pkt->l3_ofs;
+ pkt->mbuf.l3_len = pkt->l4_ofs - pkt->l3_ofs;
+ udp_hdr->udp_csum = 0;
+ if (is_ipv4) {
+ /*do not calculate udp cksum if it was a fragment IP*/
+ if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
+ fragment_offset)) {
+ return;
+ }
+
+ pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV4;
+ udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr, udp_hdr);
+ } else {
+ pkt->mbuf.ol_flags ^= PKT_TX_UDP_CKSUM | PKT_TX_IPV6;
+ udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr, udp_hdr);
+ }
+ }
+ }
+}
+
+static inline void
+netdev_prepare_tx_csum(struct dp_packet **pkts, int pkt_cnt)
+{
+ int i;
+
+ for (i = 0; i < pkt_cnt; i++) {
+ ovs_be16 dl_type;
+ struct dp_packet *pkt = (struct dp_packet *)pkts[i];
+ const char *data = dp_packet_data(pkt);
+ void *l3hdr = (char *)(data + pkt->l3_ofs);
+
+ if (pkt->l4_ofs == UINT16_MAX || pkt->l3_ofs == UINT16_MAX) {
+ continue;
+ }
+ /* This take a assumption that it should be a vhost packet if this
+ * packet was allocated by DPDK pool and try sending to pNic. */
+ if (pkt->source == DPBUF_DPDK &&
+ !(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
+ // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet need cksum
+ continue;
+ }
+
+ dl_type = *(ovs_be16 *)(data + pkt->l3_ofs - 2);
+ if (dl_type == htons(ETH_TYPE_IP)) {
+ netdev_refill_l4_cksum(data, pkt,
+ ((struct ipv4_hdr *)l3hdr)->next_proto_id,
+ true);
+ } else if (dl_type == htons(ETH_TYPE_IPV6)) {
+ netdev_refill_l4_cksum(data, pkt,
+ ((struct ipv6_hdr *)l3hdr)->proto,
+ false);
+ }
+ }
+}
+
/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes ownership of
* 'pkts', even in case of failure.
*
@@ -1833,6 +1912,8 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
return;
}
+ netdev_prepare_tx_csum(batch->packets, batch->count);
+
if (OVS_UNLIKELY(concurrent_txq)) {
qid = qid % dev->up.n_txq;
rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
@@ -2741,8 +2822,7 @@ netdev_dpdk_vhost_class_init(void)
if (ovsthread_once_start(&once)) {
rte_vhost_driver_callback_register(&virtio_net_device_ops);
rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
- | 1ULL << VIRTIO_NET_F_HOST_TSO6
- | 1ULL << VIRTIO_NET_F_CSUM);
+ | 1ULL << VIRTIO_NET_F_HOST_TSO6);
ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
ovsthread_once_done(&once);
--
1.8.3.1
More information about the dev
mailing list