[ovs-dev] [PATCH v7] Use TPACKET_V3 to accelerate veth for userspace datapath
yang_y_yi at 163.com
yang_y_yi at 163.com
Wed Mar 18 09:02:40 UTC 2020
From: Yi Yang <yangyi01 at inspur.com>
We can avoid high system call overhead by using TPACKET_V3
and using DPDK-like poll to receive and send packets (Note: send
still needs to call sendto to trigger final packet transmission).
>From Linux kernel 3.10 on, TPACKET_V3 has been supported,
so all the Linux kernels current OVS supports can run
TPACKET_V3 without any problem.
I can see about 50% performance improvement for veth compared to
last recvmmsg optimization if I use TPACKET_V3, it is about 2.21
Gbps, but it was 1.47 Gbps before.
After is_pmd is set to true, performance can be improved much
more, it is about 180% performance improvement.
TPACKET_V3 can support TSO, but its performance isn't good because
of TPACKET_V3 kernel implementation issue, so it falls back to
recvmmsg in case userspace-tso-enable is set to true, but its
performance is better than recvmmsg in case userspace-tso-enable is
set to false, so just use TPACKET_V3 in that case.
Note: how much performance improvement is up to your platform,
some platforms can see huge improvement, some ones aren't so
noticeable, but if is_pmd is set to true, you can see big
performance improvement, the prerequisite is your tested veth
interfaces should be attached to different pmd threads.
Signed-off-by: Yi Yang <yangyi01 at inspur.com>
Co-authored-by: William Tu <u9012063 at gmail.com>
Signed-off-by: William Tu <u9012063 at gmail.com>
---
acinclude.m4 | 12 ++
configure.ac | 1 +
include/sparse/linux/if_packet.h | 111 +++++++++++
lib/dp-packet.c | 18 ++
lib/dp-packet.h | 9 +
lib/netdev-linux-private.h | 26 +++
lib/netdev-linux.c | 419 +++++++++++++++++++++++++++++++++++++--
7 files changed, 579 insertions(+), 17 deletions(-)
Changelog:
- v6->v7
* is_pmd is set to true for system interfaces
* Use zero copy for tpacket_v3 receiving
* Fix comments by William
* Remove include/linux/if_packet.h
- v5->v6
* Fall back to recvmmsg in case userspace-tso-enable is true
because of TPACKET_V3 kernel implementation issue for tso
support
- v4->v5
* Fix travis build issues
* Fix comments issues (capitalize the first letter)
* Verify TSO on Ubuntu 18.04 3.5.0-40-generic
- v3->v4
* Fix sparse check errors
- v2->v3
* Fix build issues in case HAVE_TPACKET_V3 is not defined
* Add tso-related support code
* make sure it can work normally in case userspace-tso-enable is true
- v1->v2
* Remove TPACKET_V1 and TPACKET_V2 which is obsolete
* Add include/linux/if_packet.h
* Change include/sparse/linux/if_packet.h
diff --git a/acinclude.m4 b/acinclude.m4
index 02efea6..1488ded 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -1082,6 +1082,18 @@ AC_DEFUN([OVS_CHECK_IF_DL],
AC_SEARCH_LIBS([pcap_open_live], [pcap])
fi])
+dnl OVS_CHECK_LINUX_TPACKET
+dnl
+dnl Configure Linux TPACKET.
+AC_DEFUN([OVS_CHECK_LINUX_TPACKET], [
+ AC_COMPILE_IFELSE([
+ AC_LANG_PROGRAM([#include <linux/if_packet.h>], [
+ struct tpacket3_hdr x = { 0 };
+ ])],
+ [AC_DEFINE([HAVE_TPACKET_V3], [1],
+ [Define to 1 if struct tpacket3_hdr is available.])])
+])
+
dnl Checks for buggy strtok_r.
dnl
dnl Some versions of glibc 2.7 has a bug in strtok_r when compiling
diff --git a/configure.ac b/configure.ac
index 1877aae..b61a1f4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -89,6 +89,7 @@ OVS_CHECK_VISUAL_STUDIO_DDK
OVS_CHECK_COVERAGE
OVS_CHECK_NDEBUG
OVS_CHECK_NETLINK
+OVS_CHECK_LINUX_TPACKET
OVS_CHECK_OPENSSL
OVS_CHECK_LIBCAPNG
OVS_CHECK_LOGDIR
diff --git a/include/sparse/linux/if_packet.h b/include/sparse/linux/if_packet.h
index 5ff6d47..0ac3fce 100644
--- a/include/sparse/linux/if_packet.h
+++ b/include/sparse/linux/if_packet.h
@@ -5,6 +5,7 @@
#error "Use this header only with sparse. It is not a correct implementation."
#endif
+#include <openvswitch/types.h>
#include_next <linux/if_packet.h>
/* Fix endianness of 'spkt_protocol' and 'sll_protocol' members. */
@@ -27,4 +28,114 @@ struct sockaddr_ll {
unsigned char sll_addr[8];
};
+/* Packet types */
+#define PACKET_HOST 0 /* To us */
+#define PACKET_OTHERHOST 3 /* To someone else */
+#define PACKET_LOOPBACK 5 /* MC/BRD frame looped back */
+
+/* Packet socket options */
+#define PACKET_RX_RING 5
+#define PACKET_VERSION 10
+#define PACKET_TX_RING 13
+#define PACKET_VNET_HDR 15
+
+/* Rx ring - header status */
+#define TP_STATUS_KERNEL 0
+#define TP_STATUS_USER (1 << 0)
+#define TP_STATUS_VLAN_VALID (1 << 4) /* auxdata has valid tp_vlan_tci */
+#define TP_STATUS_VLAN_TPID_VALID (1 << 6) /* auxdata has valid tp_vlan_tpid */
+
+/* Tx ring - header status */
+#define TP_STATUS_SEND_REQUEST (1 << 0)
+#define TP_STATUS_SENDING (1 << 1)
+
+#define tpacket_hdr rpl_tpacket_hdr
+struct tpacket_hdr {
+ unsigned long tp_status;
+ unsigned int tp_len;
+ unsigned int tp_snaplen;
+ unsigned short tp_mac;
+ unsigned short tp_net;
+ unsigned int tp_sec;
+ unsigned int tp_usec;
+};
+
+#define TPACKET_ALIGNMENT 16
+#define TPACKET_ALIGN(x) (((x)+TPACKET_ALIGNMENT-1)&~(TPACKET_ALIGNMENT-1))
+
+#define tpacket_hdr_variant1 rpl_tpacket_hdr_variant1
+struct tpacket_hdr_variant1 {
+ uint32_t tp_rxhash;
+ uint32_t tp_vlan_tci;
+ uint16_t tp_vlan_tpid;
+ uint16_t tp_padding;
+};
+
+#define tpacket3_hdr rpl_tpacket3_hdr
+struct tpacket3_hdr {
+ uint32_t tp_next_offset;
+ uint32_t tp_sec;
+ uint32_t tp_nsec;
+ uint32_t tp_snaplen;
+ uint32_t tp_len;
+ uint32_t tp_status;
+ uint16_t tp_mac;
+ uint16_t tp_net;
+ /* pkt_hdr variants */
+ union {
+ struct tpacket_hdr_variant1 hv1;
+ };
+ uint8_t tp_padding[8];
+};
+
+#define tpacket_bd_ts rpl_tpacket_bd_ts
+struct tpacket_bd_ts {
+ unsigned int ts_sec;
+ union {
+ unsigned int ts_usec;
+ unsigned int ts_nsec;
+ };
+};
+
+#define tpacket_hdr_v1 rpl_tpacket_hdr_v1
+struct tpacket_hdr_v1 {
+ uint32_t block_status;
+ uint32_t num_pkts;
+ uint32_t offset_to_first_pkt;
+ uint32_t blk_len;
+ uint64_t __attribute__((aligned(8))) seq_num;
+ struct tpacket_bd_ts ts_first_pkt, ts_last_pkt;
+};
+
+#define tpacket_bd_header_u rpl_tpacket_bd_header_u
+union tpacket_bd_header_u {
+ struct tpacket_hdr_v1 bh1;
+};
+
+#define tpacket_block_desc rpl_tpacket_block_desc
+struct tpacket_block_desc {
+ uint32_t version;
+ uint32_t offset_to_priv;
+ union tpacket_bd_header_u hdr;
+};
+
+#define TPACKET3_HDRLEN \
+ (TPACKET_ALIGN(sizeof(struct tpacket3_hdr)) + sizeof(struct sockaddr_ll))
+
+enum rpl_tpacket_versions {
+ TPACKET_V1,
+ TPACKET_V2,
+ TPACKET_V3
+};
+
+#define tpacket_req3 rpl_tpacket_req3
+struct tpacket_req3 {
+ unsigned int tp_block_size; /* Minimal size of contiguous block */
+ unsigned int tp_block_nr; /* Number of blocks */
+ unsigned int tp_frame_size; /* Size of frame */
+ unsigned int tp_frame_nr; /* Total number of frames */
+ unsigned int tp_retire_blk_tov; /* Timeout in msecs */
+ unsigned int tp_sizeof_priv; /* Offset to private data area */
+ unsigned int tp_feature_req_word;
+};
#endif
diff --git a/lib/dp-packet.c b/lib/dp-packet.c
index cd26235..82f4934 100644
--- a/lib/dp-packet.c
+++ b/lib/dp-packet.c
@@ -76,6 +76,21 @@ dp_packet_use_afxdp(struct dp_packet *b, void *data, size_t allocated,
}
#endif
+#if HAVE_TPACKET_V3
+/* Initialize 'b' as an dp_packet that contains tpacket data.
+ */
+void
+dp_packet_use_tpacket(struct dp_packet *b, void *data, size_t allocated,
+ size_t headroom)
+{
+ dp_packet_set_base(b, (char *)data - headroom);
+ dp_packet_set_data(b, data);
+ dp_packet_set_size(b, 0);
+
+ dp_packet_init__(b, allocated, DPBUF_TPACKET_V3);
+}
+#endif
+
/* Initializes 'b' as an empty dp_packet that contains the 'allocated' bytes of
* memory starting at 'base'. 'base' should point to a buffer on the stack.
* (Nothing actually relies on 'base' being allocated on the stack. It could
@@ -271,6 +286,9 @@ dp_packet_resize(struct dp_packet *b, size_t new_headroom, size_t new_tailroom)
case DPBUF_AFXDP:
OVS_NOT_REACHED();
+ case DPBUF_TPACKET_V3:
+ OVS_NOT_REACHED();
+
case DPBUF_STUB:
b->source = DPBUF_MALLOC;
new_base = xmalloc(new_allocated);
diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index 9f8991f..955c6f8 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -44,6 +44,7 @@ enum OVS_PACKED_ENUM dp_packet_source {
* ref to dp_packet_init_dpdk() in dp-packet.c.
*/
DPBUF_AFXDP, /* Buffer data from XDP frame. */
+ DPBUF_TPACKET_V3 /* Buffer data from TPACKET_V3 rx ring */
};
#define DP_PACKET_CONTEXT_SIZE 64
@@ -139,6 +140,9 @@ void dp_packet_use_const(struct dp_packet *, const void *, size_t);
#if HAVE_AF_XDP
void dp_packet_use_afxdp(struct dp_packet *, void *, size_t, size_t);
#endif
+#if HAVE_TPACKET_V3
+void dp_packet_use_tpacket(struct dp_packet *, void *, size_t, size_t);
+#endif
void dp_packet_init_dpdk(struct dp_packet *);
void dp_packet_init(struct dp_packet *, size_t);
@@ -207,6 +211,11 @@ dp_packet_delete(struct dp_packet *b)
return;
}
+ if (b->source == DPBUF_TPACKET_V3) {
+ /* TPACKET_V3 buffer needn't free, it is recycled. */
+ return;
+ }
+
dp_packet_uninit(b);
free(b);
}
diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h
index c7c515f..296f085 100644
--- a/lib/netdev-linux-private.h
+++ b/lib/netdev-linux-private.h
@@ -20,6 +20,7 @@
#include <linux/filter.h>
#include <linux/gen_stats.h>
#include <linux/if_ether.h>
+#include <linux/if_packet.h>
#include <linux/if_tun.h>
#include <linux/types.h>
#include <linux/ethtool.h>
@@ -41,6 +42,26 @@ struct netdev;
/* The maximum packet length is 16 bits */
#define LINUX_RXQ_TSO_MAX_LEN 65535
+#ifdef HAVE_TPACKET_V3
+#define TPACKET_MAX_FRAME_NUM 64
+struct tpacket_ring {
+ int sockfd; /* Raw socket fd */
+ struct iovec *rd; /* Ring buffer descriptors */
+ uint8_t *mm_space; /* Mmap base address */
+ size_t mm_len; /* Total mmap length */
+ size_t rd_len; /* Total ring buffer descriptors length */
+ int type; /* Ring type: rx or tx */
+ int rd_num; /* Number of ring buffer descriptor */
+ int flen; /* Block size */
+ struct tpacket_req3 req; /* TPACKET_V3 req */
+ uint32_t block_num; /* Current block number */
+ uint32_t frame_num; /* Current frame number */
+ uint32_t frame_num_in_block; /* Frame number in current block */
+ void * ppd; /* Packet pointer in current block */
+ struct dp_packet *pkts; /* Preallocated dp_packet pool */
+};
+#endif /* HAVE_TPACKET_V3 */
+
struct netdev_rxq_linux {
struct netdev_rxq up;
bool is_tap;
@@ -105,6 +126,11 @@ struct netdev_linux {
int numa_id; /* NUMA node id. */
+#ifdef HAVE_TPACKET_V3
+ struct tpacket_ring *tp_rx_ring;
+ struct tpacket_ring *tp_tx_ring;
+#endif
+
#ifdef HAVE_AF_XDP
/* AF_XDP information. */
struct xsk_socket_info **xsks;
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index c6e46f1..963bb06 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -38,6 +38,9 @@
#include <linux/sockios.h>
#include <linux/virtio_net.h>
#include <sys/ioctl.h>
+#ifdef HAVE_TPACKET_V3
+#include <sys/mman.h>
+#endif
#include <sys/socket.h>
#include <sys/uio.h>
#include <sys/utsname.h>
@@ -970,6 +973,7 @@ netdev_linux_construct_tap(struct netdev *netdev_)
static const char tap_dev[] = "/dev/net/tun";
const char *name = netdev_->name;
struct ifreq ifr;
+ bool tso = userspace_tso_enabled();
int error = netdev_linux_common_construct(netdev_);
if (error) {
@@ -987,7 +991,7 @@ netdev_linux_construct_tap(struct netdev *netdev_)
/* Create tap device. */
get_flags(&netdev->up, &netdev->ifi_flags);
ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
- if (userspace_tso_enabled()) {
+ if (tso) {
ifr.ifr_flags |= IFF_VNET_HDR;
}
@@ -1012,7 +1016,7 @@ netdev_linux_construct_tap(struct netdev *netdev_)
goto error_close;
}
- if (userspace_tso_enabled()) {
+ if (tso) {
/* Old kernels don't support TUNSETOFFLOAD. If TUNSETOFFLOAD is
* available, it will return EINVAL when a flag is unknown.
* Therefore, try enabling offload with no flags to check
@@ -1074,6 +1078,116 @@ netdev_linux_rxq_alloc(void)
return &rx->up;
}
+#ifdef HAVE_TPACKET_V3
+static inline struct tpacket3_hdr *
+tpacket_get_next_frame(struct tpacket_ring *ring, uint32_t frame_num)
+{
+ uint8_t *f0 = ring->rd[0].iov_base;
+
+ return ALIGNED_CAST(struct tpacket3_hdr *,
+ f0 + (frame_num * ring->req.tp_frame_size));
+}
+
+static inline void
+tpacket_fill_ring(struct tpacket_ring *ring, unsigned int blocks, int type)
+{
+ if (type == PACKET_RX_RING) {
+ ring->req.tp_retire_blk_tov = 0;
+ ring->req.tp_sizeof_priv = 0;
+ ring->req.tp_feature_req_word = 0;
+ }
+
+ if (userspace_tso_enabled()) {
+ /* For TX ring, the whole packet must be in one frame
+ * so tp_frame_size must big enough to accommodate
+ * 64K packet, tpacket3_hdr will occupy some bytes,
+ * the final frame size is 64K + 4K = 68K.
+ */
+ ring->req.tp_frame_size = (getpagesize() << 4) + getpagesize();
+ ring->req.tp_block_size = ring->req.tp_frame_size;
+ } else {
+ ring->req.tp_block_size = getpagesize() << 2;
+ ring->req.tp_frame_size = TPACKET_ALIGNMENT << 7;
+ }
+
+ ring->req.tp_block_nr = blocks;
+
+ ring->req.tp_frame_nr = ring->req.tp_block_size /
+ ring->req.tp_frame_size *
+ ring->req.tp_block_nr;
+
+ ring->mm_len = ring->req.tp_block_size * ring->req.tp_block_nr;
+ ring->rd_num = ring->req.tp_block_nr;
+ ring->flen = ring->req.tp_block_size;
+}
+
+static int
+tpacket_setup_ring(int sock, struct tpacket_ring *ring, int type)
+{
+ int ret = 0;
+ unsigned int blocks;
+
+ if (userspace_tso_enabled()) {
+ blocks = 128;
+ } else {
+ blocks = 256;
+ }
+ ring->type = type;
+ tpacket_fill_ring(ring, blocks, type);
+ ret = setsockopt(sock, SOL_PACKET, type, &ring->req,
+ sizeof(ring->req));
+
+ if (ret == -1) {
+ return -1;
+ }
+
+ ring->rd_len = ring->rd_num * sizeof(*ring->rd);
+ ring->rd = xmalloc(ring->rd_len);
+ if (ring->rd == NULL) {
+ return -1;
+ }
+
+ /* Preallocated dp_packet pool */
+ if (type == PACKET_RX_RING) {
+ ring->pkts = xmalloc(sizeof(struct dp_packet) * TPACKET_MAX_FRAME_NUM);
+ if (ring->pkts == NULL) {
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static inline int
+tpacket_mmap_rx_tx_ring(int sock, struct tpacket_ring *rx_ring,
+ struct tpacket_ring *tx_ring)
+{
+ int i;
+
+ rx_ring->mm_space = mmap(NULL, rx_ring->mm_len + tx_ring->mm_len,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_LOCKED | MAP_POPULATE, sock, 0);
+ if (rx_ring->mm_space == MAP_FAILED) {
+ return -1;
+ }
+
+ memset(rx_ring->rd, 0, rx_ring->rd_len);
+ for (i = 0; i < rx_ring->rd_num; ++i) {
+ rx_ring->rd[i].iov_base = rx_ring->mm_space + (i * rx_ring->flen);
+ rx_ring->rd[i].iov_len = rx_ring->flen;
+ }
+
+ tx_ring->mm_space = rx_ring->mm_space + rx_ring->mm_len;
+ memset(tx_ring->rd, 0, tx_ring->rd_len);
+ for (i = 0; i < tx_ring->rd_num; ++i) {
+ tx_ring->rd[i].iov_base = tx_ring->mm_space + (i * tx_ring->flen);
+ tx_ring->rd[i].iov_len = tx_ring->flen;
+ }
+
+ return 0;
+}
+#endif
+
static int
netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
{
@@ -1081,6 +1195,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
struct netdev *netdev_ = rx->up.netdev;
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
int error;
+ bool tso = userspace_tso_enabled();
ovs_mutex_lock(&netdev->mutex);
rx->is_tap = is_tap_netdev(netdev_);
@@ -1089,6 +1204,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
} else {
struct sockaddr_ll sll;
int ifindex, val;
+
/* Result of tcpdump -dd inbound */
static const struct sock_filter filt[] = {
{ 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
@@ -1101,7 +1217,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
};
/* Create file descriptor. */
- rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
+ rx->fd = socket(PF_PACKET, SOCK_RAW, (OVS_FORCE int) htons(ETH_P_ALL));
if (rx->fd < 0) {
error = errno;
VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
@@ -1116,7 +1232,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
goto error;
}
- if (userspace_tso_enabled()
+ if (tso
&& setsockopt(rx->fd, SOL_PACKET, PACKET_VNET_HDR, &val,
sizeof val)) {
error = errno;
@@ -1125,6 +1241,53 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
goto error;
}
+#ifdef HAVE_TPACKET_V3
+ if (!tso) {
+ static int ver = TPACKET_V3;
+
+ /* TPACKET_V3 ring setup must be after setsockopt
+ * PACKET_VNET_HDR because PACKET_VNET_HDR will return error
+ * (EBUSY) if ring is set up
+ */
+ error = setsockopt(rx->fd, SOL_PACKET, PACKET_VERSION, &ver,
+ sizeof(ver));
+ if (error != 0) {
+ error = errno;
+ VLOG_ERR("%s: failed to set tpacket version (%s)",
+ netdev_get_name(netdev_), ovs_strerror(error));
+ goto error;
+ }
+ netdev->tp_rx_ring = xzalloc(sizeof(struct tpacket_ring));
+ netdev->tp_tx_ring = xzalloc(sizeof(struct tpacket_ring));
+ netdev->tp_rx_ring->sockfd = rx->fd;
+ netdev->tp_tx_ring->sockfd = rx->fd;
+ error = tpacket_setup_ring(rx->fd, netdev->tp_rx_ring,
+ PACKET_RX_RING);
+ if (error != 0) {
+ error = errno;
+ VLOG_ERR("%s: failed to set tpacket rx ring (%s)",
+ netdev_get_name(netdev_), ovs_strerror(error));
+ goto error;
+ }
+ error = tpacket_setup_ring(rx->fd, netdev->tp_tx_ring,
+ PACKET_TX_RING);
+ if (error != 0) {
+ error = errno;
+ VLOG_ERR("%s: failed to set tpacket tx ring (%s)",
+ netdev_get_name(netdev_), ovs_strerror(error));
+ goto error;
+ }
+ error = tpacket_mmap_rx_tx_ring(rx->fd, netdev->tp_rx_ring,
+ netdev->tp_tx_ring);
+ if (error != 0) {
+ error = errno;
+ VLOG_ERR("%s: failed to mmap tpacket rx & tx ring (%s)",
+ netdev_get_name(netdev_), ovs_strerror(error));
+ goto error;
+ }
+ }
+#endif
+
/* Set non-blocking mode. */
error = set_nonblocking(rx->fd);
if (error) {
@@ -1139,9 +1302,16 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
/* Bind to specific ethernet device. */
memset(&sll, 0, sizeof sll);
- sll.sll_family = AF_PACKET;
+ sll.sll_family = PF_PACKET;
+#ifdef HAVE_TPACKET_V3
+ if (!tso) {
+ sll.sll_hatype = 0;
+ sll.sll_pkttype = 0;
+ sll.sll_halen = 0;
+ }
+#endif
sll.sll_ifindex = ifindex;
- sll.sll_protocol = htons(ETH_P_ALL);
+ sll.sll_protocol = (OVS_FORCE ovs_be16) htons(ETH_P_ALL);
if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
error = errno;
VLOG_ERR("%s: failed to bind raw socket (%s)",
@@ -1178,6 +1348,19 @@ netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
int i;
if (!rx->is_tap) {
+#ifdef HAVE_TPACKET_V3
+ if (!userspace_tso_enabled()) {
+ struct netdev_linux *netdev = netdev_linux_cast(rx->up.netdev);
+
+ if (netdev->tp_rx_ring) {
+ munmap(netdev->tp_rx_ring->mm_space,
+ 2 * netdev->tp_rx_ring->mm_len);
+ free(netdev->tp_rx_ring->rd);
+ free(netdev->tp_tx_ring->rd);
+ }
+ }
+#endif
+
close(rx->fd);
}
@@ -1220,8 +1403,8 @@ auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
* It also used recvmmsg to reduce multiple syscalls overhead;
*/
static int
-netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu,
- struct dp_packet_batch *batch)
+netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, bool tso,
+ int mtu, struct dp_packet_batch *batch)
{
int iovlen;
size_t std_len;
@@ -1237,7 +1420,7 @@ netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu,
struct dp_packet *buffers[NETDEV_MAX_BURST];
int i;
- if (userspace_tso_enabled()) {
+ if (tso) {
/* Use the buffer from the allocated packet below to receive MTU
* sized packets and an aux_buf for extra TSO data. */
iovlen = IOV_TSO_SIZE;
@@ -1368,7 +1551,7 @@ netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu,
* packets are added into *batch. The return value is 0 or errno.
*/
static int
-netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
+netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, bool tso, int mtu,
struct dp_packet_batch *batch)
{
int virtio_net_hdr_size;
@@ -1377,7 +1560,7 @@ netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
int iovlen;
int i;
- if (userspace_tso_enabled()) {
+ if (tso) {
/* Use the buffer from the allocated packet below to receive MTU
* sized packets and an aux_buf for extra TSO data. */
iovlen = IOV_TSO_SIZE;
@@ -1454,6 +1637,110 @@ netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
return 0;
}
+#ifdef HAVE_TPACKET_V3
+static int
+netdev_linux_batch_recv_tpacket(struct netdev_rxq_linux *rx, bool tso,
+ int mtu OVS_UNUSED,
+ struct dp_packet_batch *batch)
+{
+ struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
+ struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+ struct dp_packet *buffer;
+ int i = 0;
+ unsigned int block_num;
+ unsigned int frame_num;
+ unsigned int fn_in_block;
+ struct tpacket_block_desc *pbd;
+ struct tpacket3_hdr *ppd;
+ int virtio_net_hdr_size;
+
+ if (tso) {
+ virtio_net_hdr_size = sizeof(struct virtio_net_hdr);
+ } else {
+ virtio_net_hdr_size = 0;
+ }
+
+ ppd = ALIGNED_CAST(struct tpacket3_hdr *, netdev->tp_rx_ring->ppd);
+ block_num = netdev->tp_rx_ring->block_num;
+ frame_num = netdev->tp_rx_ring->frame_num;
+ fn_in_block = netdev->tp_rx_ring->frame_num_in_block;
+ pbd = ALIGNED_CAST(struct tpacket_block_desc *,
+ netdev->tp_rx_ring->rd[block_num].iov_base);
+
+ while (i < NETDEV_MAX_BURST) {
+ if ((pbd->hdr.bh1.block_status & TP_STATUS_USER) == 0) {
+ break;
+ }
+ if (fn_in_block == 0) {
+ ppd = ALIGNED_CAST(struct tpacket3_hdr *, (uint8_t *) pbd +
+ pbd->hdr.bh1.offset_to_first_pkt);
+ }
+
+ /* Use preallocated dp_packet and tpacket_v3 rx ring buffer
+ * to avoid memory allocating and packet copy.
+ */
+ buffer = &netdev->tp_rx_ring->pkts[frame_num];
+ dp_packet_use_tpacket(buffer, (uint8_t *)ppd + ppd->tp_mac
+ - virtio_net_hdr_size,
+ ppd->tp_snaplen + virtio_net_hdr_size
+ + VLAN_ETH_HEADER_LEN,
+ DP_NETDEV_HEADROOM);
+ dp_packet_set_size(buffer, ppd->tp_snaplen + virtio_net_hdr_size);
+
+ if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(buffer)) {
+ /* Unexpected error situation: the virtio header is not present
+ * or corrupted. Drop the packet but continue in case next ones
+ * are correct. */
+ dp_packet_delete(buffer);
+ netdev->rx_dropped += 1;
+ VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header",
+ netdev_get_name(netdev_));
+ } else {
+ if (ppd->tp_status & TP_STATUS_VLAN_VALID) {
+ struct eth_header *eth;
+ bool double_tagged;
+ ovs_be16 vlan_tpid;
+
+ eth = dp_packet_data(buffer);
+ double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
+ if (ppd->tp_status & TP_STATUS_VLAN_TPID_VALID) {
+ vlan_tpid = htons(ppd->hv1.tp_vlan_tpid);
+ } else if (double_tagged) {
+ vlan_tpid = htons(ETH_TYPE_VLAN_8021AD);
+ } else {
+ vlan_tpid = htons(ETH_TYPE_VLAN_8021Q);
+ }
+ eth_push_vlan(buffer, vlan_tpid, htons(ppd->hv1.tp_vlan_tci));
+ }
+ dp_packet_batch_add(batch, buffer);
+ frame_num = (frame_num + 1) % TPACKET_MAX_FRAME_NUM;
+ }
+
+ fn_in_block++;
+ if (fn_in_block >= pbd->hdr.bh1.num_pkts) {
+ pbd->hdr.bh1.block_status = TP_STATUS_KERNEL;
+ block_num = (block_num + 1) %
+ netdev->tp_rx_ring->req.tp_block_nr;
+ pbd = (struct tpacket_block_desc *)
+ netdev->tp_rx_ring->rd[block_num].iov_base;
+ fn_in_block = 0;
+ ppd = NULL;
+ } else {
+ ppd = ALIGNED_CAST(struct tpacket3_hdr *,
+ (uint8_t *) ppd + ppd->tp_next_offset);
+ }
+ i++;
+ }
+
+ netdev->tp_rx_ring->block_num = block_num;
+ netdev->tp_rx_ring->frame_num = frame_num;
+ netdev->tp_rx_ring->frame_num_in_block = fn_in_block;
+ netdev->tp_rx_ring->ppd = ppd;
+
+ return 0;
+}
+#endif /* HAVE_TPACKET_V3 */
+
static int
netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
int *qfill)
@@ -1462,12 +1749,13 @@ netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
struct netdev *netdev = rx->up.netdev;
ssize_t retval;
int mtu;
+ bool tso = userspace_tso_enabled();
if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
mtu = ETH_PAYLOAD_MAX;
}
- if (userspace_tso_enabled()) {
+ if (tso) {
/* Allocate TSO packets. The packet has enough headroom to store
* a full non-TSO packet. When a TSO packet is received, the data
* from non-TSO buffer (std_len) is prepended to the TSO packet
@@ -1485,9 +1773,19 @@ netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
}
dp_packet_batch_init(batch);
- retval = (rx->is_tap
- ? netdev_linux_batch_rxq_recv_tap(rx, mtu, batch)
- : netdev_linux_batch_rxq_recv_sock(rx, mtu, batch));
+ if (rx->is_tap) {
+ retval = netdev_linux_batch_rxq_recv_tap(rx, tso, mtu, batch);
+ } else {
+ if (tso) {
+ retval = netdev_linux_batch_rxq_recv_sock(rx, tso, mtu, batch);
+ } else {
+#ifndef HAVE_TPACKET_V3
+ retval = netdev_linux_batch_rxq_recv_sock(rx, tso, mtu, batch);
+#else
+ retval = netdev_linux_batch_recv_tpacket(rx, tso, mtu, batch);
+#endif
+ }
+ }
if (retval) {
if (retval != EAGAIN && retval != EMSGSIZE) {
@@ -1692,6 +1990,83 @@ netdev_linux_get_numa_id(const struct netdev *netdev_)
return numa_id;
}
+#ifdef HAVE_TPACKET_V3
+static inline int
+tpacket_tx_is_ready(void * next_frame)
+{
+ struct tpacket3_hdr *hdr = ALIGNED_CAST(struct tpacket3_hdr *, next_frame);
+
+ return !(hdr->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING));
+}
+
+static int
+netdev_linux_tpacket_batch_send(struct netdev *netdev_, bool tso, int mtu,
+ struct dp_packet_batch *batch)
+{
+ struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+ struct dp_packet *packet;
+ int sockfd;
+ ssize_t bytes_sent;
+ int total_pkts = 0;
+
+ unsigned int frame_nr = netdev->tp_tx_ring->req.tp_frame_nr;
+ unsigned int frame_num = netdev->tp_tx_ring->frame_num;
+
+ /* The Linux tap driver returns EIO if the device is not up,
+ * so if the device is not up, don't waste time sending it.
+ * However, if the device is in another network namespace
+ * then OVS can't retrieve the state. In that case, send the
+ * packets anyway. */
+ if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
+ netdev->tx_dropped += dp_packet_batch_size(batch);
+ return 0;
+ }
+
+ DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+ size_t size;
+ struct tpacket3_hdr *ppd;
+
+ if (tso) {
+ netdev_linux_prepend_vnet_hdr(packet, mtu);
+ }
+
+ size = dp_packet_size(packet);
+ ppd = tpacket_get_next_frame(netdev->tp_tx_ring, frame_num);
+
+ if (!tpacket_tx_is_ready(ppd)) {
+ break;
+ }
+ ppd->tp_snaplen = size;
+ ppd->tp_len = size;
+ ppd->tp_next_offset = 0;
+
+ memcpy((uint8_t *)ppd + TPACKET3_HDRLEN - sizeof(struct sockaddr_ll),
+ dp_packet_data(packet),
+ size);
+ ppd->tp_status = TP_STATUS_SEND_REQUEST;
+ frame_num = (frame_num + 1) % frame_nr;
+ total_pkts++;
+ }
+ netdev->tp_tx_ring->frame_num = frame_num;
+
+ /* Kick-off transmits */
+ if (total_pkts != 0) {
+ sockfd = netdev->tp_tx_ring->sockfd;
+ bytes_sent = sendto(sockfd, NULL, 0, MSG_DONTWAIT, NULL, 0);
+ if (bytes_sent == -1 &&
+ errno != ENOBUFS && errno != EAGAIN) {
+ /*
+ * In case of an ENOBUFS/EAGAIN error all of the enqueued
+ * packets will be considered successful even though only some
+ * are sent.
+ */
+ netdev->tx_dropped += dp_packet_batch_size(batch);
+ }
+ }
+ return 0;
+}
+#endif
+
/* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
* errno value. Returns EAGAIN without blocking if the packet cannot be queued
* immediately. Returns EMSGSIZE if a partial packet was transmitted or if
@@ -1731,7 +2106,17 @@ netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
goto free_batch;
}
- error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, batch);
+ if (tso) {
+ error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu,
+ batch);
+ } else {
+#ifndef HAVE_TPACKET_V3
+ error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu,
+ batch);
+#else
+ error = netdev_linux_tpacket_batch_send(netdev_, tso, mtu, batch);
+#endif
+ }
} else {
error = netdev_linux_tap_batch_send(netdev_, tso, mtu, batch);
}
@@ -3562,7 +3947,7 @@ exit:
const struct netdev_class netdev_linux_class = {
NETDEV_LINUX_CLASS_COMMON,
.type = "system",
- .is_pmd = false,
+ .is_pmd = true,
.construct = netdev_linux_construct,
.destruct = netdev_linux_destruct,
.get_stats = netdev_linux_get_stats,
--
1.8.3.1
More information about the dev
mailing list