[ovs-dev] [PATCH v7] Use TPACKET_V3 to accelerate veth for userspace datapath

yang_y_yi at 163.com yang_y_yi at 163.com
Wed Mar 18 09:02:40 UTC 2020


From: Yi Yang <yangyi01 at inspur.com>

We can avoid high system call overhead by using TPACKET_V3
and using DPDK-like poll to receive and send packets (Note: send
still needs to call sendto to trigger final packet transmission).

>From Linux kernel 3.10 on, TPACKET_V3 has been supported,
so all the Linux kernels current OVS supports can run
TPACKET_V3 without any problem.

I can see about 50% performance improvement for veth compared to
last recvmmsg optimization if I use TPACKET_V3, it is about 2.21
Gbps, but it was 1.47 Gbps before.

After is_pmd is set to true, performance can be improved much
more, it is about 180% performance improvement.

TPACKET_V3 can support TSO, but its performance isn't good because
of TPACKET_V3 kernel implementation issue, so it falls back to
recvmmsg in case userspace-tso-enable is set to true, but its
performance is better than recvmmsg in case userspace-tso-enable is
set to false, so just use TPACKET_V3 in that case.

Note: how much performance improvement is up to your platform,
some platforms can see huge improvement, some ones aren't so
noticeable, but if is_pmd is set to true, you can see big
performance improvement, the prerequisite is your tested veth
interfaces should be attached to different pmd threads.

Signed-off-by: Yi Yang <yangyi01 at inspur.com>
Co-authored-by: William Tu <u9012063 at gmail.com>
Signed-off-by: William Tu <u9012063 at gmail.com>
---
 acinclude.m4                     |  12 ++
 configure.ac                     |   1 +
 include/sparse/linux/if_packet.h | 111 +++++++++++
 lib/dp-packet.c                  |  18 ++
 lib/dp-packet.h                  |   9 +
 lib/netdev-linux-private.h       |  26 +++
 lib/netdev-linux.c               | 419 +++++++++++++++++++++++++++++++++++++--
 7 files changed, 579 insertions(+), 17 deletions(-)

Changelog:
- v6->v7
 * is_pmd is set to true for system interfaces
 * Use zero copy for tpacket_v3 receiving
 * Fix comments by William
 * Remove include/linux/if_packet.h

- v5->v6
 * Fall back to recvmmsg in case userspace-tso-enable is true
   because of TPACKET_V3 kernel implementation issue for tso
   support

- v4->v5
 * Fix travis build issues
 * Fix comments issues (capitalize the first letter)
 * Verify TSO on Ubuntu 18.04 3.5.0-40-generic

- v3->v4
 * Fix sparse check errors

- v2->v3
 * Fix build issues in case HAVE_TPACKET_V3 is not defined
 * Add tso-related support code
 * make sure it can work normally in case userspace-tso-enable is true

- v1->v2
 * Remove TPACKET_V1 and TPACKET_V2 which is obsolete
 * Add include/linux/if_packet.h
 * Change include/sparse/linux/if_packet.h



diff --git a/acinclude.m4 b/acinclude.m4
index 02efea6..1488ded 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -1082,6 +1082,18 @@ AC_DEFUN([OVS_CHECK_IF_DL],
       AC_SEARCH_LIBS([pcap_open_live], [pcap])
    fi])
 
+dnl OVS_CHECK_LINUX_TPACKET
+dnl
+dnl Configure Linux TPACKET.
+AC_DEFUN([OVS_CHECK_LINUX_TPACKET], [
+  AC_COMPILE_IFELSE([
+    AC_LANG_PROGRAM([#include <linux/if_packet.h>], [
+        struct tpacket3_hdr x =  { 0 };
+    ])],
+    [AC_DEFINE([HAVE_TPACKET_V3], [1],
+    [Define to 1 if struct tpacket3_hdr is available.])])
+])
+
 dnl Checks for buggy strtok_r.
 dnl
 dnl Some versions of glibc 2.7 has a bug in strtok_r when compiling
diff --git a/configure.ac b/configure.ac
index 1877aae..b61a1f4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -89,6 +89,7 @@ OVS_CHECK_VISUAL_STUDIO_DDK
 OVS_CHECK_COVERAGE
 OVS_CHECK_NDEBUG
 OVS_CHECK_NETLINK
+OVS_CHECK_LINUX_TPACKET
 OVS_CHECK_OPENSSL
 OVS_CHECK_LIBCAPNG
 OVS_CHECK_LOGDIR
diff --git a/include/sparse/linux/if_packet.h b/include/sparse/linux/if_packet.h
index 5ff6d47..0ac3fce 100644
--- a/include/sparse/linux/if_packet.h
+++ b/include/sparse/linux/if_packet.h
@@ -5,6 +5,7 @@
 #error "Use this header only with sparse.  It is not a correct implementation."
 #endif
 
+#include <openvswitch/types.h>
 #include_next <linux/if_packet.h>
 
 /* Fix endianness of 'spkt_protocol' and 'sll_protocol' members. */
@@ -27,4 +28,114 @@ struct sockaddr_ll {
         unsigned char   sll_addr[8];
 };
 
+/* Packet types */
+#define PACKET_HOST                     0 /* To us                */
+#define PACKET_OTHERHOST                3 /* To someone else 	*/
+#define PACKET_LOOPBACK                 5 /* MC/BRD frame looped back */
+
+/* Packet socket options */
+#define PACKET_RX_RING                  5
+#define PACKET_VERSION                 10
+#define PACKET_TX_RING                 13
+#define PACKET_VNET_HDR                15
+
+/* Rx ring - header status */
+#define TP_STATUS_KERNEL                0
+#define TP_STATUS_USER            (1 << 0)
+#define TP_STATUS_VLAN_VALID      (1 << 4) /* auxdata has valid tp_vlan_tci */
+#define TP_STATUS_VLAN_TPID_VALID (1 << 6) /* auxdata has valid tp_vlan_tpid */
+
+/* Tx ring - header status */
+#define TP_STATUS_SEND_REQUEST    (1 << 0)
+#define TP_STATUS_SENDING         (1 << 1)
+
+#define tpacket_hdr rpl_tpacket_hdr
+struct tpacket_hdr {
+    unsigned long tp_status;
+    unsigned int tp_len;
+    unsigned int tp_snaplen;
+    unsigned short tp_mac;
+    unsigned short tp_net;
+    unsigned int tp_sec;
+    unsigned int tp_usec;
+};
+
+#define TPACKET_ALIGNMENT 16
+#define TPACKET_ALIGN(x) (((x)+TPACKET_ALIGNMENT-1)&~(TPACKET_ALIGNMENT-1))
+
+#define tpacket_hdr_variant1 rpl_tpacket_hdr_variant1
+struct tpacket_hdr_variant1 {
+    uint32_t tp_rxhash;
+    uint32_t tp_vlan_tci;
+    uint16_t tp_vlan_tpid;
+    uint16_t tp_padding;
+};
+
+#define tpacket3_hdr rpl_tpacket3_hdr
+struct tpacket3_hdr {
+    uint32_t  tp_next_offset;
+    uint32_t  tp_sec;
+    uint32_t  tp_nsec;
+    uint32_t  tp_snaplen;
+    uint32_t  tp_len;
+    uint32_t  tp_status;
+    uint16_t  tp_mac;
+    uint16_t  tp_net;
+    /* pkt_hdr variants */
+    union {
+        struct tpacket_hdr_variant1 hv1;
+    };
+    uint8_t  tp_padding[8];
+};
+
+#define tpacket_bd_ts rpl_tpacket_bd_ts
+struct tpacket_bd_ts {
+    unsigned int ts_sec;
+    union {
+        unsigned int ts_usec;
+        unsigned int ts_nsec;
+    };
+};
+
+#define tpacket_hdr_v1 rpl_tpacket_hdr_v1
+struct tpacket_hdr_v1 {
+    uint32_t block_status;
+    uint32_t num_pkts;
+    uint32_t offset_to_first_pkt;
+    uint32_t blk_len;
+    uint64_t __attribute__((aligned(8))) seq_num;
+    struct tpacket_bd_ts ts_first_pkt, ts_last_pkt;
+};
+
+#define tpacket_bd_header_u rpl_tpacket_bd_header_u
+union tpacket_bd_header_u {
+    struct tpacket_hdr_v1 bh1;
+};
+
+#define tpacket_block_desc rpl_tpacket_block_desc
+struct tpacket_block_desc {
+    uint32_t version;
+    uint32_t offset_to_priv;
+    union tpacket_bd_header_u hdr;
+};
+
+#define TPACKET3_HDRLEN \
+    (TPACKET_ALIGN(sizeof(struct tpacket3_hdr)) + sizeof(struct sockaddr_ll))
+
+enum rpl_tpacket_versions {
+    TPACKET_V1,
+    TPACKET_V2,
+    TPACKET_V3
+};
+
+#define tpacket_req3 rpl_tpacket_req3
+struct tpacket_req3 {
+    unsigned int tp_block_size; /* Minimal size of contiguous block */
+    unsigned int tp_block_nr; /* Number of blocks */
+    unsigned int tp_frame_size; /* Size of frame */
+    unsigned int tp_frame_nr; /* Total number of frames */
+    unsigned int tp_retire_blk_tov; /* Timeout in msecs */
+    unsigned int tp_sizeof_priv; /* Offset to private data area */
+    unsigned int tp_feature_req_word;
+};
 #endif
diff --git a/lib/dp-packet.c b/lib/dp-packet.c
index cd26235..82f4934 100644
--- a/lib/dp-packet.c
+++ b/lib/dp-packet.c
@@ -76,6 +76,21 @@ dp_packet_use_afxdp(struct dp_packet *b, void *data, size_t allocated,
 }
 #endif
 
+#if HAVE_TPACKET_V3
+/* Initialize 'b' as an dp_packet that contains tpacket data.
+ */
+void
+dp_packet_use_tpacket(struct dp_packet *b, void *data, size_t allocated,
+                      size_t headroom)
+{
+    dp_packet_set_base(b, (char *)data - headroom);
+    dp_packet_set_data(b, data);
+    dp_packet_set_size(b, 0);
+
+    dp_packet_init__(b, allocated, DPBUF_TPACKET_V3);
+}
+#endif
+
 /* Initializes 'b' as an empty dp_packet that contains the 'allocated' bytes of
  * memory starting at 'base'.  'base' should point to a buffer on the stack.
  * (Nothing actually relies on 'base' being allocated on the stack.  It could
@@ -271,6 +286,9 @@ dp_packet_resize(struct dp_packet *b, size_t new_headroom, size_t new_tailroom)
     case DPBUF_AFXDP:
         OVS_NOT_REACHED();
 
+    case DPBUF_TPACKET_V3:
+        OVS_NOT_REACHED();
+
     case DPBUF_STUB:
         b->source = DPBUF_MALLOC;
         new_base = xmalloc(new_allocated);
diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index 9f8991f..955c6f8 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -44,6 +44,7 @@ enum OVS_PACKED_ENUM dp_packet_source {
                                 * ref to dp_packet_init_dpdk() in dp-packet.c.
                                 */
     DPBUF_AFXDP,               /* Buffer data from XDP frame. */
+    DPBUF_TPACKET_V3           /* Buffer data from TPACKET_V3 rx ring */
 };
 
 #define DP_PACKET_CONTEXT_SIZE 64
@@ -139,6 +140,9 @@ void dp_packet_use_const(struct dp_packet *, const void *, size_t);
 #if HAVE_AF_XDP
 void dp_packet_use_afxdp(struct dp_packet *, void *, size_t, size_t);
 #endif
+#if HAVE_TPACKET_V3
+void dp_packet_use_tpacket(struct dp_packet *, void *, size_t, size_t);
+#endif
 void dp_packet_init_dpdk(struct dp_packet *);
 
 void dp_packet_init(struct dp_packet *, size_t);
@@ -207,6 +211,11 @@ dp_packet_delete(struct dp_packet *b)
             return;
         }
 
+        if (b->source == DPBUF_TPACKET_V3) {
+            /* TPACKET_V3 buffer needn't free, it is recycled. */
+            return;
+        }
+
         dp_packet_uninit(b);
         free(b);
     }
diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h
index c7c515f..296f085 100644
--- a/lib/netdev-linux-private.h
+++ b/lib/netdev-linux-private.h
@@ -20,6 +20,7 @@
 #include <linux/filter.h>
 #include <linux/gen_stats.h>
 #include <linux/if_ether.h>
+#include <linux/if_packet.h>
 #include <linux/if_tun.h>
 #include <linux/types.h>
 #include <linux/ethtool.h>
@@ -41,6 +42,26 @@ struct netdev;
 /* The maximum packet length is 16 bits */
 #define LINUX_RXQ_TSO_MAX_LEN 65535
 
+#ifdef HAVE_TPACKET_V3
+#define TPACKET_MAX_FRAME_NUM 64
+struct tpacket_ring {
+    int sockfd;                  /* Raw socket fd */
+    struct iovec *rd;            /* Ring buffer descriptors */
+    uint8_t *mm_space;           /* Mmap base address */
+    size_t mm_len;               /* Total mmap length */
+    size_t rd_len;               /* Total ring buffer descriptors length */
+    int type;                    /* Ring type: rx or tx */
+    int rd_num;                  /* Number of ring buffer descriptor */
+    int flen;                    /* Block size */
+    struct tpacket_req3 req;     /* TPACKET_V3 req */
+    uint32_t block_num;          /* Current block number */
+    uint32_t frame_num;          /* Current frame number */
+    uint32_t frame_num_in_block; /* Frame number in current block */
+    void * ppd;                  /* Packet pointer in current block */
+    struct dp_packet *pkts;      /* Preallocated dp_packet pool */
+};
+#endif /* HAVE_TPACKET_V3 */
+
 struct netdev_rxq_linux {
     struct netdev_rxq up;
     bool is_tap;
@@ -105,6 +126,11 @@ struct netdev_linux {
 
     int numa_id;                /* NUMA node id. */
 
+#ifdef HAVE_TPACKET_V3
+    struct tpacket_ring *tp_rx_ring;
+    struct tpacket_ring *tp_tx_ring;
+#endif
+
 #ifdef HAVE_AF_XDP
     /* AF_XDP information. */
     struct xsk_socket_info **xsks;
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index c6e46f1..963bb06 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -38,6 +38,9 @@
 #include <linux/sockios.h>
 #include <linux/virtio_net.h>
 #include <sys/ioctl.h>
+#ifdef HAVE_TPACKET_V3
+#include <sys/mman.h>
+#endif
 #include <sys/socket.h>
 #include <sys/uio.h>
 #include <sys/utsname.h>
@@ -970,6 +973,7 @@ netdev_linux_construct_tap(struct netdev *netdev_)
     static const char tap_dev[] = "/dev/net/tun";
     const char *name = netdev_->name;
     struct ifreq ifr;
+    bool tso = userspace_tso_enabled();
 
     int error = netdev_linux_common_construct(netdev_);
     if (error) {
@@ -987,7 +991,7 @@ netdev_linux_construct_tap(struct netdev *netdev_)
     /* Create tap device. */
     get_flags(&netdev->up, &netdev->ifi_flags);
     ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
-    if (userspace_tso_enabled()) {
+    if (tso) {
         ifr.ifr_flags |= IFF_VNET_HDR;
     }
 
@@ -1012,7 +1016,7 @@ netdev_linux_construct_tap(struct netdev *netdev_)
         goto error_close;
     }
 
-    if (userspace_tso_enabled()) {
+    if (tso) {
         /* Old kernels don't support TUNSETOFFLOAD. If TUNSETOFFLOAD is
          * available, it will return EINVAL when a flag is unknown.
          * Therefore, try enabling offload with no flags to check
@@ -1074,6 +1078,116 @@ netdev_linux_rxq_alloc(void)
     return &rx->up;
 }
 
+#ifdef HAVE_TPACKET_V3
+static inline struct tpacket3_hdr *
+tpacket_get_next_frame(struct tpacket_ring *ring, uint32_t frame_num)
+{
+    uint8_t *f0 = ring->rd[0].iov_base;
+
+    return ALIGNED_CAST(struct tpacket3_hdr *,
+               f0 + (frame_num * ring->req.tp_frame_size));
+}
+
+static inline void
+tpacket_fill_ring(struct tpacket_ring *ring, unsigned int blocks, int type)
+{
+    if (type == PACKET_RX_RING) {
+        ring->req.tp_retire_blk_tov = 0;
+        ring->req.tp_sizeof_priv = 0;
+        ring->req.tp_feature_req_word = 0;
+    }
+
+    if (userspace_tso_enabled()) {
+        /* For TX ring, the whole packet must be in one frame
+         * so tp_frame_size must big enough to accommodate
+         * 64K packet, tpacket3_hdr will occupy some bytes,
+         * the final frame size is 64K + 4K = 68K.
+         */
+        ring->req.tp_frame_size = (getpagesize() << 4) + getpagesize();
+        ring->req.tp_block_size = ring->req.tp_frame_size;
+    } else {
+        ring->req.tp_block_size = getpagesize() << 2;
+        ring->req.tp_frame_size = TPACKET_ALIGNMENT << 7;
+    }
+
+    ring->req.tp_block_nr = blocks;
+
+    ring->req.tp_frame_nr = ring->req.tp_block_size /
+                             ring->req.tp_frame_size *
+                             ring->req.tp_block_nr;
+
+    ring->mm_len = ring->req.tp_block_size * ring->req.tp_block_nr;
+    ring->rd_num = ring->req.tp_block_nr;
+    ring->flen = ring->req.tp_block_size;
+}
+
+static int
+tpacket_setup_ring(int sock, struct tpacket_ring *ring, int type)
+{
+    int ret = 0;
+    unsigned int blocks;
+
+    if (userspace_tso_enabled()) {
+        blocks = 128;
+    } else {
+        blocks = 256;
+    }
+    ring->type = type;
+    tpacket_fill_ring(ring, blocks, type);
+    ret = setsockopt(sock, SOL_PACKET, type, &ring->req,
+                     sizeof(ring->req));
+
+    if (ret == -1) {
+        return -1;
+    }
+
+    ring->rd_len = ring->rd_num * sizeof(*ring->rd);
+    ring->rd = xmalloc(ring->rd_len);
+    if (ring->rd == NULL) {
+        return -1;
+    }
+
+    /* Preallocated dp_packet pool */
+    if (type == PACKET_RX_RING) {
+        ring->pkts = xmalloc(sizeof(struct dp_packet) * TPACKET_MAX_FRAME_NUM);
+        if (ring->pkts == NULL) {
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+static inline int
+tpacket_mmap_rx_tx_ring(int sock, struct tpacket_ring *rx_ring,
+                struct tpacket_ring *tx_ring)
+{
+    int i;
+
+    rx_ring->mm_space = mmap(NULL, rx_ring->mm_len + tx_ring->mm_len,
+                          PROT_READ | PROT_WRITE,
+                          MAP_SHARED | MAP_LOCKED | MAP_POPULATE, sock, 0);
+    if (rx_ring->mm_space == MAP_FAILED) {
+        return -1;
+    }
+
+    memset(rx_ring->rd, 0, rx_ring->rd_len);
+    for (i = 0; i < rx_ring->rd_num; ++i) {
+        rx_ring->rd[i].iov_base = rx_ring->mm_space + (i * rx_ring->flen);
+        rx_ring->rd[i].iov_len = rx_ring->flen;
+    }
+
+    tx_ring->mm_space = rx_ring->mm_space + rx_ring->mm_len;
+    memset(tx_ring->rd, 0, tx_ring->rd_len);
+    for (i = 0; i < tx_ring->rd_num; ++i) {
+        tx_ring->rd[i].iov_base = tx_ring->mm_space + (i * tx_ring->flen);
+        tx_ring->rd[i].iov_len = tx_ring->flen;
+    }
+
+    return 0;
+}
+#endif
+
 static int
 netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
 {
@@ -1081,6 +1195,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
     struct netdev *netdev_ = rx->up.netdev;
     struct netdev_linux *netdev = netdev_linux_cast(netdev_);
     int error;
+    bool tso = userspace_tso_enabled();
 
     ovs_mutex_lock(&netdev->mutex);
     rx->is_tap = is_tap_netdev(netdev_);
@@ -1089,6 +1204,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
     } else {
         struct sockaddr_ll sll;
         int ifindex, val;
+
         /* Result of tcpdump -dd inbound */
         static const struct sock_filter filt[] = {
             { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
@@ -1101,7 +1217,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
         };
 
         /* Create file descriptor. */
-        rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
+        rx->fd = socket(PF_PACKET, SOCK_RAW, (OVS_FORCE int) htons(ETH_P_ALL));
         if (rx->fd < 0) {
             error = errno;
             VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
@@ -1116,7 +1232,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
             goto error;
         }
 
-        if (userspace_tso_enabled()
+        if (tso
             && setsockopt(rx->fd, SOL_PACKET, PACKET_VNET_HDR, &val,
                           sizeof val)) {
             error = errno;
@@ -1125,6 +1241,53 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
             goto error;
         }
 
+#ifdef HAVE_TPACKET_V3
+        if (!tso) {
+            static int ver = TPACKET_V3;
+
+            /* TPACKET_V3 ring setup must be after setsockopt
+             * PACKET_VNET_HDR because PACKET_VNET_HDR will return error
+             * (EBUSY) if ring is set up
+             */
+            error = setsockopt(rx->fd, SOL_PACKET, PACKET_VERSION, &ver,
+                               sizeof(ver));
+            if (error != 0) {
+                error = errno;
+                VLOG_ERR("%s: failed to set tpacket version (%s)",
+                         netdev_get_name(netdev_), ovs_strerror(error));
+                goto error;
+            }
+            netdev->tp_rx_ring = xzalloc(sizeof(struct tpacket_ring));
+            netdev->tp_tx_ring = xzalloc(sizeof(struct tpacket_ring));
+            netdev->tp_rx_ring->sockfd = rx->fd;
+            netdev->tp_tx_ring->sockfd = rx->fd;
+            error = tpacket_setup_ring(rx->fd, netdev->tp_rx_ring,
+                                       PACKET_RX_RING);
+            if (error != 0) {
+                error = errno;
+                VLOG_ERR("%s: failed to set tpacket rx ring (%s)",
+                         netdev_get_name(netdev_), ovs_strerror(error));
+                goto error;
+            }
+            error = tpacket_setup_ring(rx->fd, netdev->tp_tx_ring,
+                                       PACKET_TX_RING);
+            if (error != 0) {
+                error = errno;
+                VLOG_ERR("%s: failed to set tpacket tx ring (%s)",
+                         netdev_get_name(netdev_), ovs_strerror(error));
+                goto error;
+            }
+            error = tpacket_mmap_rx_tx_ring(rx->fd, netdev->tp_rx_ring,
+                                           netdev->tp_tx_ring);
+            if (error != 0) {
+                error = errno;
+                VLOG_ERR("%s: failed to mmap tpacket rx & tx ring (%s)",
+                         netdev_get_name(netdev_), ovs_strerror(error));
+                goto error;
+            }
+        }
+#endif
+
         /* Set non-blocking mode. */
         error = set_nonblocking(rx->fd);
         if (error) {
@@ -1139,9 +1302,16 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
 
         /* Bind to specific ethernet device. */
         memset(&sll, 0, sizeof sll);
-        sll.sll_family = AF_PACKET;
+        sll.sll_family = PF_PACKET;
+#ifdef HAVE_TPACKET_V3
+        if (!tso) {
+            sll.sll_hatype = 0;
+            sll.sll_pkttype = 0;
+            sll.sll_halen = 0;
+        }
+#endif
         sll.sll_ifindex = ifindex;
-        sll.sll_protocol = htons(ETH_P_ALL);
+        sll.sll_protocol = (OVS_FORCE ovs_be16) htons(ETH_P_ALL);
         if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
             error = errno;
             VLOG_ERR("%s: failed to bind raw socket (%s)",
@@ -1178,6 +1348,19 @@ netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
     int i;
 
     if (!rx->is_tap) {
+#ifdef HAVE_TPACKET_V3
+        if (!userspace_tso_enabled()) {
+            struct netdev_linux *netdev = netdev_linux_cast(rx->up.netdev);
+
+            if (netdev->tp_rx_ring) {
+                munmap(netdev->tp_rx_ring->mm_space,
+                       2 * netdev->tp_rx_ring->mm_len);
+                free(netdev->tp_rx_ring->rd);
+                free(netdev->tp_tx_ring->rd);
+            }
+        }
+#endif
+
         close(rx->fd);
     }
 
@@ -1220,8 +1403,8 @@ auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
  * It also used recvmmsg to reduce multiple syscalls overhead;
  */
 static int
-netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu,
-                                 struct dp_packet_batch *batch)
+netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, bool tso,
+                                 int mtu, struct dp_packet_batch *batch)
 {
     int iovlen;
     size_t std_len;
@@ -1237,7 +1420,7 @@ netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu,
     struct dp_packet *buffers[NETDEV_MAX_BURST];
     int i;
 
-    if (userspace_tso_enabled()) {
+    if (tso) {
         /* Use the buffer from the allocated packet below to receive MTU
          * sized packets and an aux_buf for extra TSO data. */
         iovlen = IOV_TSO_SIZE;
@@ -1368,7 +1551,7 @@ netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu,
  * packets are added into *batch. The return value is 0 or errno.
  */
 static int
-netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
+netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, bool tso, int mtu,
                                 struct dp_packet_batch *batch)
 {
     int virtio_net_hdr_size;
@@ -1377,7 +1560,7 @@ netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
     int iovlen;
     int i;
 
-    if (userspace_tso_enabled()) {
+    if (tso) {
         /* Use the buffer from the allocated packet below to receive MTU
          * sized packets and an aux_buf for extra TSO data. */
         iovlen = IOV_TSO_SIZE;
@@ -1454,6 +1637,110 @@ netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
     return 0;
 }
 
+#ifdef HAVE_TPACKET_V3
+static int
+netdev_linux_batch_recv_tpacket(struct netdev_rxq_linux *rx, bool tso,
+                                int mtu OVS_UNUSED,
+                                struct dp_packet_batch *batch)
+{
+    struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
+    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+    struct dp_packet *buffer;
+    int i = 0;
+    unsigned int block_num;
+    unsigned int frame_num;
+    unsigned int fn_in_block;
+    struct tpacket_block_desc *pbd;
+    struct tpacket3_hdr *ppd;
+    int virtio_net_hdr_size;
+
+    if (tso) {
+        virtio_net_hdr_size = sizeof(struct virtio_net_hdr);
+    } else {
+        virtio_net_hdr_size = 0;
+    }
+
+    ppd = ALIGNED_CAST(struct tpacket3_hdr *, netdev->tp_rx_ring->ppd);
+    block_num = netdev->tp_rx_ring->block_num;
+    frame_num = netdev->tp_rx_ring->frame_num;
+    fn_in_block = netdev->tp_rx_ring->frame_num_in_block;
+    pbd = ALIGNED_CAST(struct tpacket_block_desc *,
+              netdev->tp_rx_ring->rd[block_num].iov_base);
+
+    while (i < NETDEV_MAX_BURST) {
+        if ((pbd->hdr.bh1.block_status & TP_STATUS_USER) == 0) {
+            break;
+        }
+        if (fn_in_block == 0) {
+            ppd = ALIGNED_CAST(struct tpacket3_hdr *, (uint8_t *) pbd +
+                                   pbd->hdr.bh1.offset_to_first_pkt);
+        }
+
+        /* Use preallocated dp_packet and tpacket_v3 rx ring buffer
+         * to avoid memory allocating and packet copy.
+         */
+        buffer = &netdev->tp_rx_ring->pkts[frame_num];
+        dp_packet_use_tpacket(buffer, (uint8_t *)ppd + ppd->tp_mac
+                                           - virtio_net_hdr_size,
+                              ppd->tp_snaplen + virtio_net_hdr_size
+                                  + VLAN_ETH_HEADER_LEN,
+                              DP_NETDEV_HEADROOM);
+        dp_packet_set_size(buffer, ppd->tp_snaplen + virtio_net_hdr_size);
+
+        if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(buffer)) {
+            /* Unexpected error situation: the virtio header is not present
+             * or corrupted. Drop the packet but continue in case next ones
+             * are correct. */
+            dp_packet_delete(buffer);
+            netdev->rx_dropped += 1;
+            VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header",
+                         netdev_get_name(netdev_));
+        } else {
+            if (ppd->tp_status & TP_STATUS_VLAN_VALID) {
+                struct eth_header *eth;
+                bool double_tagged;
+                ovs_be16 vlan_tpid;
+
+                eth = dp_packet_data(buffer);
+                double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
+                if (ppd->tp_status & TP_STATUS_VLAN_TPID_VALID) {
+                    vlan_tpid = htons(ppd->hv1.tp_vlan_tpid);
+                } else if (double_tagged) {
+                    vlan_tpid = htons(ETH_TYPE_VLAN_8021AD);
+                } else {
+                    vlan_tpid = htons(ETH_TYPE_VLAN_8021Q);
+                }
+                eth_push_vlan(buffer, vlan_tpid, htons(ppd->hv1.tp_vlan_tci));
+            }
+            dp_packet_batch_add(batch, buffer);
+            frame_num = (frame_num + 1) % TPACKET_MAX_FRAME_NUM;
+        }
+
+        fn_in_block++;
+        if (fn_in_block >= pbd->hdr.bh1.num_pkts) {
+            pbd->hdr.bh1.block_status = TP_STATUS_KERNEL;
+            block_num = (block_num + 1) %
+                            netdev->tp_rx_ring->req.tp_block_nr;
+            pbd = (struct tpacket_block_desc *)
+                     netdev->tp_rx_ring->rd[block_num].iov_base;
+            fn_in_block = 0;
+            ppd = NULL;
+        } else {
+            ppd = ALIGNED_CAST(struct tpacket3_hdr *,
+                   (uint8_t *) ppd + ppd->tp_next_offset);
+        }
+        i++;
+    }
+
+    netdev->tp_rx_ring->block_num = block_num;
+    netdev->tp_rx_ring->frame_num = frame_num;
+    netdev->tp_rx_ring->frame_num_in_block = fn_in_block;
+    netdev->tp_rx_ring->ppd = ppd;
+
+    return 0;
+}
+#endif /* HAVE_TPACKET_V3 */
+
 static int
 netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
                       int *qfill)
@@ -1462,12 +1749,13 @@ netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
     struct netdev *netdev = rx->up.netdev;
     ssize_t retval;
     int mtu;
+    bool tso = userspace_tso_enabled();
 
     if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
         mtu = ETH_PAYLOAD_MAX;
     }
 
-    if (userspace_tso_enabled()) {
+    if (tso) {
         /* Allocate TSO packets. The packet has enough headroom to store
          * a full non-TSO packet. When a TSO packet is received, the data
          * from non-TSO buffer (std_len) is prepended to the TSO packet
@@ -1485,9 +1773,19 @@ netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
     }
 
     dp_packet_batch_init(batch);
-    retval = (rx->is_tap
-              ? netdev_linux_batch_rxq_recv_tap(rx, mtu, batch)
-              : netdev_linux_batch_rxq_recv_sock(rx, mtu, batch));
+    if (rx->is_tap) {
+        retval = netdev_linux_batch_rxq_recv_tap(rx, tso, mtu, batch);
+    } else {
+        if (tso) {
+            retval = netdev_linux_batch_rxq_recv_sock(rx, tso, mtu, batch);
+        } else {
+#ifndef HAVE_TPACKET_V3
+            retval = netdev_linux_batch_rxq_recv_sock(rx, tso, mtu, batch);
+#else
+            retval = netdev_linux_batch_recv_tpacket(rx, tso, mtu, batch);
+#endif
+        }
+    }
 
     if (retval) {
         if (retval != EAGAIN && retval != EMSGSIZE) {
@@ -1692,6 +1990,83 @@ netdev_linux_get_numa_id(const struct netdev *netdev_)
     return numa_id;
 }
 
+#ifdef HAVE_TPACKET_V3
+static inline int
+tpacket_tx_is_ready(void * next_frame)
+{
+    struct tpacket3_hdr *hdr = ALIGNED_CAST(struct tpacket3_hdr *, next_frame);
+
+    return !(hdr->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING));
+}
+
+static int
+netdev_linux_tpacket_batch_send(struct netdev *netdev_, bool tso, int mtu,
+                            struct dp_packet_batch *batch)
+{
+    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+    struct dp_packet *packet;
+    int sockfd;
+    ssize_t bytes_sent;
+    int total_pkts = 0;
+
+    unsigned int frame_nr = netdev->tp_tx_ring->req.tp_frame_nr;
+    unsigned int frame_num = netdev->tp_tx_ring->frame_num;
+
+    /* The Linux tap driver returns EIO if the device is not up,
+     * so if the device is not up, don't waste time sending it.
+     * However, if the device is in another network namespace
+     * then OVS can't retrieve the state. In that case, send the
+     * packets anyway. */
+    if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
+        netdev->tx_dropped += dp_packet_batch_size(batch);
+        return 0;
+    }
+
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        size_t size;
+        struct tpacket3_hdr *ppd;
+
+        if (tso) {
+            netdev_linux_prepend_vnet_hdr(packet, mtu);
+        }
+
+        size = dp_packet_size(packet);
+        ppd = tpacket_get_next_frame(netdev->tp_tx_ring, frame_num);
+
+        if (!tpacket_tx_is_ready(ppd)) {
+            break;
+        }
+        ppd->tp_snaplen = size;
+        ppd->tp_len = size;
+        ppd->tp_next_offset = 0;
+
+        memcpy((uint8_t *)ppd + TPACKET3_HDRLEN - sizeof(struct sockaddr_ll),
+               dp_packet_data(packet),
+               size);
+        ppd->tp_status = TP_STATUS_SEND_REQUEST;
+        frame_num = (frame_num + 1) % frame_nr;
+        total_pkts++;
+    }
+    netdev->tp_tx_ring->frame_num = frame_num;
+
+    /* Kick-off transmits */
+    if (total_pkts != 0) {
+        sockfd = netdev->tp_tx_ring->sockfd;
+        bytes_sent = sendto(sockfd, NULL, 0, MSG_DONTWAIT, NULL, 0);
+        if (bytes_sent == -1 &&
+                errno != ENOBUFS && errno != EAGAIN) {
+            /*
+             * In case of an ENOBUFS/EAGAIN error all of the enqueued
+             * packets will be considered successful even though only some
+             * are sent.
+             */
+            netdev->tx_dropped += dp_packet_batch_size(batch);
+        }
+    }
+    return 0;
+}
+#endif
+
 /* Sends 'batch' on 'netdev'.  Returns 0 if successful, otherwise a positive
  * errno value.  Returns EAGAIN without blocking if the packet cannot be queued
  * immediately.  Returns EMSGSIZE if a partial packet was transmitted or if
@@ -1731,7 +2106,17 @@ netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
             goto free_batch;
         }
 
-        error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, batch);
+        if (tso) {
+            error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu,
+                                                 batch);
+        } else {
+#ifndef HAVE_TPACKET_V3
+            error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu,
+                                                 batch);
+#else
+            error = netdev_linux_tpacket_batch_send(netdev_, tso, mtu, batch);
+#endif
+        }
     } else {
         error = netdev_linux_tap_batch_send(netdev_, tso, mtu, batch);
     }
@@ -3562,7 +3947,7 @@ exit:
 const struct netdev_class netdev_linux_class = {
     NETDEV_LINUX_CLASS_COMMON,
     .type = "system",
-    .is_pmd = false,
+    .is_pmd = true,
     .construct = netdev_linux_construct,
     .destruct = netdev_linux_destruct,
     .get_stats = netdev_linux_get_stats,
-- 
1.8.3.1



More information about the dev mailing list