[ovs-dev] [PATCH v6] Use TPACKET_V3 to accelerate veth for userspace datapath

William Tu u9012063 at gmail.com
Tue Mar 10 19:43:15 UTC 2020


On Fri, Mar 6, 2020 at 6:35 AM <yang_y_yi at 163.com> wrote:
>
> From: Yi Yang <yangyi01 at inspur.com>
>
> We can avoid high system call overhead by using TPACKET_V3
> and using DPDK-like poll to receive and send packets (Note: send
> still needs to call sendto to trigger final packet transmission).
>
> From Linux kernel 3.10 on, TPACKET_V3 has been supported,
> so all the Linux kernels current OVS supports can run
> TPACKET_V3 without any problem.
>
> I can see about 30% performance improvement for veth compared to
> last recvmmsg optimization if I use TPACKET_V3, it is about 1.98
> Gbps, but it was 1.47 Gbps before.

On my testbed, I didn't see any performance gain.
For a 100 sec TCP iperf3, I see with/without tpacket show the same 1.70Gbps.
Do you think if we set .is_pmd=true, the performance might be better
because tpacket is ring-based?

>
> TPACKET_V3 can support TSO, but its performance isn't good because
> of TPACKET_V3 kernel implementation issue, so it falls back to

What's the implementation issue? If we use latest kernel, does the issue
still exist?

> recvmmsg in case userspace-tso-enable is set to true, but its
> performance is better than recvmmsg in case userspace-tso-enable is
> set to false, so just use TPACKET_V3 in that case.
>
> Signed-off-by: Yi Yang <yangyi01 at inspur.com>
> Co-authored-by: William Tu <u9012063 at gmail.com>
> Signed-off-by: William Tu <u9012063 at gmail.com>
> ---
>  acinclude.m4                     |  12 ++
>  configure.ac                     |   1 +
>  include/linux/automake.mk        |   1 +
>  include/linux/if_packet.h        | 128 ++++++++++++
>  include/sparse/linux/if_packet.h | 111 +++++++++++
>  lib/netdev-linux-private.h       |  22 +++
>  lib/netdev-linux.c               | 411 +++++++++++++++++++++++++++++++++++++--
>  7 files changed, 670 insertions(+), 16 deletions(-)
>  create mode 100644 include/linux/if_packet.h
>
> Changelog:
> - v5->v6
>  * Fall back to recvmmsg in case userspace-tso-enable is true
>    because of TPACKET_V3 kernel implementation issue for tso
>    support
>
> - v4->v5
>  * Fix travis build issues
>  * Fix comments issues (capitalize the first letter)
>  * Verify TSO on Ubuntu 18.04 3.5.0-40-generic
>
> - v3->v4
>  * Fix sparse check errors
>
> - v2->v3
>  * Fix build issues in case HAVE_TPACKET_V3 is not defined
>  * Add tso-related support code
>  * make sure it can work normally in case userspace-tso-enable is true
>
> - v1->v2
>  * Remove TPACKET_V1 and TPACKET_V2 which is obsolete
>  * Add include/linux/if_packet.h
>  * Change include/sparse/linux/if_packet.h
>
> diff --git a/acinclude.m4 b/acinclude.m4
> index 1212a46..b39bbb9 100644
> --- a/acinclude.m4
> +++ b/acinclude.m4
> @@ -1093,6 +1093,18 @@ AC_DEFUN([OVS_CHECK_IF_DL],
>        AC_SEARCH_LIBS([pcap_open_live], [pcap])
>     fi])
>
> +dnl OVS_CHECK_LINUX_TPACKET
> +dnl
> +dnl Configure Linux TPACKET.
> +AC_DEFUN([OVS_CHECK_LINUX_TPACKET], [
> +  AC_COMPILE_IFELSE([
> +    AC_LANG_PROGRAM([#include <linux/if_packet.h>], [
> +        struct tpacket3_hdr x =  { 0 };
> +    ])],
> +    [AC_DEFINE([HAVE_TPACKET_V3], [1],
> +    [Define to 1 if struct tpacket3_hdr is available.])])
> +])
> +
>  dnl Checks for buggy strtok_r.
>  dnl
>  dnl Some versions of glibc 2.7 has a bug in strtok_r when compiling
> diff --git a/configure.ac b/configure.ac
> index 1877aae..b61a1f4 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -89,6 +89,7 @@ OVS_CHECK_VISUAL_STUDIO_DDK
>  OVS_CHECK_COVERAGE
>  OVS_CHECK_NDEBUG
>  OVS_CHECK_NETLINK
> +OVS_CHECK_LINUX_TPACKET
>  OVS_CHECK_OPENSSL
>  OVS_CHECK_LIBCAPNG
>  OVS_CHECK_LOGDIR
> diff --git a/include/linux/automake.mk b/include/linux/automake.mk
> index 8f063f4..a659e65 100644
> --- a/include/linux/automake.mk
> +++ b/include/linux/automake.mk
> @@ -1,4 +1,5 @@
>  noinst_HEADERS += \
> +       include/linux/if_packet.h \
>         include/linux/netlink.h \
>         include/linux/netfilter/nf_conntrack_sctp.h \
>         include/linux/pkt_cls.h \
> diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
> new file mode 100644
> index 0000000..e20aacc
> --- /dev/null
> +++ b/include/linux/if_packet.h

if OVS_CHECK_LINUX_TPACKET returns false,
can we simply fall back to recvmmsg?
So this is not needed?

> @@ -0,0 +1,128 @@
> +#ifndef __LINUX_IF_PACKET_WRAPPER_H
> +#define __LINUX_IF_PACKET_WRAPPER_H 1
> +
> +#ifdef HAVE_TPACKET_V3
> +#include_next <linux/if_packet.h>
> +#else
> +#define HAVE_TPACKET_V3 1
> +
> +struct sockaddr_pkt {
> +        unsigned short  spkt_family;
> +        unsigned char   spkt_device[14];
> +        uint16_t        spkt_protocol;
> +};
> +
> +struct sockaddr_ll {
> +        unsigned short  sll_family;
> +        uint16_t        sll_protocol;
> +        int             sll_ifindex;
> +        unsigned short  sll_hatype;
> +        unsigned char   sll_pkttype;
> +        unsigned char   sll_halen;
> +        unsigned char   sll_addr[8];
> +};
> +
> +/* Packet types */
> +#define PACKET_HOST                     0 /* To us                */
> +#define PACKET_OTHERHOST                3 /* To someone else    */
> +#define PACKET_LOOPBACK                 5 /* MC/BRD frame looped back */
> +
> +/* Packet socket options */
> +#define PACKET_RX_RING                  5
> +#define PACKET_VERSION                 10
> +#define PACKET_TX_RING                 13
> +#define PACKET_VNET_HDR                15
> +
> +/* Rx ring - header status */
> +#define TP_STATUS_KERNEL                0
> +#define TP_STATUS_USER            (1 << 0)
> +#define TP_STATUS_VLAN_VALID      (1 << 4) /* auxdata has valid tp_vlan_tci */
> +#define TP_STATUS_VLAN_TPID_VALID (1 << 6) /* auxdata has valid tp_vlan_tpid */
> +
> +/* Tx ring - header status */
> +#define TP_STATUS_SEND_REQUEST    (1 << 0)
> +#define TP_STATUS_SENDING         (1 << 1)
> +
> +struct tpacket_hdr {
> +    unsigned long tp_status;
> +    unsigned int tp_len;
> +    unsigned int tp_snaplen;
> +    unsigned short tp_mac;
> +    unsigned short tp_net;
> +    unsigned int tp_sec;
> +    unsigned int tp_usec;
> +};
> +
> +#define TPACKET_ALIGNMENT 16
> +#define TPACKET_ALIGN(x) (((x)+TPACKET_ALIGNMENT-1)&~(TPACKET_ALIGNMENT-1))
> +
> +struct tpacket_hdr_variant1 {
> +    uint32_t tp_rxhash;
> +    uint32_t tp_vlan_tci;
> +    uint16_t tp_vlan_tpid;
> +    uint16_t tp_padding;
> +};
> +
> +struct tpacket3_hdr {
> +    uint32_t  tp_next_offset;
> +    uint32_t  tp_sec;
> +    uint32_t  tp_nsec;
> +    uint32_t  tp_snaplen;
> +    uint32_t  tp_len;
> +    uint32_t  tp_status;
> +    uint16_t  tp_mac;
> +    uint16_t  tp_net;
> +    /* pkt_hdr variants */
> +    union {
> +        struct tpacket_hdr_variant1 hv1;
> +    };
> +    uint8_t  tp_padding[8];
> +};
> +
> +struct tpacket_bd_ts {
> +    unsigned int ts_sec;
> +    union {
> +        unsigned int ts_usec;
> +        unsigned int ts_nsec;
> +    };
> +};
> +
> +struct tpacket_hdr_v1 {
> +    uint32_t block_status;
> +    uint32_t num_pkts;
> +    uint32_t offset_to_first_pkt;
> +    uint32_t blk_len;
> +    uint64_t __attribute__((aligned(8))) seq_num;
> +    struct tpacket_bd_ts ts_first_pkt, ts_last_pkt;
> +};
> +
> +union tpacket_bd_header_u {
> +    struct tpacket_hdr_v1 bh1;
> +};
> +
> +struct tpacket_block_desc {
> +    uint32_t version;
> +    uint32_t offset_to_priv;
> +    union tpacket_bd_header_u hdr;
> +};
> +
> +#define TPACKET3_HDRLEN \
> +    (TPACKET_ALIGN(sizeof(struct tpacket3_hdr)) + sizeof(struct sockaddr_ll))
> +
> +enum tpacket_versions {
> +    TPACKET_V1,
> +    TPACKET_V2,
> +    TPACKET_V3
> +};
> +
> +struct tpacket_req3 {
> +    unsigned int tp_block_size; /* Minimal size of contiguous block */
> +    unsigned int tp_block_nr; /* Number of blocks */
> +    unsigned int tp_frame_size; /* Size of frame */
> +    unsigned int tp_frame_nr; /* Total number of frames */
> +    unsigned int tp_retire_blk_tov; /* Timeout in msecs */
> +    unsigned int tp_sizeof_priv; /* Offset to private data area */
> +    unsigned int tp_feature_req_word;
> +};
> +#endif /* HAVE_TPACKET_V3 */
> +#endif /* __LINUX_IF_PACKET_WRAPPER_H */
> diff --git a/include/sparse/linux/if_packet.h b/include/sparse/linux/if_packet.h
> index 5ff6d47..0ac3fce 100644
> --- a/include/sparse/linux/if_packet.h
> +++ b/include/sparse/linux/if_packet.h

Similar here. How about just use recvmmsg?

> @@ -5,6 +5,7 @@
>  #error "Use this header only with sparse.  It is not a correct implementation."
>  #endif
>
> +#include <openvswitch/types.h>
>  #include_next <linux/if_packet.h>
>
>  /* Fix endianness of 'spkt_protocol' and 'sll_protocol' members. */
> @@ -27,4 +28,114 @@ struct sockaddr_ll {
>          unsigned char   sll_addr[8];
>  };
>
> +/* Packet types */
> +#define PACKET_HOST                     0 /* To us                */
> +#define PACKET_OTHERHOST                3 /* To someone else   */
> +#define PACKET_LOOPBACK                 5 /* MC/BRD frame looped back */
> +
> +/* Packet socket options */
> +#define PACKET_RX_RING                  5
> +#define PACKET_VERSION                 10
> +#define PACKET_TX_RING                 13
> +#define PACKET_VNET_HDR                15
> +
> +/* Rx ring - header status */
> +#define TP_STATUS_KERNEL                0
> +#define TP_STATUS_USER            (1 << 0)
> +#define TP_STATUS_VLAN_VALID      (1 << 4) /* auxdata has valid tp_vlan_tci */
> +#define TP_STATUS_VLAN_TPID_VALID (1 << 6) /* auxdata has valid tp_vlan_tpid */
> +
> +/* Tx ring - header status */
> +#define TP_STATUS_SEND_REQUEST    (1 << 0)
> +#define TP_STATUS_SENDING         (1 << 1)
> +
> +#define tpacket_hdr rpl_tpacket_hdr
> +struct tpacket_hdr {
> +    unsigned long tp_status;
> +    unsigned int tp_len;
> +    unsigned int tp_snaplen;
> +    unsigned short tp_mac;
> +    unsigned short tp_net;
> +    unsigned int tp_sec;
> +    unsigned int tp_usec;
> +};
> +
> +#define TPACKET_ALIGNMENT 16
> +#define TPACKET_ALIGN(x) (((x)+TPACKET_ALIGNMENT-1)&~(TPACKET_ALIGNMENT-1))
> +
> +#define tpacket_hdr_variant1 rpl_tpacket_hdr_variant1
> +struct tpacket_hdr_variant1 {
> +    uint32_t tp_rxhash;
> +    uint32_t tp_vlan_tci;
> +    uint16_t tp_vlan_tpid;
> +    uint16_t tp_padding;
> +};
> +
> +#define tpacket3_hdr rpl_tpacket3_hdr
> +struct tpacket3_hdr {
> +    uint32_t  tp_next_offset;
> +    uint32_t  tp_sec;
> +    uint32_t  tp_nsec;
> +    uint32_t  tp_snaplen;
> +    uint32_t  tp_len;
> +    uint32_t  tp_status;
> +    uint16_t  tp_mac;
> +    uint16_t  tp_net;
> +    /* pkt_hdr variants */
> +    union {
> +        struct tpacket_hdr_variant1 hv1;
> +    };
> +    uint8_t  tp_padding[8];
> +};
> +
> +#define tpacket_bd_ts rpl_tpacket_bd_ts
> +struct tpacket_bd_ts {
> +    unsigned int ts_sec;
> +    union {
> +        unsigned int ts_usec;
> +        unsigned int ts_nsec;
> +    };
> +};
> +
> +#define tpacket_hdr_v1 rpl_tpacket_hdr_v1
> +struct tpacket_hdr_v1 {
> +    uint32_t block_status;
> +    uint32_t num_pkts;
> +    uint32_t offset_to_first_pkt;
> +    uint32_t blk_len;
> +    uint64_t __attribute__((aligned(8))) seq_num;
> +    struct tpacket_bd_ts ts_first_pkt, ts_last_pkt;
> +};
> +
> +#define tpacket_bd_header_u rpl_tpacket_bd_header_u
> +union tpacket_bd_header_u {
> +    struct tpacket_hdr_v1 bh1;
> +};
> +
> +#define tpacket_block_desc rpl_tpacket_block_desc
> +struct tpacket_block_desc {
> +    uint32_t version;
> +    uint32_t offset_to_priv;
> +    union tpacket_bd_header_u hdr;
> +};
> +
> +#define TPACKET3_HDRLEN \
> +    (TPACKET_ALIGN(sizeof(struct tpacket3_hdr)) + sizeof(struct sockaddr_ll))
> +
> +enum rpl_tpacket_versions {
> +    TPACKET_V1,
> +    TPACKET_V2,
> +    TPACKET_V3
> +};
> +
> +#define tpacket_req3 rpl_tpacket_req3
> +struct tpacket_req3 {
> +    unsigned int tp_block_size; /* Minimal size of contiguous block */
> +    unsigned int tp_block_nr; /* Number of blocks */
> +    unsigned int tp_frame_size; /* Size of frame */
> +    unsigned int tp_frame_nr; /* Total number of frames */
> +    unsigned int tp_retire_blk_tov; /* Timeout in msecs */
> +    unsigned int tp_sizeof_priv; /* Offset to private data area */
> +    unsigned int tp_feature_req_word;
> +};
>  #endif
> diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h
> index c7c515f..ccd58f4 100644
> --- a/lib/netdev-linux-private.h
> +++ b/lib/netdev-linux-private.h
> @@ -26,6 +26,7 @@
>  #include <linux/mii.h>
>  #include <stdint.h>
>  #include <stdbool.h>
> +#include <linux/if_packet.h>
need to place in order.

>
>  #include "dp-packet.h"
>  #include "netdev-afxdp.h"
> @@ -41,6 +42,22 @@ struct netdev;
>  /* The maximum packet length is 16 bits */
>  #define LINUX_RXQ_TSO_MAX_LEN 65535
>
> +#ifdef HAVE_TPACKET_V3
> +struct tpacket_ring {
> +    int sockfd;
> +    struct iovec *rd;
> +    uint8_t *mm_space;
> +    size_t mm_len, rd_len;
> +    struct sockaddr_ll ll;
> +    int type, rd_num, flen;
> +    struct tpacket_req3 req;
> +    uint32_t block_num;
> +    uint32_t frame_num;
> +    uint32_t frame_num_in_block;
> +    void * ppd;
> +};
> +#endif /* HAVE_TPACKET_V3 */
> +
>  struct netdev_rxq_linux {
>      struct netdev_rxq up;
>      bool is_tap;
> @@ -105,6 +122,11 @@ struct netdev_linux {
>
>      int numa_id;                /* NUMA node id. */
>
> +#ifdef HAVE_TPACKET_V3
> +    struct tpacket_ring *tp_rx_ring;
> +    struct tpacket_ring *tp_tx_ring;
> +#endif
> +
>  #ifdef HAVE_AF_XDP
>      /* AF_XDP information. */
>      struct xsk_socket_info **xsks;
> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> index c6e46f1..f734086 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -48,6 +48,9 @@
>  #include <stdlib.h>
>  #include <string.h>
>  #include <unistd.h>
> +#ifdef HAVE_TPACKET_V3
> +#include <sys/mman.h>
> +#endif
>
>  #include "coverage.h"
>  #include "dp-packet.h"
> @@ -970,6 +973,7 @@ netdev_linux_construct_tap(struct netdev *netdev_)
>      static const char tap_dev[] = "/dev/net/tun";
>      const char *name = netdev_->name;
>      struct ifreq ifr;
> +    bool tso = userspace_tso_enabled();
>
>      int error = netdev_linux_common_construct(netdev_);
>      if (error) {
> @@ -987,7 +991,7 @@ netdev_linux_construct_tap(struct netdev *netdev_)
>      /* Create tap device. */
>      get_flags(&netdev->up, &netdev->ifi_flags);
>      ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
> -    if (userspace_tso_enabled()) {
> +    if (tso) {
>          ifr.ifr_flags |= IFF_VNET_HDR;
>      }
>
> @@ -1012,7 +1016,7 @@ netdev_linux_construct_tap(struct netdev *netdev_)
>          goto error_close;
>      }
>
> -    if (userspace_tso_enabled()) {
> +    if (tso) {
>          /* Old kernels don't support TUNSETOFFLOAD. If TUNSETOFFLOAD is
>           * available, it will return EINVAL when a flag is unknown.
>           * Therefore, try enabling offload with no flags to check
> @@ -1074,6 +1078,111 @@ netdev_linux_rxq_alloc(void)
>      return &rx->up;
>  }
>
> +#ifdef HAVE_TPACKET_V3
> +static inline struct tpacket3_hdr *
> +tpacket_get_next_frame(struct tpacket_ring *ring, uint32_t frame_num)
> +{
> +    uint8_t *f0 = ring->rd[0].iov_base;
> +
> +    return ALIGNED_CAST(struct tpacket3_hdr *,
> +               f0 + (frame_num * ring->req.tp_frame_size));
> +}
> +
> +/*
> + * ring->rd_num is tp_block_nr, ring->flen is tp_block_size
maybe more explanation for the comments.

> + */
> +static inline void
> +tpacket_fill_ring(struct tpacket_ring *ring, unsigned int blocks, int type)
> +{
> +    if (type == PACKET_RX_RING) {
> +        ring->req.tp_retire_blk_tov = 0;
> +        ring->req.tp_sizeof_priv = 0;
> +        ring->req.tp_feature_req_word = 0;
> +    }
> +
> +    if (userspace_tso_enabled()) {
> +        /* For TX ring, the whole packet must be in one frame
> +         * so tp_frame_size must big enough to accommodate
> +         * 64K packet, tpacket3_hdr will occupy some bytes,
> +         * the final frame size is 64K + 4K = 68K.
> +         */
> +        ring->req.tp_frame_size = (getpagesize() << 4) + getpagesize();
> +        ring->req.tp_block_size = ring->req.tp_frame_size;
> +    } else {
> +        ring->req.tp_block_size = getpagesize() << 2;
> +        ring->req.tp_frame_size = TPACKET_ALIGNMENT << 7;
> +    }
> +
> +    ring->req.tp_block_nr = blocks;
> +
> +    ring->req.tp_frame_nr = ring->req.tp_block_size /
> +                             ring->req.tp_frame_size *
> +                             ring->req.tp_block_nr;
> +
> +    ring->mm_len = ring->req.tp_block_size * ring->req.tp_block_nr;
> +    ring->rd_num = ring->req.tp_block_nr;
> +    ring->flen = ring->req.tp_block_size;
> +}
> +
> +static int
> +tpacket_setup_ring(int sock, struct tpacket_ring *ring, int type)
> +{
> +    int ret = 0;
> +    unsigned int blocks;
> +
> +    if (userspace_tso_enabled()) {
> +        blocks = 128;
> +    } else {
> +        blocks = 256;
> +    }
> +    ring->type = type;
> +    tpacket_fill_ring(ring, blocks, type);
> +    ret = setsockopt(sock, SOL_PACKET, type, &ring->req,
> +                     sizeof(ring->req));
> +
> +    if (ret == -1) {
> +        return -1;
> +    }
> +
> +    ring->rd_len = ring->rd_num * sizeof(*ring->rd);
> +    ring->rd = xmalloc(ring->rd_len);
> +    if (ring->rd == NULL) {
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +static inline int
> +tpacket_mmap_rx_tx_ring(int sock, struct tpacket_ring *rx_ring,
> +                struct tpacket_ring *tx_ring)
> +{
> +    int i;
> +
> +    rx_ring->mm_space = mmap(NULL, rx_ring->mm_len + tx_ring->mm_len,
> +                          PROT_READ | PROT_WRITE,
> +                          MAP_SHARED | MAP_LOCKED | MAP_POPULATE, sock, 0);
> +    if (rx_ring->mm_space == MAP_FAILED) {
> +        return -1;
> +    }
> +
> +    memset(rx_ring->rd, 0, rx_ring->rd_len);
> +    for (i = 0; i < rx_ring->rd_num; ++i) {
> +        rx_ring->rd[i].iov_base = rx_ring->mm_space + (i * rx_ring->flen);
> +        rx_ring->rd[i].iov_len = rx_ring->flen;
> +    }
> +
> +    tx_ring->mm_space = rx_ring->mm_space + rx_ring->mm_len;
> +    memset(tx_ring->rd, 0, tx_ring->rd_len);
> +    for (i = 0; i < tx_ring->rd_num; ++i) {
> +        tx_ring->rd[i].iov_base = tx_ring->mm_space + (i * tx_ring->flen);
> +        tx_ring->rd[i].iov_len = tx_ring->flen;
> +    }
> +
> +    return 0;
> +}
> +#endif
> +
>  static int
>  netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
>  {
> @@ -1081,6 +1190,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
>      struct netdev *netdev_ = rx->up.netdev;
>      struct netdev_linux *netdev = netdev_linux_cast(netdev_);
>      int error;
> +    bool tso = userspace_tso_enabled();
>
>      ovs_mutex_lock(&netdev->mutex);
>      rx->is_tap = is_tap_netdev(netdev_);
> @@ -1089,6 +1199,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
>      } else {
>          struct sockaddr_ll sll;
>          int ifindex, val;
> +
>          /* Result of tcpdump -dd inbound */
>          static const struct sock_filter filt[] = {
>              { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
> @@ -1101,7 +1212,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
>          };
>
>          /* Create file descriptor. */
> -        rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
> +        rx->fd = socket(PF_PACKET, SOCK_RAW, (OVS_FORCE int) htons(ETH_P_ALL));
>          if (rx->fd < 0) {
>              error = errno;
>              VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
> @@ -1116,7 +1227,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
>              goto error;
>          }
>
> -        if (userspace_tso_enabled()
> +        if (tso
>              && setsockopt(rx->fd, SOL_PACKET, PACKET_VNET_HDR, &val,
>                            sizeof val)) {
>              error = errno;
> @@ -1125,6 +1236,53 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
>              goto error;
>          }
>
> +#ifdef HAVE_TPACKET_V3
> +        if (!tso) {
> +            static int ver = TPACKET_V3;
> +
> +            /* TPACKET_V3 ring setup must be after setsockopt
> +             * PACKET_VNET_HDR because PACKET_VNET_HDR will return error
> +             * (EBUSY) if ring is set up
> +             */
> +            error = setsockopt(rx->fd, SOL_PACKET, PACKET_VERSION, &ver,
> +                               sizeof(ver));
> +            if (error != 0) {
> +                error = errno;
> +                VLOG_ERR("%s: failed to set tpacket version (%s)",
> +                         netdev_get_name(netdev_), ovs_strerror(error));
> +                goto error;
> +            }
> +            netdev->tp_rx_ring = xzalloc(sizeof(struct tpacket_ring));
> +            netdev->tp_tx_ring = xzalloc(sizeof(struct tpacket_ring));
> +            netdev->tp_rx_ring->sockfd = rx->fd;
> +            netdev->tp_tx_ring->sockfd = rx->fd;
> +            error = tpacket_setup_ring(rx->fd, netdev->tp_rx_ring,
> +                                       PACKET_RX_RING);
> +            if (error != 0) {
> +                error = errno;
> +                VLOG_ERR("%s: failed to set tpacket rx ring (%s)",
> +                         netdev_get_name(netdev_), ovs_strerror(error));
> +                goto error;
> +            }
> +            error = tpacket_setup_ring(rx->fd, netdev->tp_tx_ring,
> +                                       PACKET_TX_RING);
> +            if (error != 0) {
> +                error = errno;
> +                VLOG_ERR("%s: failed to set tpacket tx ring (%s)",
> +                         netdev_get_name(netdev_), ovs_strerror(error));
> +                goto error;
> +            }
> +            error = tpacket_mmap_rx_tx_ring(rx->fd, netdev->tp_rx_ring,
> +                                           netdev->tp_tx_ring);
> +            if (error != 0) {
> +                error = errno;
> +                VLOG_ERR("%s: failed to mmap tpacket rx & tx ring (%s)",
> +                         netdev_get_name(netdev_), ovs_strerror(error));
> +                goto error;
> +            }
> +        }
> +#endif
> +
>          /* Set non-blocking mode. */
>          error = set_nonblocking(rx->fd);
>          if (error) {
> @@ -1139,9 +1297,16 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
>
>          /* Bind to specific ethernet device. */
>          memset(&sll, 0, sizeof sll);
> -        sll.sll_family = AF_PACKET;
> +        sll.sll_family = PF_PACKET;
What's the difference here?
Is using AF_PACKET not work?

> +#ifdef HAVE_TPACKET_V3
> +        if (!tso) {
> +            sll.sll_hatype = 0;
> +            sll.sll_pkttype = 0;
> +            sll.sll_halen = 0;
> +        }
> +#endif
>          sll.sll_ifindex = ifindex;
> -        sll.sll_protocol = htons(ETH_P_ALL);
> +        sll.sll_protocol = (OVS_FORCE ovs_be16) htons(ETH_P_ALL);
>          if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
>              error = errno;
>              VLOG_ERR("%s: failed to bind raw socket (%s)",
> @@ -1178,6 +1343,19 @@ netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
>      int i;
>
>      if (!rx->is_tap) {
> +#ifdef HAVE_TPACKET_V3
> +        if (!userspace_tso_enabled()) {
> +            struct netdev_linux *netdev = netdev_linux_cast(rx->up.netdev);
> +
> +            if (netdev->tp_rx_ring) {
> +                munmap(netdev->tp_rx_ring->mm_space,
> +                       2 * netdev->tp_rx_ring->mm_len);
> +                free(netdev->tp_rx_ring->rd);
> +                free(netdev->tp_tx_ring->rd);
> +            }
> +        }
> +#endif
> +
>          close(rx->fd);
>      }
>
> @@ -1220,8 +1398,8 @@ auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
>   * It also used recvmmsg to reduce multiple syscalls overhead;
>   */
>  static int
> -netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu,
> -                                 struct dp_packet_batch *batch)
> +netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, bool tso,
> +                                 int mtu, struct dp_packet_batch *batch)
>  {

I think this is unrelated changes. We can call userspace_tso_enable()
in the function instead of passing extra argument.

>      int iovlen;
>      size_t std_len;
> @@ -1237,7 +1415,7 @@ netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu,
>      struct dp_packet *buffers[NETDEV_MAX_BURST];
>      int i;
>
> -    if (userspace_tso_enabled()) {
> +    if (tso) {
I think this is unrelated changes.

>          /* Use the buffer from the allocated packet below to receive MTU
>           * sized packets and an aux_buf for extra TSO data. */
>          iovlen = IOV_TSO_SIZE;
> @@ -1368,7 +1546,7 @@ netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu,
>   * packets are added into *batch. The return value is 0 or errno.
>   */
>  static int
> -netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
> +netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, bool tso, int mtu,
>                                  struct dp_packet_batch *batch)

I think this is unrelated changes. We can call userspace_tso_enable()
in the function instead of passing as argument.

>  {
>      int virtio_net_hdr_size;
> @@ -1377,7 +1555,7 @@ netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
>      int iovlen;
>      int i;
>
> -    if (userspace_tso_enabled()) {
> +    if (tso) {
>          /* Use the buffer from the allocated packet below to receive MTU
>           * sized packets and an aux_buf for extra TSO data. */
>          iovlen = IOV_TSO_SIZE;
> @@ -1454,6 +1632,109 @@ netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
>      return 0;
>  }
>
> +#ifdef HAVE_TPACKET_V3
> +static int
> +netdev_linux_batch_recv_tpacket(struct netdev_rxq_linux *rx, bool tso, int mtu,
> +                                struct dp_packet_batch *batch)
> +{
> +    struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
> +    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
> +    struct dp_packet *buffer;
> +    int i = 0;
> +    unsigned int block_num;
> +    unsigned int fn_in_block;
> +    struct tpacket_block_desc *pbd;
> +    struct tpacket3_hdr *ppd;
> +    int virtio_net_hdr_size;
> +    size_t buffer_len;
> +
> +    if (tso) {
> +        virtio_net_hdr_size = sizeof(struct virtio_net_hdr);
> +    } else {
> +        virtio_net_hdr_size = 0;
> +    }
> +    buffer_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu;
> +
> +    ppd = ALIGNED_CAST(struct tpacket3_hdr *, netdev->tp_rx_ring->ppd);
> +    block_num = netdev->tp_rx_ring->block_num;
> +    fn_in_block = netdev->tp_rx_ring->frame_num_in_block;
> +    pbd = ALIGNED_CAST(struct tpacket_block_desc *,
> +              netdev->tp_rx_ring->rd[block_num].iov_base);
> +
> +    while (i < NETDEV_MAX_BURST) {
> +        if ((pbd->hdr.bh1.block_status & TP_STATUS_USER) == 0) {
> +            break;
> +        }
> +        if (fn_in_block == 0) {
> +            ppd = ALIGNED_CAST(struct tpacket3_hdr *, (uint8_t *) pbd +
> +                                   pbd->hdr.bh1.offset_to_first_pkt);
> +        }
> +
> +        if (ppd->tp_snaplen > (mtu + VLAN_ETH_HEADER_LEN)) {
> +            buffer_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN
> +                         + ppd->tp_snaplen;
> +        }
> +
> +        buffer = dp_packet_new_with_headroom(buffer_len, DP_NETDEV_HEADROOM);
> +        memcpy(dp_packet_data(buffer),
> +               (uint8_t *) ppd + ppd->tp_mac - virtio_net_hdr_size,
> +               ppd->tp_snaplen + virtio_net_hdr_size);
> +        dp_packet_set_size(buffer,
> +                           dp_packet_size(buffer) + ppd->tp_snaplen
> +                               + virtio_net_hdr_size);
> +
> +        if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(buffer)) {
> +            /* Unexpected error situation: the virtio header is not present
> +             * or corrupted. Drop the packet but continue in case next ones
> +             * are correct. */
> +            dp_packet_delete(buffer);
> +            netdev->rx_dropped += 1;
> +            VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header",
> +                         netdev_get_name(netdev_));
> +        } else {
> +            if (ppd->tp_status & TP_STATUS_VLAN_VALID) {
> +                struct eth_header *eth;
> +                bool double_tagged;
> +                ovs_be16 vlan_tpid;
> +
> +                eth = dp_packet_data(buffer);
> +                double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
> +                if (ppd->tp_status & TP_STATUS_VLAN_TPID_VALID) {
> +                    vlan_tpid = htons(ppd->hv1.tp_vlan_tpid);
> +                } else if (double_tagged) {
> +                    vlan_tpid = htons(ETH_TYPE_VLAN_8021AD);
> +                } else {
> +                    vlan_tpid = htons(ETH_TYPE_VLAN_8021Q);
> +                }
> +                eth_push_vlan(buffer, vlan_tpid, htons(ppd->hv1.tp_vlan_tci));
> +            }
> +            dp_packet_batch_add(batch, buffer);
> +        }
> +
> +        fn_in_block++;
> +        if (fn_in_block >= pbd->hdr.bh1.num_pkts) {
> +            pbd->hdr.bh1.block_status = TP_STATUS_KERNEL;
> +            block_num = (block_num + 1) %
> +                            netdev->tp_rx_ring->req.tp_block_nr;
> +            pbd = (struct tpacket_block_desc *)
> +                     netdev->tp_rx_ring->rd[block_num].iov_base;
> +            fn_in_block = 0;
> +            ppd = NULL;
> +        } else {
> +            ppd = ALIGNED_CAST(struct tpacket3_hdr *,
> +                   (uint8_t *) ppd + ppd->tp_next_offset);
> +        }
> +        i++;
> +    }
> +
> +    netdev->tp_rx_ring->block_num = block_num;
> +    netdev->tp_rx_ring->frame_num_in_block = fn_in_block;
> +    netdev->tp_rx_ring->ppd = ppd;
> +
> +    return 0;
> +}
> +#endif /* HAVE_TPACKET_V3 */
> +
>  static int
>  netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
>                        int *qfill)
> @@ -1462,12 +1743,13 @@ netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
>      struct netdev *netdev = rx->up.netdev;
>      ssize_t retval;
>      int mtu;
> +    bool tso = userspace_tso_enabled();
>
>      if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
>          mtu = ETH_PAYLOAD_MAX;
>      }
>
> -    if (userspace_tso_enabled()) {
> +    if (tso) {
>          /* Allocate TSO packets. The packet has enough headroom to store
>           * a full non-TSO packet. When a TSO packet is received, the data
>           * from non-TSO buffer (std_len) is prepended to the TSO packet
> @@ -1485,9 +1767,19 @@ netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
>      }
>
>      dp_packet_batch_init(batch);
> -    retval = (rx->is_tap
> -              ? netdev_linux_batch_rxq_recv_tap(rx, mtu, batch)
> -              : netdev_linux_batch_rxq_recv_sock(rx, mtu, batch));
> +    if (rx->is_tap) {
> +        retval = netdev_linux_batch_rxq_recv_tap(rx, tso, mtu, batch);
> +    } else {
> +        if (tso) {
> +            retval = netdev_linux_batch_rxq_recv_sock(rx, tso, mtu, batch);
> +        } else {
> +#ifndef HAVE_TPACKET_V3
> +            retval = netdev_linux_batch_rxq_recv_sock(rx, tso, mtu, batch);
> +#else
> +            retval = netdev_linux_batch_recv_tpacket(rx, tso, mtu, batch);
> +#endif
> +        }
> +    }
>
>      if (retval) {
>          if (retval != EAGAIN && retval != EMSGSIZE) {
> @@ -1692,6 +1984,83 @@ netdev_linux_get_numa_id(const struct netdev *netdev_)
>      return numa_id;
>  }
>
> +#ifdef HAVE_TPACKET_V3
> +static inline int
> +tpacket_tx_is_ready(void * next_frame)
> +{
> +    struct tpacket3_hdr *hdr = ALIGNED_CAST(struct tpacket3_hdr *, next_frame);
> +
> +    return !(hdr->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING));
> +}
> +
> +static int
> +netdev_linux_tpacket_batch_send(struct netdev *netdev_, bool tso, int mtu,
> +                            struct dp_packet_batch *batch)
> +{
> +    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
> +    struct dp_packet *packet;
> +    int sockfd;
> +    ssize_t bytes_sent;
> +    int total_pkts = 0;
> +
> +    unsigned int frame_nr = netdev->tp_tx_ring->req.tp_frame_nr;
> +    unsigned int frame_num = netdev->tp_tx_ring->frame_num;
> +
> +    /* The Linux tap driver returns EIO if the device is not up,
> +     * so if the device is not up, don't waste time sending it.
> +     * However, if the device is in another network namespace
> +     * then OVS can't retrieve the state. In that case, send the
> +     * packets anyway. */
> +    if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
> +        netdev->tx_dropped += dp_packet_batch_size(batch);
> +        return 0;
> +    }
> +
> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +        size_t size;
> +        struct tpacket3_hdr *ppd;
> +
> +        if (tso) {
> +            netdev_linux_prepend_vnet_hdr(packet, mtu);
> +        }
> +
> +        size = dp_packet_size(packet);
> +        ppd = tpacket_get_next_frame(netdev->tp_tx_ring, frame_num);
> +
> +        if (!tpacket_tx_is_ready(ppd)) {
> +            break;
> +        }
> +        ppd->tp_snaplen = size;
> +        ppd->tp_len = size;
> +        ppd->tp_next_offset = 0;
> +
> +        memcpy((uint8_t *)ppd + TPACKET3_HDRLEN - sizeof(struct sockaddr_ll),
> +               dp_packet_data(packet),
> +               size);
> +        ppd->tp_status = TP_STATUS_SEND_REQUEST;
> +        frame_num = (frame_num + 1) % frame_nr;
> +        total_pkts++;
> +    }
> +    netdev->tp_tx_ring->frame_num = frame_num;
> +
> +    /* Kick-off transmits */
> +    if (total_pkts != 0) {
> +        sockfd = netdev->tp_tx_ring->sockfd;
> +        bytes_sent = sendto(sockfd, NULL, 0, MSG_DONTWAIT, NULL, 0);
> +        if (bytes_sent == -1 &&
> +                errno != ENOBUFS && errno != EAGAIN) {
> +            /*
> +             * In case of an ENOBUFS/EAGAIN error all of the enqueued
> +             * packets will be considered successful even though only some
> +             * are sent.
> +             */
> +            netdev->tx_dropped += dp_packet_batch_size(batch);
> +        }
> +    }
> +    return 0;
> +}
> +#endif
> +
>  /* Sends 'batch' on 'netdev'.  Returns 0 if successful, otherwise a positive
>   * errno value.  Returns EAGAIN without blocking if the packet cannot be queued
>   * immediately.  Returns EMSGSIZE if a partial packet was transmitted or if
> @@ -1731,7 +2100,17 @@ netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
>              goto free_batch;
>          }
>
> -        error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, batch);
> +        if (tso) {
> +            error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu,
> +                                                 batch);
> +        } else {
> +#ifndef HAVE_TPACKET_V3
> +            error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu,
> +                                                 batch);
> +#else
> +            error = netdev_linux_tpacket_batch_send(netdev_, tso, mtu, batch);
> +#endif
> +        }
>      } else {
>          error = netdev_linux_tap_batch_send(netdev_, tso, mtu, batch);
>      }
> --
> 1.8.3.1
>
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev


More information about the dev mailing list