[ovs-dev] 答复: [PATCH v2] netdev-linux: Prepend the std packet in the TSO packet

Flavio Leitner fbl at sysclose.org
Tue Feb 4 15:00:19 UTC 2020


On Tue, Feb 04, 2020 at 12:51:24AM +0000, Yi Yang (杨�D)-云服务集团 wrote:
> Hi, Flavio
> 
> With this one patch and previous several merged TSO-related patches, can
> veth work with "ethtool -K vethX tx on"? I always can't figure out why veth
> can work in dpdk data path when tx offload features are on, it looks like
> you're fixing this big issue, right?

If you have tso enabled with dpdk, then veth works with vethX tx on
(which is the default setting for veth). Otherwise TSO is not
enabled and then you need to turn tx offloading off.

You said "can work in dpdk data path when tx ... are on", so I think
it's okay? Not sure though.

> For tap interface, it can't support TSO, do you Redhat guys have plan to
> enable it on kernel side.

Yeah, currently it only works in one direction (OvS -> kernel). I am
looking into this now.
See below my iperf3 results as a reference:

Traffic Direction      TSO disabled         TSO enabled  
VM->tap                2.98 Gbits/sec       22.7 Gbits/sec
VM->veth               2.96 Gbits/sec       22.6 Gbits/sec
veth->VM               2.30 Gbits/sec       9.58 Gbits/sec
tap->VM                2.29 Gbits/sec       2.19 Gbits/sec

fbl
 
> -----邮件原件-----
> 发件人: Flavio Leitner [mailto:fbl at sysclose.org] 
> 发送时间: 2020年2月4日 5:46
> 收件人: dev at openvswitch.org
> 抄送: Stokes Ian <ian.stokes at intel.com>; Loftus Ciara
> <ciara.loftus at intel.com>; Ilya Maximets <i.maximets at ovn.org>; Yi Yang (杨
> ?D)-云服务集团 <yangyi01 at inspur.com>; txfh2007 <txfh2007 at aliyun.com>; Ben
> Pfaff <blp at ovn.org>; Flavio Leitner <fbl at sysclose.org>
> 主题: [PATCH v2] netdev-linux: Prepend the std packet in the TSO packet
> 
> Usually TSO packets are close to 50k, 60k bytes long, so to to copy less
> bytes when receiving a packet from the kernel change the approach. Instead
> of extending the MTU sized packet received and append with remaining TSO
> data from the TSO buffer, allocate a TSO packet with enough headroom to
> prepend the std packet data.
> 
> Fixes: 29cf9c1b3b9c ("userspace: Add TCP Segmentation Offload support")
> Suggested-by: Ben Pfaff <blp at ovn.org>
> Signed-off-by: Flavio Leitner <fbl at sysclose.org>
> ---
>  lib/dp-packet.c            |   8 +--
>  lib/dp-packet.h            |   2 +
>  lib/netdev-linux-private.h |   3 +-
>  lib/netdev-linux.c         | 117 ++++++++++++++++++++++---------------
>  4 files changed, 78 insertions(+), 52 deletions(-)
> 
> V2:
>   - tso packets tailroom depends on headroom in netdev_linux_rxq_recv()
>   - iov_len uses packet's tailroom.
> 
>   This patch depends on a previous posted patch to work:
>   Subject: netdev-linux-private: fix max length to be 16 bits
>   https://mail.openvswitch.org/pipermail/ovs-dev/2020-February/367469.html
> 
>   With both patches applied, I can run iperf3 and scp on both directions
>   with good performance and no issues.
> 
> diff --git a/lib/dp-packet.c b/lib/dp-packet.c index 8dfedcb7c..cd2623500
> 100644
> --- a/lib/dp-packet.c
> +++ b/lib/dp-packet.c
> @@ -243,8 +243,8 @@ dp_packet_copy__(struct dp_packet *b, uint8_t *new_base,
>  
>  /* Reallocates 'b' so that it has exactly 'new_headroom' and 'new_tailroom'
>   * bytes of headroom and tailroom, respectively. */ -static void
> -dp_packet_resize__(struct dp_packet *b, size_t new_headroom, size_t
> new_tailroom)
> +void
> +dp_packet_resize(struct dp_packet *b, size_t new_headroom, size_t 
> +new_tailroom)
>  {
>      void *new_base, *new_data;
>      size_t new_allocated;
> @@ -297,7 +297,7 @@ void
>  dp_packet_prealloc_tailroom(struct dp_packet *b, size_t size)  {
>      if (size > dp_packet_tailroom(b)) {
> -        dp_packet_resize__(b, dp_packet_headroom(b), MAX(size, 64));
> +        dp_packet_resize(b, dp_packet_headroom(b), MAX(size, 64));
>      }
>  }
>  
> @@ -308,7 +308,7 @@ void
>  dp_packet_prealloc_headroom(struct dp_packet *b, size_t size)  {
>      if (size > dp_packet_headroom(b)) {
> -        dp_packet_resize__(b, MAX(size, 64), dp_packet_tailroom(b));
> +        dp_packet_resize(b, MAX(size, 64), dp_packet_tailroom(b));
>      }
>  }
>  
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 69ae5dfac..9a9d35183
> 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -152,6 +152,8 @@ struct dp_packet *dp_packet_clone_with_headroom(const
> struct dp_packet *,  struct dp_packet *dp_packet_clone_data(const void *,
> size_t);  struct dp_packet *dp_packet_clone_data_with_headroom(const void *,
> size_t,
>                                                       size_t headroom);
> +void dp_packet_resize(struct dp_packet *b, size_t new_headroom,
> +                      size_t new_tailroom);
>  static inline void dp_packet_delete(struct dp_packet *);
>  
>  static inline void *dp_packet_at(const struct dp_packet *, size_t offset,
> diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h index
> be2d7b10b..c7c515f70 100644
> --- a/lib/netdev-linux-private.h
> +++ b/lib/netdev-linux-private.h
> @@ -45,7 +45,8 @@ struct netdev_rxq_linux {
>      struct netdev_rxq up;
>      bool is_tap;
>      int fd;
> -    char *aux_bufs[NETDEV_MAX_BURST]; /* Batch of preallocated TSO buffers.
> */
> +    struct dp_packet *aux_bufs[NETDEV_MAX_BURST]; /* Preallocated TSO
> +                                                     packets. */
>  };
>  
>  int netdev_linux_construct(struct netdev *); diff --git a/lib/netdev-linux.
> c b/lib/netdev-linux.c index 6add3e2fc..c6f3d2740 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -1052,15 +1052,6 @@ static struct netdev_rxq *
>  netdev_linux_rxq_alloc(void)
>  {
>      struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
> -    if (userspace_tso_enabled()) {
> -        int i;
> -
> -        /* Allocate auxiliay buffers to receive TSO packets. */
> -        for (i = 0; i < NETDEV_MAX_BURST; i++) {
> -            rx->aux_bufs[i] = xmalloc(LINUX_RXQ_TSO_MAX_LEN);
> -        }
> -    }
> -
>      return &rx->up;
>  }
>  
> @@ -1172,7 +1163,7 @@ netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
>      }
>  
>      for (i = 0; i < NETDEV_MAX_BURST; i++) {
> -        free(rx->aux_bufs[i]);
> +        dp_packet_delete(rx->aux_bufs[i]);
>      }
>  }
>  
> @@ -1238,13 +1229,18 @@ netdev_linux_batch_rxq_recv_sock(struct
> netdev_rxq_linux *rx, int mtu,
>          virtio_net_hdr_size = 0;
>      }
>  
> -    std_len = VLAN_ETH_HEADER_LEN + mtu + virtio_net_hdr_size;
> +    /* The length here needs to be accounted in the same way when the
> +     * aux_buf is allocated so that it can be prepended to TSO buffer. */
> +    std_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu;
>      for (i = 0; i < NETDEV_MAX_BURST; i++) {
>           buffers[i] = dp_packet_new_with_headroom(std_len,
> DP_NETDEV_HEADROOM);
>           iovs[i][IOV_PACKET].iov_base = dp_packet_data(buffers[i]);
>           iovs[i][IOV_PACKET].iov_len = std_len;
> -         iovs[i][IOV_AUXBUF].iov_base = rx->aux_bufs[i];
> -         iovs[i][IOV_AUXBUF].iov_len = LINUX_RXQ_TSO_MAX_LEN;
> +         if (iovlen == IOV_TSO_SIZE) {
> +             iovs[i][IOV_AUXBUF].iov_base =
> dp_packet_data(rx->aux_bufs[i]);
> +             iovs[i][IOV_AUXBUF].iov_len =
> dp_packet_tailroom(rx->aux_bufs[i]);
> +         }
> +
>           mmsgs[i].msg_hdr.msg_name = NULL;
>           mmsgs[i].msg_hdr.msg_namelen = 0;
>           mmsgs[i].msg_hdr.msg_iov = iovs[i]; @@ -1268,6 +1264,8 @@
> netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu,
>      }
>  
>      for (i = 0; i < retval; i++) {
> +        struct dp_packet *pkt;
> +
>          if (mmsgs[i].msg_len < ETH_HEADER_LEN) {
>              struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
>              struct netdev_linux *netdev = netdev_linux_cast(netdev_); @@
> -1280,29 +1278,29 @@ netdev_linux_batch_rxq_recv_sock(struct
> netdev_rxq_linux *rx, int mtu,
>          }
>  
>          if (mmsgs[i].msg_len > std_len) {
> -            /* Build a single linear TSO packet by expanding the current
> packet
> -             * to append the data received in the aux_buf. */
> -            size_t extra_len = mmsgs[i].msg_len - std_len;
> -
> -            dp_packet_set_size(buffers[i], dp_packet_size(buffers[i])
> -                               + std_len);
> -            dp_packet_prealloc_tailroom(buffers[i], extra_len);
> -            memcpy(dp_packet_tail(buffers[i]), rx->aux_bufs[i], extra_len);
> -            dp_packet_set_size(buffers[i], dp_packet_size(buffers[i])
> -                               + extra_len);
> -        } else {
> -            dp_packet_set_size(buffers[i], dp_packet_size(buffers[i])
> -                               + mmsgs[i].msg_len);
> -        }
> +            /* Build a single linear TSO packet by prepending the data from
> +             * std_len buffer to the aux_buf. */
> +            pkt = rx->aux_bufs[i];
> +            dp_packet_set_size(pkt, mmsgs[i].msg_len - std_len);
> +            dp_packet_push(pkt, dp_packet_data(buffers[i]), std_len);
> +            /* The headroom should be the same in buffers[i], pkt and
> +             * DP_NETDEV_HEADROOM. */
> +            dp_packet_resize(pkt, DP_NETDEV_HEADROOM, 0);
> +            dp_packet_delete(buffers[i]);
> +            rx->aux_bufs[i] = NULL;
> +         } else {
> +            dp_packet_set_size(buffers[i], mmsgs[i].msg_len);
> +            pkt = buffers[i];
> +         }
>  
> -        if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(buffers[i]))
> {
> +        if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(pkt)) {
>              struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
>              struct netdev_linux *netdev = netdev_linux_cast(netdev_);
>  
>              /* Unexpected error situation: the virtio header is not present
>               * or corrupted. Drop the packet but continue in case next ones
>               * are correct. */
> -            dp_packet_delete(buffers[i]);
> +            dp_packet_delete(pkt);
>              netdev->rx_dropped += 1;
>              VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net
> header",
>                           netdev_get_name(netdev_)); @@ -1325,16 +1323,16 @@
> netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu,
>                  struct eth_header *eth;
>                  bool double_tagged;
>  
> -                eth = dp_packet_data(buffers[i]);
> +                eth = dp_packet_data(pkt);
>                  double_tagged = eth->eth_type ==
> htons(ETH_TYPE_VLAN_8021Q);
>  
> -                eth_push_vlan(buffers[i],
> +                eth_push_vlan(pkt,
>                                auxdata_to_vlan_tpid(aux, double_tagged),
>                                htons(aux->tp_vlan_tci));
>                  break;
>              }
>          }
> -        dp_packet_batch_add(batch, buffers[i]);
> +        dp_packet_batch_add(batch, pkt);
>      }
>  
>      /* Delete unused buffers. */
> @@ -1354,7 +1352,6 @@ static int
>  netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
>                                  struct dp_packet_batch *batch)  {
> -    struct dp_packet *buffer;
>      int virtio_net_hdr_size;
>      ssize_t retval;
>      size_t std_len;
> @@ -1372,16 +1369,22 @@ netdev_linux_batch_rxq_recv_tap(struct
> netdev_rxq_linux *rx, int mtu,
>          virtio_net_hdr_size = 0;
>      }
>  
> -    std_len = VLAN_ETH_HEADER_LEN + mtu + virtio_net_hdr_size;
> +    /* The length here needs to be accounted in the same way when the
> +     * aux_buf is allocated so that it can be prepended to TSO buffer. */
> +    std_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu;
>      for (i = 0; i < NETDEV_MAX_BURST; i++) {
> +        struct dp_packet *buffer;
> +        struct dp_packet *pkt;
>          struct iovec iov[IOV_TSO_SIZE];
>  
>          /* Assume Ethernet port. No need to set packet_type. */
>          buffer = dp_packet_new_with_headroom(std_len, DP_NETDEV_HEADROOM);
>          iov[IOV_PACKET].iov_base = dp_packet_data(buffer);
>          iov[IOV_PACKET].iov_len = std_len;
> -        iov[IOV_AUXBUF].iov_base = rx->aux_bufs[i];
> -        iov[IOV_AUXBUF].iov_len = LINUX_RXQ_TSO_MAX_LEN;
> +        if (iovlen == IOV_TSO_SIZE) {
> +            iov[IOV_AUXBUF].iov_base = dp_packet_data(rx->aux_bufs[i]);
> +            iov[IOV_AUXBUF].iov_len = dp_packet_tailroom(rx->aux_bufs[i]);
> +        }
>  
>          do {
>              retval = readv(rx->fd, iov, iovlen); @@ -1393,33 +1396,36 @@
> netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
>          }
>  
>          if (retval > std_len) {
> -            /* Build a single linear TSO packet by expanding the current
> packet
> -             * to append the data received in the aux_buf. */
> -            size_t extra_len = retval - std_len;
> -
> -            dp_packet_set_size(buffer, dp_packet_size(buffer) + std_len);
> -            dp_packet_prealloc_tailroom(buffer, extra_len);
> -            memcpy(dp_packet_tail(buffer), rx->aux_bufs[i], extra_len);
> -            dp_packet_set_size(buffer, dp_packet_size(buffer) + extra_len);
> +            /* Build a single linear TSO packet by prepending the data from
> +             * std_len buffer to the aux_buf. */
> +            pkt = rx->aux_bufs[i];
> +            dp_packet_set_size(pkt, retval - std_len);
> +            dp_packet_push(pkt, dp_packet_data(buffer), std_len);
> +            /* The headroom should be the same in buffers[i], pkt and
> +             * DP_NETDEV_HEADROOM. */
> +            dp_packet_resize(pkt, DP_NETDEV_HEADROOM, 0);
> +            dp_packet_delete(buffer);
> +            rx->aux_bufs[i] = NULL;
>          } else {
>              dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
> +            pkt = buffer;
>          }
>  
> -        if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(buffer)) {
> +        if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(pkt)) {
>              struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
>              struct netdev_linux *netdev = netdev_linux_cast(netdev_);
>  
>              /* Unexpected error situation: the virtio header is not present
>               * or corrupted. Drop the packet but continue in case next ones
>               * are correct. */
> -            dp_packet_delete(buffer);
> +            dp_packet_delete(pkt);
>              netdev->rx_dropped += 1;
>              VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net
> header",
>                           netdev_get_name(netdev_));
>              continue;
>          }
>  
> -        dp_packet_batch_add(batch, buffer);
> +        dp_packet_batch_add(batch, pkt);
>      }
>  
>      if ((i == 0) && (retval < 0)) {
> @@ -1442,6 +1448,23 @@ netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct
> dp_packet_batch *batch,
>          mtu = ETH_PAYLOAD_MAX;
>      }
>  
> +    if (userspace_tso_enabled()) {
> +        /* Allocate TSO packets. The packet has enough headroom to store
> +         * a full non-TSO packet. When a TSO packet is received, the data
> +         * from non-TSO buffer (std_len) is prepended to the TSO packet
> +         * (aux_buf). */
> +        size_t std_len = sizeof(struct virtio_net_hdr) +
> VLAN_ETH_HEADER_LEN
> +                         + DP_NETDEV_HEADROOM + mtu;
> +        size_t data_len = LINUX_RXQ_TSO_MAX_LEN - std_len;
> +        for (int i = 0; i < NETDEV_MAX_BURST; i++) {
> +            if (rx->aux_bufs[i]) {
> +                continue;
> +            }
> +
> +            rx->aux_bufs[i] = dp_packet_new_with_headroom(data_len,
> std_len);
> +        }
> +    }
> +
>      dp_packet_batch_init(batch);
>      retval = (rx->is_tap
>                ? netdev_linux_batch_rxq_recv_tap(rx, mtu, batch)
> --
> 2.24.1
> 



-- 
fbl


More information about the dev mailing list