[ovs-dev] [PATCHv2] netdev-afxdp: Add need_wakeup supprt.

Eelco Chaudron echaudro at redhat.com
Tue Sep 10 14:24:48 UTC 2019


Hi William,

One comment below, but I’m also wondering if we should warn/log a 
message if OVS is compiled without this feature and we explicitly 
configure it? Same for the reverse what if the kernel does not support 
this option, will it fail?


//Eelco


On 5 Sep 2019, at 22:51, William Tu wrote:

> The patch adds support for using need_wakeup flag in AF_XDP rings.
> A new option, use_need_wakeup, is added. When this option is used,
> it means that OVS has to explicitly wake up the kernel RX, using 
> poll()
> syscall and wake up TX, using sendto() syscall. This feature improves
> the performance by avoiding unnecessary sendto syscalls for TX.
> For RX, instead of kernel always busy-spinning on fille queue, OVS 
> wakes
> up the kernel RX processing when fill queue is replenished.
>
> The need_wakeup feature is merged into Linux kernel 5.3.0-rc1 and OVS
> enables it by default. Running the feature before this version causes
> xsk bind fails, please use options:use_need_wakeup=false to disable 
> it.
>
> For virtual interface, it's better set use_need_wakeup=false, since
> the virtual device's AF_XDP xmit is synchronous: the sendto syscall
> enters kernel and process the TX packet on tx queue directly.
>
> I tested on kernel 5.3.0-rc3 using its libbpf.  On Intel Xeon E5-2620
> v3 2.4GHz system, performance of physical port to physical port 
> improves
> from 6.1Mpps to 7.3Mpps. Testing on 5.2.0-rc6 using libbpf from 
> 5.3.0-rc3
> does not work due to libbpf API change. Users have to use the older
> libbpf for older kernel.
>
> Suggested-by: Ilya Maximets <i.maximets at samsung.com>
> Signed-off-by: William Tu <u9012063 at gmail.com>
> ---
> v2:
> - address feedbacks from Ilya and Eelco
> - add options:use_need_wakeup, default to true
> - remove poll timeout=1sec, make poll() return immediately
> - naming change: rename to xsk_rx_wakeup_if_needing
> - fix indents and return value for errno
> ---
>  Documentation/intro/install/afxdp.rst | 14 ++++--
>  acinclude.m4                          |  5 ++
>  lib/netdev-afxdp.c                    | 94 
> +++++++++++++++++++++++++++++++----
>  lib/netdev-linux-private.h            |  2 +
>  vswitchd/vswitch.xml                  | 13 +++++
>  5 files changed, 114 insertions(+), 14 deletions(-)
>
> diff --git a/Documentation/intro/install/afxdp.rst 
> b/Documentation/intro/install/afxdp.rst
> index 820e9d993d8f..b7a3d0b7f57b 100644
> --- a/Documentation/intro/install/afxdp.rst
> +++ b/Documentation/intro/install/afxdp.rst
> @@ -176,9 +176,17 @@ in :doc:`general`::
>    ovs-vswitchd ...
>    ovs-vsctl -- add-br br0 -- set Bridge br0 datapath_type=netdev
>
> -Make sure your device driver support AF_XDP, and to use 1 PMD (on 
> core 4)
> -on 1 queue (queue 0) device, configure these options: **pmd-cpu-mask,
> -pmd-rxq-affinity, and n_rxq**. The **xdpmode** can be "drv" or 
> "skb"::
> +Make sure your device driver support AF_XDP, netdev-afxdp supports
> +the following additional options (see man ovs-vswitchd.conf.db for
> +more details):
> +
> + * **xdpmode**: use "drv" for driver mode, or "skb" for skb mode.
> +
> + * **use_need_wakeup**: enable by setting to "true", otherwise 
> "false"

Bit confusing here, as the default is true?

> +For example, to use 1 PMD (on core 4) on 1 queue (queue 0) device,
> +configure these options: **pmd-cpu-mask, pmd-rxq-affinity, and 
> n_rxq**.
> +The **xdpmode** can be "drv" or "skb"::
>
>    ethtool -L enp2s0 combined 1
>    ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x10
> diff --git a/acinclude.m4 b/acinclude.m4
> index f0e38898b17a..3f24475c96b5 100644
> --- a/acinclude.m4
> +++ b/acinclude.m4
> @@ -276,6 +276,11 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [
>                [Define to 1 if AF_XDP support is available and 
> enabled.])
>      LIBBPF_LDADD=" -lbpf -lelf"
>      AC_SUBST([LIBBPF_LDADD])
> +
> +    AC_CHECK_DECL([xsk_ring_prod__needs_wakeup], [
> +      AC_DEFINE([HAVE_XDP_NEED_WAKEUP], [1],
> +        [XDP need wakeup support detected in xsk.h.])
> +    ], [], [#include <bpf/xsk.h>])
>    fi
>    AM_CONDITIONAL([HAVE_AF_XDP], test "$AF_XDP_ENABLE" = true)
>  ])
> diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
> index e5b058d08a09..5169ed3ff801 100644
> --- a/lib/netdev-afxdp.c
> +++ b/lib/netdev-afxdp.c
> @@ -26,6 +26,7 @@
>  #include <linux/rtnetlink.h>
>  #include <linux/if_xdp.h>
>  #include <net/if.h>
> +#include <poll.h>
>  #include <stdlib.h>
>  #include <sys/resource.h>
>  #include <sys/socket.h>
> @@ -82,7 +83,7 @@ BUILD_ASSERT_DECL(PROD_NUM_DESCS == CONS_NUM_DESCS);
>  #define UMEM2DESC(elem, base) ((uint64_t)((char *)elem - (char 
> *)base))
>
>  static struct xsk_socket_info *xsk_configure(int ifindex, int 
> xdp_queue_id,
> -                                             int mode);
> +                                             int mode, bool 
> use_need_wakeup);
>  static void xsk_remove_xdp_program(uint32_t ifindex, int xdpmode);
>  static void xsk_destroy(struct xsk_socket_info *xsk);
>  static int xsk_configure_all(struct netdev *netdev);
> @@ -117,6 +118,49 @@ struct xsk_socket_info {
>      atomic_uint64_t tx_dropped;
>  };
>
> +#ifdef HAVE_XDP_NEED_WAKEUP
> +static inline void
> +xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem,
> +                        struct netdev *netdev, int fd)
> +{
> +    struct pollfd pfd;
> +    int ret;
> +
> +    if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
> +        pfd.fd = fd;
> +        pfd.events = POLLIN;
> +
> +        ret = poll(&pfd, 1, 0);
> +        if (OVS_UNLIKELY(ret < 0)) {
> +            VLOG_WARN_RL(&rl, "%s: error polling rx fd: %s.",
> +                         netdev_get_name(netdev),
> +                         ovs_strerror(errno));
> +        }
> +    }
> +}
> +
> +static inline bool
> +xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info)
> +{
> +    return xsk_ring_prod__needs_wakeup(&xsk_info->tx);
> +}
> +
> +#else /* !HAVE_XDP_NEED_WAKEUP */
> +static inline void
> +xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem OVS_UNUSED,
> +                        struct netdev *netdev OVS_UNUSED,
> +                        int fd OVS_UNUSED)
> +{
> +    /* Nothing. */
> +}
> +
> +static inline bool
> +xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info OVS_UNUSED)
> +{
> +    return true;
> +}
> +#endif /* HAVE_XDP_NEED_WAKEUP */
> +
>  static void
>  netdev_afxdp_cleanup_unused_pool(struct unused_pool *pool)
>  {
> @@ -234,7 +278,7 @@ xsk_configure_umem(void *buffer, uint64_t size, 
> int xdpmode)
>
>  static struct xsk_socket_info *
>  xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex,
> -                     uint32_t queue_id, int xdpmode)
> +                     uint32_t queue_id, int xdpmode, bool 
> use_need_wakeup)
>  {
>      struct xsk_socket_config cfg;
>      struct xsk_socket_info *xsk;
> @@ -257,6 +301,12 @@ xsk_configure_socket(struct xsk_umem_info *umem, 
> uint32_t ifindex,
>          cfg.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST | 
> XDP_FLAGS_SKB_MODE;
>      }
>
> +    if (use_need_wakeup) {
> +#ifdef HAVE_XDP_NEED_WAKEUP
> +        cfg.bind_flags |= XDP_USE_NEED_WAKEUP;
> +#endif
> +    }
> +
>      if (if_indextoname(ifindex, devname) == NULL) {
>          VLOG_ERR("ifindex %d to devname failed (%s)",
>                   ifindex, ovs_strerror(errno));
> @@ -311,7 +361,8 @@ xsk_configure_socket(struct xsk_umem_info *umem, 
> uint32_t ifindex,
>  }
>
>  static struct xsk_socket_info *
> -xsk_configure(int ifindex, int xdp_queue_id, int xdpmode)
> +xsk_configure(int ifindex, int xdp_queue_id, int xdpmode,
> +              bool use_need_wakeup)
>  {
>      struct xsk_socket_info *xsk;
>      struct xsk_umem_info *umem;
> @@ -334,7 +385,8 @@ xsk_configure(int ifindex, int xdp_queue_id, int 
> xdpmode)
>
>      VLOG_DBG("Allocated umem pool at 0x%"PRIxPTR, (uintptr_t) umem);
>
> -    xsk = xsk_configure_socket(umem, ifindex, xdp_queue_id, xdpmode);
> +    xsk = xsk_configure_socket(umem, ifindex, xdp_queue_id, xdpmode,
> +                               use_need_wakeup);
>      if (!xsk) {
>          /* Clean up umem and xpacket pool. */
>          if (xsk_umem__delete(umem->umem)) {
> @@ -365,9 +417,12 @@ xsk_configure_all(struct netdev *netdev)
>
>      /* Configure each queue. */
>      for (i = 0; i < n_rxq; i++) {
> -        VLOG_INFO("%s: configure queue %d mode %s", __func__, i,
> -                  dev->xdpmode == XDP_COPY ? "SKB" : "DRV");
> -        xsk_info = xsk_configure(ifindex, i, dev->xdpmode);
> +        VLOG_INFO("%s: configure queue %d mode %s use_need_wakeup 
> %s.",
> +                  __func__, i,
> +                  dev->xdpmode == XDP_COPY ? "SKB" : "DRV",
> +                  dev->use_need_wakeup ? "true" : "false");
> +        xsk_info = xsk_configure(ifindex, i, dev->xdpmode,
> +                                 dev->use_need_wakeup);
>          if (!xsk_info) {
>              VLOG_ERR("Failed to create AF_XDP socket on queue %d.", 
> i);
>              dev->xsks[i] = NULL;
> @@ -459,6 +514,7 @@ netdev_afxdp_set_config(struct netdev *netdev, 
> const struct smap *args,
>      struct netdev_linux *dev = netdev_linux_cast(netdev);
>      const char *str_xdpmode;
>      int xdpmode, new_n_rxq;
> +    bool need_wakeup;
>
>      ovs_mutex_lock(&dev->mutex);
>      new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
> @@ -481,10 +537,14 @@ netdev_afxdp_set_config(struct netdev *netdev, 
> const struct smap *args,
>          return EINVAL;
>      }
>
> +    need_wakeup = smap_get_bool(args, "use_need_wakeup", true);
> +
>      if (dev->requested_n_rxq != new_n_rxq
> -        || dev->requested_xdpmode != xdpmode) {
> +        || dev->requested_xdpmode != xdpmode
> +        || dev->requested_need_wakeup != need_wakeup) {
>          dev->requested_n_rxq = new_n_rxq;
>          dev->requested_xdpmode = xdpmode;
> +        dev->requested_need_wakeup = need_wakeup;
>          netdev_request_reconfigure(netdev);
>      }
>      ovs_mutex_unlock(&dev->mutex);
> @@ -500,6 +560,8 @@ netdev_afxdp_get_config(const struct netdev 
> *netdev, struct smap *args)
>      smap_add_format(args, "n_rxq", "%d", netdev->n_rxq);
>      smap_add_format(args, "xdpmode", "%s",
>          dev->xdpmode == XDP_ZEROCOPY ? "drv" : "skb");
> +    smap_add_format(args, "use_need_wakeup", "%s",
> +        dev->use_need_wakeup ? "true" : "false");
>      ovs_mutex_unlock(&dev->mutex);
>      return 0;
>  }
> @@ -515,6 +577,7 @@ netdev_afxdp_reconfigure(struct netdev *netdev)
>
>      if (netdev->n_rxq == dev->requested_n_rxq
>          && dev->xdpmode == dev->requested_xdpmode
> +        && dev->use_need_wakeup == dev->requested_need_wakeup
>          && dev->xsks) {
>          goto out;
>      }
> @@ -538,6 +601,7 @@ netdev_afxdp_reconfigure(struct netdev *netdev)
>           * when no device is in DRV mode.
>           */
>      }
> +    dev->use_need_wakeup = dev->requested_need_wakeup;
>
>      err = xsk_configure_all(netdev);
>      if (err) {
> @@ -660,6 +724,9 @@ netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, 
> struct dp_packet_batch *batch,
>
>      rcvd = xsk_ring_cons__peek(&xsk_info->rx, BATCH_SIZE, &idx_rx);
>      if (!rcvd) {
> +        if (dev->use_need_wakeup) {
> +            xsk_rx_wakeup_if_needed(umem, netdev, rx->fd);
> +        }
>          return EAGAIN;
>      }
>
> @@ -704,11 +771,15 @@ netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, 
> struct dp_packet_batch *batch,
>  }
>
>  static inline int
> -kick_tx(struct xsk_socket_info *xsk_info, int xdpmode)
> +kick_tx(struct xsk_socket_info *xsk_info, int xdpmode, bool 
> use_need_wakeup)
>  {
>      int ret, retries;
>      static const int KERNEL_TX_BATCH_SIZE = 16;
>
> +    if (use_need_wakeup && !xsk_tx_need_wakeup(xsk_info)) {
> +        return 0;
> +    }
> +
>      /* In SKB_MODE packet transmission is synchronous, and the kernel 
> xmits
>       * only TX_BATCH_SIZE(16) packets for a single sendmsg syscall.
>       * So, we have to kick the kernel (n_packets / 16) times to be 
> sure that
> @@ -876,7 +947,7 @@ __netdev_afxdp_batch_send(struct netdev *netdev, 
> int qid,
>          atomic_add_relaxed(&xsk_info->tx_dropped, batch->count, 
> &orig);
>          COVERAGE_INC(afxdp_tx_full);
>          afxdp_complete_tx(xsk_info);
> -        kick_tx(xsk_info, dev->xdpmode);
> +        kick_tx(xsk_info, dev->xdpmode, dev->use_need_wakeup);
>          error = ENOMEM;
>          goto out;
>      }
> @@ -900,7 +971,7 @@ __netdev_afxdp_batch_send(struct netdev *netdev, 
> int qid,
>      xsk_ring_prod__submit(&xsk_info->tx, batch->count);
>      xsk_info->outstanding_tx += batch->count;
>
> -    ret = kick_tx(xsk_info, dev->xdpmode);
> +    ret = kick_tx(xsk_info, dev->xdpmode, dev->use_need_wakeup);
>      if (OVS_UNLIKELY(ret)) {
>          VLOG_WARN_RL(&rl, "%s: error sending AF_XDP packet: %s.",
>                       netdev_get_name(netdev), ovs_strerror(ret));
> @@ -970,6 +1041,7 @@ netdev_afxdp_construct(struct netdev *netdev)
>
>      dev->requested_n_rxq = NR_QUEUE;
>      dev->requested_xdpmode = XDP_COPY;
> +    dev->requested_need_wakeup = true;
>
>      dev->xsks = NULL;
>      dev->tx_locks = NULL;
> diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h
> index a350be151147..c14f2fb81bb0 100644
> --- a/lib/netdev-linux-private.h
> +++ b/lib/netdev-linux-private.h
> @@ -102,6 +102,8 @@ struct netdev_linux {
>      int requested_n_rxq;
>      int xdpmode;                /* AF_XDP running mode: driver or 
> skb. */
>      int requested_xdpmode;
> +    bool use_need_wakeup;
> +    bool requested_need_wakeup;
>      struct ovs_spin *tx_locks;  /* spin lock array for TX queues. */
>  #endif
>  };
> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
> index 9a743c05b4bf..e8b428d323c7 100644
> --- a/vswitchd/vswitch.xml
> +++ b/vswitchd/vswitch.xml
> @@ -3147,6 +3147,19 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 
> type=patch options:peer=p1 \
>          </p>
>        </column>
>
> +      <column name="options" key="use_need_wakeup"
> +              type='{"type": "boolean"}'>
> +        <p>
> +          Specifies whether to use need_wakeup feature in afxdp 
> netdev.
> +          (required kernel version later than 5.3.0-rc1)
> +          If enabled, OVS explicitly wakes up the kernel RX, using 
> poll()
> +          syscall and wakes up TX, using sendto() syscall. For 
> physical
> +          devices, this feature improves the performance by avoiding
> +          unnecessary sendto syscalls.
> +          Defaults is true.
> +        </p>
> +      </column>
> +
>        <column name="options" key="vhost-server-path"
>                type='{"type": "string"}'>
>          <p>
> -- 
> 2.7.4


More information about the dev mailing list