[ovs-dev] [PATCHv2] netdev-afxdp: Enable loading XDP program.

Eelco Chaudron echaudro at redhat.com
Tue Oct 29 15:28:50 UTC 2019


See comments inline…

On 5 Oct 2019, at 2:12, William Tu wrote:

> Now netdev-afxdp always forwards all packets to userspace because
> it is using libbpf's default XDP program, see 'xsk_load_xdp_prog'.
> There are some cases when users want to keep packets in kernel instead
> of sending to userspace, for example, management traffic such as SSH
> should be processed in kernel.
>
> The patch enables loading the user-provide XDP program by doing
>   $ovs-vsctl -- set int afxdp-p0 options:xdpobj=<path/to/xdp/obj>

Should we maybe change the name to xdp-obj?

> So users can implement their filtering logic or traffic steering idea
> in their XDP program, and rest of the traffic passes to AF_XDP socket
> handled by OVS.
>
> Signed-off-by: William Tu <u9012063 at gmail.com>
> ---
> v2:
>     A couple fixes and remove RFC
> ---
>  Documentation/intro/install/afxdp.rst |  46 ++++++++++++
>  lib/netdev-afxdp.c                    | 137 
> ++++++++++++++++++++++++++++++----
>  lib/netdev-linux-private.h            |   2 +
>  3 files changed, 169 insertions(+), 16 deletions(-)
>
> diff --git a/Documentation/intro/install/afxdp.rst 
> b/Documentation/intro/install/afxdp.rst
> index 820e9d993d8f..f484bdb8249b 100644
> --- a/Documentation/intro/install/afxdp.rst
> +++ b/Documentation/intro/install/afxdp.rst
> @@ -265,6 +265,52 @@ Or, use OVS pmd tool::
>    ovs-appctl dpif-netdev/pmd-stats-show
>
>
> +Loading Custom XDP Program
> +--------------------------
> +By defailt, netdev-afxdp always forwards all packets to userspace 
> because
> +it is using libbpf's default XDP program. There are some cases when 
> users
> +want to keep packets in kernel instead of sending to userspace, for 
> example,
> +management traffic such as SSH should be processed in kernel. This 
> can be
> +done by loading the user-provided XDP program::
> +
> +  ovs-vsctl -- set int afxdp-p0 options:xdpobj=<path/to/xdp/obj>
> +
> +So users can implement their filtering logic or traffic steering idea
> +in their XDP program, and rest of the traffic passes to AF_XDP socket
> +handled by OVS. Below is a sample C program compiled under kernel's
> +samples/bpf/.
> +
> +.. code-block:: c
> +
> +  #include <uapi/linux/bpf.h>
> +  #include "bpf_helpers.h"
> +
> +  /* OVS will associate map 'xsks_map' to xsk socket. */
> +  struct bpf_map_def SEC("maps") xsks_map = {
> +      .type = BPF_MAP_TYPE_XSKMAP,
> +      .key_size = sizeof(int),
> +      .value_size = sizeof(int),
> +      .max_entries = 32,
> +  };
> +

Maybe something like this could be added to make sure it compiles with 
older kernels:

#if LINUX_VERSION_CODE < KERNEL_VERSION(5,3,0)

/* Kernel version before 5.3 needed an additional map */
struct bpf_map_def SEC("maps") qidconf_map = {
	.type = BPF_MAP_TYPE_ARRAY,
	.key_size = sizeof(int),
	.value_size = sizeof(int),
	.max_entries = 64,
};
#endi

See also here:
https://github.com/chaudron/xdp-tutorial/blob/master/advanced03-AF_XDP/af_xdp_kern.c

> +  SEC("xdp_sock")
> +  int xdp_sock_prog(struct xdp_md *ctx)
> +  {
> +      int index = ctx->rx_queue_index;
> +
> +      /* Customized by user.
> +       * For example
> +       * 1) filter out all SSH traffic and return XDP_PASS
> +       *    for kernel to process.
> +       * 2) Drop unwanted packet by returning XDP_DROP.
> +       */
> +
> +      /* Rest of packets goes to AF_XDP. */
> +      return bpf_redirect_map(&xsks_map, index, 0);
> +  }
> +  char _license[] SEC("license") = "GPL";
> +
> +
>  Example Script
>  --------------
>
> diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
> index 6e01803272aa..231d6473fb67 100644
> --- a/lib/netdev-afxdp.c
> +++ b/lib/netdev-afxdp.c
> @@ -21,6 +21,7 @@
>  #include "netdev-afxdp.h"
>  #include "netdev-afxdp-pool.h"
>
> +#include <bpf/bpf.h>
>  #include <errno.h>
>  #include <inttypes.h>
>  #include <linux/rtnetlink.h>
> @@ -29,6 +30,7 @@
>  #include <stdlib.h>
>  #include <sys/resource.h>
>  #include <sys/socket.h>
> +#include <sys/stat.h>
>  #include <sys/types.h>
>  #include <unistd.h>
>
> @@ -82,7 +84,7 @@ BUILD_ASSERT_DECL(PROD_NUM_DESCS == CONS_NUM_DESCS);
>  #define UMEM2DESC(elem, base) ((uint64_t)((char *)elem - (char 
> *)base))
>
>  static struct xsk_socket_info *xsk_configure(int ifindex, int 
> xdp_queue_id,
> -                                             int mode);
> +                                             int mode, bool 
> use_default_xdp);
>  static void xsk_remove_xdp_program(uint32_t ifindex, int xdpmode);
>  static void xsk_destroy(struct xsk_socket_info *xsk);
>  static int xsk_configure_all(struct netdev *netdev);
> @@ -158,6 +160,51 @@ netdev_afxdp_sweep_unused_pools(void *aux 
> OVS_UNUSED)
>      ovs_mutex_unlock(&unused_pools_mutex);
>  }
>
> +static int
> +xsk_load_prog(const char *path, struct bpf_object **obj,
> +              int *prog_fd)
> +{
> +    struct bpf_prog_load_attr attr = {
> +        .prog_type = BPF_PROG_TYPE_XDP,
> +        .file = path,
> +    };
> +
> +    if (bpf_prog_load_xattr(&attr, obj, prog_fd)) {
> +        VLOG_WARN("Can't load XDP program at '%s'", path);

I think it should be a VLOG_ERR()

> +        return EINVAL;
> +    }
> +
> +    VLOG_INFO("Loaded XDP program at '%s'", path);
> +    return 0;
> +}
> +
> +static int
> +xsk_configure_map(struct bpf_object *obj, struct xsk_socket_info 
> *xsk,
> +                  int queue_id)
> +{
> +    struct bpf_map *map;
> +    int xsks_map_fd;
> +    int xsk_fd;
> +    int ret;
> +
> +    map = bpf_object__find_map_by_name(obj, "xsks_map");
> +    if (!map) {
> +        VLOG_WARN("XDP map 'xsks_map' not found");
> +        return EINVAL;
> +    }
> +
> +    xsks_map_fd = bpf_map__fd(map);
> +    xsk_fd = xsk_socket__fd(xsk->xsk);
> +
> +    ret = bpf_map_update_elem(xsks_map_fd, &queue_id, &xsk_fd, 0);

Don’t think this is needed, as 
xsk_socket__create()->xsk_setup_xdp_prog()->xsk_lookup_bpf_maps() will 
take care of it.

> +    if (ret) {
> +        VLOG_WARN("Can't set XSK map, map fd: %d", xsks_map_fd);
> +        return EINVAL;
> +    }
> +
> +    return 0;
> +}
> +
>  static struct xsk_umem_info *
>  xsk_configure_umem(void *buffer, uint64_t size, int xdpmode)
>  {
> @@ -234,7 +281,7 @@ xsk_configure_umem(void *buffer, uint64_t size, 
> int xdpmode)
>
>  static struct xsk_socket_info *
>  xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex,
> -                     uint32_t queue_id, int xdpmode)
> +                     uint32_t queue_id, int xdpmode, bool 
> use_default_xdp)
>  {
>      struct xsk_socket_config cfg;
>      struct xsk_socket_info *xsk;
> @@ -257,6 +304,10 @@ xsk_configure_socket(struct xsk_umem_info *umem, 
> uint32_t ifindex,
>          cfg.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST | 
> XDP_FLAGS_SKB_MODE;
>      }
>
> +    if (!use_default_xdp) {
> +        cfg.libbpf_flags |= XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD;
> +    }
> +
>      if (if_indextoname(ifindex, devname) == NULL) {
>          VLOG_ERR("ifindex %d to devname failed (%s)",
>                   ifindex, ovs_strerror(errno));
> @@ -275,17 +326,19 @@ xsk_configure_socket(struct xsk_umem_info *umem, 
> uint32_t ifindex,
>          return NULL;
>      }
>
> -    /* Make sure the built-in AF_XDP program is loaded. */
> -    ret = bpf_get_link_xdp_id(ifindex, &prog_id, cfg.xdp_flags);
> -    if (ret || !prog_id) {
> -        if (ret) {
> -            VLOG_ERR("Get XDP prog ID failed (%s)", 
> ovs_strerror(errno));
> -        } else {
> -            VLOG_ERR("No XDP program is loaded at ifindex %d", 
> ifindex);
> +    if (use_default_xdp) {
> +        /* Make sure the built-in AF_XDP program is loaded. */
> +        ret = bpf_get_link_xdp_id(ifindex, &prog_id, cfg.xdp_flags);
> +        if (ret || !prog_id) {
> +            if (ret) {
> +                VLOG_ERR("Get XDP prog ID failed (%s)", 
> ovs_strerror(errno));
> +            } else {
> +                VLOG_ERR("No XDP program is loaded at ifindex %d", 
> ifindex);
> +            }
> +            xsk_socket__delete(xsk->xsk);
> +            free(xsk);
> +            return NULL;
>          }

I think a WARNING message should be logged when an external program is 
loaded so operators know when debugging something might be out of the 
ordinary. I would suggest to include the interface name and program 
path/name loaded.

> -        xsk_socket__delete(xsk->xsk);
> -        free(xsk);
> -        return NULL;
>      }
>
>      while (!xsk_ring_prod__reserve(&xsk->umem->fq,
> @@ -311,7 +364,8 @@ xsk_configure_socket(struct xsk_umem_info *umem, 
> uint32_t ifindex,
>  }
>
>  static struct xsk_socket_info *
> -xsk_configure(int ifindex, int xdp_queue_id, int xdpmode)
> +xsk_configure(int ifindex, int xdp_queue_id, int xdpmode,
> +              bool use_default_xdp)
>  {
>      struct xsk_socket_info *xsk;
>      struct xsk_umem_info *umem;
> @@ -334,7 +388,8 @@ xsk_configure(int ifindex, int xdp_queue_id, int 
> xdpmode)
>
>      VLOG_DBG("Allocated umem pool at 0x%"PRIxPTR, (uintptr_t) umem);
>
> -    xsk = xsk_configure_socket(umem, ifindex, xdp_queue_id, xdpmode);
> +    xsk = xsk_configure_socket(umem, ifindex, xdp_queue_id, xdpmode,
> +                               use_default_xdp);
>      if (!xsk) {
>          /* Clean up umem and xpacket pool. */
>          if (xsk_umem__delete(umem->umem)) {
> @@ -354,6 +409,10 @@ xsk_configure_all(struct netdev *netdev)
>      struct netdev_linux *dev = netdev_linux_cast(netdev);
>      struct xsk_socket_info *xsk_info;
>      int i, ifindex, n_rxq, n_txq;
> +    bool use_default_xdp;
> +    struct bpf_object *obj;
> +    int prog_fd = 0;
> +    int ret;
>
>      ifindex = linux_get_ifindex(netdev_get_name(netdev));
>
> @@ -362,12 +421,13 @@ xsk_configure_all(struct netdev *netdev)
>
>      n_rxq = netdev_n_rxq(netdev);
>      dev->xsks = xcalloc(n_rxq, sizeof *dev->xsks);
> +    use_default_xdp = dev->xdpobj ? false : true;
>
>      /* Configure each queue. */
>      for (i = 0; i < n_rxq; i++) {
>          VLOG_INFO("%s: configure queue %d mode %s", __func__, i,
>                    dev->xdpmode == XDP_COPY ? "SKB" : "DRV");
> -        xsk_info = xsk_configure(ifindex, i, dev->xdpmode);
> +        xsk_info = xsk_configure(ifindex, i, dev->xdpmode, 
> use_default_xdp);
>          if (!xsk_info) {
>              VLOG_ERR("Failed to create AF_XDP socket on queue %d.", 
> i);
>              dev->xsks[i] = NULL;
> @@ -377,6 +437,27 @@ xsk_configure_all(struct netdev *netdev)
>          atomic_init(&xsk_info->tx_dropped, 0);
>          xsk_info->outstanding_tx = 0;
>          xsk_info->available_rx = PROD_NUM_DESCS;
> +
> +        if (dev->xdpobj) {
> +            if (prog_fd == 0) {
> +                /* XDP program is per-netdev, so all queues share
> +                   the same XDP program.
> +                 */
Format comment to be inline with other multiline comments

> +                ret = xsk_load_prog(dev->xdpobj, &obj, &prog_fd);
> +                if (ret) {
> +                    VLOG_ERR("%s: Can't load XDP object file",
> +                             netdev_get_name(netdev));

Maybe combine this failure with the one in xsk_load_prog() and report 
both the device and filename

> +                    goto err;
> +                }
> +            }
> +            ret = xsk_configure_map(obj, xsk_info, i);
> +            if (ret) {
> +                VLOG_ERR("%s: Can't configure XDP map",
> +                         netdev_get_name(netdev));
> +                goto err;
> +            }
> +            bpf_set_link_xdp_fd(ifindex, prog_fd, dev->xdpmode);
> +        }
>      }
>
>      n_txq = netdev_n_txq(netdev);
> @@ -450,6 +531,8 @@ xsk_destroy_all(struct netdev *netdev)
>          free(dev->tx_locks);
>          dev->tx_locks = NULL;
>      }
> +    free(CONST_CAST(char *, dev->xdpobj));
> +    dev->xdpobj = NULL;
>  }

Do we need to adjust xsk_remove_xdp_program() to free the loaded 
program/maps if it gets reconfigured with another one?

>  int
> @@ -458,7 +541,9 @@ netdev_afxdp_set_config(struct netdev *netdev, 
> const struct smap *args,
>  {
>      struct netdev_linux *dev = netdev_linux_cast(netdev);
>      const char *str_xdpmode;
> +    const char *str_xdpobj;
>      int xdpmode, new_n_rxq;
> +    struct stat s;
>
>      ovs_mutex_lock(&dev->mutex);
>      new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
> @@ -481,10 +566,25 @@ netdev_afxdp_set_config(struct netdev *netdev, 
> const struct smap *args,
>          return EINVAL;
>      }
>
> +    str_xdpobj = smap_get_def(args, "xdpobj", NULL);
> +    if (str_xdpobj && stat(str_xdpobj, &s)) {
> +        VLOG_ERR("Invalid xdpobj '%s': %s", str_xdpobj,
> +                 ovs_strerror(errno));
> +        ovs_mutex_unlock(&dev->mutex);
Maybe move unlock above log?

> +        return EINVAL;
> +    } else if (str_xdpobj && !S_ISREG(s.st_mode)) {
> +        VLOG_ERR("xdpobj '%s' is not a regular file", str_xdpobj);
> +        ovs_mutex_unlock(&dev->mutex);
Maybe move unlock above log?

> +        return EINVAL;
> +    }
> +
>      if (dev->requested_n_rxq != new_n_rxq
> -        || dev->requested_xdpmode != xdpmode) {
> +        || dev->requested_xdpmode != xdpmode
> +        || !nullable_string_is_equal(dev->requested_xdpobj, 
> str_xdpobj)
> +        || str_xdpobj) {
>          dev->requested_n_rxq = new_n_rxq;
>          dev->requested_xdpmode = xdpmode;
> +        dev->requested_xdpobj = nullable_xstrdup(str_xdpobj);
>          netdev_request_reconfigure(netdev);
>      }
>      ovs_mutex_unlock(&dev->mutex);
> @@ -500,6 +600,7 @@ netdev_afxdp_get_config(const struct netdev 
> *netdev, struct smap *args)
>      smap_add_format(args, "n_rxq", "%d", netdev->n_rxq);
>      smap_add_format(args, "xdpmode", "%s",
>          dev->xdpmode == XDP_ZEROCOPY ? "drv" : "skb");
> +    smap_add_format(args, "xdpobj", "%s", dev->xdpobj);
>      ovs_mutex_unlock(&dev->mutex);
>      return 0;
>  }
> @@ -539,6 +640,8 @@ netdev_afxdp_reconfigure(struct netdev *netdev)
>           */
>      }
>
> +    dev->xdpobj = dev->requested_xdpobj;

Don’t we need to free the current dev->xdpobj before we override it?

>      err = xsk_configure_all(netdev);
>      if (err) {
>          VLOG_ERR("AF_XDP device %s reconfig failed.", 
> netdev_get_name(netdev));
> @@ -971,9 +1074,11 @@ netdev_afxdp_construct(struct netdev *netdev)
>      netdev->n_rxq = 0;
>      netdev->n_txq = 0;
>      dev->xdpmode = 0;
> +    dev->xdpobj = NULL;
>
>      dev->requested_n_rxq = NR_QUEUE;
>      dev->requested_xdpmode = XDP_COPY;
> +    dev->requested_xdpobj = NULL;
>
>      dev->xsks = NULL;
>      dev->tx_locks = NULL;
> diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h
> index a350be151147..7ded0e4b4228 100644
> --- a/lib/netdev-linux-private.h
> +++ b/lib/netdev-linux-private.h
> @@ -103,6 +103,8 @@ struct netdev_linux {
>      int xdpmode;                /* AF_XDP running mode: driver or 
> skb. */
>      int requested_xdpmode;
>      struct ovs_spin *tx_locks;  /* spin lock array for TX queues. */
> +    const char *xdpobj;         /* XDP object file path. */
> +    const char *requested_xdpobj;
>  #endif
>  };
>
> -- 
> 2.7.4



More information about the dev mailing list