[ovs-dev] [PATCHv3] netdev-afxdp: Enable loading XDP program.
Eelco Chaudron
echaudro at redhat.com
Tue Nov 5 16:06:10 UTC 2019
Hi William,
See some comments inline.
//Eelco
On 1 Nov 2019, at 21:14, William Tu wrote:
> Now netdev-afxdp always forwards all packets to userspace because
> it is using libbpf's default XDP program, see 'xsk_load_xdp_prog'.
> There are some cases when users want to keep packets in kernel instead
> of sending to userspace, for example, management traffic such as SSH
> should be processed in kernel.
>
> The patch enables loading the user-provide XDP program by doing
> $ovs-vsctl -- set int afxdp-p0 options:xdp-obj=<path/to/xdp/obj>
>
> So users can implement their filtering logic or traffic steering idea
> in their XDP program, and rest of the traffic passes to AF_XDP socket
> handled by OVS.
>
> Signed-off-by: William Tu <u9012063 at gmail.com>
> ---
> v3:
> Feedbacks from Eelco.
> - keep using xdpobj not xdp-obj (because we alread use xdpmode)
> or we change both to xdp-obj and xdp-mode?
> - log a info message when using external program for better
> debugging
> - combine some failure messages
> - update doc
> NEW:
> - add options:xdpobj=__default__, to set back to libbpf default
> prog
> - Tested-at:
> https://travis-ci.org/williamtu/ovs-travis/builds/606153231
>
> v2:
> A couple fixes and remove RFC
> ---
> Documentation/intro/install/afxdp.rst | 49 +++++++++++
> lib/netdev-afxdp.c | 148
> +++++++++++++++++++++++++++++-----
> lib/netdev-linux-private.h | 2 +
> 3 files changed, 181 insertions(+), 18 deletions(-)
>
> diff --git a/Documentation/intro/install/afxdp.rst
> b/Documentation/intro/install/afxdp.rst
> index a136db0c950a..89bc97da9548 100644
> --- a/Documentation/intro/install/afxdp.rst
> +++ b/Documentation/intro/install/afxdp.rst
> @@ -273,6 +273,55 @@ Or, use OVS pmd tool::
> ovs-appctl dpif-netdev/pmd-stats-show
>
>
> +Loading Custom XDP Program
> +--------------------------
> +By defailt, netdev-afxdp always forwards all packets to userspace
> because
> +it is using libbpf's default XDP program. There are some cases when
> users
> +want to keep packets in kernel instead of sending to userspace, for
> example,
> +management traffic such as SSH should be processed in kernel. This
> can be
> +done by loading the user-provided XDP program::
> +
> + ovs-vsctl -- set int afxdp-p0 options:xdpobj=<path/to/xdp/obj>
> +
> +So users can implement their filtering logic or traffic steering idea
> +in their XDP program, and rest of the traffic passes to AF_XDP socket
> +handled by OVS. To set it back to default, use::
> +
> + ovs-vsctl -- set int afxdp-p0 options:xdpobj=__default__
> +
> +Below is a sample C program compiled under kernel's samples/bpf/.
> +
> +.. code-block:: c
> +
> + #include <uapi/linux/bpf.h>
> + #include "bpf_helpers.h"
> +
You mentioned you would add the below but did not, any reason, or just
forgot about it?
#if LINUX_VERSION_CODE < KERNEL_VERSION(5,3,0)
/* Kernel version before 5.3 needed an additional map */
struct bpf_map_def SEC("maps") qidconf_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 64,
};
#endif
> + /* OVS will associate map 'xsks_map' to xsk socket. */
> + struct bpf_map_def SEC("maps") xsks_map = {
> + .type = BPF_MAP_TYPE_XSKMAP,
> + .key_size = sizeof(int),
> + .value_size = sizeof(int),
> + .max_entries = 32,
> + };
> +
> + SEC("xdp_sock")
> + int xdp_sock_prog(struct xdp_md *ctx)
> + {
> + int index = ctx->rx_queue_index;
> +
> + /* Customized by user.
> + * For example
> + * 1) filter out all SSH traffic and return XDP_PASS
> + * for kernel to process.
> + * 2) Drop unwanted packet by returning XDP_DROP.
> + */
> +
> + /* Rest of packets goes to AF_XDP. */
> + return bpf_redirect_map(&xsks_map, index, 0);
> + }
> + char _license[] SEC("license") = "GPL";
> +
> +
> Example Script
> --------------
>
> diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
> index af654d498a88..be74527946a4 100644
> --- a/lib/netdev-afxdp.c
> +++ b/lib/netdev-afxdp.c
> @@ -21,6 +21,7 @@
> #include "netdev-afxdp.h"
> #include "netdev-afxdp-pool.h"
>
> +#include <bpf/bpf.h>
> #include <errno.h>
> #include <inttypes.h>
> #include <linux/rtnetlink.h>
> @@ -30,6 +31,7 @@
> #include <stdlib.h>
> #include <sys/resource.h>
> #include <sys/socket.h>
> +#include <sys/stat.h>
> #include <sys/types.h>
> #include <unistd.h>
>
> @@ -88,8 +90,11 @@ BUILD_ASSERT_DECL(PROD_NUM_DESCS ==
> CONS_NUM_DESCS);
>
> #define UMEM2DESC(elem, base) ((uint64_t)((char *)elem - (char
> *)base))
>
> +#define LIBBPF_XDP_PROGRAM "__default__"
> +
> static struct xsk_socket_info *xsk_configure(int ifindex, int
> xdp_queue_id,
> - int mode, bool
> use_need_wakeup);
> + int mode, bool
> use_need_wakeup,
> + bool use_default_xdp);
> static void dest(uint32_t ifindex, int xdpmode);
> static void xsk_destroy(struct xsk_socket_info *xsk);
> static int xsk_configure_all(struct netdev *netdev);
> @@ -213,6 +218,50 @@ netdev_afxdp_sweep_unused_pools(void *aux
> OVS_UNUSED)
> ovs_mutex_unlock(&unused_pools_mutex);
> }
>
> +static int
> +xsk_load_prog(const char *path, struct bpf_object **obj,
> + int *prog_fd)
> +{
> + struct bpf_prog_load_attr attr = {
> + .prog_type = BPF_PROG_TYPE_XDP,
> + .file = path,
> + };
> +
> + if (bpf_prog_load_xattr(&attr, obj, prog_fd)) {
> + VLOG_ERR("Can't load XDP program at '%s'", path);
> + return EINVAL;
> + }
> +
> + return 0;
> +}
> +
> +static int
> +xsk_configure_map(struct bpf_object *obj, struct xsk_socket_info
> *xsk,
> + int queue_id)
> +{
> + struct bpf_map *map;
> + int xsks_map_fd;
> + int xsk_fd;
> + int ret;
> +
> + map = bpf_object__find_map_by_name(obj, "xsks_map");
> + if (!map) {
> + VLOG_WARN("XDP map 'xsks_map' not found");
> + return EINVAL;
> + }
> +
> + xsks_map_fd = bpf_map__fd(map);
> + xsk_fd = xsk_socket__fd(xsk->xsk);
> +
> + ret = bpf_map_update_elem(xsks_map_fd, &queue_id, &xsk_fd, 0);
> + if (ret) {
> + VLOG_WARN("Can't set XSK map, map fd: %d", xsks_map_fd);
I think this VLOG and the one above should be set to ERR, because when
this happens we end up in the re-configure loop, and no idea why (until
you change the log level).
> + return EINVAL;
> + }
> +
> + return 0;
> +}
> +
> static struct xsk_umem_info *
> xsk_configure_umem(void *buffer, uint64_t size, int xdpmode)
> {
> @@ -290,7 +339,8 @@ xsk_configure_umem(void *buffer, uint64_t size,
> int xdpmode)
>
> static struct xsk_socket_info *
> xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex,
> - uint32_t queue_id, int xdpmode, bool
> use_need_wakeup)
> + uint32_t queue_id, int xdpmode, bool
> use_need_wakeup,
> + bool use_default_xdp)
> {
> struct xsk_socket_config cfg;
> struct xsk_socket_info *xsk;
> @@ -319,6 +369,10 @@ xsk_configure_socket(struct xsk_umem_info *umem,
> uint32_t ifindex,
> }
> #endif
>
> + if (!use_default_xdp) {
> + cfg.libbpf_flags |= XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD;
Instead of using this flag, and the bpf_map_update_elem() function
above, would it not be easier just let the API take care of this?
See earlier comment,
xsk_socket__create()->xsk_setup_xdp_prog()->xsk_lookup_bpf_maps(),
however I won’t mind keeping it as is.
Maybe this also takes care of cleaning up the program on unload/cleanup?
This is a link to an example:
https://github.com/chaudron/xdp-tutorial/blob/master/advanced03-AF_XDP/af_xdp_user.c
> + }
> +
> if (if_indextoname(ifindex, devname) == NULL) {
> VLOG_ERR("ifindex %d to devname failed (%s)",
> ifindex, ovs_strerror(errno));
> @@ -339,17 +393,19 @@ xsk_configure_socket(struct xsk_umem_info *umem,
> uint32_t ifindex,
> return NULL;
> }
>
> - /* Make sure the built-in AF_XDP program is loaded. */
> - ret = bpf_get_link_xdp_id(ifindex, &prog_id, cfg.xdp_flags);
> - if (ret || !prog_id) {
> - if (ret) {
> - VLOG_ERR("Get XDP prog ID failed (%s)",
> ovs_strerror(errno));
> - } else {
> - VLOG_ERR("No XDP program is loaded at ifindex %d",
> ifindex);
> + if (use_default_xdp) {
> + /* Make sure the built-in AF_XDP program is loaded. */
> + ret = bpf_get_link_xdp_id(ifindex, &prog_id, cfg.xdp_flags);
> + if (ret || !prog_id) {
> + if (ret) {
> + VLOG_ERR("Get XDP prog ID failed (%s)",
> ovs_strerror(errno));
> + } else {
> + VLOG_ERR("No XDP program is loaded at ifindex %d",
> ifindex);
> + }
> + xsk_socket__delete(xsk->xsk);
> + free(xsk);
> + return NULL;
> }
> - xsk_socket__delete(xsk->xsk);
> - free(xsk);
> - return NULL;
> }
>
> while (!xsk_ring_prod__reserve(&xsk->umem->fq,
> @@ -376,7 +432,7 @@ xsk_configure_socket(struct xsk_umem_info *umem,
> uint32_t ifindex,
>
> static struct xsk_socket_info *
> xsk_configure(int ifindex, int xdp_queue_id, int xdpmode,
> - bool use_need_wakeup)
> + bool use_need_wakeup, bool use_default_xdp)
> {
> struct xsk_socket_info *xsk;
> struct xsk_umem_info *umem;
> @@ -400,7 +456,7 @@ xsk_configure(int ifindex, int xdp_queue_id, int
> xdpmode,
> VLOG_DBG("Allocated umem pool at 0x%"PRIxPTR, (uintptr_t) umem);
>
> xsk = xsk_configure_socket(umem, ifindex, xdp_queue_id, xdpmode,
> - use_need_wakeup);
> + use_need_wakeup, use_default_xdp);
> if (!xsk) {
> /* Clean up umem and xpacket pool. */
> if (xsk_umem__delete(umem->umem)) {
> @@ -420,6 +476,10 @@ xsk_configure_all(struct netdev *netdev)
> struct netdev_linux *dev = netdev_linux_cast(netdev);
> struct xsk_socket_info *xsk_info;
> int i, ifindex, n_rxq, n_txq;
> + bool use_default_xdp;
> + struct bpf_object *obj;
> + int prog_fd = 0;
> + int ret;
>
> ifindex = linux_get_ifindex(netdev_get_name(netdev));
>
> @@ -428,6 +488,7 @@ xsk_configure_all(struct netdev *netdev)
>
> n_rxq = netdev_n_rxq(netdev);
> dev->xsks = xcalloc(n_rxq, sizeof *dev->xsks);
> + use_default_xdp = dev->xdpobj ? false : true;
>
> /* Configure each queue. */
> for (i = 0; i < n_rxq; i++) {
> @@ -436,7 +497,7 @@ xsk_configure_all(struct netdev *netdev)
> dev->xdpmode == XDP_COPY ? "SKB" : "DRV",
> dev->use_need_wakeup ? "true" : "false");
> xsk_info = xsk_configure(ifindex, i, dev->xdpmode,
> - dev->use_need_wakeup);
> + dev->use_need_wakeup,
> use_default_xdp);
> if (!xsk_info) {
> VLOG_ERR("Failed to create AF_XDP socket on queue %d.",
> i);
> dev->xsks[i] = NULL;
> @@ -446,6 +507,28 @@ xsk_configure_all(struct netdev *netdev)
> atomic_init(&xsk_info->tx_dropped, 0);
> xsk_info->outstanding_tx = 0;
> xsk_info->available_rx = PROD_NUM_DESCS;
> +
> + if (dev->xdpobj) {
> + if (prog_fd == 0) {
> + /* XDP program is per-netdev, so all queues share
> + the same XDP program. */
> + ret = xsk_load_prog(dev->xdpobj, &obj, &prog_fd);
> + if (ret) {
> + goto err;
> + }
> + }
> + ret = xsk_configure_map(obj, xsk_info, i);
> + if (ret) {
> + goto err;
> + }
> +
> + /* Clear existing XDP program and its associated map. */
> + bpf_set_link_xdp_fd(ifindex, -1, dev->xdpmode);
> +
> + bpf_set_link_xdp_fd(ifindex, prog_fd, dev->xdpmode);
> + VLOG_INFO("%s: Load custom XDP program at %s.",
> + netdev_get_name(netdev), dev->xdpobj);
> + }
> }
>
> n_txq = netdev_n_txq(netdev);
> @@ -519,6 +602,8 @@ xsk_destroy_all(struct netdev *netdev)
> free(dev->tx_locks);
> dev->tx_locks = NULL;
> }
> + free(CONST_CAST(char *, dev->xdpobj));
> + dev->xdpobj = NULL;
> }
We need to make sure we close all the open BPF programs or else they
will all remain in memory.
I did a lot of tries when debugging and this is what I see with bpftool:
$ bpftool prog
2023: xdp name xdp_prog_simple tag 3b185187f1855c4c gpl
loaded_at 2019-11-05T10:54:04-0500 uid 0
xlated 16B jited 35B memlock 4096B
2027: xdp name xdp_prog_simple tag 3b185187f1855c4c gpl
loaded_at 2019-11-05T10:54:04-0500 uid 0
xlated 16B jited 35B memlock 4096B
2031: xdp name xdp_prog_simple tag 3b185187f1855c4c gpl
loaded_at 2019-11-05T10:54:04-0500 uid 0
xlated 16B jited 35B memlock 4096B
2035: xdp name xdp_prog_simple tag 3b185187f1855c4c gpl
loaded_at 2019-11-05T10:54:04-0500 uid 0
xlated 16B jited 35B memlock 4096B
2039: xdp name xdp_prog_simple tag 3b185187f1855c4c gpl
loaded_at 2019-11-05T10:54:05-0500 uid 0
xlated 16B jited 35B memlock 4096B
2043: xdp name xdp_prog_simple tag 3b185187f1855c4c gpl
loaded_at 2019-11-05T10:54:05-0500 uid 0
xlated 16B jited 35B memlock 4096B
2047: xdp name xdp_prog_simple tag 3b185187f1855c4c gpl
loaded_at 2019-11-05T10:54:05-0500 uid 0
xlated 16B jited 35B memlock 4096B
2051: xdp name xdp_prog_simple tag 3b185187f1855c4c gpl
loaded_at 2019-11-05T10:54:05-0500 uid 0
xlated 16B jited 35B memlock 4096B
...
...
Guess same thing for the maps:
$ bpftool map
1094: (null) name xsks_map flags 0x0
key 4B value 4B max_entries 32 memlock 4096B
1097: (null) name xsks_map flags 0x0
key 4B value 4B max_entries 32 memlock 4096B
1098: (null) name xsks_map flags 0x0
key 4B value 4B max_entries 63 memlock 4096B
...
...
>
> int
> @@ -527,8 +612,10 @@ netdev_afxdp_set_config(struct netdev *netdev,
> const struct smap *args,
> {
> struct netdev_linux *dev = netdev_linux_cast(netdev);
> const char *str_xdpmode;
> + const char *str_xdpobj;
> int xdpmode, new_n_rxq;
> bool need_wakeup;
> + struct stat s;
>
> ovs_mutex_lock(&dev->mutex);
> new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
> @@ -545,9 +632,9 @@ netdev_afxdp_set_config(struct netdev *netdev,
> const struct smap *args,
> } else if (!strcasecmp(str_xdpmode, "skb")) {
> xdpmode = XDP_COPY;
> } else {
> + ovs_mutex_unlock(&dev->mutex);
> VLOG_ERR("%s: Incorrect xdpmode (%s).",
> netdev_get_name(netdev), str_xdpmode);
> - ovs_mutex_unlock(&dev->mutex);
> return EINVAL;
> }
>
> @@ -559,12 +646,30 @@ netdev_afxdp_set_config(struct netdev *netdev,
> const struct smap *args,
> }
> #endif
>
> + str_xdpobj = smap_get_def(args, "xdpobj", NULL);
> + if (str_xdpobj) {
> + if (!strcmp(str_xdpobj, LIBBPF_XDP_PROGRAM)) {
> + str_xdpobj = NULL;
> + } else if (stat(str_xdpobj, &s)) {
> + ovs_mutex_unlock(&dev->mutex);
> + VLOG_ERR("Invalid xdpobj '%s': %s.", str_xdpobj,
> + ovs_strerror(errno));
> + return EINVAL;
> + } else if (!S_ISREG(s.st_mode)) {
> + ovs_mutex_unlock(&dev->mutex);
> + VLOG_ERR("xdpobj '%s' is not a regular file.",
> str_xdpobj);
> + return EINVAL;
> + }
> + }
> +
> if (dev->requested_n_rxq != new_n_rxq
> || dev->requested_xdpmode != xdpmode
> - || dev->requested_need_wakeup != need_wakeup) {
> + || dev->requested_need_wakeup != need_wakeup
> + || !nullable_string_is_equal(dev->requested_xdpobj,
> str_xdpobj)) {
> dev->requested_n_rxq = new_n_rxq;
> dev->requested_xdpmode = xdpmode;
> dev->requested_need_wakeup = need_wakeup;
> + dev->requested_xdpobj = nullable_xstrdup(str_xdpobj);
> netdev_request_reconfigure(netdev);
> }
> ovs_mutex_unlock(&dev->mutex);
> @@ -582,6 +687,8 @@ netdev_afxdp_get_config(const struct netdev
> *netdev, struct smap *args)
> dev->xdpmode == XDP_ZEROCOPY ? "drv" : "skb");
> smap_add_format(args, "use-need-wakeup", "%s",
> dev->use_need_wakeup ? "true" : "false");
> + smap_add_format(args, "xdpobj", "%s",
> + dev->xdpobj ? dev->xdpobj : LIBBPF_XDP_PROGRAM);
> ovs_mutex_unlock(&dev->mutex);
> return 0;
> }
> @@ -598,7 +705,8 @@ netdev_afxdp_reconfigure(struct netdev *netdev)
> if (netdev->n_rxq == dev->requested_n_rxq
> && dev->xdpmode == dev->requested_xdpmode
> && dev->use_need_wakeup == dev->requested_need_wakeup
> - && dev->xsks) {
> + && dev->xsks
> + && nullable_string_is_equal(dev->xdpobj,
> dev->requested_xdpobj)) {
> goto out;
> }
>
> @@ -616,6 +724,8 @@ netdev_afxdp_reconfigure(struct netdev *netdev)
> }
> dev->use_need_wakeup = dev->requested_need_wakeup;
>
> + dev->xdpobj = dev->requested_xdpobj;
> +
> err = xsk_configure_all(netdev);
> if (err) {
> VLOG_ERR("AF_XDP device %s reconfig failed.",
> netdev_get_name(netdev));
> @@ -1053,10 +1163,12 @@ netdev_afxdp_construct(struct netdev *netdev)
> netdev->n_rxq = 0;
> netdev->n_txq = 0;
> dev->xdpmode = 0;
> + dev->xdpobj = NULL;
>
> dev->requested_n_rxq = NR_QUEUE;
> dev->requested_xdpmode = XDP_COPY;
> dev->requested_need_wakeup = NEED_WAKEUP_DEFAULT;
> + dev->requested_xdpobj = NULL;
>
> dev->xsks = NULL;
> dev->tx_locks = NULL;
> diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h
> index c14f2fb81bb0..29dacb0ec017 100644
> --- a/lib/netdev-linux-private.h
> +++ b/lib/netdev-linux-private.h
> @@ -105,6 +105,8 @@ struct netdev_linux {
> bool use_need_wakeup;
> bool requested_need_wakeup;
> struct ovs_spin *tx_locks; /* spin lock array for TX queues. */
> + const char *xdpobj; /* XDP object file path. */
> + const char *requested_xdpobj;
> #endif
> };
>
> --
> 2.7.4
More information about the dev
mailing list