[ovs-dev] [RFC v2 PATCH 4/4] bpf: Add reference XDP program implementation for netdev-offload-xdp
William Tu
u9012063 at gmail.com
Sun May 3 16:22:18 UTC 2020
On Tue, Apr 21, 2020 at 11:47:04PM +0900, Toshiaki Makita wrote:
> This adds a reference program, flowtable_afxdp.o, which can be used to
> offload flows to XDP through netdev-offload-xdp.
> The program will be compiled when using --enable-bpf switch.
>
> Signed-off-by: Toshiaki Makita <toshiaki.makita1 at gmail.com>
Hi Toshiaki,
Thanks for your patch, I haven't tried to run it but I have
questions about miniflow.
> ---
> Makefile.am | 9 +-
> acinclude.m4 | 56 +++++
> bpf/.gitignore | 4 +
> bpf/Makefile.am | 59 +++++
> bpf/bpf_miniflow.h | 179 +++++++++++++++
> bpf/bpf_netlink.h | 34 +++
> bpf/bpf_workaround.h | 28 +++
> bpf/flowtable_afxdp.c | 515 ++++++++++++++++++++++++++++++++++++++++++
> configure.ac | 2 +
> 9 files changed, 884 insertions(+), 2 deletions(-)
> create mode 100644 bpf/.gitignore
> create mode 100644 bpf/Makefile.am
> create mode 100644 bpf/bpf_miniflow.h
> create mode 100644 bpf/bpf_netlink.h
> create mode 100644 bpf/bpf_workaround.h
> create mode 100644 bpf/flowtable_afxdp.c
>
> diff --git a/Makefile.am b/Makefile.am
> index b279303d1..f18bfefde 100644
> --- a/Makefile.am
> +++ b/Makefile.am
> @@ -8,6 +8,9 @@
> AUTOMAKE_OPTIONS = foreign subdir-objects
> ACLOCAL_AMFLAGS = -I m4
> SUBDIRS = datapath
> +if HAVE_BPF
> +SUBDIRS += bpf
> +endif
>
> AM_CPPFLAGS = $(SSL_CFLAGS)
> AM_LDFLAGS = $(SSL_LDFLAGS)
> @@ -198,7 +201,9 @@ ALL_LOCAL += dist-hook-git
> dist-hook-git: distfiles
> @if test -e $(srcdir)/.git && (git --version) >/dev/null 2>&1; then \
> (cd datapath && $(MAKE) distfiles); \
> - (cat distfiles; sed 's|^|datapath/|' datapath/distfiles) | \
> + (cd bpf && $(MAKE) distfiles); \
> + (cat distfiles; sed 's|^|datapath/|' datapath/distfiles; \
> + sed 's|^|bpf/|' bpf/distfiles) | \
> LC_ALL=C sort -u > all-distfiles; \
> (cd $(srcdir) && git ls-files) | grep -v '\.gitignore$$' | \
> grep -v '\.gitattributes$$' | \
> @@ -234,7 +239,7 @@ config-h-check:
> @cd $(srcdir); \
> if test -e .git && (git --version) >/dev/null 2>&1 && \
> git --no-pager grep -L '#include <config\.h>' `git ls-files | grep '\.c$$' | \
> - grep -vE '^datapath|^lib/sflow|^third-party|^datapath-windows|^python'`; \
> + grep -vE '^datapath|^lib/sflow|^third-party|^datapath-windows|^python|^bpf'`; \
> then \
> echo "See above for list of violations of the rule that"; \
> echo "every C source file must #include <config.h>."; \
> diff --git a/acinclude.m4 b/acinclude.m4
> index 0901f2870..2fb2f385f 100644
> --- a/acinclude.m4
> +++ b/acinclude.m4
> @@ -301,6 +301,62 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [
> AM_CONDITIONAL([HAVE_AF_XDP], test "$AF_XDP_ENABLE" = true)
> ])
>
> +dnl OVS_CHECK_LINUX_BPF
> +dnl
> +dnl Check both llvm and libbpf support
> +AC_DEFUN([OVS_CHECK_LINUX_BPF], [
> + AC_ARG_ENABLE([bpf],
> + [AC_HELP_STRING([--enable-bpf],
> + [Compile reference eBPF programs for XDP])],
> + [], [enable_bpf=no])
> + AC_MSG_CHECKING([whether BPF is enabled])
> + if test "$enable_bpf" != yes; then
> + AC_MSG_RESULT([no])
> + BPF_ENABLE=false
> + else
> + AC_MSG_RESULT([yes])
> + BPF_ENABLE=true
> +
> + AC_CHECK_PROG(CLANG_CHECK, clang, yes)
> + AS_IF([test X"$CLANG_CHECK" != X"yes"],
> + [AC_MSG_ERROR([unable to find clang to compile BPF program])])
> +
> + AC_CHECK_PROG(LLC_CHECK, llc, yes)
> + AS_IF([test X"$LLC_CHECK" != X"yes"],
> + [AC_MSG_ERROR([unable to find llc to compile BPF program])])
> +
> + AC_CHECK_HEADER([bpf/bpf_helpers.h], [],
> + [AC_MSG_ERROR([unable to find bpf/bpf_helpers.h to compile BPF program])])
> +
> + AC_CHECK_HEADER([linux/bpf.h], [],
> + [AC_MSG_ERROR([unable to find linux/bpf.h to compile BPF program])])
> +
> + AC_MSG_CHECKING([for LLVM bpf target support])
> + if llc -march=bpf -mattr=help >/dev/null 2>&1; then
> + AC_MSG_RESULT([yes])
> + else
> + AC_MSG_RESULT([no])
> + AC_MSG_ERROR([LLVM does not support bpf target])
> + fi
> +
> + AC_MSG_CHECKING([for BTF DATASEC support])
> + AC_LANG_CONFTEST(
> + [AC_LANG_SOURCE([__attribute__((section("_x"), used)) int x;])])
> + if clang -g -O2 -S -target bpf -emit-llvm -c conftest.c -o conftest.ll && \
> + llc -march=bpf -filetype=obj -o conftest.o conftest.ll && \
> + readelf -p.BTF conftest.o 2>/dev/null | grep -q -w _x; then
> + AC_MSG_RESULT([yes])
> + else
> + AC_MSG_RESULT([no])
> + AC_MSG_ERROR([LLVM does not support BTF DATASEC])
> + fi
> +
> + AC_DEFINE([HAVE_BPF], [1],
> + [Define to 1 if BPF compilation is available and enabled.])
> + fi
> + AM_CONDITIONAL([HAVE_BPF], test "$BPF_ENABLE" = true)
> +])
> +
> dnl OVS_CHECK_DPDK
> dnl
> dnl Configure DPDK source tree
> diff --git a/bpf/.gitignore b/bpf/.gitignore
> new file mode 100644
> index 000000000..ab0f2d9e4
> --- /dev/null
> +++ b/bpf/.gitignore
> @@ -0,0 +1,4 @@
> +*.ll
> +/distfiles
> +/Makefile
> +/Makefile.in
> diff --git a/bpf/Makefile.am b/bpf/Makefile.am
> new file mode 100644
> index 000000000..f485b17f0
> --- /dev/null
> +++ b/bpf/Makefile.am
> @@ -0,0 +1,59 @@
> +AUTOMAKE_OPTIONS = foreign
> +
> +EXTRA_DIST = flowtable_afxdp.c bpf_miniflow.h bpf_netlink.h bpf_workaround.h
> +
> +# The following is based on commands for the Automake "distdir" target.
> +distfiles: Makefile
> + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
> + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
> + list='$(DISTFILES)'; \
> + for file in $$list; do echo $$file; done | \
> + sed -e "s|^$$srcdirstrip/||;t" \
> + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t" | \
> + LC_ALL=C sort -u > $@
> +CLEANFILES = distfiles
> +
> +CLANG ?= clang
> +LLC ?= llc
> +
> +AM_CPPFLAGS = -I $(top_srcdir)/include
> +AM_CPPFLAGS += -I $(top_builddir)/include
> +AM_CPPFLAGS += -I $(top_srcdir)/lib
> +AM_CPPFLAGS += -I $(top_builddir)/lib
> +
> +AM_CFLAGS = -Wall
> +AM_CFLAGS += -Wextra
> +AM_CFLAGS += -Wno-sign-compare
> +AM_CFLAGS += -Wformat -Wformat-security
> +AM_CFLAGS += -Wswitch-enum
> +AM_CFLAGS += -Wunused-parameter
> +AM_CFLAGS += -Wbad-function-cast
> +AM_CFLAGS += -Wcast-align
> +AM_CFLAGS += -Wstrict-prototypes
> +AM_CFLAGS += -Wold-style-definition
> +AM_CFLAGS += -Wmissing-field-initializers
> +AM_CFLAGS += -fno-strict-aliasing
> +AM_CFLAGS += -Wswitch-bool
> +AM_CFLAGS += -Wlogical-not-parentheses
> +AM_CFLAGS += -Wsizeof-array-argument
> +AM_CFLAGS += -Wshift-negative-value
> +AM_CFLAGS += -Wshadow
> +AM_CFLAGS += -Wcast-align
> +AM_CFLAGS += -Wno-unused-value
> +AM_CFLAGS += -Wno-compare-distinct-pointer-types
> +AM_CFLAGS += -g
> +AM_CFLAGS += -O2
> +
> +SUFFIXES = .ll
> +%.ll: %.c
> + $(CLANG) $(AM_CPPFLAGS) $(AM_CFLAGS) -S \
> + -target bpf -emit-llvm -c $< -o $@
> +CLEANFILES += *.ll
> +
> +%.o: %.ll
> + $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@ $<
> +CLEANFILES += *.o
> +
> +flowtable_afxdp.o: flowtable_afxdp.c bpf_miniflow.h bpf_netlink.h bpf_workaround.h
> +
> +all-local: flowtable_afxdp.o
> diff --git a/bpf/bpf_miniflow.h b/bpf/bpf_miniflow.h
> new file mode 100644
> index 000000000..090a9c7be
> --- /dev/null
> +++ b/bpf/bpf_miniflow.h
> @@ -0,0 +1,179 @@
> +/*
> + * Copyright (c) 2020 NTT Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef BPF_MINIFLOW_H
> +#define BPF_MINIFLOW_H 1
> +
> +#include "flow.h"
> +
> +struct bpf_mf_ctx {
> + struct flowmap map;
> + uint64_t *data;
> +};
> +
> +static inline void
> +miniflow_set_maps(struct bpf_mf_ctx *ctx, size_t ofs, size_t n_words)
> +{
> + flowmap_set(&ctx->map, ofs, n_words);
> +}
> +
> +static inline void
> +miniflow_set_map(struct bpf_mf_ctx *ctx, size_t ofs)
> +{
> + flowmap_set(&ctx->map, ofs, 1);
> +}
> +
> +static inline void
> +miniflow_push_uint8_(struct bpf_mf_ctx *ctx, size_t ofs, uint8_t value)
> +{
> + size_t ofs8 = ofs % 8;
> +
> + if (ofs8 == 0) {
> + miniflow_set_map(ctx, ofs / 8);
> + }
> + *((uint8_t *)ctx->data + ofs8) = value;
> + if (ofs8 == 7) {
> + ctx->data++;
> + }
> +}
> +
> +static inline void
> +miniflow_push_uint16_(struct bpf_mf_ctx *ctx, size_t ofs, uint16_t value)
> +{
> + size_t ofs8 = ofs % 8;
> +
> + if (ofs8 == 0) {
> + miniflow_set_map(ctx, ofs / 8);
> + *(uint16_t *)ctx->data = value;
> + } else if (ofs8 == 2) {
> + *((uint16_t *)ctx->data + 1) = value;
> + } else if (ofs8 == 4) {
> + *((uint16_t *)ctx->data + 2) = value;
> + } else if (ofs8 == 6) {
> + *((uint16_t *)ctx->data + 3) = value;
> + ctx->data++;
> + }
> +}
> +
> +static inline void
> +miniflow_push_uint32_(struct bpf_mf_ctx *ctx, size_t ofs, uint32_t value)
> +{
> + size_t ofs8 = ofs % 8;
> +
> + if (ofs8 == 0) {
> + miniflow_set_map(ctx, ofs / 8);
> + *(uint32_t *)ctx->data = value;
> + } else if (ofs8 == 4) {
> + *((uint32_t *)ctx->data + 1) = value;
> + ctx->data++;
> + }
> +}
> +
> +static inline void
> +ether_addr_copy(struct eth_addr *dst, const struct eth_addr *src)
> +{
> + ovs_be16 *a = dst->be16;
> + const ovs_be16 *b = src->be16;
> +
> + a[0] = b[0];
> + a[1] = b[1];
> + a[2] = b[2];
> +}
> +
> +/* 'valuep' is 16-aligned */
> +/* data must start 64-aligned and must be followed by other data or padding */
> +static inline void
> +miniflow_push_macs_(struct bpf_mf_ctx *ctx, size_t ofs,
> + const struct eth_addr *valuep)
> +{
> + miniflow_set_maps(ctx, ofs / 8, 2);
> + ether_addr_copy((struct eth_addr *)ctx->data, valuep);
> + ether_addr_copy((struct eth_addr *)ctx->data + 1, valuep + 1);
> + ctx->data++; /* First word only. */
> +}
> +
> +/* data must start 64-aligned and must be followed by other data */
> +static inline void
> +miniflow_pad_from_64_(struct bpf_mf_ctx *ctx, size_t ofs)
> +{
> + size_t ofs8 = ofs % 8;
> + size_t ofs4 = ofs % 4;
> + size_t ofs2 = ofs % 2;
> + void *cdata = ctx->data;
> +
> + miniflow_set_map(ctx, ofs / 8);
> +
> + if (ofs8 >= 4) {
> + *(uint32_t *)cdata = 0;
> + cdata += 4;
> + }
> + if (ofs4 >= 2) {
> + *(uint16_t *)cdata = 0;
> + cdata += 2;
> + }
> + if (ofs2 == 1) {
> + *(uint8_t *)cdata = 0;
> + }
> +}
> +
> +static inline void
> +miniflow_pad_to_64_(struct bpf_mf_ctx *ctx, size_t ofs)
> +{
> + size_t ofs8 = ofs % 8;
> + size_t ofs4 = ofs % 4;
> + size_t ofs2 = ofs % 2;
> + void *cdata = ctx->data;
> +
> + cdata += ofs8;
> + if (ofs2 == 1) {
> + *(uint8_t *)cdata = 0;
> + cdata++;
> + }
> + if (ofs4 <= 2) {
> + *(uint16_t *)cdata = 0;
> + cdata += 2;
> + }
> + if (ofs8 <= 4) {
> + *(uint32_t *)cdata = 0;
> + }
> + ctx->data++;
> +}
> +
> +#define miniflow_push_uint8(CTX, FIELD, VALUE) \
> + miniflow_push_uint8_(CTX, offsetof(struct flow, FIELD), VALUE)
> +
> +#define miniflow_push_be16_(CTX, OFS, VALUE) \
> + miniflow_push_uint16_(CTX, OFS, (OVS_FORCE uint16_t)VALUE)
> +
> +#define miniflow_push_be16(CTX, FIELD, VALUE) \
> + miniflow_push_be16_(CTX, offsetof(struct flow, FIELD), VALUE) \
> +
> +#define miniflow_push_be32_(CTX, OFS, VALUE) \
> + miniflow_push_uint32_(CTX, OFS, (OVS_FORCE uint32_t)VALUE)
> +
> +#define miniflow_push_be32(CTX, FIELD, VALUE) \
> + miniflow_push_be32_(CTX, offsetof(struct flow, FIELD), VALUE) \
> +
> +#define miniflow_push_macs(CTX, FIELD, VALUEP) \
> + miniflow_push_macs_(CTX, offsetof(struct flow, FIELD), VALUEP)
> +
> +#define miniflow_pad_from_64(CTX, FIELD) \
> + miniflow_pad_from_64_(CTX, offsetof(struct flow, FIELD))
> +
> +#define miniflow_pad_to_64(CTX, FIELD) \
> + miniflow_pad_to_64_(CTX, OFFSETOFEND(struct flow, FIELD))
> +
> +#endif /* bpf_miniflow.h */
> diff --git a/bpf/bpf_netlink.h b/bpf/bpf_netlink.h
> new file mode 100644
> index 000000000..091926c61
> --- /dev/null
> +++ b/bpf/bpf_netlink.h
> @@ -0,0 +1,34 @@
> +/*
> + * Copyright (c) 2020 NTT Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef BPF_NETLINK_H
> +#define BPF_NETLINK_H 1
> +
> +#include "netlink.h"
> +
> +static inline int
> +bpf_nl_attr_type(const struct nlattr *nla)
> +{
> + return nla->nla_type & NLA_TYPE_MASK;
> +}
> +
> +static inline const void *
> +bpf_nl_attr_get(const struct nlattr *nla)
> +{
> + return nla + 1;
> +}
> +
> +#endif /* bpf_netlink.h */
> diff --git a/bpf/bpf_workaround.h b/bpf/bpf_workaround.h
> new file mode 100644
> index 000000000..cb072a7e6
> --- /dev/null
> +++ b/bpf/bpf_workaround.h
> @@ -0,0 +1,28 @@
> +/*
> + * Copyright (c) 2020 NTT Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef BPF_WORKAROUND_H
> +#define BPF_WORKAROUND_H
> +
> +/* On Linux x86/x64 systems bits/wordsize.h included from stdint.h cannot
> + * correctly determine __WORDSIZE for bpf, which causes incorrect UINTPTR_MAX
> + */
> +#if __UINTPTR_MAX__ == __UINT64_MAX__ && defined(UINTPTR_MAX)
> +#undef UINTPTR_MAX
> +#define UINTPTR_MAX UINT64_MAX
> +#endif
> +
> +#endif /* bpf_workaround.h */
> diff --git a/bpf/flowtable_afxdp.c b/bpf/flowtable_afxdp.c
> new file mode 100644
> index 000000000..7a4767333
> --- /dev/null
> +++ b/bpf/flowtable_afxdp.c
> @@ -0,0 +1,515 @@
> +/*
> + * Copyright (c) 2020 NTT Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +/* linux/types.h is necessary for bpf_helpers.h as it's not self-contained */
> +#include <linux/types.h>
> +#include <bpf/bpf_helpers.h>
> +#include <linux/bpf.h>
> +
> +/* Workaround for incorrect macros for bpf in stdint.h */
> +#include <stdint.h>
> +#include "bpf_workaround.h"
> +
> +#include "bpf_miniflow.h"
> +#include "bpf_netlink.h"
> +#include "netdev-offload-xdp.h"
> +#include "packets.h"
> +#include "util.h"
> +
> +/* Supported keys. Need to keep same 64-align as struct flow for miniflow */
> +struct xdp_flow {
> + struct eth_addr dl_dst;
> + struct eth_addr dl_src;
> + ovs_be16 dl_type;
> + uint8_t pad1[2];
> +
> + union flow_vlan_hdr vlans[1];
> + uint8_t pad2[4];
> +
> + ovs_be32 nw_src;
> + ovs_be32 nw_dst;
> +
> + uint8_t pad3[4];
> + uint8_t nw_frag;
> + uint8_t nw_tos;
> + uint8_t nw_ttl;
> + uint8_t nw_proto;
> +
> + ovs_be16 tp_src;
> + ovs_be16 tp_dst;
> + uint8_t pad4[4];
> +};
> +
> +/* Size of xdp_flow must be 64-aligned for key comparison */
> +BUILD_ASSERT_DECL(sizeof(struct xdp_flow) % sizeof(uint64_t) == 0);
> +
> +#define XDP_FLOW_U64S (sizeof(struct xdp_flow) / sizeof(uint64_t))
> +
> +#define XDP_MAX_SUBTABLE_FLOWS 1024
> +#define XDP_MAX_ACTIONS_LEN 256
> +
> +/* Actual key in each subtable. miniflow map is omitted as it's identical to
> + * mask map */
> +struct xdp_flow_key {
> + union {
> + uint64_t miniflow_buf[XDP_FLOW_U64S];
> + struct xdp_flow _flow; /* Need this to keep xdp_flow in BTF */
> + };
> +};
> +
> +/* Value for subtable mask array */
> +struct xdp_subtable_mask {
> + struct xdp_subtable_mask_header header;
> + uint64_t buf[XDP_FLOW_U64S];
> +};
> +
> +/* miniflow for packet */
> +struct xdp_miniflow {
> + struct miniflow mf;
> + struct xdp_flow_key value;
> +};
> +
> +/* Used when the action only modifies the packet */
> +#define _XDP_ACTION_CONTINUE -1
> +
> +/* Supported actions */
> +/* XXX: This size should be uint16_t but needs to be int as kernel has a bug
> + * in btf_enum_check_member() that assumes enum size is sizeof(int), which
> + * causes an error when loading BTF if we use uint16_t here */
> +enum action_attrs : uint32_t {
> + XDP_ACTION_OUTPUT = OVS_ACTION_ATTR_OUTPUT,
> + XDP_ACTION_PUSH_VLAN = OVS_ACTION_ATTR_PUSH_VLAN,
> + XDP_ACTION_POP_VLAN = OVS_ACTION_ATTR_POP_VLAN,
> +};
> +
> +/* Identical to struct nlattr. Need this to keep enum action_attrs in BTF */
> +struct xdp_action_nlattr {
> + uint16_t nla_len;
> + enum action_attrs action_type;
> +};
> +
> +struct xdp_flow_actions {
> + struct xdp_flow_actions_header header;
> + uint8_t data[XDP_MAX_ACTIONS_LEN - sizeof(struct xdp_flow_actions_header)];
> + /* Dummy. Need xdp_action_nlattr to keep enum action_attrs in BTF */
> + struct xdp_action_nlattr _xdp_actions[];
> +};
> +
> +
> +/* Map definitions */
> +
> +struct {
> + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> + __uint(max_entries, 256);
> + __type(key, uint32_t);
> + __type(value, long);
> +} debug_stats SEC(".maps");
> +
> +/* Temporary storage for packet miniflow. Need this because verifier does not
> + * allow access to array variable in stack with variable index. Such access
> + * happens in mask_key() */
> +struct {
> + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> + __uint(max_entries, 1);
> + __type(key, uint32_t);
> + __type(value, struct xdp_miniflow);
> +} pkt_mf_tbl SEC(".maps");
> +
> +struct {
> + __uint(type, BPF_MAP_TYPE_XSKMAP);
> + __uint(max_entries, 1); /* This should be redefined by userspace */
> + __uint(key_size, sizeof(int));
> + __uint(value_size, sizeof(int));
> +} xsks_map SEC(".maps");
> +
> +struct {
> + __uint(type, BPF_MAP_TYPE_DEVMAP);
> + __uint(max_entries, XDP_MAX_PORTS);
> + __uint(key_size, sizeof(int));
> + __uint(value_size, sizeof(int));
> +} output_map SEC(".maps");
> +
> +/* Head index for subtbl_masks list */
> +/* TODO: Use global variable */
> +struct {
> + __uint(type, BPF_MAP_TYPE_ARRAY);
> + __uint(max_entries, 1);
> + __type(key, uint32_t);
> + __type(value, int);
> +} subtbl_masks_hd SEC(".maps");
> +
> +/* Information about subtable mask. A list implemented using array */
> +struct {
> + __uint(type, BPF_MAP_TYPE_ARRAY);
> + __uint(max_entries, XDP_MAX_SUBTABLES);
> + __type(key, uint32_t);
> + __type(value, struct xdp_subtable_mask);
> +} subtbl_masks SEC(".maps");
> +
> +/* Template for subtable hash-map. This will be used in userspace to create
> + * flow_table array-of-maps. */
> +struct {
> + __uint(type, BPF_MAP_TYPE_HASH);
> + __uint(max_entries, XDP_MAX_SUBTABLE_FLOWS);
> + __type(key, struct xdp_flow_key);
> + __type(value, struct xdp_flow_actions);
> +} subtbl_template SEC(".maps");
> +
> +/* Array-of-maps whose entry contains subtable hash-map. */
> +struct {
> + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
> + __uint(max_entries, XDP_MAX_SUBTABLES);
> + __uint(key_size, sizeof(uint32_t));
> + __uint(value_size, sizeof(uint32_t));
> +} flow_table SEC(".maps");
> +
> +
> +static inline void
> +account_debug(int idx)
> +{
> + long *cnt;
> +
> + cnt = bpf_map_lookup_elem(&debug_stats, &idx);
> + if (cnt) {
> + *cnt += 1;
> + }
> +}
> +
> +static inline void
> +account_action(enum action_attrs act)
> +{
> + account_debug(act + 1);
> +}
> +
> +/* Derived from xsk_load_xdp_prog() in libbpf */
> +static inline int
> +upcall(struct xdp_md *ctx)
> +{
> + int ret, index = ctx->rx_queue_index;
> +
> + ret = bpf_redirect_map(&xsks_map, index, XDP_ABORTED);
> + if (ret > 0) {
> + return ret;
> + }
> +
> + /* Fallback for kernel <= 5.3 not supporting default action in flags */
> + if (bpf_map_lookup_elem(&xsks_map, &index)) {
> + return bpf_redirect_map(&xsks_map, index, 0);
> + }
> +
> + return XDP_ABORTED;
> +}
> +
> +static inline int
> +action_output(int tx_port)
> +{
> + account_action(XDP_ACTION_OUTPUT);
> +
> + return bpf_redirect_map(&output_map, tx_port, 0);
> +}
> +
> +static inline int
> +action_vlan_push(struct xdp_md *ctx OVS_UNUSED,
> + const struct ovs_action_push_vlan *vlan OVS_UNUSED)
> +{
> + account_action(XDP_ACTION_PUSH_VLAN);
> +
> + /* TODO: implement this */
> + return XDP_ABORTED;
> +}
> +
> +static inline int
> +action_vlan_pop(struct xdp_md *ctx OVS_UNUSED)
> +{
> + account_action(XDP_ACTION_POP_VLAN);
> +
> + /* TODO: implement this */
> + return XDP_ABORTED;
> +}
> +
> +/* TODO: Add more actions */
> +
> +
> +struct nw_params {
> + union {
> + ovs_be32 params;
> + struct {
> + uint8_t nw_frag;
> + uint8_t nw_tos;
> + uint8_t nw_ttl;
> + uint8_t nw_proto;
> + };
> + };
> +};
> +
> +static inline int
> +parse_ipv4(void *data, uint64_t *nh_off, void *data_end,
> + struct bpf_mf_ctx *mf_ctx, struct nw_params *nw_params)
> +{
> + struct ip_header *ip = data + *nh_off;
> +
> + if (ip + 1 > data_end) {
> + return 1;
> + }
> +
> + /* Linux network drivers ensure that IP header is 4-byte aligned or
> + * the platform can handle unaligned access */
> + miniflow_push_be32(mf_ctx, nw_src, *(ovs_be32 *)(void *)&ip->ip_src);
> + miniflow_push_be32(mf_ctx, nw_dst, *(ovs_be32 *)(void *)&ip->ip_dst);
> +
> + if (OVS_UNLIKELY(IP_IS_FRAGMENT(ip->ip_frag_off))) {
> + nw_params->nw_frag = FLOW_NW_FRAG_ANY;
> + if (ip->ip_frag_off & htons(IP_FRAG_OFF_MASK)) {
> + nw_params->nw_frag |= FLOW_NW_FRAG_LATER;
> + }
> + } else {
> + nw_params->nw_frag = 0;
> + }
> + nw_params->nw_tos = ip->ip_tos;
> + nw_params->nw_ttl = ip->ip_ttl;
> + nw_params->nw_proto = ip->ip_proto;
> +
> + *nh_off += IP_IHL(ip->ip_ihl_ver) * 4;
> +
> + return 0;
> +}
> +
> +static inline int
> +xdp_miniflow_extract(struct xdp_md *ctx, struct xdp_miniflow *pkt_mf)
> +{
> + void *data = (void *)(long)ctx->data;
> + void *data_end = (void *)(long)ctx->data_end;
> + struct eth_header *eth = data;
> + struct vlan_header *vlan = NULL;
> + ovs_be16 dl_type;
> + uint64_t nh_off;
> + struct nw_params nw_params;
> + struct bpf_mf_ctx mf_ctx = { {{ 0 }}, (uint64_t *)&pkt_mf->value };
> +
> + nh_off = sizeof *eth;
> + if (data + nh_off > data_end) {
> + return 1;
> + }
> +
> + miniflow_push_macs(&mf_ctx, dl_dst, ð->eth_dst);
> +
> + if (eth_type_vlan(eth->eth_type)) {
> + vlan = data + nh_off;
> + nh_off += sizeof(*vlan);
> + if (data + nh_off > data_end) {
> + return 1;
> + }
> + dl_type = vlan->vlan_next_type;
> + } else {
> + dl_type = eth->eth_type;
> + }
> + miniflow_push_be16(&mf_ctx, dl_type, dl_type);
> + miniflow_pad_to_64(&mf_ctx, dl_type);
> +
> + if (vlan) {
> + const ovs_16aligned_be32 *qp;
> + union flow_vlan_hdr vlan_hdr;
> +
> + qp = (ovs_16aligned_be32 *)ð->eth_type;
> + vlan_hdr.qtag = get_16aligned_be32(qp);
> + vlan_hdr.tci |= htons(VLAN_CFI);
> + miniflow_push_be32(&mf_ctx, vlans, vlan_hdr.qtag);
> + miniflow_push_be32_(&mf_ctx,
> + offsetof(struct flow, vlans) + sizeof(ovs_be32),
> + 0);
> + }
> +
> + if (dl_type == htons(ETH_TYPE_IP)) {
> + if (parse_ipv4(data, &nh_off, data_end, &mf_ctx, &nw_params)) {
> + return 1;
> + }
> + } else {
> + goto out;
> + }
> + miniflow_pad_from_64(&mf_ctx, nw_frag);
> + miniflow_push_be32(&mf_ctx, nw_frag, &nw_params.params);
> +
> + if (nw_params.nw_proto == IPPROTO_TCP) {
> + struct tcp_header *tcp = data + nh_off;
> +
> + if (tcp + 1 > data_end) {
> + return 1;
> + }
> +
> + miniflow_push_be16(&mf_ctx, tp_src, tcp->tcp_src);
> + miniflow_push_be16(&mf_ctx, tp_dst, tcp->tcp_dst);
> + } else if (nw_params.nw_proto == IPPROTO_UDP) {
> + struct udp_header *udp = data + nh_off;
> +
> + if (udp + 1 > data_end) {
> + return 1;
> + }
> +
> + miniflow_push_be16(&mf_ctx, tp_src, udp->udp_src);
> + miniflow_push_be16(&mf_ctx, tp_dst, udp->udp_dst);
> + }
> +out:
> + pkt_mf->mf.map = mf_ctx.map;
> + return 0;
> +}
> +
> +#define for_each_subtable_mask(subtable_mask, head, idx, cnt) \
> + for (subtable_mask = bpf_map_lookup_elem(&subtbl_masks, (head)), \
> + idx = *(head), cnt = 0; \
> + subtable_mask != NULL && cnt < XDP_MAX_SUBTABLES; \
> + idx = subtable_mask->header.next, \
> + subtable_mask = bpf_map_lookup_elem(&subtbl_masks, &idx), cnt++)
> +
> +/* Returns false if an error happens */
> +static inline int
> +mask_key(uint64_t *mkey, const struct miniflow *pkt_mf,
> + const struct xdp_subtable_mask_header *tbl_mask)
> +{
> + const struct miniflow *tbl_mf = &tbl_mask->mask.masks;
> + const uint64_t *tbl_blocks = miniflow_get_values(tbl_mf);
> + const uint64_t *pkt_blocks = miniflow_get_values(pkt_mf);
> + uint64_t tbl_mf_bits = tbl_mf->map.bits[0];
> + uint64_t pkt_mf_bits = pkt_mf->map.bits[0];
> + uint8_t tbl_mf_bits_u0 = tbl_mask->mf_bits_u0;
> + uint8_t tbl_mf_bits_u1 = tbl_mask->mf_bits_u1;
> + unsigned int pkt_ofs = 0;
> + int i = 0;
> +
> + /* This sensitive loop easily exceeds verifier limit 1M insns so
> + * need to be careful when modifying.
> + * E.g. increasing XDP_FLOW_U64S by adding keys to struct xdp_flow
> + * increases verifier cost and may be rejected due to 1M insns exceeds */
> + for (; i < tbl_mf_bits_u0 + tbl_mf_bits_u1 && i < XDP_FLOW_U64S; i++) {
> + uint64_t mf_mask;
> + uint64_t idx_bits;
> + unsigned int pkt_idx;
> + uint64_t lowest_bit;
> +
> + if (i == tbl_mf_bits_u0) {
> + tbl_mf_bits = tbl_mf->map.bits[1];
> + pkt_mf_bits = pkt_mf->map.bits[1];
> + pkt_ofs = count_1bits(pkt_mf->map.bits[0]);
> + }
> +
> + lowest_bit = tbl_mf_bits & -tbl_mf_bits;
> + tbl_mf_bits &= ~lowest_bit;
> + if (!(lowest_bit & pkt_mf_bits)) {
> + mkey[i] = 0;
> + continue;
> + }
> + mf_mask = lowest_bit - 1;
> + idx_bits = mf_mask & pkt_mf_bits;
> + pkt_idx = count_1bits(idx_bits) + pkt_ofs;
> + if (pkt_idx >= XDP_FLOW_U64S) {
> + /* xdp flow api provider (userspace) BUG */
> + return false;
> + }
> +
> + mkey[i] = pkt_blocks[pkt_idx] & tbl_blocks[i];
> + }
> +
> + return true;
> +}
> +
> +SEC("xdp") int
> +flowtable_afxdp(struct xdp_md *ctx)
> +{
> + struct xdp_miniflow *pkt_mf;
> + struct xdp_subtable_mask *subtable_mask;
> + int *head;
> + struct xdp_flow_actions *xdp_actions = NULL;
> + struct nlattr *a;
> + unsigned int left;
> + int cnt, idx, zero = 0;
> +
> + account_debug(0);
> +
> + head = bpf_map_lookup_elem(&subtbl_masks_hd, &zero);
> + if (!head) {
> + return XDP_ABORTED;
> + }
> + if (*head == XDP_SUBTABLES_TAIL) {
> + /* Offload not enabled */
> + goto upcall;
> + }
> +
> + /* Get temporary storage for storing packet miniflow */
> + pkt_mf = bpf_map_lookup_elem(&pkt_mf_tbl, &zero);
> + if (!pkt_mf) {
> + return XDP_ABORTED;
> + }
> +
I start to wonder what's the benefit of using miniflow in XDP?
miniflow tries to compress the large flow key into smaller memory,
and with a flowmap indicating which offset in bits are valid.
And when adding a new subtable at flow_put, the subtable's key
size can be smaller with only the needed fields.
But in the case of XDP/eBPF, we have to statically allocated fixed
key size (the subtbl_template has key as struct xdp_flow_key), so
each subtable is always having key with full key fields. (not saving
memory space) Moreover with miniflow, it makes the 'mask_key' function
below pretty complicated.
What if we don't compress the flow key?
unlike to OVS userspace, more like the kernel's implementation of OVS
megaflow (ex: masked_flow_lookup in net/openvswitch/flow_table.c)
Where it has a list of masks (like your 'subtbl_masks' map) but
only use 1 hash table (so we don't need map-in-map).
We lookup the hash table multiple times, each
time applied different masks in the mask list until finding a match.
However, I'm not sure whether this will hit some limitations of verifier.
Please correct me if I understand wrong here.
Thanks
William
> + /* Extract miniflow from packet */
> + if (xdp_miniflow_extract(ctx, pkt_mf)) {
> + return XDP_DROP;
> + }
> +
> + /* Lookup each subtable */
> + for_each_subtable_mask(subtable_mask, head, idx, cnt) {
> + struct xdp_flow_key mkey = { 0 };
> + void *subtable;
> +
> + subtable = bpf_map_lookup_elem(&flow_table, &idx);
> + if (!subtable) {
> + return XDP_ABORTED;
> + }
> +
> + if (!mask_key(mkey.miniflow_buf, &pkt_mf->mf,
> + &subtable_mask->header)) {
> + continue;
> + }
> +
> + xdp_actions = bpf_map_lookup_elem(subtable, &mkey);
> + if (xdp_actions) {
> + break;
> + }
> + }
> +
> + if (!xdp_actions) {
> + /* Flow entry miss */
> +upcall:
> + return upcall(ctx);
> + }
> +
> + /* Execute actions */
> + NL_ATTR_FOR_EACH_UNSAFE(a, left, xdp_flow_actions(&xdp_actions->header),
> + xdp_actions->header.actions_len) {
> + uint16_t type = bpf_nl_attr_type(a);
> + int act;
> +
> + switch ((enum action_attrs)type) {
> + case XDP_ACTION_OUTPUT:
> + /* Note: userspace ensures there is no multiple output in actions */
> + return action_output(*(int *)bpf_nl_attr_get(a));
> + case XDP_ACTION_PUSH_VLAN:
> + act = action_vlan_push(ctx, bpf_nl_attr_get(a));
> + break;
> + case XDP_ACTION_POP_VLAN:
> + act = action_vlan_pop(ctx);
> + break;
> + default:
> + return XDP_ABORTED;
> + }
> + if (act != _XDP_ACTION_CONTINUE) {
> + return act;
> + }
> + }
> +
> + account_debug(1);
> + return XDP_DROP;
> +}
> +
> +char _license[] SEC("license") = "Apache-2.0";
> diff --git a/configure.ac b/configure.ac
> index 1877aae56..99a93ce00 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -99,6 +99,7 @@ OVS_CHECK_DOT
> OVS_CHECK_IF_DL
> OVS_CHECK_STRTOK_R
> OVS_CHECK_LINUX_AF_XDP
> +OVS_CHECK_LINUX_BPF
> AC_CHECK_DECLS([sys_siglist], [], [], [[#include <signal.h>]])
> AC_CHECK_MEMBERS([struct stat.st_mtim.tv_nsec, struct stat.st_mtimensec],
> [], [], [[#include <sys/stat.h>]])
> @@ -198,6 +199,7 @@ AC_CONFIG_FILES(datapath/Makefile)
> AC_CONFIG_FILES(datapath/linux/Kbuild)
> AC_CONFIG_FILES(datapath/linux/Makefile)
> AC_CONFIG_FILES(datapath/linux/Makefile.main)
> +AC_CONFIG_FILES(bpf/Makefile)
> AC_CONFIG_FILES(tests/atlocal)
> AC_CONFIG_FILES(lib/libopenvswitch.pc)
> AC_CONFIG_FILES(lib/libsflow.pc)
> --
> 2.25.1
>
More information about the dev
mailing list