[ovs-dev] [PATCH v4 5/7] dpif-lookup: add avx512 gather implementation.

Sat Jun 27 18:27:19 UTC 2020

On Thu, Jun 18, 2020 at 9:53 AM Harry van Haaren
<harry.van.haaren at intel.com> wrote:
>
> This commit adds an AVX-512 dpcls lookup implementation.
> It uses the AVX-512 SIMD ISA to perform multiple miniflow
> operations in parallel.
>
> To run this implementation, the "avx512f" and "bmi2" ISAs are
> required. These ISA checks are performed at runtime while
> probing the subtable implementation. If a CPU does not provide
> both "avx512f" and "bmi2", then this code does not execute.
>
> The avx512 code is built as a seperate static library, with added
> CFLAGS to enable the required ISA features. By building only this
> static library with avx512 enabled, it is ensured that the main OVS
> core library is *not* using avx512, and that OVS continues to run
> as before on CPUs that do not support avx512.
>
> The approach taken in this implementation is to use the
> gather instruction to access the packet miniflow, allowing
> any miniflow blocks to be loaded into an AVX-512 register.
> This maximises the usefulness of the register, and hence this
> implementation handles any subtable with up to miniflow 8 bits.
>
> Note that specialization of these avx512 lookup routines
> still provides performance value, as the hashing of the
> resulting data is performed in scalar code, and compile-time
> loop unrolling occurs when specialized to miniflow bits.
>
> Signed-off-by: Harry van Haaren <harry.van.haaren at intel.com>
>
> ---
>
> v4:
> - Remove TODO comment on prio-set command (was accidentally
>   added to this commit in v3)
> - Fixup v3 changlog to not include #warning comment (William Tu)
> - Remove #define for debugging in lookup.h
> - Fix builds on older gcc versions that don't support -mavx512f.
>   Solution involves CC_CHECK and #ifdefs in code (OVS Robot, William Tu)
>
> v3:
> - Improve function name for _any subtable lookup
> - Use "" include not <> for immintrin.h
> - Add checks for SSE42 instructions in core OVS for CRC32 based hashing
>   If not available, disable AVX512 lookup implementation as it requires
>   uses CRC32 for hashing, and the hashing algorithm must match core OVS.
> - Rework ovs_asserts() into function selection time check
> - Add #define for magic number 8, number of u64 blocks in AVX512 register
> - Add #if CHECKER around AVX code, sparse doesn't like checking it
> - Simplify avx512 enabled building, fixes builds with --enable-shared
> ---
>  configure.ac                           |   2 +
>  lib/automake.mk                        |  17 ++
>  lib/dpif-netdev-lookup-avx512-gather.c | 265 +++++++++++++++++++++++++
>  lib/dpif-netdev-lookup.c               |  17 ++
>  lib/dpif-netdev-lookup.h               |   4 +
>  5 files changed, 305 insertions(+)
>  create mode 100644 lib/dpif-netdev-lookup-avx512-gather.c
>
> diff --git a/configure.ac b/configure.ac
> index 81893e56e..1367c868b 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -178,6 +178,8 @@ OVS_ENABLE_OPTION([-Wno-null-pointer-arithmetic])
>  OVS_ENABLE_OPTION([-Warray-bounds-pointer-arithmetic])
>  OVS_CONDITIONAL_CC_OPTION([-Wno-unused], [HAVE_WNO_UNUSED])
>  OVS_CONDITIONAL_CC_OPTION([-Wno-unused-parameter], [HAVE_WNO_UNUSED_PARAMETER])
> +OVS_CONDITIONAL_CC_OPTION([-mavx512f], [HAVE_AVX512F])
> +OVS_CHECK_CC_OPTION([-mavx512f], [CFLAGS="$CFLAGS -DHAVE_AVX512F"])

Do you need both checks?
I thought the first one OVS_CONDITIONAL_CC_OPTION([-mavx512f], [HAVE_AVX512F])
is good enough since at lib/automake.mk, you add the -mavx512f to CFLAGS.

>  OVS_ENABLE_WERROR
>  OVS_ENABLE_SPARSE
>  OVS_CTAGS_IDENTIFIERS
> diff --git a/lib/automake.mk b/lib/automake.mk
> index 1fc1a209e..fab056b8a 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -11,6 +11,7 @@ lib_libopenvswitch_la_LIBADD = $(SSL_LIBS)
>  lib_libopenvswitch_la_LIBADD += $(CAPNG_LDADD)
>  lib_libopenvswitch_la_LIBADD += $(LIBBPF_LDADD)
>
> +
>  if WIN32
>  lib_libopenvswitch_la_LIBADD += ${PTHREAD_LIBS}
>  endif
> @@ -20,6 +21,22 @@ lib_libopenvswitch_la_LDFLAGS = \
>          -Wl,--version-script=$(top_builddir)/lib/libopenvswitch.sym \
>          $(AM_LDFLAGS)
>
> +if HAVE_AVX512F
> +# Build library of avx512 code with CPU ISA CFLAGS enabled. This allows the
> +# compiler to use the ISA features required for the ISA optimized code-paths.
> +lib_LTLIBRARIES += lib/libopenvswitchavx512.la
> +lib_libopenvswitch_la_LIBADD += lib/libopenvswitchavx512.la
> +lib_libopenvswitchavx512_la_CFLAGS = \
> +       -mavx512f \
> +       -mavx512bw \
> +       -mavx512dq \
> +       -mbmi2 \
> +       $(AM_CFLAGS)
> +lib_libopenvswitchavx512_la_SOURCES = \
> +       lib/dpif-netdev-lookup-avx512-gather.c
> +endif
> +
> +# Build core vswitch libraries as before
>  lib_libopenvswitch_la_SOURCES = \
>         lib/aes128.c \
>         lib/aes128.h \
> diff --git a/lib/dpif-netdev-lookup-avx512-gather.c b/lib/dpif-netdev-lookup-avx512-gather.c
> new file mode 100644
> index 000000000..754cd0e3c
> --- /dev/null
> +++ b/lib/dpif-netdev-lookup-avx512-gather.c
> @@ -0,0 +1,265 @@
> +/*
> + * Copyright (c) 2020, Intel Corperation.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifdef __x86_64__
> +#if !defined(__CHECKER__)
> +
> +#include <config.h>
> +
> +#include "dpif-netdev.h"
> +#include "dpif-netdev-lookup.h"
> +#include "dpif-netdev-private.h"
> +#include "cmap.h"
> +#include "flow.h"
> +#include "pvector.h"
> +#include "openvswitch/vlog.h"
> +
> +#include "immintrin.h"
> +
> +/* Each AVX512 register (zmm register in assembly notation) can contain up to
> + * 512 bits, which is equivelent to 8 uint64_t variables. This is the maximum

typo: equivalent

> + * number of miniflow blocks that can be processed in a single pass of the
> + * AVX512 code at a time.
> + */
> +#define NUM_U64_IN_ZMM_REG (8)
> +#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * NUM_U64_IN_ZMM_REG)
> +
> +
> +VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
> +
> +static inline __m512i
> +_mm512_popcnt_epi64_manual(__m512i v_in)
> +{
> +    static const uint8_t pop_lut[64] = {
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +    };
> +    __m512i v_pop_lut = _mm512_loadu_si512(pop_lut);
> +
> +    __m512i v_in_srl8 = _mm512_srli_epi64(v_in, 4);
> +    __m512i v_nibble_mask = _mm512_set1_epi8(0xF);
> +    __m512i v_in_lo = _mm512_and_si512(v_in, v_nibble_mask);
> +    __m512i v_in_hi = _mm512_and_si512(v_in_srl8, v_nibble_mask);
> +
> +    __m512i v_lo_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_lo);
> +    __m512i v_hi_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_hi);
> +    __m512i v_u8_pop = _mm512_add_epi8(v_lo_pop, v_hi_pop);
> +
> +    return _mm512_sad_epu8(v_u8_pop, _mm512_setzero_si512());
> +}

I forgot whether you mentioned this or not.
But why create this manual popcnt?
Isn't there a _mm512_popcnt_* in the library?

The rest looks good to me,
Thanks

William