[ovs-dev] [v13 04/12] dpif-avx512: Add ISA implementation of dpif.

Ferriter, Cian cian.ferriter at intel.com
Thu Jun 24 11:44:45 UTC 2021


Hi Flavio,

Thanks for the testing here. My responses are inline.

Cian

> -----Original Message-----
> From: Flavio Leitner <fbl at sysclose.org>
> Sent: Thursday 24 June 2021 05:06
> To: Ferriter, Cian <cian.ferriter at intel.com>
> Cc: ovs-dev at openvswitch.org; Amber, Kumar <kumar.amber at intel.com>; i.maximets at ovn.org
> Subject: Re: [ovs-dev] [v13 04/12] dpif-avx512: Add ISA implementation of dpif.
> 
> On Thu, Jun 17, 2021 at 05:18:17PM +0100, Cian Ferriter wrote:
> > From: Harry van Haaren <harry.van.haaren at intel.com>
> >
> > This commit adds the AVX512 implementation of DPIF functionality,
> > specifically the dp_netdev_input_outer_avx512 function. This function
> > only handles outer (no re-circulations), and is optimized to use the
> > AVX512 ISA for packet batching and other DPIF work.
> >
> > Sparse is not able to handle the AVX512 intrinsics, causing compile
> > time failures, so it is disabled for this file.
> >
> > Signed-off-by: Harry van Haaren <harry.van.haaren at intel.com>
> > Co-authored-by: Cian Ferriter <cian.ferriter at intel.com>
> > Signed-off-by: Cian Ferriter <cian.ferriter at intel.com>
> > Co-authored-by: Kumar Amber <kumar.amber at intel.com>
> > Signed-off-by: Kumar Amber <kumar.amber at intel.com>
> >
> > ---
> >
> > v13:
> > - Squash "Add HWOL support" commit into this commit.
> > - Add NEWS item about this feature here rather than in a later commit.
> > - Add #define NUM_U64_IN_ZMM_REG 8.
> > - Add comment describing operation of while loop handling HWOL->EMC->SMC
> >   lookups in dp_netdev_input_outer_avx512().
> > - Add EMC and SMC batch insert functions for better handling of EMC and
> >   SMC in AVX512 DPIF.
> > - Minor code refactor to address review comments.
> > ---
> >  NEWS                             |   2 +
> >  lib/automake.mk                  |   5 +-
> >  lib/dpif-netdev-avx512.c         | 327 +++++++++++++++++++++++++++++++
> >  lib/dpif-netdev-private-dfc.h    |  25 +++
> >  lib/dpif-netdev-private-dpif.h   |  32 +++
> >  lib/dpif-netdev-private-thread.h |  11 +-
> >  lib/dpif-netdev-private.h        |  25 +++
> >  lib/dpif-netdev.c                | 103 ++++++++--
> >  8 files changed, 514 insertions(+), 16 deletions(-)
> >  create mode 100644 lib/dpif-netdev-avx512.c
> >  create mode 100644 lib/dpif-netdev-private-dpif.h
> >
> > diff --git a/NEWS b/NEWS
> > index 96b3a61c8..6a4a7b76d 100644
> > --- a/NEWS
> > +++ b/NEWS
> > @@ -10,6 +10,8 @@ Post-v2.15.0
> >       * Auto load balancing of PMDs now partially supports cross-NUMA polling
> >         cases, e.g if all PMD threads are running on the same NUMA node.
> >       * Refactor lib/dpif-netdev.c to multiple header files.
> > +     * Add avx512 implementation of dpif which can process non recirculated
> > +       packets. It supports partial HWOL, EMC, SMC and DPCLS lookups.
> >     - ovs-ctl:
> >       * New option '--no-record-hostname' to disable hostname configuration
> >         in ovsdb on startup.
> > diff --git a/lib/automake.mk b/lib/automake.mk
> > index 3a33cdd5c..660cd07f0 100644
> > --- a/lib/automake.mk
> > +++ b/lib/automake.mk
> > @@ -33,11 +33,13 @@ lib_libopenvswitchavx512_la_CFLAGS = \
> >  	-mavx512f \
> >  	-mavx512bw \
> >  	-mavx512dq \
> > +	-mbmi \
> >  	-mbmi2 \
> >  	-fPIC \
> >  	$(AM_CFLAGS)
> >  lib_libopenvswitchavx512_la_SOURCES = \
> > -	lib/dpif-netdev-lookup-avx512-gather.c
> > +	lib/dpif-netdev-lookup-avx512-gather.c \
> > +	lib/dpif-netdev-avx512.c
> >  lib_libopenvswitchavx512_la_LDFLAGS = \
> >  	-static
> >  endif
> > @@ -114,6 +116,7 @@ lib_libopenvswitch_la_SOURCES = \
> >  	lib/dpif-netdev-private-dfc.c \
> >  	lib/dpif-netdev-private-dfc.h \
> >  	lib/dpif-netdev-private-dpcls.h \
> > +	lib/dpif-netdev-private-dpif.h \
> >  	lib/dpif-netdev-private-flow.h \
> >  	lib/dpif-netdev-private-hwol.h \
> >  	lib/dpif-netdev-private-thread.h \
> > diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
> > new file mode 100644
> > index 000000000..0e55b0be2
> > --- /dev/null
> > +++ b/lib/dpif-netdev-avx512.c
> > @@ -0,0 +1,327 @@
> > +/*
> > + * Copyright (c) 2021 Intel Corporation.
> > + *
> > + * Licensed under the Apache License, Version 2.0 (the "License");
> > + * you may not use this file except in compliance with the License.
> > + * You may obtain a copy of the License at:
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +
> > +#ifdef __x86_64__
> > +/* Sparse cannot handle the AVX512 instructions. */
> > +#if !defined(__CHECKER__)
> > +
> > +#include <config.h>
> > +
> > +#include "dpif-netdev.h"
> > +#include "dpif-netdev-perf.h"
> > +
> > +#include "dpif-netdev-private.h"
> > +#include "dpif-netdev-private-dpcls.h"
> > +#include "dpif-netdev-private-flow.h"
> > +#include "dpif-netdev-private-thread.h"
> > +#include "dpif-netdev-private-hwol.h"
> > +
> > +#include "dp-packet.h"
> > +#include "netdev.h"
> > +
> > +#include "immintrin.h"
> > +
> > +/* Each AVX512 register (zmm register in assembly notation) can contain up to
> > + * 512 bits, which is equivalent to 8 uint64_t variables. This is the maximum
> > + * number of miniflow blocks that can be processed in a single pass of the
> > + * AVX512 code at a time.
> > + */
> > +#define NUM_U64_IN_ZMM_REG (8)
> > +
> > +/* Structure to contain per-packet metadata that must be attributed to the
> > + * dp netdev flow. This is unfortunate to have to track per packet, however
> > + * it's a bit awkward to maintain them in a performant way. This structure
> > + * helps to keep two variables on a single cache line per packet.
> > + */
> > +struct pkt_flow_meta {
> > +    uint16_t bytes;
> > +    uint16_t tcp_flags;
> > +};
> > +
> > +/* Structure of heap allocated memory for DPIF internals. */
> > +struct dpif_userdata {
> > +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
> > +        struct netdev_flow_key keys[NETDEV_MAX_BURST];
> > +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
> > +        struct netdev_flow_key *key_ptrs[NETDEV_MAX_BURST];
> > +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
> > +        struct pkt_flow_meta pkt_meta[NETDEV_MAX_BURST];
> > +};
> > +
> > +int32_t
> > +dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
> > +                             struct dp_packet_batch *packets,
> > +                             odp_port_t in_port)
> > +{
> > +    /* Allocate DPIF userdata. */
> > +    if (OVS_UNLIKELY(!pmd->netdev_input_func_userdata)) {
> > +        pmd->netdev_input_func_userdata =
> > +                xmalloc_pagealign(sizeof(struct dpif_userdata));
> > +    }
> > +
> > +    struct dpif_userdata *ud = pmd->netdev_input_func_userdata;
> > +    struct netdev_flow_key *keys = ud->keys;
> > +    struct netdev_flow_key **key_ptrs = ud->key_ptrs;
> > +    struct pkt_flow_meta *pkt_meta = ud->pkt_meta;
> > +
> > +    /* The AVX512 DPIF implementation handles rules in a way that is optimized
> > +     * for reducing data-movement between HWOL/EMC/SMC and DPCLS. This is
> > +     * achieved by separating the rule arrays. Bitmasks are kept for each
> > +     * packet, indicating if it matched in the HWOL/EMC/SMC array or DPCLS
> > +     * array. Later the two arrays are merged by AVX-512 expand instructions.
> > +     */
> > +
> > +    /* Stores the computed output: a rule pointer for each packet. */
> > +    /* Used initially for HWOL/EMC/SMC. */
> > +    struct dpcls_rule *rules[NETDEV_MAX_BURST];
> > +    /* Used for DPCLS. */
> > +    struct dpcls_rule *dpcls_rules[NETDEV_MAX_BURST];
> > +
> > +    uint32_t dpcls_key_idx = 0;
> > +
> > +    for (uint32_t i = 0; i < NETDEV_MAX_BURST; i += NUM_U64_IN_ZMM_REG) {
> > +        _mm512_storeu_si512(&rules[i], _mm512_setzero_si512());
> > +        _mm512_storeu_si512(&dpcls_rules[i], _mm512_setzero_si512());
> > +    }
> > +
> > +    /* Prefetch each packet's metadata. */
> > +    const size_t batch_size = dp_packet_batch_size(packets);
> > +    for (int i = 0; i < batch_size; i++) {
> > +        struct dp_packet *packet = packets->packets[i];
> > +        OVS_PREFETCH(dp_packet_data(packet));
> > +        pkt_metadata_prefetch_init(&packet->md);
> > +    }
> > +
> > +    /* Check if EMC or SMC are enabled. */
> > +    struct dfc_cache *cache = &pmd->flow_cache;
> > +    const uint32_t emc_enabled = pmd->ctx.emc_insert_min != 0;
> > +    const uint32_t smc_enabled = pmd->ctx.smc_enable_db;
> > +
> > +    uint32_t emc_hits = 0;
> > +    uint32_t smc_hits = 0;
> > +
> > +    /* A 1 bit in this mask indicates a hit, so no DPCLS lookup on the pkt. */
> > +    uint32_t hwol_emc_smc_hitmask = 0;
> > +    uint32_t smc_hitmask = 0;
> > +
> > +    /* The below while loop is based on the 'iter' variable which has a number
> > +     * of bits set representing packets that we want to process
> > +     * (HWOL->MFEX->EMC->SMC). As each packet is processed, we clear (set to 0)
> > +     * the bit representing that packet using '_blsr_u64()'. The
> > +     * '__builtin_ctz()' will give us the correct index into the 'packets',
> > +     * 'pkt_meta', 'keys' and 'rules' arrays.
> > +     *
> > +     * For one iteration of the while loop, here's some psuedocode as an
> > +     * example where 'iter' is represented in binary:
> > +     *
> > +     * while (iter) { // iter = 1100
> > +     *     uint32_t i = __builtin_ctz(iter); // i = 2
> > +     *     iter = _blsr_u64(iter); // iter = 1000
> > +     *     // do all processing (HWOL->MFEX->EMC->SMC)
> > +     * }
> > +     */
> > +    uint32_t lookup_pkts_bitmask = (1ULL << batch_size) - 1;
> > +    uint32_t iter = lookup_pkts_bitmask;
> > +    while (iter) {
> > +        uint32_t i = __builtin_ctz(iter);
> > +        iter = _blsr_u64(iter);
> > +
> > +        /* Get packet pointer from bitmask and packet md. */
> > +        struct dp_packet *packet = packets->packets[i];
> > +        pkt_metadata_init(&packet->md, in_port);
> > +
> > +        struct dp_netdev_flow *f = NULL;
> > +
> > +        /* Check for partial hardware offload mark. */
> > +        uint32_t mark;
> > +        if (dp_packet_has_flow_mark(packet, &mark)) {
> > +            f = mark_to_flow_find(pmd, mark);
> > +            if (f) {
> > +                rules[i] = &f->cr;
> > +                pkt_meta[i].tcp_flags = parse_tcp_flags(packet);
> > +                pkt_meta[i].bytes = dp_packet_size(packet);
> > +                hwol_emc_smc_hitmask |= (1 << i);
> > +                continue;
> > +            }
> > +        }
> > +
> > +        /* Do miniflow extract into keys. */
> > +        struct netdev_flow_key *key = &keys[i];
> > +        miniflow_extract(packet, &key->mf);
> > +
> > +        /* Cache TCP and byte values for all packets. */
> > +        pkt_meta[i].bytes = dp_packet_size(packet);
> > +        pkt_meta[i].tcp_flags = miniflow_get_tcp_flags(&key->mf);
> > +
> > +        key->len = netdev_flow_key_size(miniflow_n_values(&key->mf));
> > +        key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf);
> > +
> > +        if (emc_enabled) {
> > +            f = emc_lookup(&cache->emc_cache, key);
> > +
> > +            if (f) {
> > +                rules[i] = &f->cr;
> > +                emc_hits++;
> > +                hwol_emc_smc_hitmask |= (1 << i);
> > +                continue;
> > +            }
> > +        }
> > +
> > +        if (smc_enabled && !f) {
> > +            f = smc_lookup_single(pmd, packet, key);
> > +            if (f) {
> > +                rules[i] = &f->cr;
> > +                smc_hits++;
> > +                smc_hitmask |= (1 << i);
> > +                continue;
> > +            }
> > +        }
> > +
> > +        /* The flow pointer was not found in HWOL/EMC/SMC, so add it to the
> > +         * dpcls input keys array for batch lookup later.
> > +         */
> > +        key_ptrs[dpcls_key_idx] = &keys[i];
> > +        dpcls_key_idx++;
> > +    }
> > +
> > +    hwol_emc_smc_hitmask |= smc_hitmask;
> > +
> > +    /* DPCLS handles any packets missed by HWOL/EMC/SMC. It operates on the
> > +     * key_ptrs[] for input miniflows to match, storing results in the
> > +     * dpcls_rules[] array.
> > +     */
> > +    if (dpcls_key_idx > 0) {
> > +        struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
> > +        if (OVS_UNLIKELY(!cls)) {
> > +            return -1;
> > +        }
> > +        bool any_miss =
> > +            !dpcls_lookup(cls, (const struct netdev_flow_key **) key_ptrs,
> > +                          dpcls_rules, dpcls_key_idx, NULL);
> > +        if (OVS_UNLIKELY(any_miss)) {
> > +            return -1;
> > +        }
> > +
> > +        /* Merge DPCLS rules and HWOL/EMC/SMC rules. */
> > +        uint32_t dpcls_idx = 0;
> > +        for (int i = 0; i < NETDEV_MAX_BURST; i += NUM_U64_IN_ZMM_REG) {
> > +            /* Indexing here is somewhat complicated due to DPCLS output rule
> > +             * load index depending on the hitmask of HWOL/EMC/SMC. More
> > +             * packets from HWOL/EMC/SMC bitmask means less DPCLS rules are
> > +             * used.
> > +             */
> > +            __m512i v_cache_rules = _mm512_loadu_si512(&rules[i]);
> > +            __m512i v_merged_rules =
> > +                        _mm512_mask_expandloadu_epi64(v_cache_rules,
> > +                                                      ~hwol_emc_smc_hitmask,
> > +                                                      &dpcls_rules[dpcls_idx]);
> > +            _mm512_storeu_si512(&rules[i], v_merged_rules);
> > +
> > +            /* Update DPCLS load index and bitmask for HWOL/EMC/SMC hits.
> > +             * There are NUM_U64_IN_ZMM_REG output pointers per register,
> > +             * subtract the HWOL/EMC/SMC lanes equals the number of DPCLS rules
> > +             * consumed.
> > +             */
> > +            uint32_t hitmask_FF = (hwol_emc_smc_hitmask & 0xFF);
> > +            dpcls_idx += NUM_U64_IN_ZMM_REG - __builtin_popcountll(hitmask_FF);
> > +            hwol_emc_smc_hitmask =
> > +                (hwol_emc_smc_hitmask >> NUM_U64_IN_ZMM_REG);
> > +        }
> > +    }
> > +
> > +    /* At this point we have a 1:1 pkt to rules mapping, so update EMC/SMC
> > +     * if required.
> > +     */
> > +    /* Insert SMC and DPCLS hits into EMC. */
> > +    /* Insert DPCLS hits into SMC. */
> > +    if (emc_enabled) {
> > +        uint32_t emc_insert_mask = smc_hitmask | ~hwol_emc_smc_hitmask;
> > +        emc_insert_mask &= lookup_pkts_bitmask;
> > +        emc_probabilistic_insert_batch(pmd, keys, &rules[0], emc_insert_mask);
> > +    }
> > +    if (smc_enabled) {
> > +        uint32_t smc_insert_mask = ~hwol_emc_smc_hitmask;
> > +        smc_insert_mask &= lookup_pkts_bitmask;
> > +        smc_insert_batch(pmd, keys, &rules[0], smc_insert_mask);
> > +    }
> > +
> > +    /* At this point we don't return error anymore, so commit stats here. */
> > +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_RECV, batch_size);
> > +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, emc_hits);
> > +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, smc_hits);
> > +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
> > +                            dpcls_key_idx);
> > +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
> > +                            dpcls_key_idx);
> > +
> > +    /* Initialize the "Action Batch" for each flow handled below. */
> > +    struct dp_packet_batch action_batch;
> > +    action_batch.trunc = 0;
> > +
> > +    while (lookup_pkts_bitmask) {
> > +        uint32_t rule_pkt_idx = __builtin_ctz(lookup_pkts_bitmask);
> > +        uint64_t needle = (uintptr_t) rules[rule_pkt_idx];
> > +
> > +        /* Parallel compare NUM_U64_IN_ZMM_REG flow* 's to the needle, create a
> > +         * bitmask.
> > +         */
> > +        uint32_t batch_bitmask = 0;
> > +        for (uint32_t j = 0; j < NETDEV_MAX_BURST; j += NUM_U64_IN_ZMM_REG) {
> > +            /* Pre-calculate store addr. */
> > +            uint32_t num_pkts_in_batch = __builtin_popcountll(batch_bitmask);
> > +            void *store_addr = &action_batch.packets[num_pkts_in_batch];
> > +
> > +            /* Search for identical flow* in burst, update bitmask. */
> > +            __m512i v_needle = _mm512_set1_epi64(needle);
> > +            __m512i v_hay = _mm512_loadu_si512(&rules[j]);
> > +            __mmask8 k_cmp_bits = _mm512_cmpeq_epi64_mask(v_needle, v_hay);
> > +            uint32_t cmp_bits = k_cmp_bits;
> > +            batch_bitmask |= cmp_bits << j;
> > +
> > +            /* Compress and store the batched packets. */
> > +            struct dp_packet **packets_ptrs = &packets->packets[j];
> > +            __m512i v_pkt_ptrs = _mm512_loadu_si512(packets_ptrs);
> > +            _mm512_mask_compressstoreu_epi64(store_addr, cmp_bits, v_pkt_ptrs);
> > +        }
> > +
> > +        /* Strip all packets in this batch from the lookup_pkts_bitmask. */
> > +        lookup_pkts_bitmask &= (~batch_bitmask);
> > +        action_batch.count = __builtin_popcountll(batch_bitmask);
> > +
> > +        /* Loop over all packets in this batch, to gather the byte and tcp_flag
> > +         * values, and pass them to the execute function. It would be nice to
> > +         * optimize this away, however it is not easy to refactor in dpif.
> > +         */
> > +        uint32_t bytes = 0;
> > +        uint16_t tcp_flags = 0;
> > +        uint32_t bitmask_iter = batch_bitmask;
> > +        for (int i = 0; i < action_batch.count; i++) {
> > +            uint32_t idx = __builtin_ctzll(bitmask_iter);
> > +            bitmask_iter = _blsr_u64(bitmask_iter);
> > +
> > +            bytes += pkt_meta[idx].bytes;
> > +            tcp_flags |= pkt_meta[idx].tcp_flags;
> > +        }
> > +
> > +        dp_netdev_batch_execute(pmd, &action_batch, rules[rule_pkt_idx],
> > +                                bytes, tcp_flags);
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +#endif
> > +#endif
> > diff --git a/lib/dpif-netdev-private-dfc.h b/lib/dpif-netdev-private-dfc.h
> > index 6a672d1b3..d5d4da7ea 100644
> > --- a/lib/dpif-netdev-private-dfc.h
> > +++ b/lib/dpif-netdev-private-dfc.h
> > @@ -81,6 +81,14 @@ extern "C" {
> >  #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX /                     \
> >                                      DEFAULT_EM_FLOW_INSERT_INV_PROB)
> >
> > +/* Forward declaration for SMC function prototype that requires access to
> > + * 'struct dp_netdev_pmd_thread'. */
> > +struct dp_netdev_pmd_thread;
> > +
> > +/* Forward declaration for EMC and SMC batch insert function prototypes that
> > + * require access to 'struct dpcls_rule'. */
> > +struct dpcls_rule;
> > +
> >  struct emc_entry {
> >      struct dp_netdev_flow *flow;
> >      struct netdev_flow_key key;   /* key.hash used for emc hash value. */
> > @@ -168,6 +176,23 @@ emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
> >      return NULL;
> >  }
> >
> > +/* Insert a batch of keys/flows into the EMC and SMC caches. */
> > +void
> > +emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd,
> > +                               const struct netdev_flow_key *keys,
> > +                               struct dpcls_rule **rules,
> > +                               uint32_t emc_insert_mask);
> > +
> > +void
> > +smc_insert_batch(struct dp_netdev_pmd_thread *pmd,
> > +                               const struct netdev_flow_key *keys,
> > +                               struct dpcls_rule **rules,
> > +                               uint32_t smc_insert_mask);
> > +
> > +struct dp_netdev_flow *
> > +smc_lookup_single(struct dp_netdev_pmd_thread *pmd,
> > +                  struct dp_packet *packet,
> > +                  struct netdev_flow_key *key);
> >
> >  #ifdef  __cplusplus
> >  }
> > diff --git a/lib/dpif-netdev-private-dpif.h b/lib/dpif-netdev-private-dpif.h
> > new file mode 100644
> > index 000000000..2fd7cc400
> > --- /dev/null
> > +++ b/lib/dpif-netdev-private-dpif.h
> > @@ -0,0 +1,32 @@
> > +/*
> > + * Copyright (c) 2021 Intel Corporation.
> > + *
> > + * Licensed under the Apache License, Version 2.0 (the "License");
> > + * you may not use this file except in compliance with the License.
> > + * You may obtain a copy of the License at:
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +
> > +#ifndef DPIF_NETDEV_PRIVATE_DPIF_H
> > +#define DPIF_NETDEV_PRIVATE_DPIF_H 1
> > +
> > +#include "openvswitch/types.h"
> > +
> > +/* Forward declarations to avoid including files. */
> > +struct dp_netdev_pmd_thread;
> > +struct dp_packet_batch;
> > +
> > +/* Available implementations for dpif work. */
> > +int32_t
> > +dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
> > +                             struct dp_packet_batch *packets,
> > +                             odp_port_t in_port);
> > +
> > +#endif /* netdev-private.h */
> > diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h
> > index 0d674ab83..17356d5e2 100644
> > --- a/lib/dpif-netdev-private-thread.h
> > +++ b/lib/dpif-netdev-private-thread.h
> > @@ -45,14 +45,19 @@ struct dp_netdev_pmd_thread_ctx {
> >      struct dp_netdev_rxq *last_rxq;
> >      /* EMC insertion probability context for the current processing cycle. */
> >      uint32_t emc_insert_min;
> > +    /* Enable the SMC cache from ovsdb config. */
> > +    bool smc_enable_db;
> >  };
> >
> >  /* Forward declaration for typedef. */
> >  struct dp_netdev_pmd_thread;
> >
> > -typedef void (*dp_netdev_input_func)(struct dp_netdev_pmd_thread *pmd,
> > -                                     struct dp_packet_batch *packets,
> > -                                     odp_port_t port_no);
> > +/* Typedef for DPIF functions.
> > + * Returns a bitmask of packets to handle, possibly including upcall/misses.
> > + */
> > +typedef int32_t (*dp_netdev_input_func)(struct dp_netdev_pmd_thread *pmd,
> > +                                        struct dp_packet_batch *packets,
> > +                                        odp_port_t port_no);
> >
> >  /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
> >   * the performance overhead of interrupt processing.  Therefore netdev can
> > diff --git a/lib/dpif-netdev-private.h b/lib/dpif-netdev-private.h
> > index d7b6fd7ec..0315b5bf6 100644
> > --- a/lib/dpif-netdev-private.h
> > +++ b/lib/dpif-netdev-private.h
> > @@ -31,4 +31,29 @@
> >  #include "dpif-netdev-private-dfc.h"
> >  #include "dpif-netdev-private-thread.h"
> >
> > +/* Allow other implementations to lookup the DPCLS instances. */
> > +struct dpcls *
> > +dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
> > +                           odp_port_t in_port);
> > +
> > +/* Allow other implementations to call dpcls_lookup() for subtable search. */
> > +bool
> > +dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
> > +             struct dpcls_rule **rules, const size_t cnt,
> > +             int *num_lookups_p);
> > +
> > +/* Allow other implementations to execute actions on a batch. */
> > +void
> > +dp_netdev_batch_execute(struct dp_netdev_pmd_thread *pmd,
> > +                        struct dp_packet_batch *packets,
> > +                        struct dpcls_rule *rule,
> > +                        uint32_t bytes,
> > +                        uint16_t tcp_flags);
> > +
> > +/* Available implementations for dpif work. */
> > +int32_t
> > +dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
> > +                             struct dp_packet_batch *packets,
> > +                             odp_port_t in_port);
> > +
> >  #endif /* netdev-private.h */
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > index e6486417e..1f15af882 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -183,10 +183,6 @@ static uint32_t dpcls_subtable_lookup_reprobe(struct dpcls *cls);
> >  static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
> >                           const struct netdev_flow_key *mask);
> >  static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
> > -static bool dpcls_lookup(struct dpcls *cls,
> > -                         const struct netdev_flow_key *keys[],
> > -                         struct dpcls_rule **rules, size_t cnt,
> > -                         int *num_lookups_p);
> >
> >  /* Set of supported meter flags */
> >  #define DP_SUPPORTED_METER_FLAGS_MASK \
> > @@ -483,7 +479,7 @@ static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
> >                                        const struct flow *flow,
> >                                        const struct nlattr *actions,
> >                                        size_t actions_len);
> > -static void dp_netdev_input(struct dp_netdev_pmd_thread *,
> > +static int32_t dp_netdev_input(struct dp_netdev_pmd_thread *,
> >                              struct dp_packet_batch *, odp_port_t port_no);
> >  static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
> >                                    struct dp_packet_batch *);
> > @@ -555,7 +551,7 @@ dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
> >                                 bool purge);
> >  static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
> >                                        struct tx_port *tx);
> > -static inline struct dpcls *
> > +inline struct dpcls *
> >  dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
> >                             odp_port_t in_port);
> >
> > @@ -1920,7 +1916,7 @@ void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
> >      }
> >  }
> >
> > -static inline struct dpcls *
> > +inline struct dpcls *
> >  dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
> >                             odp_port_t in_port)
> >  {
> > @@ -2714,13 +2710,46 @@ smc_insert(struct dp_netdev_pmd_thread *pmd,
> >      bucket->flow_idx[i] = index;
> >  }
> >
> > +inline void
> > +emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd,
> > +                               const struct netdev_flow_key *keys,
> > +                               struct dpcls_rule **rules,
> > +                               uint32_t emc_insert_mask)
> > +{
> > +    while (emc_insert_mask) {
> > +        uint32_t i = __builtin_ctz(emc_insert_mask);
> 
> I got an error on Windows:
> 
> [...]
> libtool: compile:  build-aux/cccl -DHAVE_CONFIG_H -I. -I ./include/windows -I ./datapath-
> windows/include -Ic:/PTHREADS-BUILT//include -O2 -I ./include -I ./include -I ./lib -I ./lib -
> IC:/OpenSSL-Win64/include -Wstrict-prototypes -Wall -Wextra -Wno-sign-compare -Wpointer-arith -Wformat
> -Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-cast -Wcast-align -Wstrict-
> prototypes -Wold-style-definition -Wmissing-prototypes -Wmissing-field-initializers -Wthread-safety -
> fno-strict-aliasing -Wswitch-bool -Wlogical-not-parentheses -Wsizeof-array-argument -Wbool-compare -
> Wshift-negative-value -Wduplicated-cond -Qunused-arguments -Wshadow -Wmultistatement-macros -Wcast-
> align=strict -Wno-null-pointer-arithmetic -Warray-bounds-pointer-arithmetic -g -DHAVE_AVX512F -c
> lib/dpif-netdev.c
> libtool: compile: mv -f "dpif-netdev-lookup-autovalidator.obj" "lib/dpif-netdev-lookup-
> autovalidator.obj"
> c:\PTHREADS-BUILT\include\_ptw32.h(120): warning C4005: 'HAVE_STRUCT_TIMESPEC': macro redefinition
> c:\openvswitch_compile\config.h(207): note: see previous definition of 'HAVE_STRUCT_TIMESPEC'
> c:\openvswitch_compile\lib\ovs-rcu.h(215): warning C4311: 'type cast': pointer truncation from 'void
> *' to 'long'
> libtool: compile: mv -f "dpif-netdev-lookup-generic.obj" "lib/dpif-netdev-lookup-generic.obj"
> dpif-netdev.c
> \
> 	source='lib/dpif-netdev-private-dfc.c' object='lib/dpif-netdev-private-dfc.lo' libtool=yes \
> 	DEPDIR=.deps depmode=none /bin/sh ./build-aux/depcomp \
> 	/bin/sh ./libtool  --tag=CC   --mode=compile build-aux/cccl -DHAVE_CONFIG_H -I.   -I
> ./include/windows -I ./datapath-windows/include -Ic:/PTHREADS-BUILT//include -O2 -I ./include -I
> ./include -I ./lib -I ./lib -IC:/OpenSSL-Win64/include   -Wstrict-prototypes -Wall -Wextra -Wno-sign-
> compare -Wpointer-arith -Wformat -Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-
> cast -Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes -Wmissing-field-
> initializers -Wthread-safety -fno-strict-aliasing -Wswitch-bool -Wlogical-not-parentheses -Wsizeof-
> array-argument -Wbool-compare -Wshift-negative-value -Wduplicated-cond -Qunused-arguments -Wshadow -
> Wmultistatement-macros -Wcast-align=strict -Wno-null-pointer-arithmetic -Warray-bounds-pointer-
> arithmetic    -g -DHAVE_AVX512F -c -o lib/dpif-netdev-private-dfc.lo lib/dpif-netdev-private-dfc.c
> c:\PTHREADS-BUILT\include\_ptw32.h(120): warning C4005: 'HAVE_STRUCT_TIMESPEC': macro redefinition
> c:\openvswitch_compile\config.h(207): note: see previous definition of 'HAVE_STRUCT_TIMESPEC'
> c:\openvswitch_compile\lib\ovs-rcu.h(215): warning C4311: 'type cast': pointer truncation from 'void
> *' to 'long'
> c:\openvswitch_compile\config.h(207): warning C4005: 'HAVE_STRUCT_TIMESPEC': macro redefinition
> c:\PTHREADS-BUILT\include\_ptw32.h(120): note: see previous definition of 'HAVE_STRUCT_TIMESPEC'
> lib/dpif-netdev.c(2826): error C4013: '__builtin_ctz' undefined; assuming extern returning int
> lib/dpif-netdev.c(2919): warning C4311: 'type cast': pointer truncation from 'const char *const ' to
> 'long'
> \
> 	source='lib/dpif-netdev-private-dpif.c' object='lib/dpif-netdev-private-dpif.lo' libtool=yes \
> 	DEPDIR=.deps depmode=none /bin/sh ./build-aux/depcomp \
> 	/bin/sh ./libtool  --tag=CC   --mode=compile build-aux/cccl -DHAVE_CONFIG_H -I.   -I
> ./include/windows -I ./datapath-windows/include -Ic:/PTHREADS-BUILT//include -O2 -I ./include -I
> ./include -I ./lib -I ./lib -IC:/OpenSSL-Win64/include   -Wstrict-prototypes -Wall -Wextra -Wno-sign-
> compare -Wpointer-arith -Wformat -Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-
> cast -Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes -Wmissing-field-
> initializers -Wthread-safety -fno-strict-aliasing -Wswitch-bool -Wlogical-not-parentheses -Wsizeof-
> array-argument -Wbool-compare -Wshift-negative-value -Wduplicated-cond -Qunused-arguments -Wshadow -
> Wmultistatement-macros -Wcast-align=strict -Wno-null-pointer-arithmetic -Warray-bounds-pointer-
> arithmetic    -g -DHAVE_AVX512F -c -o lib/dpif-netdev-private-dpif.lo lib/dpif-netdev-private-dpif.c
> make[2]: *** [lib/dpif-netdev.lo] Error 1
> make[2]: *** Waiting for unfinished jobs....
> 

Thanks for testing and finding this. We don't have the Windows machines set up to test this.

We need to use OVS's raw_ctz(). This will wrap the uses of __builtin_ctz and __builtin_ctzll. This should fix the above error. I'll fix this.

> Thanks,
> fbl
> 
> 
> > +        emc_insert_mask &= emc_insert_mask - 1;
> > +        /* Get the require parameters for EMC/SMC from the rule */
> > +        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
> > +        /* Insert the key into EMC/SMC. */
> > +        emc_probabilistic_insert(pmd, &keys[i], flow);
> > +    }
> > +}
> > +
> > +inline void
> > +smc_insert_batch(struct dp_netdev_pmd_thread *pmd,
> > +                 const struct netdev_flow_key *keys,
> > +                 struct dpcls_rule **rules,
> > +                 uint32_t smc_insert_mask)
> > +{
> > +    while (smc_insert_mask) {
> > +        uint32_t i = __builtin_ctz(smc_insert_mask);
> > +        smc_insert_mask &= smc_insert_mask - 1;
> > +        /* Get the require parameters for EMC/SMC from the rule */
> > +        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
> > +        uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
> > +        /* Insert the key into EMC/SMC. */
> > +        smc_insert(pmd, &keys[i], hash);
> > +    }
> > +}
> > +
> >  static struct dp_netdev_flow *
> >  dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
> >                            const struct netdev_flow_key *key,
> >                            int *lookup_num_p)
> >  {
> >      struct dpcls *cls;
> > -    struct dpcls_rule *rule;
> > +    struct dpcls_rule *rule = NULL;
> >      odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
> >                                                       in_port.odp_port));
> >      struct dp_netdev_flow *netdev_flow = NULL;
> > @@ -4233,7 +4262,10 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
> >          }
> >
> >          /* Process packet batch. */
> > -        pmd->netdev_input_func(pmd, &batch, port_no);
> > +        int32_t ret = pmd->netdev_input_func(pmd, &batch, port_no);
> > +        if (ret) {
> > +            dp_netdev_input(pmd, &batch, port_no);
> > +        }
> >
> >          /* Assign processing cycles to rx queue. */
> >          cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
> > @@ -5251,6 +5283,8 @@ dpif_netdev_run(struct dpif *dpif)
> >                      non_pmd->ctx.emc_insert_min = 0;
> >                  }
> >
> > +                non_pmd->ctx.smc_enable_db = dp->smc_enable_db;
> > +
> >                  for (i = 0; i < port->n_rxq; i++) {
> >
> >                      if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
> > @@ -5522,6 +5556,8 @@ reload:
> >                  pmd->ctx.emc_insert_min = 0;
> >              }
> >
> > +            pmd->ctx.smc_enable_db = pmd->dp->smc_enable_db;
> > +
> >              process_packets =
> >                  dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
> >                                             poll_list[i].port_no);
> > @@ -6415,6 +6451,24 @@ packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
> >                                actions->actions, actions->size);
> >  }
> >
> > +void
> > +dp_netdev_batch_execute(struct dp_netdev_pmd_thread *pmd,
> > +                        struct dp_packet_batch *packets,
> > +                        struct dpcls_rule *rule,
> > +                        uint32_t bytes,
> > +                        uint16_t tcp_flags)
> > +{
> > +    /* Gets action* from the rule. */
> > +    struct dp_netdev_flow *flow = dp_netdev_flow_cast(rule);
> > +    struct dp_netdev_actions *actions = dp_netdev_flow_get_actions(flow);
> > +
> > +    dp_netdev_flow_used(flow, dp_packet_batch_size(packets), bytes,
> > +                        tcp_flags, pmd->ctx.now / 1000);
> > +    const uint32_t steal = 1;
> > +    dp_netdev_execute_actions(pmd, packets, steal, &flow->flow,
> > +                              actions->actions, actions->size);
> > +}
> > +
> >  static inline void
> >  dp_netdev_queue_batches(struct dp_packet *pkt,
> >                          struct dp_netdev_flow *flow, uint16_t tcp_flags,
> > @@ -6519,6 +6573,30 @@ smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
> >      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
> >  }
> >
> > +struct dp_netdev_flow *
> > +smc_lookup_single(struct dp_netdev_pmd_thread *pmd,
> > +                  struct dp_packet *packet,
> > +                  struct netdev_flow_key *key)
> > +{
> > +    const struct cmap_node *flow_node = smc_entry_get(pmd, key->hash);
> > +
> > +    if (OVS_LIKELY(flow_node != NULL)) {
> > +        struct dp_netdev_flow *flow = NULL;
> > +
> > +        CMAP_NODE_FOR_EACH (flow, node, flow_node) {
> > +            /* Since we dont have per-port megaflow to check the port
> > +             * number, we need to verify that the input ports match. */
> > +            if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, key) &&
> > +                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
> > +
> > +                return (void *) flow;
> > +            }
> > +        }
> > +    }
> > +
> > +    return NULL;
> > +}
> > +
> >  /* Try to process all ('cnt') the 'packets' using only the datapath flow cache
> >   * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
> >   * miniflow is copied into 'keys' and the packet pointer is moved at the
> > @@ -6924,12 +7002,13 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
> >      }
> >  }
> >
> > -static void
> > +static int32_t
> >  dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
> >                  struct dp_packet_batch *packets,
> >                  odp_port_t port_no)
> >  {
> >      dp_netdev_input__(pmd, packets, false, port_no);
> > +    return 0;
> >  }
> >
> >  static void
> > @@ -8369,7 +8448,7 @@ dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl,
> >
> >  /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
> >   * in 'mask' the values in 'key' and 'target' are the same. */
> > -bool
> > +inline bool ALWAYS_INLINE
> >  dpcls_rule_matches_key(const struct dpcls_rule *rule,
> >                         const struct netdev_flow_key *target)
> >  {
> > @@ -8395,7 +8474,7 @@ dpcls_rule_matches_key(const struct dpcls_rule *rule,
> >   * priorities, instead returning any rule which matches the flow.
> >   *
> >   * Returns true if all miniflows found a corresponding rule. */
> > -static bool
> > +bool
> >  dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
> >               struct dpcls_rule **rules, const size_t cnt,
> >               int *num_lookups_p)
> > --
> > 2.32.0
> >
> > _______________________________________________
> > dev mailing list
> > dev at openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> 
> --
> fbl


More information about the dev mailing list