[ovs-dev] [v4 01/12] dpif-netdev: Add command line and function pointer for miniflow extract

Flavio Leitner fbl at sysclose.org
Mon Jun 28 02:49:51 UTC 2021


Hi,

I am reviewing this patch ignoring the things that were
already pointed out in other reviews in the ML.

On Thu, Jun 17, 2021 at 09:57:43PM +0530, Kumar Amber wrote:
> This patch introduces the mfex function pointers which allows
> the user to switch between different miniflow extract implementations
> which are provided by the OVS based on optimized ISA CPU.
> 
> The user can query for the available minflow extract variants available
> for that CPU by following commands:
> 
> $ovs-appctl dpif-netdev/miniflow-parser-get
> 
> Similarly an user can set the miniflow implementation by the following
> command :
> 
> $ ovs-appctl dpif-netdev/miniflow-parser-set name
> 
> This allow for more performance and flexibility to the user to choose
> the miniflow implementation according to the needs.
> 
> Signed-off-by: Kumar Amber <kumar.amber at intel.com>
> Co-authored-by: Harry van Haaren <harry.van.haaren at intel.com>
> Signed-off-by: Harry van Haaren <harry.van.haaren at intel.com>
> ---
>  lib/automake.mk                   |   2 +
>  lib/dpif-netdev-avx512.c          |  32 ++++++--
>  lib/dpif-netdev-private-extract.c |  86 ++++++++++++++++++++
>  lib/dpif-netdev-private-extract.h |  94 ++++++++++++++++++++++
>  lib/dpif-netdev-private-thread.h  |   4 +
>  lib/dpif-netdev.c                 | 126 +++++++++++++++++++++++++++++-
>  6 files changed, 337 insertions(+), 7 deletions(-)
>  create mode 100644 lib/dpif-netdev-private-extract.c
>  create mode 100644 lib/dpif-netdev-private-extract.h
> 
> diff --git a/lib/automake.mk b/lib/automake.mk
> index 49f42c2a3..6657b9ae5 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -118,6 +118,8 @@ lib_libopenvswitch_la_SOURCES = \
>  	lib/dpif-netdev-private-dpcls.h \
>  	lib/dpif-netdev-private-dpif.c \
>  	lib/dpif-netdev-private-dpif.h \
> +	lib/dpif-netdev-private-extract.c \
> +	lib/dpif-netdev-private-extract.h \
>  	lib/dpif-netdev-private-flow.h \
>  	lib/dpif-netdev-private-hwol.h \
>  	lib/dpif-netdev-private-thread.h \
> diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
> index f9b199637..bb99b23ff 100644
> --- a/lib/dpif-netdev-avx512.c
> +++ b/lib/dpif-netdev-avx512.c
> @@ -148,6 +148,15 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
>       *     // do all processing (HWOL->MFEX->EMC->SMC)
>       * }
>       */
> +
> +    /* Do a batch minfilow extract into keys. */
> +    uint32_t mf_mask = 0;
> +    if (pmd->miniflow_extract_opt) {
> +        mf_mask = pmd->miniflow_extract_opt(packets, keys,
> +                                            batch_size, in_port,
> +                                            (void *) pmd);
> +    }
> +    /* Perform first packet interation */
>      uint32_t lookup_pkts_bitmask = (1ULL << batch_size) - 1;
>      uint32_t iter = lookup_pkts_bitmask;
>      while (iter) {
> @@ -159,6 +168,12 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
>          pkt_metadata_init(&packet->md, in_port);
>  
>          struct dp_netdev_flow *f = NULL;
> +        struct netdev_flow_key *key = &keys[i];
> +
> +        /* Check the minfiflow mask to see if the packet was correctly
> +        * classifed by vector mfex else do a scalar miniflow extract
> +        * for that packet. */
> +        uint32_t mfex_hit = (mf_mask & (1 << i));

This is actually a bool.

>  
>          /* Check for partial hardware offload mark. */
>          uint32_t mark;
> @@ -166,7 +181,13 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
>              f = mark_to_flow_find(pmd, mark);
>              if (f) {
>                  rules[i] = &f->cr;
> -                pkt_meta[i].tcp_flags = parse_tcp_flags(packet);
> +                /* If AVX512 MFEX already classified the packet, use it. */
> +                if (mfex_hit) {
> +                    pkt_meta[i].tcp_flags = miniflow_get_tcp_flags(&key->mf);
> +                } else {
> +                    pkt_meta[i].tcp_flags = parse_tcp_flags(packet);
> +                }
> +
>                  pkt_meta[i].bytes = dp_packet_size(packet);
>                  phwol_hits++;
>                  hwol_emc_smc_hitmask |= (1 << i);
> @@ -174,11 +195,12 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
>              }
>          }
>  
> -        /* Do miniflow extract into keys. */
> -        struct netdev_flow_key *key = &keys[i];
> -        miniflow_extract(packet, &key->mf);
> +        if (!mfex_hit) {
> +            /* Do a scalar miniflow extract into keys */
> +            miniflow_extract(packet, &key->mf);
> +        }
>  
> -        /* Cache TCP and byte values for all packets. */
> +        /* Cache TCP and byte values for all packets */
>          pkt_meta[i].bytes = dp_packet_size(packet);
>          pkt_meta[i].tcp_flags = miniflow_get_tcp_flags(&key->mf);
>  
> diff --git a/lib/dpif-netdev-private-extract.c b/lib/dpif-netdev-private-extract.c
> new file mode 100644
> index 000000000..fcc56ef26
> --- /dev/null
> +++ b/lib/dpif-netdev-private-extract.c
> @@ -0,0 +1,86 @@
> +/*
> + * Copyright (c) 2021 Intel.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <config.h>
> +#include <errno.h>
> +#include <stdint.h>
> +#include <string.h>
> +
> +#include "dp-packet.h"
> +#include "dpif-netdev-private-dpcls.h"
> +#include "dpif-netdev-private-extract.h"
> +#include "dpif-netdev-private-thread.h"
> +#include "flow.h"
> +#include "openvswitch/vlog.h"
> +#include "ovs-thread.h"
> +#include "util.h"
> +
> +VLOG_DEFINE_THIS_MODULE(dpif_netdev_extract);
> +
> +/* Implementations of available extract options. */
> +static struct dpif_miniflow_extract_impl mfex_impls[] = {
> +    {
> +        .probe = NULL,
> +        .extract_func = NULL,
> +        .name = "disable",
> +    },

I find 'ovs-appctl dpif-netdev/miniflow-parser-set disable'
misleading because we are not disabling miniflow parser at all.
It is just using scalar function instead.


> +};
> +
> +BUILD_ASSERT_DECL(MFEX_IMPLS_MAX_SIZE > ARRAY_SIZE(mfex_impls));
> +
> +int32_t
> +dpif_miniflow_extract_opt_get(const char *name,
> +                              struct dpif_miniflow_extract_impl **opt)
> +{
> +    ovs_assert(opt);
> +    ovs_assert(name);
> +
> +    uint32_t i;
> +    for (i = 0; i < ARRAY_SIZE(mfex_impls); i++) {
> +        if (strcmp(name, mfex_impls[i].name) == 0) {
> +            *opt = &mfex_impls[i];
> +            return 0;
> +        }
> +    }
> +    return -ENOTSUP;
> +}
> +
> +void
> +dpif_miniflow_extract_init(void)
> +{
> +    /* Call probe on each impl, and cache the result. */
> +    uint32_t i;
> +    for (i = 0; i < ARRAY_SIZE(mfex_impls); i++) {
> +        int avail = 1;

Can this be a bool instead?

> +        if (mfex_impls[i].probe) {
> +            /* Return zero is success, non-zero means error. */
> +            avail = (mfex_impls[i].probe() == 0);
> +        }
> +        VLOG_INFO("Miniflow Extract implementation %s (available: %s)\n",
> +                  mfex_impls[i].name, avail ? "True" : "False");
> +        mfex_impls[i].available = avail;
> +    }
> +}
> +
> +int32_t
> +dpif_miniflow_extract_info_get(struct dpif_miniflow_extract_impl **out_ptr)
> +{
> +    if (out_ptr == NULL) {
> +        return -EINVAL;
> +    }

missing space.

> +    *out_ptr = mfex_impls;
> +    return ARRAY_SIZE(mfex_impls);
> +}
> diff --git a/lib/dpif-netdev-private-extract.h b/lib/dpif-netdev-private-extract.h
> new file mode 100644
> index 000000000..b7b0b2be4
> --- /dev/null
> +++ b/lib/dpif-netdev-private-extract.h
> @@ -0,0 +1,94 @@
> +/*
> + * Copyright (c) 2021 Intel.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef DPIF_NETDEV_AVX512_EXTRACT
> +#define DPIF_NETDEV_AVX512_EXTRACT 1
> +
> +#include <sys/types.h>
> +
> +#include "openvswitch/types.h"
> +
> +/* Max size of dpif_miniflow_extract_impl array. */
> +#define MFEX_IMPLS_MAX_SIZE (16)
> +
> +/* Forward declarations. */
> +struct dp_packet;
> +struct miniflow;
> +struct dp_netdev_pmd_thread;
> +struct dp_packet_batch;
> +struct netdev_flow_key;
> +
> +/* Function pointer prototype to be implemented in the optimized miniflow
> + * extract code.
> + * returns the hitmask of the processed packets on success.
> + * returns zero on failure.
> + */
> +typedef uint32_t (*miniflow_extract_func)(struct dp_packet_batch *batch,
> +                                          struct netdev_flow_key *keys,
> +                                          uint32_t keys_size,
> +                                          odp_port_t in_port,
> +                                          void *pmd_handle);

The pmd_handle is void here, but some callers in future patches
cast it to struct dp_netdev_pmd_thread. After the refactoring,
that struct is declared in dpif-netdev-private-thread.h, so can
we use that instead?


> +
> +/* Probe function is used to detect if this CPU has the ISA required
> + * to run the optimized miniflow implementation.
> + * returns one on successful probe.
> + * returns zero on failure.
> + */
> +typedef int32_t (*miniflow_extract_probe)(void);
> +
> +/* Structure representing the attributes of an optimized implementation. */
> +struct dpif_miniflow_extract_impl {
> +    /* When non-zero, this impl has passed the probe() checks. */
> +    uint8_t available;

Can this be of type bool?

> +
> +    /* Probe function is used to detect if this CPU has the ISA required
> +     * to run the optimized miniflow implementation.
> +     */
> +    miniflow_extract_probe probe;
> +
> +    /* Function to call to extract miniflows for a burst of packets. */
> +    miniflow_extract_func extract_func;
> +
> +    /* Name of the optimized implementation. */
> +    char *name;
> +};
> +
> +/* Retrieve the opt structure for the requested implementation by name.
> + * Returns zero on success, and opt points to a valid struct, or
> + * returns a negative failure status.
> + * -ENOTSUP : invalid name requested
> + */
> +int32_t
> +dpif_miniflow_extract_opt_get(const char *name,
> +                              struct dpif_miniflow_extract_impl **opt);
> +
> +/* Initializes the available miniflow extract implementations by probing for
> + * the CPU ISA requirements. As the runtime available CPU ISA does not change
> + * and the required ISA of the implementation also does not change, it is safe
> + * to cache the probe() results, and not call probe() at runtime.
> + */
> +void
> +dpif_miniflow_extract_init(void);
> +
> +/* Retrieve the array of miniflow implementations for iteration.
> + * On error, returns a negative number.
> + * On success, returns the size of the arrays pointed to by the out parameter.
> + */
> +int32_t
> +dpif_miniflow_extract_info_get(struct dpif_miniflow_extract_impl **out_ptr);
> +
> +
> +#endif /* DPIF_NETDEV_AVX512_EXTRACT */
> diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h
> index f89b1ddaa..119eb7396 100644
> --- a/lib/dpif-netdev-private-thread.h
> +++ b/lib/dpif-netdev-private-thread.h
> @@ -28,6 +28,7 @@
>  #include "dpif-netdev-private-dpif.h"
>  #include "dpif-netdev-perf.h"
>  #include "openvswitch/thread.h"
> +#include "dpif-netdev-private-extract.h"
>  
>  #ifdef  __cplusplus
>  extern "C" {
> @@ -110,6 +111,9 @@ struct dp_netdev_pmd_thread {
>      /* Pointer for per-DPIF implementation scratch space. */
>      void *netdev_input_func_userdata;
>  
> +    /* Function pointer to call for miniflow_extract() functionality. */
> +    miniflow_extract_func miniflow_extract_opt;
> +
>      struct seq *reload_seq;
>      uint64_t last_reload_seq;
>  
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index f316835a4..567ebd952 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -46,6 +46,7 @@
>  #include "dpif.h"
>  #include "dpif-netdev-lookup.h"
>  #include "dpif-netdev-perf.h"
> +#include "dpif-netdev-private-extract.h"
>  #include "dpif-provider.h"
>  #include "dummy.h"
>  #include "fat-rwlock.h"
> @@ -1089,6 +1090,102 @@ dpif_netdev_impl_set(struct unixctl_conn *conn, int argc,
>      ds_destroy(&reply);
>  }
>  
> +static void
> +dpif_miniflow_extract_impl_get(struct unixctl_conn *conn, int argc OVS_UNUSED,
> +                     const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED)
> +{
> +    struct dpif_miniflow_extract_impl *mfex_impls;
> +    uint32_t count = dpif_miniflow_extract_info_get(&mfex_impls);
> +    if (count == 0) {
> +        unixctl_command_reply_error(conn, "error getting mfex names");
> +        return;
> +    }
> +
> +    /* Add all mfex functions to reply string. */
> +    struct ds reply = DS_EMPTY_INITIALIZER;
> +    ds_put_cstr(&reply, "Available Optimized Miniflow Extracts:\n");
> +    for (uint32_t i = 0; i < count; i++) {
> +        ds_put_format(&reply, "  %s (available: %s)\n",
> +                      mfex_impls[i].name, mfex_impls[i].available ?
> +                      "True" : "False");
> +    }

Shouldn't this be inside private-extract.c ?
See the comments for the DPIF dp_netdev_impl_get command:
https://mail.openvswitch.org/pipermail/ovs-dev/2021-June/384341.html



> +    unixctl_command_reply(conn, ds_cstr(&reply));
> +    ds_destroy(&reply);
> +}
> +
> +static void
> +dpif_miniflow_extract_impl_set(struct unixctl_conn *conn, int argc,
> +                     const char *argv[], void *aux OVS_UNUSED)
> +{
> +    /* This function requires just one parameter, the miniflow name.
> +     * A second optional parameter can identify the datapath instance.
> +     */
> +    const char *mfex_name = argv[1];
> +
> +    static const char *error_description[2] = {
> +        "Unknown miniflow implementation",
> +        "implementation doesn't exist",
> +    };
> +    struct dpif_miniflow_extract_impl *opt;
> +    miniflow_extract_func new_func;
> +    int32_t err = dpif_miniflow_extract_opt_get(mfex_name, &opt);
> +    if (err) {
> +        struct ds reply = DS_EMPTY_INITIALIZER;
> +        ds_put_format(&reply,
> +                    "Miniflow implementation not available: %s %s.\n",
> +                    error_description[ (err == -ENOTSUP) ], mfex_name);
> +        const char *reply_str = ds_cstr(&reply);
> +        unixctl_command_reply(conn, reply_str);
> +        VLOG_INFO("%s", reply_str);
> +        ds_destroy(&reply);
> +        return;
> +    }
> +    new_func = opt->extract_func;
> +    /* argv[2] is optional datapath instance. If no datapath name is provided.
> +     * and only one datapath exists, the one existing datapath is reprobed.
> +     */
> +    ovs_mutex_lock(&dp_netdev_mutex);
> +    struct dp_netdev *dp = NULL;
> +
> +    if (argc == 3) {
> +        dp = shash_find_data(&dp_netdevs, argv[2]);
> +    } else if (shash_count(&dp_netdevs) == 1) {
> +        dp = shash_first(&dp_netdevs)->data;
> +    }
> +
> +    if (!dp) {
> +        ovs_mutex_unlock(&dp_netdev_mutex);
> +        unixctl_command_reply_error(conn,
> +                                    "please specify an existing datapath");
> +        return;
> +    }
> +
> +    /* Get PMD threads list. */
> +    size_t n;
> +    struct dp_netdev_pmd_thread **pmd_list;
> +    sorted_poll_thread_list(dp, &pmd_list, &n);
> +
> +    for (size_t i = 0; i < n; i++) {
> +        struct dp_netdev_pmd_thread *pmd = pmd_list[i];
> +        if (pmd->core_id == NON_PMD_CORE_ID) {
> +            continue;
> +        }
> +
> +        /* Set PMD threads miniflow implementation to requested one. */
> +        pmd->miniflow_extract_opt = *new_func;
> +    };
> +
> +    ovs_mutex_unlock(&dp_netdev_mutex);
> +
> +    /* Reply with success to command. */
> +    struct ds reply = DS_EMPTY_INITIALIZER;
> +    ds_put_format(&reply, "Miniflow implementation set to %s.\n", mfex_name);
> +    const char *reply_str = ds_cstr(&reply);
> +    unixctl_command_reply(conn, reply_str);
> +    VLOG_INFO("%s", reply_str);
> +    ds_destroy(&reply);

This is very similar to the dpif_netdev_impl_set() reviewed in
https://mail.openvswitch.org/pipermail/ovs-dev/2021-June/384336.html

In this case you still need to provide dpif_miniflow_extract_opt_get(),
however the callers seem to be all in -*-extract*.c


> +}
> +
>  static void
>  dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
>                            const char *argv[], void *aux OVS_UNUSED)
> @@ -1315,9 +1412,16 @@ dpif_netdev_init(void)
>                               "dpif_implementation_name [dp]",
>                               1, 2, dpif_netdev_impl_set,
>                               NULL);
> +    unixctl_command_register("dpif-netdev/miniflow-parser-set",
> +                             "miniflow implementation name [dp]",
> +                             1, 2, dpif_miniflow_extract_impl_set,
> +                             NULL);
>      unixctl_command_register("dpif-netdev/dpif-get", "",
>                               0, 0, dpif_netdev_impl_get,
>                               NULL);
> +    unixctl_command_register("dpif-netdev/miniflow-parser-get", "",
> +                             0, 0, dpif_miniflow_extract_impl_get,
> +                             NULL);

Already mentioned in another review, but please don't separate
get and set functions.

>      return 0;
>  }
>  
> @@ -1461,6 +1565,8 @@ create_dp_netdev(const char *name, const struct dpif_class *class,
>  
>      dp->conntrack = conntrack_init();
>  
> +    dpif_miniflow_extract_init();
> +

This is using a different logic than DPIF. The DPIF calls probe
when the implementation is requested. This one seems better
assuming that calling .probe() are harmless.


>      atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
>      atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
>  
> @@ -6176,6 +6282,9 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
>      /* Initialize DPIF function pointer to the default configured version. */
>      pmd->netdev_input_func = dp_netdev_impl_get_default();
>  
> +    /*Init default miniflow_extract function */
> +    pmd->miniflow_extract_opt = NULL;
> +

That should come from dpif_miniflow_extract_get_default().
I see that you are actually adding that on one later patch, but
can we have a function returning NULL for now? And the comment
needs fixing.

>      /* init the 'flow_cache' since there is no
>       * actual thread created for NON_PMD_CORE_ID. */
>      if (core_id == NON_PMD_CORE_ID) {
> @@ -6730,10 +6839,12 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd,
>  {
>      struct netdev_flow_key *key = &keys[0];
>      size_t n_missed = 0, n_emc_hit = 0, n_phwol_hit = 0;
> +    struct dp_packet_batch single_packet;
>      struct dfc_cache *cache = &pmd->flow_cache;
>      struct dp_packet *packet;
>      const size_t cnt = dp_packet_batch_size(packets_);
>      uint32_t cur_min = pmd->ctx.emc_insert_min;
> +    int mf_ret;
>      int i;
>      uint16_t tcp_flags;
>      bool smc_enable_db;
> @@ -6786,8 +6897,19 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd,
>                  continue;
>              }
>          }
> -
> -        miniflow_extract(packet, &key->mf);
> +        /* Set the count and packet for miniflow_opt with batch_size 1. */
> +        if ((pmd->miniflow_extract_opt) && (!md_is_valid)) {

The single_packet and mf_ret are not in reverse xmas tree order, and
they are not needed outside of this scope. My suggestion to improve
readability and fix the coding style issue is to declare both at this
point. Does that make sense?


> +            single_packet.count = 1;
> +            single_packet.packets[0] = packet;
> +            mf_ret = pmd->miniflow_extract_opt(&single_packet, key, 1,
> +                                               port_no, (void *) pmd);
> +            /* Fallback to original miniflow_extract if there is a miss. */
> +            if (!mf_ret) {
> +                miniflow_extract(packet, &key->mf);
> +            }
> +        } else {
> +            miniflow_extract(packet, &key->mf);
> +        }
>          key->len = 0; /* Not computed yet. */
>          key->hash =
>                  (md_is_valid == false)
> -- 
> 2.25.1
> 
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev

-- 
fbl


More information about the dev mailing list