[ovs-dev] [PATCH v2] dpdk: Support running PMD threads on cores > RTE_MAX_LCORE.
Ilya Maximets
i.maximets at ovn.org
Fri Dec 13 17:34:11 UTC 2019
On 10.12.2019 11:10, David Marchand wrote:
> Most DPDK components make the assumption that rte_lcore_id() returns
> a valid lcore_id in [0..RTE_MAX_LCORE[ range (with the exception of
> the LCORE_ID_ANY special value).
>
> OVS does not currently check which value is set in
> RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK
> side.
>
> Introduce a lcore allocator in OVS for PMD threads and map them to
> unused lcores from DPDK à la --lcores.
>
> The physical cores on which the PMD threads are running still
> constitutes an important information when debugging, so still keep
> those in the PMD thread names but add a new debug log when starting
> them.
> Mapping between OVS threads and DPDK lcores can be dumped with a new
> dpdk/dump-lcores command.
>
> Synchronize DPDK internals on numa and cpuset for the PMD threads by
> registering them via the rte_thread_set_affinity() helper.
>
> Signed-off-by: David Marchand <david.marchand at redhat.com>
> ---
> Changelog since v1:
> - rewired existing configuration 'dpdk-lcore-mask' to use --lcores,
> - switched to a bitmap to track lcores,
> - added a command to dump current mapping (Flavio): used an experimental
> API to get DPDK lcores cpuset since it is the most reliable/portable
> information,
> - used the same code for the logs when starting DPDK/PMD threads,
> - addressed Ilya comments,
>
This version looks scary.
Some comments inline.
> ---
> lib/dpdk-stub.c | 8 ++-
> lib/dpdk.c | 163 ++++++++++++++++++++++++++++++++++++++++++++--
> lib/dpdk.h | 5 +-
> lib/dpif-netdev.c | 3 +-
> 4 files changed, 170 insertions(+), 9 deletions(-)
>
> diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c
> index c332c217c..90473bc8e 100644
> --- a/lib/dpdk-stub.c
> +++ b/lib/dpdk-stub.c
> @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config)
> }
>
> void
> -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED)
> +dpdk_init_thread_context(unsigned cpu OVS_UNUSED)
> +{
> + /* Nothing */
> +}
> +
> +void
> +dpdk_uninit_thread_context(void)
> {
> /* Nothing */
> }
> diff --git a/lib/dpdk.c b/lib/dpdk.c
> index 37ea2973c..0173366a0 100644
> --- a/lib/dpdk.c
> +++ b/lib/dpdk.c
> @@ -30,6 +30,7 @@
> #include <rte_pdump.h>
> #endif
>
> +#include "bitmap.h"
> #include "dirs.h"
> #include "fatal-signal.h"
> #include "netdev-dpdk.h"
> @@ -39,6 +40,7 @@
> #include "ovs-numa.h"
> #include "smap.h"
> #include "svec.h"
> +#include "unixctl.h"
> #include "util.h"
> #include "vswitch-idl.h"
>
> @@ -54,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization
> * of DPDK. */
> static bool per_port_memory = false; /* Status of per port memory support */
>
> +static struct ovs_mutex lcore_bitmap_mutex = OVS_MUTEX_INITIALIZER;
> +static unsigned long *lcore_bitmap OVS_GUARDED_BY(lcore_bitmap_mutex);
> +
> static int
> process_vhost_flags(char *flag, const char *default_val, int size,
> const struct smap *ovs_other_config,
> @@ -94,6 +99,38 @@ args_contains(const struct svec *args, const char *value)
> return false;
> }
>
> +static void
> +construct_dpdk_lcore_option(const struct smap *ovs_other_config,
> + struct svec *args)
> +{
> + const char *cmask = smap_get(ovs_other_config, "dpdk-lcore-mask");
> + struct svec lcores = SVEC_EMPTY_INITIALIZER;
> + struct ovs_numa_info_core *core;
> + struct ovs_numa_dump *cores;
> + int index = 0;
> +
> + if (!cmask) {
> + return;
> + }
> + if (args_contains(args, "-c") || args_contains(args, "-l") ||
> + args_contains(args, "--lcores")) {
> + VLOG_WARN("Ignoring database defined option 'dpdk-lcore-mask' "
> + "due to dpdk-extra config");
> + return;
> + }
> +
> + cores = ovs_numa_dump_cores_with_cmask(cmask);
> + FOR_EACH_CORE_ON_DUMP(core, cores) {
> + svec_add_nocopy(&lcores, xasprintf("%d@%d", index, core->core_id));
What's the point of re-mapping these cores?
Just let DPDK to fail initialization and user will adjust cores.
This automatic re-mapping only complicates everything.
IMHO, user should not configure DPDK cores at all. So, it's users' problem
if cores configured incorrectly.
> + index++;
> + }
> + svec_terminate(&lcores);
> + ovs_numa_dump_destroy(cores);
> + svec_add(args, "--lcores");
> + svec_add_nocopy(args, svec_join(&lcores, ",", ""));
> + svec_destroy(&lcores);
> +}
> +
> static void
> construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
> {
> @@ -103,7 +140,6 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
> bool default_enabled;
> const char *default_value;
> } opts[] = {
> - {"dpdk-lcore-mask", "-c", false, NULL},
> {"dpdk-hugepage-dir", "--huge-dir", false, NULL},
> {"dpdk-socket-limit", "--socket-limit", false, NULL},
> };
> @@ -224,6 +260,7 @@ construct_dpdk_args(const struct smap *ovs_other_config, struct svec *args)
> svec_parse_words(args, extra_configuration);
> }
>
> + construct_dpdk_lcore_option(ovs_other_config, args);
> construct_dpdk_options(ovs_other_config, args);
> construct_dpdk_mutex_options(ovs_other_config, args);
> }
> @@ -264,6 +301,58 @@ static cookie_io_functions_t dpdk_log_func = {
> .write = dpdk_log_write,
> };
>
> +static void
> +dpdk_dump_lcore(struct ds *ds, unsigned lcore)
> +{
> + struct svec cores = SVEC_EMPTY_INITIALIZER;
> + rte_cpuset_t cpuset;
> + unsigned cpu;
> +
> + cpuset = rte_lcore_cpuset(lcore);
> + for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
> + if (!CPU_ISSET(cpu, &cpuset)) {
> + continue;
> + }
> + svec_add_nocopy(&cores, xasprintf("%u", cpu));
> + }> + svec_terminate(&cores);
> + ds_put_format(ds, "lcore%u (%s) is running on core %s\n", lcore,
> + rte_eal_lcore_role(lcore) != ROLE_OFF ? "DPDK" : "OVS",
> + svec_join(&cores, ",", ""));
> + svec_destroy(&cores);
> +}
> +
> +static void
> +dpdk_dump_lcores(struct unixctl_conn *conn, int argc, const char *argv[],
> + void *aux OVS_UNUSED)
> +{
> + struct ds ds = DS_EMPTY_INITIALIZER;
> + unsigned lcore;
> +
> + ovs_mutex_lock(&lcore_bitmap_mutex);
> + if (lcore_bitmap == NULL) {
> + unixctl_command_reply_error(conn, "DPDK has not been initialised");
> + goto out;
> + }
> + if (argc > 1) {
> + if (!str_to_uint(argv[1], 0, &lcore) || lcore >= RTE_MAX_LCORE ||
> + !bitmap_is_set(lcore_bitmap, lcore)) {
> + unixctl_command_reply_error(conn, "incorrect lcoreid");
> + goto out;
> + }
> + dpdk_dump_lcore(&ds, lcore);
> + } else for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
Not a good style to have 'else for'.
DPDK lcore iterator?
> + if (!bitmap_is_set(lcore_bitmap, lcore)) {
> + continue;
> + }
> + dpdk_dump_lcore(&ds, lcore);
> + }
> + unixctl_command_reply(conn, ds_cstr(&ds));
> + ds_destroy(&ds);
> +out:
> + ovs_mutex_unlock(&lcore_bitmap_mutex);
> +}
> +
> static bool
> dpdk_init__(const struct smap *ovs_other_config)
> {
> @@ -345,7 +434,8 @@ dpdk_init__(const struct smap *ovs_other_config)
> }
> }
>
> - if (args_contains(&args, "-c") || args_contains(&args, "-l")) {
> + if (args_contains(&args, "-c") || args_contains(&args, "-l") ||
> + args_contains(&args, "--lcores")) {
> auto_determine = false;
> }
>
> @@ -371,8 +461,8 @@ dpdk_init__(const struct smap *ovs_other_config)
> * thread affintity - default to core #0 */
> VLOG_ERR("Thread getaffinity failed. Using core #0");
> }
> - svec_add(&args, "-l");
> - svec_add_nocopy(&args, xasprintf("%d", cpu));
> + svec_add(&args, "--lcores");
> + svec_add_nocopy(&args, xasprintf("0@%d", cpu));
> }
>
> svec_terminate(&args);
> @@ -428,6 +518,24 @@ dpdk_init__(const struct smap *ovs_other_config)
> }
> }
>
> + ovs_mutex_lock(&lcore_bitmap_mutex);
> + lcore_bitmap = bitmap_allocate(RTE_MAX_LCORE);
> + /* Mark DPDK threads. */
> + for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
Shouldn't we use DPDK lcore iterator instead?
> + struct ds ds = DS_EMPTY_INITIALIZER;
> +
> + if (rte_eal_lcore_role(lcore) == ROLE_OFF) {
> + continue;
> + }
> + bitmap_set1(lcore_bitmap, lcore);
> + dpdk_dump_lcore(&ds, lcore);
> + VLOG_INFO("%s", ds_cstr(&ds));
> + ds_destroy(&ds);
> + }
> + unixctl_command_register("dpdk/dump-lcores", "[lcore]", 0, 1,
> + dpdk_dump_lcores, NULL);
We might need this command if we're doing automatic re-mapping
of lcores from the EAL arguments, but if not, I don't think we need it.
Simple logging on PMD start should be enough.
> + ovs_mutex_unlock(&lcore_bitmap_mutex);
> +
> /* We are called from the main thread here */
> RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
>
> @@ -512,11 +620,54 @@ dpdk_available(void)
> }
>
> void
> -dpdk_set_lcore_id(unsigned cpu)
> +dpdk_init_thread_context(unsigned cpu)
> {
> + struct ds ds = DS_EMPTY_INITIALIZER;
> + rte_cpuset_t cpuset;
> + unsigned lcore;
> +
> /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
> ovs_assert(cpu != NON_PMD_CORE_ID);
> - RTE_PER_LCORE(_lcore_id) = cpu;
> +
> + ovs_mutex_lock(&lcore_bitmap_mutex);
> + if (lcore_bitmap == NULL) {
> + lcore = NON_PMD_CORE_ID;
> + } else {
> + lcore = bitmap_scan(lcore_bitmap, 0, 0, RTE_MAX_LCORE);
> + if (lcore == RTE_MAX_LCORE) {
> + VLOG_WARN("Reached maximum number of DPDK lcores, core %u will "
> + "have lower performance", cpu);
> + lcore = NON_PMD_CORE_ID;
> + } else {
> + bitmap_set1(lcore_bitmap, lcore);
> + }
> + }
> + ovs_mutex_unlock(&lcore_bitmap_mutex);
> +
> + RTE_PER_LCORE(_lcore_id) = lcore;
> +
> + if (lcore == NON_PMD_CORE_ID) {
> + return;
> + }
> +
> + CPU_ZERO(&cpuset);
> + CPU_SET(cpu, &cpuset);
> + rte_thread_set_affinity(&cpuset);
> + dpdk_dump_lcore(&ds, lcore);
> + VLOG_INFO("%s", ds_cstr(&ds));
> + ds_destroy(&ds);
> +}
> +
> +void
> +dpdk_uninit_thread_context(void)
> +{
> + if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) {
> + return;
> + }
> +
> + ovs_mutex_lock(&lcore_bitmap_mutex);
> + bitmap_set0(lcore_bitmap, RTE_PER_LCORE(_lcore_id));
> + ovs_mutex_unlock(&lcore_bitmap_mutex);
> }
>
> void
> diff --git a/lib/dpdk.h b/lib/dpdk.h
> index 736a64279..eab7a926b 100644
> --- a/lib/dpdk.h
> +++ b/lib/dpdk.h
> @@ -22,7 +22,9 @@
> #ifdef DPDK_NETDEV
>
> #include <rte_config.h>
> +#define ALLOW_EXPERIMENTAL_API
Not for this patch, but for DPDK API:
I think, ALLOW_EXPERIMENTAL_API is too generic name for
external definition. It should have some special prefix.
> #include <rte_lcore.h>
> +#undef ALLOW_EXPERIMENTAL_API
>
> #define NON_PMD_CORE_ID LCORE_ID_ANY
>
> @@ -36,7 +38,8 @@ struct smap;
> struct ovsrec_open_vswitch;
>
> void dpdk_init(const struct smap *ovs_other_config);
> -void dpdk_set_lcore_id(unsigned cpu);
> +void dpdk_init_thread_context(unsigned cpu);
> +void dpdk_uninit_thread_context(void);
> const char *dpdk_get_vhost_sock_dir(void);
> bool dpdk_vhost_iommu_enabled(void);
> bool dpdk_vhost_postcopy_enabled(void);
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 7ebf4ef87..91e98919c 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -5470,7 +5470,7 @@ pmd_thread_main(void *f_)
> /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
> ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
> ovs_numa_thread_setaffinity_core(pmd->core_id);
> - dpdk_set_lcore_id(pmd->core_id);
> + dpdk_init_thread_context(pmd->core_id);
Since this function will atually change cpu affinity that is not
expected by OVS it must be called before setting affinity by
ovs_numa_thread_setaffinity_core(). This way OVS will guarantee
correct affinity for its threads.
> poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
> dfc_cache_init(&pmd->flow_cache);
> pmd_alloc_static_tx_qid(pmd);
> @@ -5590,6 +5590,7 @@ reload:
> dfc_cache_uninit(&pmd->flow_cache);
> free(poll_list);
> pmd_free_cached_ports(pmd);
> + dpdk_uninit_thread_context();
> return NULL;
> }
>
>
More information about the dev
mailing list