[ovs-dev] [PATCH v2] dpdk: Support running PMD threads on cores > RTE_MAX_LCORE.
David Marchand
david.marchand at redhat.com
Tue Dec 17 09:22:43 UTC 2019
On Fri, Dec 13, 2019 at 6:34 PM Ilya Maximets <i.maximets at ovn.org> wrote:
> > @@ -94,6 +99,38 @@ args_contains(const struct svec *args, const char *value)
> > return false;
> > }
> >
> > +static void
> > +construct_dpdk_lcore_option(const struct smap *ovs_other_config,
> > + struct svec *args)
> > +{
> > + const char *cmask = smap_get(ovs_other_config, "dpdk-lcore-mask");
> > + struct svec lcores = SVEC_EMPTY_INITIALIZER;
> > + struct ovs_numa_info_core *core;
> > + struct ovs_numa_dump *cores;
> > + int index = 0;
> > +
> > + if (!cmask) {
> > + return;
> > + }
> > + if (args_contains(args, "-c") || args_contains(args, "-l") ||
> > + args_contains(args, "--lcores")) {
> > + VLOG_WARN("Ignoring database defined option 'dpdk-lcore-mask' "
> > + "due to dpdk-extra config");
> > + return;
> > + }
> > +
> > + cores = ovs_numa_dump_cores_with_cmask(cmask);
> > + FOR_EACH_CORE_ON_DUMP(core, cores) {
> > + svec_add_nocopy(&lcores, xasprintf("%d@%d", index, core->core_id));
>
> What's the point of re-mapping these cores?
> Just let DPDK to fail initialization and user will adjust cores.
> This automatic re-mapping only complicates everything.
Deal with existing user of this parameter.
Example: https://bugzilla.redhat.com/show_bug.cgi?id=1753432
>
> IMHO, user should not configure DPDK cores at all. So, it's users' problem
> if cores configured incorrectly.
If the user explicitly had set a dpdk option, ok, this is his problem.
But here, this is an OVS option, handling this internally makes more
sense to me.
> > + index++;
> > + }
> > + svec_terminate(&lcores);
> > + ovs_numa_dump_destroy(cores);
> > + svec_add(args, "--lcores");
> > + svec_add_nocopy(args, svec_join(&lcores, ",", ""));
> > + svec_destroy(&lcores);
> > +}
> > +
> > static void
> > construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
> > {
> > @@ -103,7 +140,6 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
> > bool default_enabled;
> > const char *default_value;
> > } opts[] = {
> > - {"dpdk-lcore-mask", "-c", false, NULL},
> > {"dpdk-hugepage-dir", "--huge-dir", false, NULL},
> > {"dpdk-socket-limit", "--socket-limit", false, NULL},
> > };
> > @@ -224,6 +260,7 @@ construct_dpdk_args(const struct smap *ovs_other_config, struct svec *args)
> > svec_parse_words(args, extra_configuration);
> > }
> >
> > + construct_dpdk_lcore_option(ovs_other_config, args);
> > construct_dpdk_options(ovs_other_config, args);
> > construct_dpdk_mutex_options(ovs_other_config, args);
> > }
> > @@ -264,6 +301,58 @@ static cookie_io_functions_t dpdk_log_func = {
> > .write = dpdk_log_write,
> > };
> >
> > +static void
> > +dpdk_dump_lcore(struct ds *ds, unsigned lcore)
> > +{
> > + struct svec cores = SVEC_EMPTY_INITIALIZER;
> > + rte_cpuset_t cpuset;
> > + unsigned cpu;
> > +
> > + cpuset = rte_lcore_cpuset(lcore);
> > + for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
> > + if (!CPU_ISSET(cpu, &cpuset)) {
> > + continue;
> > + }
> > + svec_add_nocopy(&cores, xasprintf("%u", cpu));
> > + }> + svec_terminate(&cores);
> > + ds_put_format(ds, "lcore%u (%s) is running on core %s\n", lcore,
> > + rte_eal_lcore_role(lcore) != ROLE_OFF ? "DPDK" : "OVS",
> > + svec_join(&cores, ",", ""));
> > + svec_destroy(&cores);
> > +}
> > +
> > +static void
> > +dpdk_dump_lcores(struct unixctl_conn *conn, int argc, const char *argv[],
> > + void *aux OVS_UNUSED)
> > +{
> > + struct ds ds = DS_EMPTY_INITIALIZER;
> > + unsigned lcore;
> > +
> > + ovs_mutex_lock(&lcore_bitmap_mutex);
> > + if (lcore_bitmap == NULL) {
> > + unixctl_command_reply_error(conn, "DPDK has not been initialised");
> > + goto out;
> > + }
> > + if (argc > 1) {
> > + if (!str_to_uint(argv[1], 0, &lcore) || lcore >= RTE_MAX_LCORE ||
> > + !bitmap_is_set(lcore_bitmap, lcore)) {
> > + unixctl_command_reply_error(conn, "incorrect lcoreid");
> > + goto out;
> > + }
> > + dpdk_dump_lcore(&ds, lcore);
> > + } else for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
>
> Not a good style to have 'else for'.
Tried to be creative :-) (the additional indent is really unnecessary).
But, if this is a blocker for this patch, ok.
>
> DPDK lcore iterator?
We are iterating over lcores owned by DPDK and OVS.
Example:
other_config : {dpdk-init="true", dpdk-lcore-mask="0x1",
pmd-cpu-mask="0x00008002"}
# ovs-appctl dpdk/dump-lcores
lcore0 (DPDK) is running on core 0
lcore1 (OVS) is running on core 1
lcore2 (OVS) is running on core 15
>
> > + if (!bitmap_is_set(lcore_bitmap, lcore)) {
> > + continue;
> > + }
> > + dpdk_dump_lcore(&ds, lcore);
> > + }
> > + unixctl_command_reply(conn, ds_cstr(&ds));
> > + ds_destroy(&ds);
> > +out:
> > + ovs_mutex_unlock(&lcore_bitmap_mutex);
> > +}
> > +
> > static bool
> > dpdk_init__(const struct smap *ovs_other_config)
> > {
> > @@ -428,6 +518,24 @@ dpdk_init__(const struct smap *ovs_other_config)
> > }
> > }
> >
> > + ovs_mutex_lock(&lcore_bitmap_mutex);
> > + lcore_bitmap = bitmap_allocate(RTE_MAX_LCORE);
> > + /* Mark DPDK threads. */
> > + for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
>
> Shouldn't we use DPDK lcore iterator instead?
The DPDK lcore iterator skips "service" cores.
We would end up reusing those lcores if the user had set something exotic.
Example:
other_config : {dpdk-extra="-c 0x5 -s 0x4", dpdk-init="true",
pmd-cpu-mask="0x00008002"}
# ovs-appctl dpdk/dump-lcores
lcore0 (DPDK) is running on core 0
lcore1 (OVS) is running on core 1
lcore2 (DPDK) is running on core 2
lcore3 (OVS) is running on core 15
>
> > + struct ds ds = DS_EMPTY_INITIALIZER;
> > +
> > + if (rte_eal_lcore_role(lcore) == ROLE_OFF) {
> > + continue;
> > + }
> > + bitmap_set1(lcore_bitmap, lcore);
> > + dpdk_dump_lcore(&ds, lcore);
> > + VLOG_INFO("%s", ds_cstr(&ds));
> > + ds_destroy(&ds);
> > + }
> > + unixctl_command_register("dpdk/dump-lcores", "[lcore]", 0, 1,
> > + dpdk_dump_lcores, NULL);
>
> We might need this command if we're doing automatic re-mapping
> of lcores from the EAL arguments, but if not, I don't think we need it.
> Simple logging on PMD start should be enough.
Flavio mentioned that the logs could have been rotated.
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > index 7ebf4ef87..91e98919c 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -5470,7 +5470,7 @@ pmd_thread_main(void *f_)
> > /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
> > ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
> > ovs_numa_thread_setaffinity_core(pmd->core_id);
> > - dpdk_set_lcore_id(pmd->core_id);
> > + dpdk_init_thread_context(pmd->core_id);
>
> Since this function will atually change cpu affinity that is not
This is the same affinity than what OVS just applied.
The call to pthread_setaffinity_np in dpdk should be a noop.
> expected by OVS it must be called before setting affinity by
> ovs_numa_thread_setaffinity_core(). This way OVS will guarantee
> correct affinity for its threads.
ovs_numa_thread_setaffinity_core() return value is not checked.
And I can't see what calling this function changes to the ovs-numa internals.
We could even remove it and only rely on dpdk.
--
David Marchand
More information about the dev
mailing list