[ovs-dev] [PATCH] dpdk: Support running PMD threads on cores > RTE_MAX_LCORE.
Ilya Maximets
i.maximets at ovn.org
Wed Dec 4 21:22:52 UTC 2019
On 03.12.2019 14:00, David Marchand wrote:
> On Tue, Dec 3, 2019 at 12:34 PM Ilya Maximets <i.maximets at ovn.org> wrote:
>>
>> On 02.12.2019 17:03, David Marchand wrote:
>>> Most DPDK components make the assumption that rte_lcore_id() returns
>>> a valid lcore_id in [0..RTE_MAX_LCORE] range (with the exception of
>>> the LCORE_ID_ANY special value).
>>
>> Do you think that makes a practical sense? In a real virtualization
>> environments users are usually using cpus with lower numbers and it's
>> most likely possible to change NUMA layout to have CPUs from all the
>> nodes enumerated from the low CPU numbers.
>
> That is if you can reconfigure the NUMA layout.
> How do you achieve this?
> Bios/firmware configuration?
It's usually a BIOS configuration. Modern servers provides configurations
for 2-4 types of core enumeration along with choosing NUMA topology.
> Kernel boot options (did not find)?
>
> I imagine it would be a pain to enforce this on a fleet of servers
> with different hw specs.
>From my experience, you almost never could just take a new server and
run your virtualization software. You'll have to enable some HW
capabilities or enable "Highly Optimized Virtualization Configuration
Set" in BIOS anyway.
>
>
>> It's uncommon also to use all the CPUs for just OVS on a big system
>> with huge number of cores.
>
> Yes, obviously, you don't want to use all your cores for OVS :-).
>
>
>>
>> Another thing is can we really do this on a DPDK level? Maybe it'll
>> be a step to dynamic registering/unregistering of non-EAL threads?
>
> I'd like to ultimately get to a dynamic register mechanism.
> But I first wanted to fix the existing code...
>
>
>> Since you're wiping off the meaning of a lcore as a CPU core, it
>> becomes just a thread_id in a DPDK point of view. So, maybe application
>> could call a new API function instead of managing these strange
>> mappings by itself?
>
> ... as it is unlikely we will backport an experimental API to previous
> dpdk versions.
>
>
>>
>> One comment inline (not a full review).
>>
>> Best regards, Ilya Maximets.
>>
>>>
>>> OVS does not currently check which value is set in
>>> RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK
>>> side.
>>>
>>> Introduce a lcore allocator in OVS for PMD threads and map them to
>>> unused lcores from DPDK à la --lcores.
>>>
>>> The physical cores on which the PMD threads are running still
>>> constitutes an important information when debugging, so still keep
>>> those in the PMD thread names but add a new debug log when starting
>>> them.
>>>
>>> Synchronize DPDK internals on numa and cpuset for the PMD threads by
>>> registering them via the rte_thread_set_affinity() helper.
>>>
>>> Signed-off-by: David Marchand <david.marchand at redhat.com>
>>> ---
>>> lib/dpdk-stub.c | 8 +++++-
>>> lib/dpdk.c | 69 +++++++++++++++++++++++++++++++++++++++++++----
>>> lib/dpdk.h | 3 ++-
>>> lib/dpif-netdev.c | 3 ++-
>>> 4 files changed, 75 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c
>>> index c332c217c..90473bc8e 100644
>>> --- a/lib/dpdk-stub.c
>>> +++ b/lib/dpdk-stub.c
>>> @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config)
>>> }
>>>
>>> void
>>> -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED)
>>> +dpdk_init_thread_context(unsigned cpu OVS_UNUSED)
>>> +{
>>> + /* Nothing */
>>> +}
>>> +
>>> +void
>>> +dpdk_uninit_thread_context(void)
>>> {
>>> /* Nothing */
>>> }
>>> diff --git a/lib/dpdk.c b/lib/dpdk.c
>>> index 21dd47e80..771baa413 100644
>>> --- a/lib/dpdk.c
>>> +++ b/lib/dpdk.c
>>> @@ -33,6 +33,7 @@
>>>
>>> #include "dirs.h"
>>> #include "fatal-signal.h"
>>> +#include "id-pool.h"
>>> #include "netdev-dpdk.h"
>>> #include "netdev-offload-provider.h"
>>> #include "openvswitch/dynamic-string.h"
>>> @@ -55,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization
>>> * of DPDK. */
>>> static bool per_port_memory = false; /* Status of per port memory support */
>>>
>>> +static struct id_pool *lcore_id_pool;
OVS_GUARDED_BY() would be nice.
>>> +static struct ovs_mutex lcore_id_pool_mutex = OVS_MUTEX_INITIALIZER;
>>> +
>>> static int
>>> process_vhost_flags(char *flag, const char *default_val, int size,
>>> const struct smap *ovs_other_config,
>>> @@ -346,7 +350,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>>> }
>>> }
>>>
>>> - if (args_contains(&args, "-c") || args_contains(&args, "-l")) {
>>> + if (args_contains(&args, "-c") || args_contains(&args, "-l") ||
>>> + args_contains(&args, "--lcores")) {
>>> auto_determine = false;
>>> }
>>>
>>> @@ -372,8 +377,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>>> * thread affintity - default to core #0 */
>>> VLOG_ERR("Thread getaffinity failed. Using core #0");
>>> }
>>> - svec_add(&args, "-l");
>>> - svec_add_nocopy(&args, xasprintf("%d", cpu));
>>> + svec_add(&args, "--lcores");
>>> + svec_add_nocopy(&args, xasprintf("0@%d", cpu));
>>> }
>>>
>>> svec_terminate(&args);
>>> @@ -429,6 +434,23 @@ dpdk_init__(const struct smap *ovs_other_config)
>>> }
>>> }
>>>
>>> + ovs_mutex_lock(&lcore_id_pool_mutex);
>>> + lcore_id_pool = id_pool_create(0, RTE_MAX_LCORE);
>>> + /* Empty the whole pool... */
>>> + for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
>>> + uint32_t lcore_id;
>>> +
>>> + id_pool_alloc_id(lcore_id_pool, &lcore_id);
>>> + }
>>> + /* ...and release the unused spots. */
>>> + for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
>>> + if (rte_eal_lcore_role(lcore) != ROLE_OFF) {
>>> + continue;
>>> + }
>>> + id_pool_free_id(lcore_id_pool, lcore);
>>> + }
>>> + ovs_mutex_unlock(&lcore_id_pool_mutex);
>>> +
>>> /* We are called from the main thread here */
>>> RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
>>>
>>> @@ -522,11 +544,48 @@ dpdk_available(void)
>>> }
>>>
>>> void
>>> -dpdk_set_lcore_id(unsigned cpu)
>>> +dpdk_init_thread_context(unsigned cpu)
>>> {
>>> + cpu_set_t cpuset;
I think it should be rte_cpuset_t.
For portability and also to follow DPDK API.
>>> + unsigned lcore;
>>> + int err;
>>> +
>>> /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
>>> ovs_assert(cpu != NON_PMD_CORE_ID);
>>> - RTE_PER_LCORE(_lcore_id) = cpu;
>>> +
>>> + ovs_mutex_lock(&lcore_id_pool_mutex);
>>> + if (lcore_id_pool == NULL || !id_pool_alloc_id(lcore_id_pool, &lcore)) {
>>> + lcore = NON_PMD_CORE_ID;
>>
>> If PMD thread will get NON_PMD_CORE_ID it will smash the non-PMD
>> mempool caches since it doesn't take a 'non_pmd_mutex'.
>
> Ok, need to look at this again.
> I missed something.
> I don't understand how you would create PMD threads with cpu !=
> NON_PMD_CORE_ID without dpdk enabled but still call the mempool code.
I meant that if you'll try to create more PMD threads than RTE_MAX_LCORE.
For example, if RTE_MAX_LCORE == 32 and you want to create 40 PMD threads
(assuming your system has 64 CPUs), 8 PMD threads and non-PMD thread(s)
will have lcore == NON_PMD_CORE_ID == LCORE_ID_ANY. I re-checked and
this will not cause sharing of the same mempool caches because threads
with LCORE_ID_ANY doesn't use it, however I think that simultaneous calls
of DPDK API might still be somehow dangerous. Anyway, performance will
suffer. At least, we should warn about that.
> + }
> + ovs_mutex_unlock(&lcore_id_pool_mutex);
> +
> + RTE_PER_LCORE(_lcore_id) = lcore;
> +
> + /* DPDK is not initialised, nothing more to do. */
> + if (lcore == NON_PMD_CORE_ID) {
> + return;
> + }
> +
> + CPU_ZERO(&cpuset);
> + err = pthread_getaffinity_np(pthread_self(), sizeof cpuset, &cpuset);
We know the core already. No need to make additional syscall.
We could just set it with CPU_SET(). This will also help with
portability in the future.
> + if (err) {
> + VLOG_ABORT("Thread getaffinity error: %s", ovs_strerror(err));
> + }
> +
> + rte_thread_set_affinity(&cpuset);
> + VLOG_INFO("Initialised lcore %u for core %u", lcore, cpu);
> +}
> +
> +void
> +dpdk_uninit_thread_context(void)
> +{
> + if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) {
> + return;
> + }
> +
> + ovs_mutex_lock(&lcore_id_pool_mutex);
> + id_pool_free_id(lcore_id_pool, RTE_PER_LCORE(_lcore_id));
> + ovs_mutex_unlock(&lcore_id_pool_mutex);
> }
>
> void
> diff --git a/lib/dpdk.h b/lib/dpdk.h
> index 736a64279..404ac1a4b 100644
> --- a/lib/dpdk.h
> +++ b/lib/dpdk.h
> @@ -36,7 +36,8 @@ struct smap;
> struct ovsrec_open_vswitch;
>
> void dpdk_init(const struct smap *ovs_other_config);
> -void dpdk_set_lcore_id(unsigned cpu);
> +void dpdk_init_thread_context(unsigned cpu);
> +void dpdk_uninit_thread_context(void);
> const char *dpdk_get_vhost_sock_dir(void);
> bool dpdk_vhost_iommu_enabled(void);
> bool dpdk_vhost_postcopy_enabled(void);
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 5142bad1d..c40031a78 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -5472,7 +5472,7 @@ pmd_thread_main(void *f_)
> /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
> ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
> ovs_numa_thread_setaffinity_core(pmd->core_id);
> - dpdk_set_lcore_id(pmd->core_id);
> + dpdk_init_thread_context(pmd->core_id);
> poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
> dfc_cache_init(&pmd->flow_cache);
> pmd_alloc_static_tx_qid(pmd);
> @@ -5592,6 +5592,7 @@ reload:
> dfc_cache_uninit(&pmd->flow_cache);
> free(poll_list);
> pmd_free_cached_ports(pmd);
> + dpdk_uninit_thread_context();
> return NULL;
> }
More information about the dev
mailing list