[ovs-dev] [PATCH] dpdk: Support running PMD threads on cores > RTE_MAX_LCORE.

Ilya Maximets i.maximets at ovn.org
Wed Dec 4 21:22:52 UTC 2019


On 03.12.2019 14:00, David Marchand wrote:
> On Tue, Dec 3, 2019 at 12:34 PM Ilya Maximets <i.maximets at ovn.org> wrote:
>>
>> On 02.12.2019 17:03, David Marchand wrote:
>>> Most DPDK components make the assumption that rte_lcore_id() returns
>>> a valid lcore_id in [0..RTE_MAX_LCORE] range (with the exception of
>>> the LCORE_ID_ANY special value).
>>
>> Do you think that makes a practical sense?  In a real virtualization
>> environments users are usually using cpus with lower numbers and it's
>> most likely possible to change NUMA layout to have CPUs from all the
>> nodes enumerated from the low CPU numbers.
> 
> That is if you can reconfigure the NUMA layout.
> How do you achieve this?
> Bios/firmware configuration?

It's usually a BIOS configuration.  Modern servers provides configurations
for 2-4 types of core enumeration along with choosing NUMA topology.

> Kernel boot options (did not find)?
> 
> I imagine it would be a pain to enforce this on a fleet of servers
> with different hw specs.

>From my experience, you almost never could just take a new server and
run your virtualization software.  You'll have to enable some HW
capabilities or enable "Highly Optimized Virtualization Configuration
Set" in BIOS anyway.


> 
> 
>> It's uncommon also to use all the CPUs for just OVS on a big system
>> with huge number of cores.
> 
> Yes, obviously, you don't want to use all your cores for OVS :-).
> 
> 
>>
>> Another thing is can we really do this on a DPDK level?  Maybe it'll
>> be a step to dynamic registering/unregistering of non-EAL threads?
> 
> I'd like to ultimately get to a dynamic register mechanism.
> But I first wanted to fix the existing code...
> 
> 
>> Since you're wiping off the meaning of a lcore as a CPU core, it
>> becomes just a thread_id in a DPDK point of view.  So, maybe application
>> could call a new API function instead of managing these strange
>> mappings by itself?
> 
> ... as it is unlikely we will backport an experimental API to previous
> dpdk versions.
> 
> 
>>
>> One comment inline (not a full review).
>>
>> Best regards, Ilya Maximets.
>>
>>>
>>> OVS does not currently check which value is set in
>>> RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK
>>> side.
>>>
>>> Introduce a lcore allocator in OVS for PMD threads and map them to
>>> unused lcores from DPDK à la --lcores.
>>>
>>> The physical cores on which the PMD threads are running still
>>> constitutes an important information when debugging, so still keep
>>> those in the PMD thread names but add a new debug log when starting
>>> them.
>>>
>>> Synchronize DPDK internals on numa and cpuset for the PMD threads by
>>> registering them via the rte_thread_set_affinity() helper.
>>>
>>> Signed-off-by: David Marchand <david.marchand at redhat.com>
>>> ---
>>>  lib/dpdk-stub.c   |  8 +++++-
>>>  lib/dpdk.c        | 69 +++++++++++++++++++++++++++++++++++++++++++----
>>>  lib/dpdk.h        |  3 ++-
>>>  lib/dpif-netdev.c |  3 ++-
>>>  4 files changed, 75 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c
>>> index c332c217c..90473bc8e 100644
>>> --- a/lib/dpdk-stub.c
>>> +++ b/lib/dpdk-stub.c
>>> @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config)
>>>  }
>>>
>>>  void
>>> -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED)
>>> +dpdk_init_thread_context(unsigned cpu OVS_UNUSED)
>>> +{
>>> +    /* Nothing */
>>> +}
>>> +
>>> +void
>>> +dpdk_uninit_thread_context(void)
>>>  {
>>>      /* Nothing */
>>>  }
>>> diff --git a/lib/dpdk.c b/lib/dpdk.c
>>> index 21dd47e80..771baa413 100644
>>> --- a/lib/dpdk.c
>>> +++ b/lib/dpdk.c
>>> @@ -33,6 +33,7 @@
>>>
>>>  #include "dirs.h"
>>>  #include "fatal-signal.h"
>>> +#include "id-pool.h"
>>>  #include "netdev-dpdk.h"
>>>  #include "netdev-offload-provider.h"
>>>  #include "openvswitch/dynamic-string.h"
>>> @@ -55,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization
>>>                                         * of DPDK. */
>>>  static bool per_port_memory = false; /* Status of per port memory support */
>>>
>>> +static struct id_pool *lcore_id_pool;

OVS_GUARDED_BY() would be nice.

>>> +static struct ovs_mutex lcore_id_pool_mutex = OVS_MUTEX_INITIALIZER;
>>> +
>>>  static int
>>>  process_vhost_flags(char *flag, const char *default_val, int size,
>>>                      const struct smap *ovs_other_config,
>>> @@ -346,7 +350,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>>>          }
>>>      }
>>>
>>> -    if (args_contains(&args, "-c") || args_contains(&args, "-l")) {
>>> +    if (args_contains(&args, "-c") || args_contains(&args, "-l") ||
>>> +        args_contains(&args, "--lcores")) {
>>>          auto_determine = false;
>>>      }
>>>
>>> @@ -372,8 +377,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>>>               * thread affintity - default to core #0 */
>>>              VLOG_ERR("Thread getaffinity failed. Using core #0");
>>>          }
>>> -        svec_add(&args, "-l");
>>> -        svec_add_nocopy(&args, xasprintf("%d", cpu));
>>> +        svec_add(&args, "--lcores");
>>> +        svec_add_nocopy(&args, xasprintf("0@%d", cpu));
>>>      }
>>>
>>>      svec_terminate(&args);
>>> @@ -429,6 +434,23 @@ dpdk_init__(const struct smap *ovs_other_config)
>>>          }
>>>      }
>>>
>>> +    ovs_mutex_lock(&lcore_id_pool_mutex);
>>> +    lcore_id_pool = id_pool_create(0, RTE_MAX_LCORE);
>>> +    /* Empty the whole pool... */
>>> +    for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
>>> +        uint32_t lcore_id;
>>> +
>>> +        id_pool_alloc_id(lcore_id_pool, &lcore_id);
>>> +    }
>>> +    /* ...and release the unused spots. */
>>> +    for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
>>> +        if (rte_eal_lcore_role(lcore) != ROLE_OFF) {
>>> +             continue;
>>> +        }
>>> +        id_pool_free_id(lcore_id_pool, lcore);
>>> +    }
>>> +    ovs_mutex_unlock(&lcore_id_pool_mutex);
>>> +
>>>      /* We are called from the main thread here */
>>>      RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
>>>
>>> @@ -522,11 +544,48 @@ dpdk_available(void)
>>>  }
>>>
>>>  void
>>> -dpdk_set_lcore_id(unsigned cpu)
>>> +dpdk_init_thread_context(unsigned cpu)
>>>  {
>>> +    cpu_set_t cpuset;

I think it should be rte_cpuset_t.
For portability and also to follow DPDK API.

>>> +    unsigned lcore;
>>> +    int err;
>>> +
>>>      /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
>>>      ovs_assert(cpu != NON_PMD_CORE_ID);
>>> -    RTE_PER_LCORE(_lcore_id) = cpu;
>>> +
>>> +    ovs_mutex_lock(&lcore_id_pool_mutex);
>>> +    if (lcore_id_pool == NULL || !id_pool_alloc_id(lcore_id_pool, &lcore)) {
>>> +        lcore = NON_PMD_CORE_ID;
>>
>> If PMD thread will get NON_PMD_CORE_ID it will smash the non-PMD
>> mempool caches since it doesn't take a 'non_pmd_mutex'.
> 
> Ok, need to look at this again.
> I missed something.
> I don't understand how you would create PMD threads with cpu !=
> NON_PMD_CORE_ID without dpdk enabled but still call the mempool code.

I meant that if you'll try to create more PMD threads than RTE_MAX_LCORE.
For example, if RTE_MAX_LCORE == 32 and you want to create 40 PMD threads
(assuming your system has 64 CPUs), 8 PMD threads and non-PMD thread(s)
will have lcore == NON_PMD_CORE_ID == LCORE_ID_ANY.  I re-checked and
this will not cause sharing of the same mempool caches because threads
with LCORE_ID_ANY doesn't use it, however I think that simultaneous calls
of DPDK API might still be somehow dangerous.  Anyway, performance will
suffer.  At least, we should warn about that.

> +    }
> +    ovs_mutex_unlock(&lcore_id_pool_mutex);
> +
> +    RTE_PER_LCORE(_lcore_id) = lcore;
> +
> +    /* DPDK is not initialised, nothing more to do. */
> +    if (lcore == NON_PMD_CORE_ID) {
> +        return;
> +    }
> +
> +    CPU_ZERO(&cpuset);
> +    err = pthread_getaffinity_np(pthread_self(), sizeof cpuset, &cpuset);

We know the core already. No need to make additional syscall.
We could just set it with CPU_SET().  This will also help with
portability in the future.

> +    if (err) {
> +        VLOG_ABORT("Thread getaffinity error: %s", ovs_strerror(err));
> +    }
> +
> +    rte_thread_set_affinity(&cpuset);
> +    VLOG_INFO("Initialised lcore %u for core %u", lcore, cpu);
> +}
> +
> +void
> +dpdk_uninit_thread_context(void)
> +{
> +    if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) {
> +        return;
> +    }
> +
> +    ovs_mutex_lock(&lcore_id_pool_mutex);
> +    id_pool_free_id(lcore_id_pool, RTE_PER_LCORE(_lcore_id));
> +    ovs_mutex_unlock(&lcore_id_pool_mutex);
>  }
>  
>  void
> diff --git a/lib/dpdk.h b/lib/dpdk.h
> index 736a64279..404ac1a4b 100644
> --- a/lib/dpdk.h
> +++ b/lib/dpdk.h
> @@ -36,7 +36,8 @@ struct smap;
>  struct ovsrec_open_vswitch;
>  
>  void dpdk_init(const struct smap *ovs_other_config);
> -void dpdk_set_lcore_id(unsigned cpu);
> +void dpdk_init_thread_context(unsigned cpu);
> +void dpdk_uninit_thread_context(void);
>  const char *dpdk_get_vhost_sock_dir(void);
>  bool dpdk_vhost_iommu_enabled(void);
>  bool dpdk_vhost_postcopy_enabled(void);
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 5142bad1d..c40031a78 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -5472,7 +5472,7 @@ pmd_thread_main(void *f_)
>      /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
>      ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
>      ovs_numa_thread_setaffinity_core(pmd->core_id);
> -    dpdk_set_lcore_id(pmd->core_id);
> +    dpdk_init_thread_context(pmd->core_id);
>      poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
>      dfc_cache_init(&pmd->flow_cache);
>      pmd_alloc_static_tx_qid(pmd);
> @@ -5592,6 +5592,7 @@ reload:
>      dfc_cache_uninit(&pmd->flow_cache);
>      free(poll_list);
>      pmd_free_cached_ports(pmd);
> +    dpdk_uninit_thread_context();
>      return NULL;
>  }


More information about the dev mailing list