[ovs-dev] [PATCH v2] dpdk: Support running PMD threads on cores > RTE_MAX_LCORE.

David Marchand david.marchand at redhat.com
Tue Dec 10 10:10:12 UTC 2019


Most DPDK components make the assumption that rte_lcore_id() returns
a valid lcore_id in [0..RTE_MAX_LCORE[ range (with the exception of
the LCORE_ID_ANY special value).

OVS does not currently check which value is set in
RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK
side.

Introduce a lcore allocator in OVS for PMD threads and map them to
unused lcores from DPDK à la --lcores.

The physical cores on which the PMD threads are running still
constitutes an important information when debugging, so still keep
those in the PMD thread names but add a new debug log when starting
them.
Mapping between OVS threads and DPDK lcores can be dumped with a new
dpdk/dump-lcores command.

Synchronize DPDK internals on numa and cpuset for the PMD threads by
registering them via the rte_thread_set_affinity() helper.

Signed-off-by: David Marchand <david.marchand at redhat.com>
---
Changelog since v1:
- rewired existing configuration 'dpdk-lcore-mask' to use --lcores,
- switched to a bitmap to track lcores,
- added a command to dump current mapping (Flavio): used an experimental
  API to get DPDK lcores cpuset since it is the most reliable/portable
  information,
- used the same code for the logs when starting DPDK/PMD threads,
- addressed Ilya comments,

---
 lib/dpdk-stub.c   |   8 ++-
 lib/dpdk.c        | 163 ++++++++++++++++++++++++++++++++++++++++++++--
 lib/dpdk.h        |   5 +-
 lib/dpif-netdev.c |   3 +-
 4 files changed, 170 insertions(+), 9 deletions(-)

diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c
index c332c217c..90473bc8e 100644
--- a/lib/dpdk-stub.c
+++ b/lib/dpdk-stub.c
@@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config)
 }
 
 void
-dpdk_set_lcore_id(unsigned cpu OVS_UNUSED)
+dpdk_init_thread_context(unsigned cpu OVS_UNUSED)
+{
+    /* Nothing */
+}
+
+void
+dpdk_uninit_thread_context(void)
 {
     /* Nothing */
 }
diff --git a/lib/dpdk.c b/lib/dpdk.c
index 37ea2973c..0173366a0 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -30,6 +30,7 @@
 #include <rte_pdump.h>
 #endif
 
+#include "bitmap.h"
 #include "dirs.h"
 #include "fatal-signal.h"
 #include "netdev-dpdk.h"
@@ -39,6 +40,7 @@
 #include "ovs-numa.h"
 #include "smap.h"
 #include "svec.h"
+#include "unixctl.h"
 #include "util.h"
 #include "vswitch-idl.h"
 
@@ -54,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization
                                        * of DPDK. */
 static bool per_port_memory = false; /* Status of per port memory support */
 
+static struct ovs_mutex lcore_bitmap_mutex = OVS_MUTEX_INITIALIZER;
+static unsigned long *lcore_bitmap OVS_GUARDED_BY(lcore_bitmap_mutex);
+
 static int
 process_vhost_flags(char *flag, const char *default_val, int size,
                     const struct smap *ovs_other_config,
@@ -94,6 +99,38 @@ args_contains(const struct svec *args, const char *value)
     return false;
 }
 
+static void
+construct_dpdk_lcore_option(const struct smap *ovs_other_config,
+                            struct svec *args)
+{
+    const char *cmask = smap_get(ovs_other_config, "dpdk-lcore-mask");
+    struct svec lcores = SVEC_EMPTY_INITIALIZER;
+    struct ovs_numa_info_core *core;
+    struct ovs_numa_dump *cores;
+    int index = 0;
+
+    if (!cmask) {
+        return;
+    }
+    if (args_contains(args, "-c") || args_contains(args, "-l") ||
+        args_contains(args, "--lcores")) {
+                VLOG_WARN("Ignoring database defined option 'dpdk-lcore-mask' "
+                          "due to dpdk-extra config");
+        return;
+    }
+
+    cores = ovs_numa_dump_cores_with_cmask(cmask);
+    FOR_EACH_CORE_ON_DUMP(core, cores) {
+        svec_add_nocopy(&lcores, xasprintf("%d@%d", index, core->core_id));
+        index++;
+    }
+    svec_terminate(&lcores);
+    ovs_numa_dump_destroy(cores);
+    svec_add(args, "--lcores");
+    svec_add_nocopy(args, svec_join(&lcores, ",", ""));
+    svec_destroy(&lcores);
+}
+
 static void
 construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
 {
@@ -103,7 +140,6 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
         bool default_enabled;
         const char *default_value;
     } opts[] = {
-        {"dpdk-lcore-mask",   "-c",             false, NULL},
         {"dpdk-hugepage-dir", "--huge-dir",     false, NULL},
         {"dpdk-socket-limit", "--socket-limit", false, NULL},
     };
@@ -224,6 +260,7 @@ construct_dpdk_args(const struct smap *ovs_other_config, struct svec *args)
         svec_parse_words(args, extra_configuration);
     }
 
+    construct_dpdk_lcore_option(ovs_other_config, args);
     construct_dpdk_options(ovs_other_config, args);
     construct_dpdk_mutex_options(ovs_other_config, args);
 }
@@ -264,6 +301,58 @@ static cookie_io_functions_t dpdk_log_func = {
     .write = dpdk_log_write,
 };
 
+static void
+dpdk_dump_lcore(struct ds *ds, unsigned lcore)
+{
+    struct svec cores = SVEC_EMPTY_INITIALIZER;
+    rte_cpuset_t cpuset;
+    unsigned cpu;
+
+    cpuset = rte_lcore_cpuset(lcore);
+    for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
+        if (!CPU_ISSET(cpu, &cpuset)) {
+            continue;
+        }
+        svec_add_nocopy(&cores, xasprintf("%u", cpu));
+    }
+    svec_terminate(&cores);
+    ds_put_format(ds, "lcore%u (%s) is running on core %s\n", lcore,
+                  rte_eal_lcore_role(lcore) != ROLE_OFF ? "DPDK" : "OVS",
+                  svec_join(&cores, ",", ""));
+    svec_destroy(&cores);
+}
+
+static void
+dpdk_dump_lcores(struct unixctl_conn *conn, int argc, const char *argv[],
+                 void *aux OVS_UNUSED)
+{
+    struct ds ds = DS_EMPTY_INITIALIZER;
+    unsigned lcore;
+
+    ovs_mutex_lock(&lcore_bitmap_mutex);
+    if (lcore_bitmap == NULL) {
+        unixctl_command_reply_error(conn, "DPDK has not been initialised");
+        goto out;
+    }
+    if (argc > 1) {
+        if (!str_to_uint(argv[1], 0, &lcore) || lcore >= RTE_MAX_LCORE ||
+            !bitmap_is_set(lcore_bitmap, lcore)) {
+            unixctl_command_reply_error(conn, "incorrect lcoreid");
+            goto out;
+        }
+        dpdk_dump_lcore(&ds, lcore);
+    } else for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
+        if (!bitmap_is_set(lcore_bitmap, lcore)) {
+            continue;
+        }
+        dpdk_dump_lcore(&ds, lcore);
+    }
+    unixctl_command_reply(conn, ds_cstr(&ds));
+    ds_destroy(&ds);
+out:
+    ovs_mutex_unlock(&lcore_bitmap_mutex);
+}
+
 static bool
 dpdk_init__(const struct smap *ovs_other_config)
 {
@@ -345,7 +434,8 @@ dpdk_init__(const struct smap *ovs_other_config)
         }
     }
 
-    if (args_contains(&args, "-c") || args_contains(&args, "-l")) {
+    if (args_contains(&args, "-c") || args_contains(&args, "-l") ||
+        args_contains(&args, "--lcores")) {
         auto_determine = false;
     }
 
@@ -371,8 +461,8 @@ dpdk_init__(const struct smap *ovs_other_config)
              * thread affintity - default to core #0 */
             VLOG_ERR("Thread getaffinity failed. Using core #0");
         }
-        svec_add(&args, "-l");
-        svec_add_nocopy(&args, xasprintf("%d", cpu));
+        svec_add(&args, "--lcores");
+        svec_add_nocopy(&args, xasprintf("0@%d", cpu));
     }
 
     svec_terminate(&args);
@@ -428,6 +518,24 @@ dpdk_init__(const struct smap *ovs_other_config)
         }
     }
 
+    ovs_mutex_lock(&lcore_bitmap_mutex);
+    lcore_bitmap = bitmap_allocate(RTE_MAX_LCORE);
+    /* Mark DPDK threads. */
+    for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
+        struct ds ds = DS_EMPTY_INITIALIZER;
+
+        if (rte_eal_lcore_role(lcore) == ROLE_OFF) {
+             continue;
+        }
+        bitmap_set1(lcore_bitmap, lcore);
+        dpdk_dump_lcore(&ds, lcore);
+        VLOG_INFO("%s", ds_cstr(&ds));
+        ds_destroy(&ds);
+    }
+    unixctl_command_register("dpdk/dump-lcores", "[lcore]", 0, 1,
+                             dpdk_dump_lcores, NULL);
+    ovs_mutex_unlock(&lcore_bitmap_mutex);
+
     /* We are called from the main thread here */
     RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
 
@@ -512,11 +620,54 @@ dpdk_available(void)
 }
 
 void
-dpdk_set_lcore_id(unsigned cpu)
+dpdk_init_thread_context(unsigned cpu)
 {
+    struct ds ds = DS_EMPTY_INITIALIZER;
+    rte_cpuset_t cpuset;
+    unsigned lcore;
+
     /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
     ovs_assert(cpu != NON_PMD_CORE_ID);
-    RTE_PER_LCORE(_lcore_id) = cpu;
+
+    ovs_mutex_lock(&lcore_bitmap_mutex);
+    if (lcore_bitmap == NULL) {
+        lcore = NON_PMD_CORE_ID;
+    } else {
+        lcore = bitmap_scan(lcore_bitmap, 0, 0, RTE_MAX_LCORE);
+        if (lcore == RTE_MAX_LCORE) {
+            VLOG_WARN("Reached maximum number of DPDK lcores, core %u will "
+                      "have lower performance", cpu);
+            lcore = NON_PMD_CORE_ID;
+        } else {
+            bitmap_set1(lcore_bitmap, lcore);
+        }
+    }
+    ovs_mutex_unlock(&lcore_bitmap_mutex);
+
+    RTE_PER_LCORE(_lcore_id) = lcore;
+
+    if (lcore == NON_PMD_CORE_ID) {
+        return;
+    }
+
+    CPU_ZERO(&cpuset);
+    CPU_SET(cpu, &cpuset);
+    rte_thread_set_affinity(&cpuset);
+    dpdk_dump_lcore(&ds, lcore);
+    VLOG_INFO("%s", ds_cstr(&ds));
+    ds_destroy(&ds);
+}
+
+void
+dpdk_uninit_thread_context(void)
+{
+    if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) {
+        return;
+    }
+
+    ovs_mutex_lock(&lcore_bitmap_mutex);
+    bitmap_set0(lcore_bitmap, RTE_PER_LCORE(_lcore_id));
+    ovs_mutex_unlock(&lcore_bitmap_mutex);
 }
 
 void
diff --git a/lib/dpdk.h b/lib/dpdk.h
index 736a64279..eab7a926b 100644
--- a/lib/dpdk.h
+++ b/lib/dpdk.h
@@ -22,7 +22,9 @@
 #ifdef DPDK_NETDEV
 
 #include <rte_config.h>
+#define ALLOW_EXPERIMENTAL_API
 #include <rte_lcore.h>
+#undef ALLOW_EXPERIMENTAL_API
 
 #define NON_PMD_CORE_ID LCORE_ID_ANY
 
@@ -36,7 +38,8 @@ struct smap;
 struct ovsrec_open_vswitch;
 
 void dpdk_init(const struct smap *ovs_other_config);
-void dpdk_set_lcore_id(unsigned cpu);
+void dpdk_init_thread_context(unsigned cpu);
+void dpdk_uninit_thread_context(void);
 const char *dpdk_get_vhost_sock_dir(void);
 bool dpdk_vhost_iommu_enabled(void);
 bool dpdk_vhost_postcopy_enabled(void);
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 7ebf4ef87..91e98919c 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -5470,7 +5470,7 @@ pmd_thread_main(void *f_)
     /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
     ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
     ovs_numa_thread_setaffinity_core(pmd->core_id);
-    dpdk_set_lcore_id(pmd->core_id);
+    dpdk_init_thread_context(pmd->core_id);
     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
     dfc_cache_init(&pmd->flow_cache);
     pmd_alloc_static_tx_qid(pmd);
@@ -5590,6 +5590,7 @@ reload:
     dfc_cache_uninit(&pmd->flow_cache);
     free(poll_list);
     pmd_free_cached_ports(pmd);
+    dpdk_uninit_thread_context();
     return NULL;
 }
 
-- 
2.23.0



More information about the dev mailing list