[ovs-dev] [PATCH v7 1/1] ovs-numa: Support non-contiguous numa nodes and offline CPU cores

David Wilder dwilder at us.ibm.com
Tue Jun 22 18:53:08 UTC 2021


This change removes the assumption that numa nodes and cores are numbered
contiguously in linux.  This change is required to support some Power
systems.

A check has been added to verify that cores are online,
offline cores result in non-contiguously numbered cores.

Dpdk EAL option generation is updated to work with non-contiguous numa nodes.
These options can be seen in the ovs-vswitchd.log.  For example:
a system containing only numa nodes 0 and 8 will generate the following:

EAL ARGS: ovs-vswitchd --socket-mem 1024,0,0,0,0,0,0,0,1024 \
                         --socket-limit 1024,0,0,0,0,0,0,0,1024 -l 0

Tests for pmd and dpif-netdev have been updated to validate non-contiguous
numbered nodes.

Signed-off-by: David Wilder <dwilder at us.ibm.com>
---
 lib/dpdk.c           | 57 +++++++++++++++++++++++++++++++++++------
 lib/ovs-numa.c       | 51 ++++++++++++++++++++++++------------
 lib/ovs-numa.h       |  2 ++
 tests/dpif-netdev.at |  2 +-
 tests/pmd.at         | 61 ++++++++++++++++++++++++++++++++++++++++----
 5 files changed, 142 insertions(+), 31 deletions(-)

diff --git a/lib/dpdk.c b/lib/dpdk.c
index 2eaaa569c..238d0fffb 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -130,22 +130,63 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
     }
 }
 
+static int
+compare_numa_node_list(const void *a_, const void *b_)
+{
+    int a = *(const int *) a_;
+    int b = *(const int *) b_;
+
+    if (a < b) {
+        return -1;
+    }
+    if (a > b) {
+        return 1;
+    }
+    return 0;
+}
+
 static char *
 construct_dpdk_socket_mem(void)
 {
     const char *def_value = "1024";
-    int numa, numa_nodes = ovs_numa_get_n_numas();
+    struct ovs_numa_dump *dump;
+    const struct ovs_numa_info_numa *node;
+    int k = 0, last_node = 0, n_numa_nodes, *numa_node_list;
     struct ds dpdk_socket_mem = DS_EMPTY_INITIALIZER;
 
-    if (numa_nodes == 0 || numa_nodes == OVS_NUMA_UNSPEC) {
-        numa_nodes = 1;
-    }
+    /* Build a list of all numa nodes with at least one core */
+    dump = ovs_numa_dump_n_cores_per_numa(1);
+    n_numa_nodes = hmap_count(&dump->numas);
+    numa_node_list = xcalloc(n_numa_nodes, sizeof *numa_node_list);
 
-    ds_put_cstr(&dpdk_socket_mem, def_value);
-    for (numa = 1; numa < numa_nodes; ++numa) {
-        ds_put_format(&dpdk_socket_mem, ",%s", def_value);
+    FOR_EACH_NUMA_ON_DUMP(node, dump) {
+        if (k >= n_numa_nodes) {
+            break;
+        }
+        numa_node_list[k++] = node->numa_id;
     }
-
+    qsort(numa_node_list, k, sizeof *numa_node_list, compare_numa_node_list);
+
+    for (int i = 0; i < n_numa_nodes; i++) {
+        while (numa_node_list[i] > last_node &&
+               numa_node_list[i] != OVS_NUMA_UNSPEC &&
+               numa_node_list[i] <= MAX_NUMA_NODES){
+            if (last_node == 0) {
+                ds_put_format(&dpdk_socket_mem, "%s", "0");
+            } else {
+                ds_put_format(&dpdk_socket_mem, ",%s", "0");
+            }
+            last_node++;
+        }
+        if (numa_node_list[i] == 0) {
+                ds_put_format(&dpdk_socket_mem, "%s", def_value);
+        } else {
+                ds_put_format(&dpdk_socket_mem, ",%s", def_value);
+        }
+        last_node++;
+    }
+    free(numa_node_list);
+    ovs_numa_dump_destroy(dump);
     return ds_cstr(&dpdk_socket_mem);
 }
 
diff --git a/lib/ovs-numa.c b/lib/ovs-numa.c
index 6d0a68522..b825ecbdd 100644
--- a/lib/ovs-numa.c
+++ b/lib/ovs-numa.c
@@ -42,21 +42,22 @@ VLOG_DEFINE_THIS_MODULE(ovs_numa);
  * This module stores the affinity information of numa nodes and cpu cores.
  * It also provides functions to bookkeep the pin of threads on cpu cores.
  *
- * It is assumed that the numa node ids and cpu core ids all start from 0 and
- * range continuously.  So, for example, if 'ovs_numa_get_n_cores()' returns N,
- * user can assume core ids from 0 to N-1 are all valid and there is a
- * 'struct cpu_core' for each id.
+ * It is assumed that the numa node ids and cpu core ids all start from 0.
+ * There is no guarantee that node and cpu ids are numbered consecutively
+ * (this is a change from earlier version of the code). So, for example,
+ * if two nodes exist with ids 0 and 8, 'ovs_numa_get_n_nodes()' will
+ * return 2, no assumption of node numbering should be made.
  *
  * NOTE, this module should only be used by the main thread.
  *
- * NOTE, the assumption above will fail when cpu hotplug is used.  In that
- * case ovs-numa will not function correctly.  For now, add a TODO entry
- * for addressing it in the future.
+ * NOTE, if cpu hotplug is used 'all_numa_nodes' and 'all_cpu_cores' must be
+ * invalidated when ever the system topology changes. Support for detecting
+ * topology changes has not been included. For now, add a TODO entry for
+ * addressing it in the future.
  *
  * TODO: Fix ovs-numa when cpu hotplug is used.
  */
 
-#define MAX_NUMA_NODES 128
 
 /* numa node. */
 struct numa_node {
@@ -130,15 +131,14 @@ insert_new_cpu_core(struct numa_node *n, unsigned core_id)
  * - "0,0,0,0": four cores on numa socket 0.
  * - "0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1": 16 cores on two numa sockets.
  * - "0,0,0,0,1,1,1,1": 8 cores on two numa sockets.
- *
- * The different numa ids must be consecutives or the function will abort. */
+ * - "0,0,0,0,8,8,8,8": 8 cores on two numa sockets, non-contiguous.
+ */
 static void
 discover_numa_and_core_dummy(void)
 {
     char *conf = xstrdup(dummy_config);
     char *id, *saveptr = NULL;
     unsigned i = 0;
-    long max_numa_id = 0;
 
     for (id = strtok_r(conf, ",", &saveptr); id;
          id = strtok_r(NULL, ",", &saveptr)) {
@@ -152,8 +152,6 @@ discover_numa_and_core_dummy(void)
             continue;
         }
 
-        max_numa_id = MAX(max_numa_id, numa_id);
-
         hnode = hmap_first_with_hash(&all_numa_nodes, hash_int(numa_id, 0));
 
         if (hnode) {
@@ -169,10 +167,27 @@ discover_numa_and_core_dummy(void)
 
     free(conf);
 
-    if (max_numa_id + 1 != hmap_count(&all_numa_nodes)) {
-        ovs_fatal(0, "dummy numa contains non consecutive numa ids");
+}
+
+#ifdef __linux__
+/* Check if a cpu is detected and online */
+static int
+cpu_detected(unsigned int core_id)
+{
+    char path[PATH_MAX];
+    int len = snprintf(path, sizeof(path),
+                       "/sys/devices/system/cpu/cpu%d/topology/core_id",
+                       core_id);
+    if (len <= 0 || (unsigned) len >= sizeof(path)) {
+        return 0;
+    }
+    if (access(path, F_OK) != 0) {
+        return 0;
     }
+
+    return 1;
 }
+#endif /* __linux__ */
 
 /* Discovers all numa nodes and the corresponding cpu cores.
  * Constructs the 'struct numa_node' and 'struct cpu_core'. */
@@ -219,7 +234,9 @@ discover_numa_and_core(void)
                     unsigned core_id;
 
                     core_id = strtoul(subdir->d_name + 3, NULL, 10);
-                    insert_new_cpu_core(n, core_id);
+                    if (cpu_detected(core_id)) {
+                        insert_new_cpu_core(n, core_id);
+                    }
                 }
             }
             closedir(dir);
@@ -229,7 +246,7 @@ discover_numa_and_core(void)
         }
 
         free(path);
-        if (!dir || !numa_supported) {
+        if (!numa_supported) {
             break;
         }
     }
diff --git a/lib/ovs-numa.h b/lib/ovs-numa.h
index 8f2ea3430..ecc251a7f 100644
--- a/lib/ovs-numa.h
+++ b/lib/ovs-numa.h
@@ -26,6 +26,8 @@
 #define OVS_CORE_UNSPEC INT_MAX
 #define OVS_NUMA_UNSPEC INT_MAX
 
+#define MAX_NUMA_NODES 128
+
 /* Dump of a list of 'struct ovs_numa_info'. */
 struct ovs_numa_dump {
     struct hmap cores;
diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at
index 16402ebae..53eee185a 100644
--- a/tests/dpif-netdev.at
+++ b/tests/dpif-netdev.at
@@ -98,7 +98,7 @@ m4_define([DPIF_NETDEV_DUMMY_IFACE],
                      fail-mode=secure -- \
       add-port br1 p2 -- set interface p2 type=$1 options:stream=unix:$OVS_RUNDIR/p0.sock ofport_request=2 -- \
       add-port br1 p8 -- set interface p8 ofport_request=8 type=$1 --], [], [],
-      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
+      [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
    AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
 
    AT_CHECK([ovs-ofctl add-flow br0 action=normal])
diff --git a/tests/pmd.at b/tests/pmd.at
index cc5371d5a..faf02f158 100644
--- a/tests/pmd.at
+++ b/tests/pmd.at
@@ -361,8 +361,8 @@ AT_SETUP([PMD - change numa node])
 OVS_VSWITCHD_START(
   [add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=2 -- \
    add-port br0 p2 -- set Interface p2 type=dummy-pmd ofport_request=2 options:n_rxq=2 -- \
-   set Open_vSwitch . other_config:pmd-cpu-mask=3
-], [], [], [--dummy-numa 0,1])
+   set Open_vSwitch . other_config:pmd-cpu-mask=7
+], [], [], [--dummy-numa 0,1,8])
 AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
 
 AT_CHECK([ovs-ofctl add-flow br0 action=controller])
@@ -432,6 +432,40 @@ NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=2 (via action) data_l
 icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc
 ])
 
+AT_CHECK([ovs-vsctl set Interface p1 options:numa_id=8])
+
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
+p1 0 8 2
+p1 1 8 2
+p2 0 1 1
+p2 1 1 1
+])
+
+AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
+
+AT_CHECK([ovs-appctl netdev-dummy/receive p1 --qid 1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
+
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2])
+OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=1 (via action) data_len=106 (unbuffered)
+icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc
+])
+
+AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
+
+AT_CHECK([ovs-appctl netdev-dummy/receive p2 --qid 0 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
+
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2])
+OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=2 (via action) data_len=106 (unbuffered)
+icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc
+])
+
+
 OVS_VSWITCHD_STOP
 AT_CLEANUP
 
@@ -584,7 +618,7 @@ AT_CLEANUP
 
 AT_SETUP([PMD - rxq affinity - NUMA])
 OVS_VSWITCHD_START(
-  [], [], [], [--dummy-numa 0,0,0,1,1])
+  [], [], [], [--dummy-numa 0,0,0,1,1,8,8])
 AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
 
 AT_CHECK([ovs-ofctl add-flow br0 actions=controller])
@@ -601,21 +635,38 @@ p1 1 0 2
 
 AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:4"])
 
-dnl We moved the queues to different numa node. Expecting threads on
+dnl We moved the queues to different contiguous numa node. Expecting threads on
 dnl NUMA node 1 to be created.
 AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
 p1 0 1 3
 p1 1 1 4
 ])
 
+AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:5,1:6"])
+
+dnl We moved the queues to different non-contiguous numa node. Expecting threads on
+dnl NUMA node 8 to be created.
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
+p1 0 8 5
+p1 1 8 6
+])
+
 AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:1"])
 
-dnl Queues splitted between NUMA nodes.
+dnl Queues splitted between contiguous NUMA nodes.
 AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
 p1 0 1 3
 p1 1 0 1
 ])
 
+AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:5,1:1"])
+
+dnl Queues splitted between non-contiguous NUMA nodes.
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
+p1 0 8 5
+p1 1 0 1
+])
+
 AT_CHECK([ovs-vsctl remove Interface p1 other_config pmd-rxq-affinity])
 
 dnl We removed the rxq-affinity request.  dpif-netdev should assign queues
-- 
2.27.0



More information about the dev mailing list