[ovs-dev] [PATCH v7 1/1] ovs-numa: Support non-contiguous numa nodes and offline CPU cores
David Wilder
dwilder at us.ibm.com
Tue Jun 22 18:53:08 UTC 2021
This change removes the assumption that numa nodes and cores are numbered
contiguously in linux. This change is required to support some Power
systems.
A check has been added to verify that cores are online,
offline cores result in non-contiguously numbered cores.
Dpdk EAL option generation is updated to work with non-contiguous numa nodes.
These options can be seen in the ovs-vswitchd.log. For example:
a system containing only numa nodes 0 and 8 will generate the following:
EAL ARGS: ovs-vswitchd --socket-mem 1024,0,0,0,0,0,0,0,1024 \
--socket-limit 1024,0,0,0,0,0,0,0,1024 -l 0
Tests for pmd and dpif-netdev have been updated to validate non-contiguous
numbered nodes.
Signed-off-by: David Wilder <dwilder at us.ibm.com>
---
lib/dpdk.c | 57 +++++++++++++++++++++++++++++++++++------
lib/ovs-numa.c | 51 ++++++++++++++++++++++++------------
lib/ovs-numa.h | 2 ++
tests/dpif-netdev.at | 2 +-
tests/pmd.at | 61 ++++++++++++++++++++++++++++++++++++++++----
5 files changed, 142 insertions(+), 31 deletions(-)
diff --git a/lib/dpdk.c b/lib/dpdk.c
index 2eaaa569c..238d0fffb 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -130,22 +130,63 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
}
}
+static int
+compare_numa_node_list(const void *a_, const void *b_)
+{
+ int a = *(const int *) a_;
+ int b = *(const int *) b_;
+
+ if (a < b) {
+ return -1;
+ }
+ if (a > b) {
+ return 1;
+ }
+ return 0;
+}
+
static char *
construct_dpdk_socket_mem(void)
{
const char *def_value = "1024";
- int numa, numa_nodes = ovs_numa_get_n_numas();
+ struct ovs_numa_dump *dump;
+ const struct ovs_numa_info_numa *node;
+ int k = 0, last_node = 0, n_numa_nodes, *numa_node_list;
struct ds dpdk_socket_mem = DS_EMPTY_INITIALIZER;
- if (numa_nodes == 0 || numa_nodes == OVS_NUMA_UNSPEC) {
- numa_nodes = 1;
- }
+ /* Build a list of all numa nodes with at least one core */
+ dump = ovs_numa_dump_n_cores_per_numa(1);
+ n_numa_nodes = hmap_count(&dump->numas);
+ numa_node_list = xcalloc(n_numa_nodes, sizeof *numa_node_list);
- ds_put_cstr(&dpdk_socket_mem, def_value);
- for (numa = 1; numa < numa_nodes; ++numa) {
- ds_put_format(&dpdk_socket_mem, ",%s", def_value);
+ FOR_EACH_NUMA_ON_DUMP(node, dump) {
+ if (k >= n_numa_nodes) {
+ break;
+ }
+ numa_node_list[k++] = node->numa_id;
}
-
+ qsort(numa_node_list, k, sizeof *numa_node_list, compare_numa_node_list);
+
+ for (int i = 0; i < n_numa_nodes; i++) {
+ while (numa_node_list[i] > last_node &&
+ numa_node_list[i] != OVS_NUMA_UNSPEC &&
+ numa_node_list[i] <= MAX_NUMA_NODES){
+ if (last_node == 0) {
+ ds_put_format(&dpdk_socket_mem, "%s", "0");
+ } else {
+ ds_put_format(&dpdk_socket_mem, ",%s", "0");
+ }
+ last_node++;
+ }
+ if (numa_node_list[i] == 0) {
+ ds_put_format(&dpdk_socket_mem, "%s", def_value);
+ } else {
+ ds_put_format(&dpdk_socket_mem, ",%s", def_value);
+ }
+ last_node++;
+ }
+ free(numa_node_list);
+ ovs_numa_dump_destroy(dump);
return ds_cstr(&dpdk_socket_mem);
}
diff --git a/lib/ovs-numa.c b/lib/ovs-numa.c
index 6d0a68522..b825ecbdd 100644
--- a/lib/ovs-numa.c
+++ b/lib/ovs-numa.c
@@ -42,21 +42,22 @@ VLOG_DEFINE_THIS_MODULE(ovs_numa);
* This module stores the affinity information of numa nodes and cpu cores.
* It also provides functions to bookkeep the pin of threads on cpu cores.
*
- * It is assumed that the numa node ids and cpu core ids all start from 0 and
- * range continuously. So, for example, if 'ovs_numa_get_n_cores()' returns N,
- * user can assume core ids from 0 to N-1 are all valid and there is a
- * 'struct cpu_core' for each id.
+ * It is assumed that the numa node ids and cpu core ids all start from 0.
+ * There is no guarantee that node and cpu ids are numbered consecutively
+ * (this is a change from earlier version of the code). So, for example,
+ * if two nodes exist with ids 0 and 8, 'ovs_numa_get_n_nodes()' will
+ * return 2, no assumption of node numbering should be made.
*
* NOTE, this module should only be used by the main thread.
*
- * NOTE, the assumption above will fail when cpu hotplug is used. In that
- * case ovs-numa will not function correctly. For now, add a TODO entry
- * for addressing it in the future.
+ * NOTE, if cpu hotplug is used 'all_numa_nodes' and 'all_cpu_cores' must be
+ * invalidated when ever the system topology changes. Support for detecting
+ * topology changes has not been included. For now, add a TODO entry for
+ * addressing it in the future.
*
* TODO: Fix ovs-numa when cpu hotplug is used.
*/
-#define MAX_NUMA_NODES 128
/* numa node. */
struct numa_node {
@@ -130,15 +131,14 @@ insert_new_cpu_core(struct numa_node *n, unsigned core_id)
* - "0,0,0,0": four cores on numa socket 0.
* - "0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1": 16 cores on two numa sockets.
* - "0,0,0,0,1,1,1,1": 8 cores on two numa sockets.
- *
- * The different numa ids must be consecutives or the function will abort. */
+ * - "0,0,0,0,8,8,8,8": 8 cores on two numa sockets, non-contiguous.
+ */
static void
discover_numa_and_core_dummy(void)
{
char *conf = xstrdup(dummy_config);
char *id, *saveptr = NULL;
unsigned i = 0;
- long max_numa_id = 0;
for (id = strtok_r(conf, ",", &saveptr); id;
id = strtok_r(NULL, ",", &saveptr)) {
@@ -152,8 +152,6 @@ discover_numa_and_core_dummy(void)
continue;
}
- max_numa_id = MAX(max_numa_id, numa_id);
-
hnode = hmap_first_with_hash(&all_numa_nodes, hash_int(numa_id, 0));
if (hnode) {
@@ -169,10 +167,27 @@ discover_numa_and_core_dummy(void)
free(conf);
- if (max_numa_id + 1 != hmap_count(&all_numa_nodes)) {
- ovs_fatal(0, "dummy numa contains non consecutive numa ids");
+}
+
+#ifdef __linux__
+/* Check if a cpu is detected and online */
+static int
+cpu_detected(unsigned int core_id)
+{
+ char path[PATH_MAX];
+ int len = snprintf(path, sizeof(path),
+ "/sys/devices/system/cpu/cpu%d/topology/core_id",
+ core_id);
+ if (len <= 0 || (unsigned) len >= sizeof(path)) {
+ return 0;
+ }
+ if (access(path, F_OK) != 0) {
+ return 0;
}
+
+ return 1;
}
+#endif /* __linux__ */
/* Discovers all numa nodes and the corresponding cpu cores.
* Constructs the 'struct numa_node' and 'struct cpu_core'. */
@@ -219,7 +234,9 @@ discover_numa_and_core(void)
unsigned core_id;
core_id = strtoul(subdir->d_name + 3, NULL, 10);
- insert_new_cpu_core(n, core_id);
+ if (cpu_detected(core_id)) {
+ insert_new_cpu_core(n, core_id);
+ }
}
}
closedir(dir);
@@ -229,7 +246,7 @@ discover_numa_and_core(void)
}
free(path);
- if (!dir || !numa_supported) {
+ if (!numa_supported) {
break;
}
}
diff --git a/lib/ovs-numa.h b/lib/ovs-numa.h
index 8f2ea3430..ecc251a7f 100644
--- a/lib/ovs-numa.h
+++ b/lib/ovs-numa.h
@@ -26,6 +26,8 @@
#define OVS_CORE_UNSPEC INT_MAX
#define OVS_NUMA_UNSPEC INT_MAX
+#define MAX_NUMA_NODES 128
+
/* Dump of a list of 'struct ovs_numa_info'. */
struct ovs_numa_dump {
struct hmap cores;
diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at
index 16402ebae..53eee185a 100644
--- a/tests/dpif-netdev.at
+++ b/tests/dpif-netdev.at
@@ -98,7 +98,7 @@ m4_define([DPIF_NETDEV_DUMMY_IFACE],
fail-mode=secure -- \
add-port br1 p2 -- set interface p2 type=$1 options:stream=unix:$OVS_RUNDIR/p0.sock ofport_request=2 -- \
add-port br1 p8 -- set interface p8 ofport_request=8 type=$1 --], [], [],
- [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
+ [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
AT_CHECK([ovs-ofctl add-flow br0 action=normal])
diff --git a/tests/pmd.at b/tests/pmd.at
index cc5371d5a..faf02f158 100644
--- a/tests/pmd.at
+++ b/tests/pmd.at
@@ -361,8 +361,8 @@ AT_SETUP([PMD - change numa node])
OVS_VSWITCHD_START(
[add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=2 -- \
add-port br0 p2 -- set Interface p2 type=dummy-pmd ofport_request=2 options:n_rxq=2 -- \
- set Open_vSwitch . other_config:pmd-cpu-mask=3
-], [], [], [--dummy-numa 0,1])
+ set Open_vSwitch . other_config:pmd-cpu-mask=7
+], [], [], [--dummy-numa 0,1,8])
AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
AT_CHECK([ovs-ofctl add-flow br0 action=controller])
@@ -432,6 +432,40 @@ NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=2 (via action) data_l
icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc
])
+AT_CHECK([ovs-vsctl set Interface p1 options:numa_id=8])
+
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
+p1 0 8 2
+p1 1 8 2
+p2 0 1 1
+p2 1 1 1
+])
+
+AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
+
+AT_CHECK([ovs-appctl netdev-dummy/receive p1 --qid 1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
+
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2])
+OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=1 (via action) data_len=106 (unbuffered)
+icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc
+])
+
+AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
+
+AT_CHECK([ovs-appctl netdev-dummy/receive p2 --qid 0 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
+
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2])
+OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=2 (via action) data_len=106 (unbuffered)
+icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc
+])
+
+
OVS_VSWITCHD_STOP
AT_CLEANUP
@@ -584,7 +618,7 @@ AT_CLEANUP
AT_SETUP([PMD - rxq affinity - NUMA])
OVS_VSWITCHD_START(
- [], [], [], [--dummy-numa 0,0,0,1,1])
+ [], [], [], [--dummy-numa 0,0,0,1,1,8,8])
AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
AT_CHECK([ovs-ofctl add-flow br0 actions=controller])
@@ -601,21 +635,38 @@ p1 1 0 2
AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:4"])
-dnl We moved the queues to different numa node. Expecting threads on
+dnl We moved the queues to different contiguous numa node. Expecting threads on
dnl NUMA node 1 to be created.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 1 3
p1 1 1 4
])
+AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:5,1:6"])
+
+dnl We moved the queues to different non-contiguous numa node. Expecting threads on
+dnl NUMA node 8 to be created.
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
+p1 0 8 5
+p1 1 8 6
+])
+
AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:1"])
-dnl Queues splitted between NUMA nodes.
+dnl Queues splitted between contiguous NUMA nodes.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 1 3
p1 1 0 1
])
+AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:5,1:1"])
+
+dnl Queues splitted between non-contiguous NUMA nodes.
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
+p1 0 8 5
+p1 1 0 1
+])
+
AT_CHECK([ovs-vsctl remove Interface p1 other_config pmd-rxq-affinity])
dnl We removed the rxq-affinity request. dpif-netdev should assign queues
--
2.27.0
More information about the dev
mailing list