[ovs-dev] [PATCH 5/6] dpif-netdev: Allow configuring number of PMD threads.

Thu Mar 12 18:04:36 UTC 2015

Dealing with CPU masks can be confusing and unnecessary for simple
configurations.  This commit introduces the 'other_config:n-pmd-cores'
key to specify the desired number of CPU cores reserved to the PMD
threads.  The 'other_config:pmd-cpu-mask' (if specified) overrides this
parameter.

Signed-off-by: Daniele Di Proietto <diproiettod at vmware.com>
---
 lib/dpif-netdev.c          | 122 +++++++++++++++++++++++++++++++--------------
 lib/dpif-provider.h        |   7 ++-
 lib/dpif.c                 |   6 ++-
 lib/dpif.h                 |   2 +-
 lib/ovs-numa.c             |  15 ++----
 lib/ovs-numa.h             |   8 +--
 ofproto/ofproto-dpif.c     |   4 +-
 ofproto/ofproto-provider.h |   2 +
 ofproto/ofproto.c          |   7 +++
 ofproto/ofproto.h          |   1 +
 vswitchd/bridge.c          |   2 +
 vswitchd/vswitch.xml       |  31 +++++++++++-
 12 files changed, 146 insertions(+), 61 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 1657621..3506432 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -209,10 +209,15 @@ struct dp_netdev {
      * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
     ovsthread_key_t per_pmd_key;
 
-    /* Number of rx queues for each dpdk interface and the cpu mask
-     * for pin of pmd threads. */
+    /* Number of rx queues for each dpdk interface */
     size_t n_dpdk_rxqs;
+    /* Maximum number of PMD threads. Ignored if 'pmd_cmask' != NULL */
+    int n_pmd_threads;
+    /* CPU cores used for PMD threads. There will be one thread for each core
+     * set in the CPU mask. */
     char *pmd_cmask;
+    /* Non pmd threads will be restricted to use the CPU set specified by this
+     * mask */
     char *nonpmd_cmask;
     uint64_t last_tnl_conf_seq;
 };
@@ -436,12 +441,12 @@ static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
 static struct dp_netdev_pmd_thread *
 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp);
-static void dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id);
-static void dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id);
+static void dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id, int);
 static void dp_netdev_reset_pmd_threads(struct dp_netdev *dp);
 static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
+static int get_n_pmd_threads_on_numa(struct dp_netdev *dp, int numa_id);
 
 static inline bool emc_entry_alive(struct emc_entry *ce);
 static void emc_clear_entry(struct emc_entry *ce);
@@ -623,10 +628,11 @@ create_dp_netdev(const char *name, const struct dpif_class *class,
     ovs_mutex_init_recursive(&dp->non_pmd_mutex);
     ovsthread_key_create(&dp->per_pmd_key, NULL);
 
-    /* Reserves the core NON_PMD_CORE_ID for all non-pmd threads. */
-    ovs_numa_try_pin_core_specific(NON_PMD_CORE_ID);
+    /* There can never be a pmd thread of NON_PMD_CORE_ID. */
+    ovs_numa_core_disable_pmd(NON_PMD_CORE_ID);
     dp_netdev_set_nonpmd(dp);
     dp->n_dpdk_rxqs = NR_QUEUE;
+    dp->n_pmd_threads = NR_PMD_THREADS;
 
     ovs_mutex_lock(&dp->port_mutex);
     error = do_add_port(dp, name, "internal", ODPP_LOCAL);
@@ -901,9 +907,20 @@ do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
     cmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
 
     if (netdev_is_pmd(netdev)) {
-        dp_netdev_set_pmds_on_numa(dp, netdev_get_numa_id(netdev));
-        dp_netdev_reload_pmds(dp);
-        dp_netdev_set_nonpmd_affinity();
+        int dev_numa_id = netdev_get_numa_id(netdev);
+
+        if (!get_n_pmd_threads_on_numa(dp, dev_numa_id)) {
+            /* There weren't pmd threads on numa domain 'dev_numa_id'.
+             * Reset all the pmd threads to distribute the pmd threads
+             * among numa domains */
+            dp_netdev_destroy_all_pmds(dp);
+            dp_netdev_reset_pmd_threads(dp);
+        } else {
+            /* There are already pmd threads on numa domain 'dev_numa_id'.
+             * Redistribute the queues */
+            dp_netdev_reload_pmds(dp);
+            dp_netdev_set_nonpmd_affinity();
+        }
     }
     seq_change(dp->port_seq);
 
@@ -1090,7 +1107,8 @@ do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
         /* If there is no netdev on the numa node, deletes the pmd threads
          * for that numa.  Else, just reloads the queues.  */
         if (!has_pmd_port_for_numa(dp, numa_id)) {
-            dp_netdev_del_pmds_on_numa(dp, numa_id);
+            dp_netdev_destroy_all_pmds(dp);
+            dp_netdev_reset_pmd_threads(dp);
         }
         dp_netdev_reload_pmds(dp);
     }
@@ -2135,17 +2153,25 @@ dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
  * is changed. */
 static bool
 pmd_config_changed(const struct dp_netdev *dp, size_t rxqs,
-                   const char *cmask_pmd)
+                   const char *cmask_pmd, int n_pmd_cores)
 {
     if (dp->n_dpdk_rxqs != rxqs) {
         return true;
-    } else {
-        if (dp->pmd_cmask != NULL && cmask_pmd != NULL) {
-            return strcmp(dp->pmd_cmask, cmask_pmd);
-        } else {
-            return (dp->pmd_cmask != NULL || cmask_pmd != NULL);
-        }
     }
+
+    if (dp->pmd_cmask != NULL && cmask_pmd != NULL) {
+        return strcmp(dp->pmd_cmask, cmask_pmd);
+    }
+
+    if (dp->pmd_cmask != NULL || cmask_pmd != NULL) {
+        return true;
+    }
+
+    if (dp->pmd_cmask == NULL) {
+        return dp->n_pmd_threads != n_pmd_cores;
+    }
+
+    return false;
 }
 
 /* Returns true if the configuration for nonpmd cpu mask is changed */
@@ -2162,11 +2188,12 @@ nonpmd_config_changed(const struct dp_netdev *dp, const char *cmask_nonpmd)
 /* Resets pmd threads if the configuration for 'rxq's or cpu mask changes. */
 static int
 dpif_netdev_pmd_set(struct dpif *dpif, unsigned int n_rxqs,
-                    const char *cmask_pmd, const char *cmask_nonpmd)
+                    int n_pmd_cores, const char *cmask_pmd,
+                    const char *cmask_nonpmd)
 {
     struct dp_netdev *dp = get_dp_netdev(dpif);
 
-    if (pmd_config_changed(dp, n_rxqs, cmask_pmd)) {
+    if (pmd_config_changed(dp, n_rxqs, cmask_pmd, n_pmd_cores)) {
         struct dp_netdev_port *port;
 
         dp_netdev_destroy_all_pmds(dp);
@@ -2200,6 +2227,7 @@ dpif_netdev_pmd_set(struct dpif *dpif, unsigned int n_rxqs,
             }
         }
         dp->n_dpdk_rxqs = n_rxqs;
+        dp->n_pmd_threads = n_pmd_cores;
 
         /* Reconfigures the cpu mask. */
         ovs_numa_set_cpu_mask_pmd(cmask_pmd);
@@ -2208,6 +2236,7 @@ dpif_netdev_pmd_set(struct dpif *dpif, unsigned int n_rxqs,
         free(dp->nonpmd_cmask);
         dp->pmd_cmask = cmask_pmd ? xstrdup(cmask_pmd) : NULL;
         dp->nonpmd_cmask = cmask_nonpmd ? xstrdup(cmask_nonpmd) : NULL;
+        ovs_numa_core_disable_pmd(NON_PMD_CORE_ID);
 
         /* Restores the non-pmd. */
         dp_netdev_set_nonpmd(dp);
@@ -2630,23 +2659,10 @@ dp_netdev_destroy_all_pmds(struct dp_netdev *dp)
     }
 }
 
-/* Deletes all pmd threads on numa node 'numa_id'. */
-static void
-dp_netdev_del_pmds_on_numa(struct dp_netdev *dp, int numa_id)
-{
-    struct dp_netdev_pmd_thread *pmd;
-
-    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
-        if (pmd->numa_id == numa_id) {
-            dp_netdev_del_pmd(pmd);
-        }
-    }
-}
-
 /* Checks the numa node id of 'netdev' and starts pmd threads for
  * the numa node. */
 static void
-dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id)
+dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id, int nr_threads)
 {
     int n_pmds;
 
@@ -2672,8 +2688,9 @@ dp_netdev_set_pmds_on_numa(struct dp_netdev *dp, int numa_id)
         }
 
         /* If cpu mask is specified, uses all unpinned cores, otherwise
-         * tries creating NR_PMD_THREADS pmd threads. */
-        can_have = dp->pmd_cmask ? n_unpinned : MIN(n_unpinned, NR_PMD_THREADS);
+         * tries creating 'nr_threads' pmd threads. */
+        can_have = dp->pmd_cmask ? n_unpinned : MIN(n_unpinned, nr_threads);
+
         for (i = 0; i < can_have; i++) {
             struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
             int core_id = ovs_numa_get_unpinned_core_on_numa(numa_id);
@@ -2694,15 +2711,46 @@ static void
 dp_netdev_reset_pmd_threads(struct dp_netdev *dp)
 {
     struct dp_netdev_port *port;
+    int max_numa = ovs_numa_get_n_numas();
+    unsigned long *numabitmap;
+    int numa_id, nr_numa;
+
+    if (max_numa < 1) {
+        max_numa = 1;
+    }
+    
+    numabitmap = bitmap_allocate(max_numa);
 
     CMAP_FOR_EACH (port, node, &dp->ports) {
         if (netdev_is_pmd(port->netdev)) {
-            int numa_id = netdev_get_numa_id(port->netdev);
+            numa_id = netdev_get_numa_id(port->netdev);
 
-            dp_netdev_set_pmds_on_numa(dp, numa_id);
+            bitmap_set1(numabitmap, numa_id);
+        }
+    }
+
+    nr_numa = bitmap_count1(numabitmap, max_numa);
+    if (nr_numa) {
+        int n_threads_per_numa, n_threads_remainder;
+
+        if (dp->n_pmd_threads == 0) {
+            /* Default: just create one pmd threads per numa node */
+            n_threads_per_numa = 1;
+            n_threads_remainder = 0;
+        } else {
+            n_threads_per_numa = dp->n_pmd_threads / nr_numa;
+            n_threads_remainder = dp->n_pmd_threads % nr_numa;
+        }
+
+        BITMAP_FOR_EACH_1(numa_id, max_numa, numabitmap) {
+            dp_netdev_set_pmds_on_numa(dp, numa_id, n_threads_per_numa
+                                                    + n_threads_remainder);
+            n_threads_remainder = 0;
         }
     }
     dp_netdev_set_nonpmd_affinity();
+
+    bitmap_free(numabitmap);
 }
 
 static char *
diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h
index 3612766..d33d7ed 100644
--- a/lib/dpif-provider.h
+++ b/lib/dpif-provider.h
@@ -310,9 +310,12 @@ struct dpif_class {
      * configuration.  'n_rxqs' configures the number of rx_queues, which
      * are distributed among threads.  'cmask' configures the cpu mask
      * for setting the polling threads' cpu affinity.  'cmask_nonpmd'
-     * configures the cpumask of the remaining OVS threads */
+     * configures the cpumask of the remaining OVS threads.
+     * If 'cmask' is NULL, 'n_pmd_cores' cores will be used on each
+     * numa domain, otherwise 'n_pmd_cores' will be ignored */
     int (*poll_threads_set)(struct dpif *dpif, unsigned int n_rxqs,
-                            const char *cmask, const char *cmask_nonpmd);
+                            int n_pmd_cores, const char *cmask,
+                            const char *cmask_nonpmd);
 
     /* Translates OpenFlow queue ID 'queue_id' (in host byte order) into a
      * priority value used for setting packet priority. */
diff --git a/lib/dpif.c b/lib/dpif.c
index 32bc005..cbb7399 100644
--- a/lib/dpif.c
+++ b/lib/dpif.c
@@ -1383,12 +1383,14 @@ dpif_print_packet(struct dpif *dpif, struct dpif_upcall *upcall)
  * configuration. */
 int
 dpif_poll_threads_set(struct dpif *dpif, unsigned int n_rxqs,
-                      const char *cmask, const char *cmask_nonpmd)
+                      int n_pmd_cores, const char *cmask,
+                      const char *cmask_nonpmd)
 {
     int error = 0;
 
     if (dpif->dpif_class->poll_threads_set) {
-        error = dpif->dpif_class->poll_threads_set(dpif, n_rxqs, cmask,
+        error = dpif->dpif_class->poll_threads_set(dpif, n_rxqs,
+                                                   n_pmd_cores, cmask,
                                                    cmask_nonpmd);
         if (error) {
             log_operation(dpif, "poll_threads_set", error);
diff --git a/lib/dpif.h b/lib/dpif.h
index 68774bf..4ee1a69 100644
--- a/lib/dpif.h
+++ b/lib/dpif.h
@@ -819,7 +819,7 @@ void dpif_register_upcall_cb(struct dpif *, upcall_callback *, void *aux);
 
 int dpif_recv_set(struct dpif *, bool enable);
 int dpif_handlers_set(struct dpif *, uint32_t n_handlers);
-int dpif_poll_threads_set(struct dpif *, unsigned int n_rxqs,
+int dpif_poll_threads_set(struct dpif *, unsigned int n_rxqs, int n_pmd_cores,
                           const char *cmask, const char *cmask_nonpmd);
 int dpif_recv(struct dpif *, uint32_t handler_id, struct dpif_upcall *,
               struct ofpbuf *);
diff --git a/lib/ovs-numa.c b/lib/ovs-numa.c
index 3b432f1..8c6a6e1 100644
--- a/lib/ovs-numa.c
+++ b/lib/ovs-numa.c
@@ -287,22 +287,15 @@ ovs_numa_get_n_unpinned_cores_on_numa(int numa_id)
     return OVS_CORE_UNSPEC;
 }
 
-/* Given 'core_id', tries to pin that core.  Returns true, if succeeds.
- * False, if the core has already been pinned, or if it is invalid or
- * not available. */
-bool
-ovs_numa_try_pin_core_specific(int core_id)
+/* Removes the core 'core_id' from the pmd cpu mask */
+void
+ovs_numa_core_disable_pmd(int core_id)
 {
     struct cpu_core *core = get_core_by_core_id(core_id);
 
     if (core) {
-        if (core->available_pmd && !core->pinned) {
-            core->pinned = true;
-            return true;
-        }
+        core->available_pmd = false;
     }
-
-    return false;
 }
 
 /* Searches through all cores for an unpinned and available core.  Returns
diff --git a/lib/ovs-numa.h b/lib/ovs-numa.h
index 04f598a..ea7a759 100644
--- a/lib/ovs-numa.h
+++ b/lib/ovs-numa.h
@@ -52,7 +52,7 @@ int ovs_numa_get_n_cores(void);
 int ovs_numa_get_numa_id(int core_id);
 int ovs_numa_get_n_cores_on_numa(int numa_id);
 int ovs_numa_get_n_unpinned_cores_on_numa(int numa_id);
-bool ovs_numa_try_pin_core_specific(int core_id);
+void ovs_numa_core_disable_pmd(int core_id);
 int ovs_numa_get_unpinned_core_any(void);
 int ovs_numa_get_unpinned_core_on_numa(int numa_id);
 void ovs_numa_unpin_core(int core_id);
@@ -131,10 +131,10 @@ ovs_numa_get_n_unpinned_cores_on_numa(int numa_id OVS_UNUSED)
     return OVS_CORE_UNSPEC;
 }
 
-static inline bool
-ovs_numa_try_pin_core_specific(int core_id OVS_UNUSED)
+static inline void
+ovs_numa_core_disable_pmd(int core_id OVS_UNUSED)
 {
-    return false;
+    /* Nothing */
 }
 
 static inline int
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index 0a90f9e..5c86dbb 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -569,8 +569,8 @@ type_run(const char *type)
         udpif_set_threads(backer->udpif, n_handlers, n_revalidators);
     }
 
-    dpif_poll_threads_set(backer->dpif, n_dpdk_rxqs, pmd_cpu_mask,
-                          nonpmd_cpu_mask);
+    dpif_poll_threads_set(backer->dpif, n_dpdk_rxqs, n_pmd_cores,
+                          pmd_cpu_mask, nonpmd_cpu_mask);
 
     if (backer->need_revalidate) {
         struct ofproto_dpif *ofproto;
diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h
index 856491d..c953576 100644
--- a/ofproto/ofproto-provider.h
+++ b/ofproto/ofproto-provider.h
@@ -457,6 +457,8 @@ extern size_t n_handlers, n_revalidators;
 /* Number of rx queues to be created for each dpdk interface. */
 extern size_t n_dpdk_rxqs;
 
+/* Maximum number of pmd threads.  Ignored if 'pmd_cpu_mask' != NULL */
+extern int n_pmd_cores;
 /* Cpu mask for pmd threads. */
 extern char *pmd_cpu_mask;
 extern char *nonpmd_cpu_mask;
diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c
index ecb7bb8..92e4038 100644
--- a/ofproto/ofproto.c
+++ b/ofproto/ofproto.c
@@ -305,6 +305,7 @@ unsigned ofproto_max_idle = OFPROTO_MAX_IDLE_DEFAULT;
 
 size_t n_handlers, n_revalidators;
 size_t n_dpdk_rxqs;
+int n_pmd_cores;
 char *pmd_cpu_mask;
 char *nonpmd_cpu_mask;
 
@@ -742,6 +743,12 @@ ofproto_set_n_dpdk_rxqs(int n_rxqs)
 }
 
 void
+ofproto_set_n_pmd_cores(int n_cores)
+{
+    n_pmd_cores = MAX(n_cores, 0);
+}
+
+void
 ofproto_set_pmd_cpu_mask(const char *cmask)
 {
     free(pmd_cpu_mask);
diff --git a/ofproto/ofproto.h b/ofproto/ofproto.h
index bc7359b..3dada59 100644
--- a/ofproto/ofproto.h
+++ b/ofproto/ofproto.h
@@ -317,6 +317,7 @@ int ofproto_port_set_mcast_snooping(struct ofproto *ofproto, void *aux,
                           const struct ofproto_mcast_snooping_port_settings *s);
 void ofproto_set_threads(int n_handlers, int n_revalidators);
 void ofproto_set_n_dpdk_rxqs(int n_rxqs);
+void ofproto_set_n_pmd_cores(int n_cores);
 void ofproto_set_pmd_cpu_mask(const char *cmask);
 void ofproto_set_nonpmd_cpu_mask(const char *cmask);
 void ofproto_set_dp_desc(struct ofproto *, const char *dp_desc);
diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c
index 1bbd6af..571a036 100644
--- a/vswitchd/bridge.c
+++ b/vswitchd/bridge.c
@@ -565,6 +565,8 @@ bridge_reconfigure(const struct ovsrec_open_vswitch *ovs_cfg)
                                       OFPROTO_MAX_IDLE_DEFAULT));
     ofproto_set_n_dpdk_rxqs(smap_get_int(&ovs_cfg->other_config,
                                          "n-dpdk-rxqs", 0));
+    ofproto_set_n_pmd_cores(smap_get_int(&ovs_cfg->other_config,
+                                         "n-pmd-cores", 0));
     ofproto_set_pmd_cpu_mask(smap_get(&ovs_cfg->other_config,
                                       "pmd-cpu-mask"));
     ofproto_set_nonpmd_cpu_mask(smap_get(&ovs_cfg->other_config,
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 5d14487..c36d252 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -161,6 +161,24 @@
         </p>
       </column>
 
+      <column name="other_config" key="n-cores"
+              type='{"type": "integer", "minInteger": 0}'>
+        <p>
+          Specifies the maximum number of cores that the userspace datapath is
+          allowed to use to process packets (i.e. the maximum number of PMD
+          threads that will be created by OVS).  The special value 0 means
+          that OVS will use a single core per NUMA domain.
+        </p>
+        <p>
+          If ``other_config:pmd-cpu-mask'' is specified, this value will be
+          ignored and the CPU mask will be honored.
+        </p>
+        <p>
+          The default is 0.  It means that OVS will use only one core per
+          NUMA domain to process packets in the userspace datapath.
+        </p>
+      </column>
+
       <column name="other_config" key="pmd-cpu-mask">
         <p>
           Specifies CPU mask for setting the cpu affinity of PMD (Poll
@@ -175,8 +193,13 @@
           those uncovered cores are considered not set.
         </p>
         <p>
-          If not specified, one pmd thread will be created for each numa node
-          and pinned to any available core on the numa node by default.
+          Please note that core 0 (NON_PMD_CORE_ID in the code) is reserved
+          and will never be used for a PMD thread.  If set in the mask, it will
+          be ignored.
+        </p>
+        <p>
+          If not specified, the ``other_config:n-cores'' key will be honored.
+          If it is specified, ``other_config:n-cores'' will be ignored.
         </p>
       </column>
 
@@ -195,6 +218,10 @@
           the cores used for PMD threads operations.
         </p>
         <p>
+          Please note that core 0 (NON_PMD_CORE_ID in the code) is always used
+          for non PMD threads, even if unset in this mask.
+        </p>
+        <p>
           If not specified, the non PMD threads will be bound to every core
           not used for pmd operations.
         </p>
-- 
2.1.4