[ovs-dev] [PATCH 1/4] dpif-netdev: Refactor rxq auto-lb dry-run code.

anurag2k at gmail.com anurag2k at gmail.com
Tue Jun 29 11:27:53 UTC 2021


From: Anurag Agarwal <anurag.agarwal at ericsson.com>

The current functions performing a dry-run of the allocation of
non-pinned rxqs to PMDs during rxq auto load-balancing duplicate most
of the code of the rxq_scheduling() function used during actual rxq
reconfiguration.

This is difficult to maintain and there are actually cases today where
the dry-run behaves differently to rxq_scheduling() which can cause the
auto-lb function to fail.

This patch refactors the pmd_rebalance_dry_run() function to rely on
the rxq_scheduling() function to perform the dry-run and only implement
the comparision of current and predicted PMD load on top. The resulting
code is not only shorter but also easier to understand.

It makes use of the fact that the "pmd" member of struct dp_netdev_rxq,
which is set as part of rxq_scheduling(), is only used under the
protection of dp->port_mutex and it is safe to temporarily change it
during the dry-run. Before the end of the dry-run the original values
are restored.

Signed-off-by: Anurag Agarwal <anurag.agarwal at ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich at ericsson.com>
Signed-off-by: Rudra Surya Bhaskara Rao <rudrasurya.r at acldigital.com>
---
 lib/dpif-netdev.c | 317 +++++++++++++++++++++---------------------------------
 1 file changed, 123 insertions(+), 194 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index c5ab35d..fd192db 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -4922,12 +4922,6 @@ struct rr_numa {
     bool idx_inc;
 };
 
-static size_t
-rr_numa_list_count(struct rr_numa_list *rr)
-{
-    return hmap_count(&rr->numas);
-}
-
 static struct rr_numa *
 rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
 {
@@ -5075,9 +5069,11 @@ compare_rxq_cycles(const void *a, const void *b)
  * pmds to unpinned queues.
  *
  * The function doesn't touch the pmd threads, it just stores the assignment
- * in the 'pmd' member of each rxq. */
+ * in the 'pmd' member of each rxq. Skip logging in the case of an auto
+ * load-balancing dry_run. */
 static void
-rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
+rxq_scheduling(struct dp_netdev *dp, bool pinned, bool dry_run)
+    OVS_REQUIRES(dp->port_mutex)
 {
     struct dp_netdev_port *port;
     struct rr_numa_list rr;
@@ -5152,38 +5148,44 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
                Round robin on the NUMA nodes that do have pmds. */
             non_local_numa = rr_numa_list_next(&rr, non_local_numa);
             if (!non_local_numa) {
-                VLOG_ERR("There is no available (non-isolated) pmd "
-                         "thread for port \'%s\' queue %d. This queue "
-                         "will not be polled. Is pmd-cpu-mask set to "
-                         "zero? Or are all PMDs isolated to other "
-                         "queues?", netdev_rxq_get_name(rxqs[i]->rx),
-                         netdev_rxq_get_queue_id(rxqs[i]->rx));
+                if (!dry_run) {
+                    VLOG_ERR("There is no available (non-isolated) pmd "
+                             "thread for port \'%s\' queue %d. This queue "
+                             "will not be polled. Is pmd-cpu-mask set to "
+                             "zero? Or are all PMDs isolated to other "
+                             "queues?", netdev_rxq_get_name(rxqs[i]->rx),
+                             netdev_rxq_get_queue_id(rxqs[i]->rx));
+                }
                 continue;
             }
             rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa, assign_cyc);
-            VLOG_WARN("There's no available (non-isolated) pmd thread "
-                      "on numa node %d. Queue %d on port \'%s\' will "
-                      "be assigned to the pmd on core %d "
-                      "(numa node %d). Expect reduced performance.",
-                      numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
-                      netdev_rxq_get_name(rxqs[i]->rx),
-                      rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
+            if (!dry_run) {
+                VLOG_WARN("There's no available (non-isolated) pmd thread "
+                          "on numa node %d. Queue %d on port \'%s\' will "
+                          "be assigned to the pmd on core %d "
+                          "(numa node %d). Expect reduced performance.",
+                          numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
+                          netdev_rxq_get_name(rxqs[i]->rx),
+                          rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
+            }
         } else {
             rxqs[i]->pmd = rr_numa_get_pmd(numa, assign_cyc);
-            if (assign_cyc) {
-                VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
-                          "rx queue %d "
-                          "(measured processing cycles %"PRIu64").",
-                          rxqs[i]->pmd->core_id, numa_id,
-                          netdev_rxq_get_name(rxqs[i]->rx),
-                          netdev_rxq_get_queue_id(rxqs[i]->rx),
-                          dp_netdev_rxq_get_cycles(rxqs[i],
-                                                   RXQ_CYCLES_PROC_HIST));
-            } else {
-                VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
-                          "rx queue %d.", rxqs[i]->pmd->core_id, numa_id,
-                          netdev_rxq_get_name(rxqs[i]->rx),
-                          netdev_rxq_get_queue_id(rxqs[i]->rx));
+            if (!dry_run) {
+                if (assign_cyc) {
+                    VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
+                              "rx queue %d "
+                              "(measured processing cycles %"PRIu64").",
+                              rxqs[i]->pmd->core_id, numa_id,
+                              netdev_rxq_get_name(rxqs[i]->rx),
+                              netdev_rxq_get_queue_id(rxqs[i]->rx),
+                              dp_netdev_rxq_get_cycles(rxqs[i],
+                                                       RXQ_CYCLES_PROC_HIST));
+                } else {
+                    VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
+                              "rx queue %d.", rxqs[i]->pmd->core_id, numa_id,
+                              netdev_rxq_get_name(rxqs[i]->rx),
+                              netdev_rxq_get_queue_id(rxqs[i]->rx));
+                }
             }
         }
     }
@@ -5441,10 +5443,10 @@ reconfigure_datapath(struct dp_netdev *dp)
     }
 
     /* Add pinned queues and mark pmd threads isolated. */
-    rxq_scheduling(dp, true);
+    rxq_scheduling(dp, true, false);
 
     /* Add non-pinned queues. */
-    rxq_scheduling(dp, false);
+    rxq_scheduling(dp, false, false);
 
     /* Step 5: Remove queues not compliant with new scheduling. */
 
@@ -5547,7 +5549,7 @@ ports_require_restart(const struct dp_netdev *dp)
 }
 
 /* Calculates variance in the values stored in array 'a'. 'n' is the number
- * of elements in array to be considered for calculating vairance.
+ * of elements in array to be considered for calculating variance.
  * Usage example: data array 'a' contains the processing load of each pmd and
  * 'n' is the number of PMDs. It returns the variance in processing load of
  * PMDs*/
@@ -5575,134 +5577,10 @@ variance(uint64_t a[], int n)
             sqDiff += (a[i] - mean)*(a[i] - mean);
         }
     }
-    return (sqDiff ? (sqDiff / n) : 0);
+    return sqDiff / n;
 }
 
 
-/* Returns the variance in the PMDs usage as part of dry run of rxqs
- * assignment to PMDs. */
-static bool
-get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
-                     uint32_t num_pmds, uint64_t *predicted_variance)
-    OVS_REQUIRES(dp->port_mutex)
-{
-    struct dp_netdev_port *port;
-    struct dp_netdev_pmd_thread *pmd;
-    struct dp_netdev_rxq **rxqs = NULL;
-    struct rr_numa *numa = NULL;
-    struct rr_numa_list rr;
-    int n_rxqs = 0;
-    bool ret = false;
-    uint64_t *pmd_usage;
-
-    if (!predicted_variance) {
-        return ret;
-    }
-
-    pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
-
-    HMAP_FOR_EACH (port, node, &dp->ports) {
-        if (!netdev_is_pmd(port->netdev)) {
-            continue;
-        }
-
-        for (int qid = 0; qid < port->n_rxq; qid++) {
-            struct dp_netdev_rxq *q = &port->rxqs[qid];
-            uint64_t cycle_hist = 0;
-
-            if (q->pmd->isolated) {
-                continue;
-            }
-
-            if (n_rxqs == 0) {
-                rxqs = xmalloc(sizeof *rxqs);
-            } else {
-                rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
-            }
-
-            /* Sum the queue intervals and store the cycle history. */
-            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
-                cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
-            }
-            dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
-                                         cycle_hist);
-            /* Store the queue. */
-            rxqs[n_rxqs++] = q;
-        }
-    }
-    if (n_rxqs > 1) {
-        /* Sort the queues in order of the processing cycles
-         * they consumed during their last pmd interval. */
-        qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
-    }
-    rr_numa_list_populate(dp, &rr);
-
-    for (int i = 0; i < n_rxqs; i++) {
-        int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
-        numa = rr_numa_list_lookup(&rr, numa_id);
-        /* If there is no available pmd on the local numa but there is only one
-         * numa for cross-numa polling, we can estimate the dry run. */
-        if (!numa && rr_numa_list_count(&rr) == 1) {
-            numa = rr_numa_list_next(&rr, NULL);
-        }
-        if (!numa) {
-            VLOG_DBG("PMD auto lb dry run: "
-                     "There's no available (non-isolated) PMD thread on NUMA "
-                     "node %d for port '%s' and there are PMD threads on more "
-                     "than one NUMA node available for cross-NUMA polling. "
-                     "Aborting.", numa_id, netdev_rxq_get_name(rxqs[i]->rx));
-            goto cleanup;
-        }
-
-        pmd = rr_numa_get_pmd(numa, true);
-        VLOG_DBG("PMD auto lb dry run. Predicted: Core %d on numa node %d "
-                  "to be assigned port \'%s\' rx queue %d "
-                  "(measured processing cycles %"PRIu64").",
-                  pmd->core_id, numa_id,
-                  netdev_rxq_get_name(rxqs[i]->rx),
-                  netdev_rxq_get_queue_id(rxqs[i]->rx),
-                  dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
-
-        for (int id = 0; id < num_pmds; id++) {
-            if (pmd->core_id == core_list[id]) {
-                /* Add the processing cycles of rxq to pmd polling it. */
-                pmd_usage[id] += dp_netdev_rxq_get_cycles(rxqs[i],
-                                        RXQ_CYCLES_PROC_HIST);
-            }
-        }
-    }
-
-    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
-        uint64_t total_cycles = 0;
-
-        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
-            continue;
-        }
-
-        /* Get the total pmd cycles for an interval. */
-        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
-        /* Estimate the cycles to cover all intervals. */
-        total_cycles *= PMD_RXQ_INTERVAL_MAX;
-        for (int id = 0; id < num_pmds; id++) {
-            if (pmd->core_id == core_list[id]) {
-                if (pmd_usage[id]) {
-                    pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
-                }
-                VLOG_DBG("PMD auto lb dry run. Predicted: Core %d, "
-                         "usage %"PRIu64"", pmd->core_id, pmd_usage[id]);
-            }
-        }
-    }
-    *predicted_variance = variance(pmd_usage, num_pmds);
-    ret = true;
-
-cleanup:
-    rr_numa_list_destroy(&rr);
-    free(rxqs);
-    free(pmd_usage);
-    return ret;
-}
-
 /* Does the dry run of Rxq assignment to PMDs and returns true if it gives
  * better distribution of load on PMDs. */
 static bool
@@ -5710,81 +5588,132 @@ pmd_rebalance_dry_run(struct dp_netdev *dp)
     OVS_REQUIRES(dp->port_mutex)
 {
     struct dp_netdev_pmd_thread *pmd;
+    struct dp_netdev_port *port;
+    struct dp_netdev_rxq *q;
     uint64_t *curr_pmd_usage;
-
+    uint64_t *pred_pmd_usage;
     uint64_t curr_variance;
-    uint64_t new_variance;
+    uint64_t pred_variance;
     uint64_t improvement = 0;
     uint32_t num_pmds;
-    uint32_t *pmd_corelist;
+    uint32_t pmd_idx;
     struct rxq_poll *poll;
-    bool ret;
+    bool ret = false;
 
     num_pmds = cmap_count(&dp->poll_threads);
 
-    if (num_pmds > 1) {
-        curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
-        pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
-    } else {
+    if (num_pmds <= 1) {
         return false;
     }
 
-    num_pmds = 0;
+    curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
+    pred_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
+
+    /* Perform a dry-run rxq allocation of non-pinned rxqs.
+     * This will refresh each rxq's processing cycle history count.
+     * The dry-run will temporarily update the pmd pointer of non-pinned
+     * rxqs, but that will be restored from the actual polling lists
+     * further down. This is safe as the pmd pointer is not used outside
+     * the dp->port_mutex. */
+    rxq_scheduling(dp, false, true);
+
+    /* Calculate the current and predicted PMD loads. */
+    pmd_idx = 0;
     CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
-        uint64_t total_cycles = 0;
-        uint64_t total_proc = 0;
 
+        /* Skip non-DPDK and isolated pmds. */
         if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
             continue;
         }
 
         /* Get the total pmd cycles for an interval. */
+        uint64_t total_cycles;
         atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
         /* Estimate the cycles to cover all intervals. */
         total_cycles *= PMD_RXQ_INTERVAL_MAX;
 
+        /* Sum up predicted pmd load from all rxqs assigned to the pmd
+         * in the dry-run. */
+        uint64_t pred_proc = 0;
+        HMAP_FOR_EACH (port, node, &dp->ports) {
+            if (!netdev_is_pmd(port->netdev)) {
+                continue;
+            }
+            for (int qid = 0; qid < port->n_rxq; qid++) {
+                q = &port->rxqs[qid];
+                if (q->pmd != pmd) {
+                    continue;
+                }
+                uint64_t cycles =
+                    dp_netdev_rxq_get_cycles(q, RXQ_CYCLES_PROC_HIST);
+                pred_proc += cycles;
+                VLOG_DBG("PMD auto lb dry run. Predicted: Core %d on numa "
+                          "node %d to be assigned port \'%s\' rx queue %d "
+                          "(measured processing cycles %"PRIu64").",
+                          pmd->core_id, pmd->numa_id,
+                          netdev_rxq_get_name(q->rx),
+                          netdev_rxq_get_queue_id(q->rx),
+                          cycles);
+            }
+        }
+
+        /* Sum up the current pmd load from the currently polled rxqs. */
+        uint64_t curr_proc = 0;
         ovs_mutex_lock(&pmd->port_mutex);
         HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
-            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
-                total_proc += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
-            }
+            q = poll->rxq;
+            curr_proc += dp_netdev_rxq_get_cycles(q, RXQ_CYCLES_PROC_HIST);
         }
         ovs_mutex_unlock(&pmd->port_mutex);
 
-        if (total_proc) {
-            curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
+        if (total_cycles > 0) {
+            pred_pmd_usage[pmd_idx] = (pred_proc * 100) / total_cycles;
+            curr_pmd_usage[pmd_idx] = (curr_proc * 100) / total_cycles;
+        } else {
+            pred_pmd_usage[pmd_idx] = 0;
+            curr_pmd_usage[pmd_idx] = 0;
         }
 
         VLOG_DBG("PMD auto lb dry run. Current: Core %d, usage %"PRIu64"",
-                  pmd->core_id, curr_pmd_usage[num_pmds]);
+                  pmd->core_id, curr_pmd_usage[pmd_idx]);
+        VLOG_DBG("PMD auto lb dry run. Predicted: Core %d, usage %"PRIu64"",
+                  pmd->core_id, curr_pmd_usage[pmd_idx]);
 
+        /* Reset the PMD overload indication. */
         if (atomic_count_get(&pmd->pmd_overloaded)) {
             atomic_count_set(&pmd->pmd_overloaded, 0);
         }
 
-        pmd_corelist[num_pmds] = pmd->core_id;
-        num_pmds++;
+        pmd_idx++;
+    }
+
+    /* Restore the original pmd from before the dry-run. */
+    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
+        /* Skip non-DPDK and isolated pmds. */
+        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
+            continue;
+        }
+        HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
+            poll->rxq->pmd = pmd;
+        }
     }
 
     curr_variance = variance(curr_pmd_usage, num_pmds);
-    ret = get_dry_run_variance(dp, pmd_corelist, num_pmds, &new_variance);
+    pred_variance = variance(pred_pmd_usage, num_pmds);
 
-    if (ret) {
-        VLOG_DBG("PMD auto lb dry run. Current PMD variance: %"PRIu64","
-                  " Predicted PMD variance: %"PRIu64"",
-                  curr_variance, new_variance);
+    VLOG_DBG("PMD auto lb dry run. Current PMD variance: %"PRIu64","
+              " Predicted PMD variance: %"PRIu64"",
+              curr_variance, pred_variance);
 
-        if (new_variance < curr_variance) {
-            improvement =
-                ((curr_variance - new_variance) * 100) / curr_variance;
-        }
-        if (improvement < dp->pmd_alb.rebalance_improve_thresh) {
+    if (pred_variance < curr_variance) {
+        improvement =
+            ((curr_variance - pred_variance) * 100) / curr_variance;
+        if (improvement < dp->pmd_alb.rebalance_improve_thresh)
             ret = false;
-        }
     }
 
     free(curr_pmd_usage);
-    free(pmd_corelist);
+    free(pred_pmd_usage);
     return ret;
 }
 
-- 
2.7.4



More information about the dev mailing list