[ovs-dev] [PATCH 2/4] dpif-netdev: Least-loaded scheduling algorithm for rxqs

Tue Jun 29 11:27:54 UTC 2021

From: Anurag Agarwal <anurag.agarwal at ericsson.com>

The current algorithm for balancing unpinned rxqs over non-isolated pmds
assigns the rxqs in descending load order to pmds in a "zig-zag" fashion
(e.g. A,B,C,C,B,A,A,B,...). This simple heuristic relies on the pmds
being initially "empty" and produces optimal results only if the rxq loads
are not too disparate (e.g. 100,10,10,10,10,10,10).

The first pre-requisite will no longer be fulfilled when the isolation
of pmds with pinned rxqs is made optional in a subsequent patch.

To prepare for this change we introduce a least-loaded scheduling
algorithm. During rxq scheduling, we keep track of the number of assigned
rxqs and their processing cycles per non-isolated pmd and maintain the
array of pmds per numa node sorted according to their load. We still
assign the rxqs in descending load order, but always assign to the least
loaded pmd. This deals with the case of pmds with a-prioy load due to
pinned rxs and, as additional benefit, handles disparate rxq loads better.

If rxq processing cycles are not used for rxq scheduling, the estimated
pmd load is based on the number of assigned rxqs. In this case, the
least-loaded scheduling algorithm effectively results in a round-robin
assignment of rxqs to pmds so that the legacy code for round-robin
assignment could be removed.

Signed-off-by: Anurag Agarwal <anurag.agarwal at ericsson.com>
Signed-off-by: Jan Scheurich <jan.scheurich at ericsson.com>
Signed-off-by: Rudra Surya Bhaskara Rao <rudrasurya.r at acldigital.com>
---
 lib/dpif-netdev.c | 267 +++++++++++++++++++++++++++++++++---------------------
 tests/pmd.at      |   4 +-
 2 files changed, 166 insertions(+), 105 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index fd192db..1c458b2 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -800,6 +800,10 @@ struct dp_netdev_pmd_thread {
 
     /* Next time when PMD should try RCU quiescing. */
     long long next_rcu_quiesce;
+
+    /* Number and cycles of assigned rxqs during rxq scheduling. */
+    int ll_n_rxq;
+    uint64_t ll_cycles;
 };
 
 /* Interface to netdev-based datapath. */
@@ -4905,6 +4909,62 @@ port_reconfigure(struct dp_netdev_port *port)
     return 0;
 }
 
+/* Sort PMDs in ascending order of load. If processing cycles are not used for
+ * rxq scheduling, the ll_cycles are zero. In that case the count of
+ * assigned rxqs is taken as PMD load, resulting in round-robin scheduling. */
+static int
+compare_pmd_load(const void *a, const void *b)
+{
+    struct dp_netdev_pmd_thread *pmda;
+    struct dp_netdev_pmd_thread *pmdb;
+
+    pmda = *(struct dp_netdev_pmd_thread **) a;
+    pmdb = *(struct dp_netdev_pmd_thread **) b;
+
+    if (pmda->ll_cycles != pmdb->ll_cycles) {
+        return (pmda->ll_cycles > pmdb->ll_cycles) ? 1 : -1;
+    } else if (pmda->ll_n_rxq != pmdb->ll_n_rxq) {
+        return (pmda->ll_n_rxq > pmdb->ll_n_rxq) ? 1 : -1;
+    } else {
+        /* Cycles and rxqs are the same so tiebreak on core ID.
+         * Tiebreaking (as opposed to return 0) ensures consistent
+         * sort results across multiple OS's. */
+        return (pmda->core_id > pmdb->core_id) ? 1 : -1;
+    }
+}
+
+/* Sort Rx Queues in descending order of processing cycles they
+ * are consuming. */
+static int
+compare_rxq_cycles(const void *a, const void *b)
+{
+    struct dp_netdev_rxq *qa;
+    struct dp_netdev_rxq *qb;
+    uint64_t cycles_qa, cycles_qb;
+
+    qa = *(struct dp_netdev_rxq **) a;
+    qb = *(struct dp_netdev_rxq **) b;
+
+    cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
+    cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
+
+    if (cycles_qa != cycles_qb) {
+        return (cycles_qa < cycles_qb) ? 1 : -1;
+    } else {
+        /* Cycles are the same so tiebreak on port/queue id.
+         * Tiebreaking (as opposed to return 0) ensures consistent
+         * sort results across multiple OS's. */
+        uint32_t port_qa = odp_to_u32(qa->port->port_no);
+        uint32_t port_qb = odp_to_u32(qb->port->port_no);
+        if (port_qa != port_qb) {
+            return (port_qa > port_qb) ? 1 : -1;
+        } else {
+            return (netdev_rxq_get_queue_id(qa->rx)
+                    - netdev_rxq_get_queue_id(qb->rx));
+        }
+    }
+}
+
 struct rr_numa_list {
     struct hmap numas;  /* Contains 'struct rr_numa' */
 };
@@ -4917,9 +4977,6 @@ struct rr_numa {
     /* Non isolated pmds on numa node 'numa_id' */
     struct dp_netdev_pmd_thread **pmds;
     int n_pmds;
-
-    int cur_index;
-    bool idx_inc;
 };
 
 static struct rr_numa *
@@ -4976,49 +5033,12 @@ rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
         numa->n_pmds++;
         numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
         numa->pmds[numa->n_pmds - 1] = pmd;
-        /* At least one pmd so initialise curr_idx and idx_inc. */
-        numa->cur_index = 0;
-        numa->idx_inc = true;
     }
-}
-
-/*
- * Returns the next pmd from the numa node.
- *
- * If 'updown' is 'true' it will alternate between selecting the next pmd in
- * either an up or down walk, switching between up/down when the first or last
- * core is reached. e.g. 1,2,3,3,2,1,1,2...
- *
- * If 'updown' is 'false' it will select the next pmd wrapping around when last
- * core reached. e.g. 1,2,3,1,2,3,1,2...
- */
-static struct dp_netdev_pmd_thread *
-rr_numa_get_pmd(struct rr_numa *numa, bool updown)
-{
-    int numa_idx = numa->cur_index;
 
-    if (numa->idx_inc == true) {
-        /* Incrementing through list of pmds. */
-        if (numa->cur_index == numa->n_pmds-1) {
-            /* Reached the last pmd. */
-            if (updown) {
-                numa->idx_inc = false;
-            } else {
-                numa->cur_index = 0;
-            }
-        } else {
-            numa->cur_index++;
-        }
-    } else {
-        /* Decrementing through list of pmds. */
-        if (numa->cur_index == 0) {
-            /* Reached the first pmd. */
-            numa->idx_inc = true;
-        } else {
-            numa->cur_index--;
-        }
+    /* Sort the pmds of each numa according to their load. */
+    HMAP_FOR_EACH (numa, node, &rr->numas) {
+        qsort(numa->pmds, numa->n_pmds, sizeof *numa->pmds, compare_pmd_load);
     }
-    return numa->pmds[numa_idx];
 }
 
 static void
@@ -5033,57 +5053,67 @@ rr_numa_list_destroy(struct rr_numa_list *rr)
     hmap_destroy(&rr->numas);
 }
 
-/* Sort Rx Queues by the processing cycles they are consuming. */
-static int
-compare_rxq_cycles(const void *a, const void *b)
+/*
+ * Returns the least loaded pmd of the numa node.
+ */
+static struct dp_netdev_pmd_thread *
+rr_numa_assign_least_loaded_pmd(struct rr_numa *numa, uint64_t cycles)
 {
-    struct dp_netdev_rxq *qa;
-    struct dp_netdev_rxq *qb;
-    uint64_t cycles_qa, cycles_qb;
-
-    qa = *(struct dp_netdev_rxq **) a;
-    qb = *(struct dp_netdev_rxq **) b;
-
-    cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
-    cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
+    struct dp_netdev_pmd_thread *ll_pmd;
+    struct dp_netdev_pmd_thread *pmd;
+    int i;
 
-    if (cycles_qa != cycles_qb) {
-        return (cycles_qa < cycles_qb) ? 1 : -1;
-    } else {
-        /* Cycles are the same so tiebreak on port/queue id.
-         * Tiebreaking (as opposed to return 0) ensures consistent
-         * sort results across multiple OS's. */
-        uint32_t port_qa = odp_to_u32(qa->port->port_no);
-        uint32_t port_qb = odp_to_u32(qb->port->port_no);
-        if (port_qa != port_qb) {
-            return port_qa > port_qb ? 1 : -1;
-        } else {
-            return netdev_rxq_get_queue_id(qa->rx)
-                    - netdev_rxq_get_queue_id(qb->rx);
+    /* By construction each numa has at least one non-isolated pmd.
+     * The least loaded is in position 0 of the sorted pmd array. */
+    ll_pmd = numa->pmds[0];
+    ll_pmd->ll_n_rxq++;
+    ll_pmd->ll_cycles += cycles;
+
+    /* Bubble the updated pmd up in the array to maintain the sort order.
+     * Move the pmd to the end of the list of pmds with equal load to ensure
+     * least loaded scheduling based on number of rxqs behaves identically to
+     * round-robin scheduling. */
+    for (i = 1; i < numa->n_pmds; i++) {
+        if (compare_pmd_load(&numa->pmds[i - 1], &numa->pmds[i]) < 0) {
+            break;
         }
+        pmd = numa->pmds[i];
+        numa->pmds[i] = numa->pmds[i - 1];
+        numa->pmds[i - 1] = pmd;
     }
+
+    return ll_pmd;
 }
 
-/* Assign pmds to queues.  If 'pinned' is true, assign pmds to pinned
- * queues and marks the pmds as isolated.  Otherwise, assign non isolated
- * pmds to unpinned queues.
+/*
+ * Assign rxq to pmds in two phases. In phase 1 assigned pinned rxqs and mark
+ * the pmds as isolated. Also initialize the data in pmd and rxq structs for
+ * the second phase.
  *
- * The function doesn't touch the pmd threads, it just stores the assignment
- * in the 'pmd' member of each rxq. Skip logging in the case of an auto
- * load-balancing dry_run. */
+ * In phase 2 assign the unpinned rxqs to non-isolated pmds. To balance the
+ * resulting pmd loads, assign the rxqs in descending load order to the least
+ * loaded pmd on the chosen numa node. The pmd load is either the processing
+ * cycles of the assigned rx queues or the number of assigned rx queues.
+ *
+ * The functions don't reconfigure the running pmd threads. They just store the
+ * assignment in the 'pmd' member of each rxq.
+ */
+
+
 static void
-rxq_scheduling(struct dp_netdev *dp, bool pinned, bool dry_run)
+prepare_rxq_scheduling(struct dp_netdev *dp)
     OVS_REQUIRES(dp->port_mutex)
 {
     struct dp_netdev_port *port;
-    struct rr_numa_list rr;
-    struct rr_numa *non_local_numa = NULL;
-    struct dp_netdev_rxq ** rxqs = NULL;
-    int n_rxqs = 0;
-    struct rr_numa *numa = NULL;
-    int numa_id;
+    struct dp_netdev_pmd_thread *pmd;
     bool assign_cyc = dp->pmd_rxq_assign_cyc;
 
+    /* Reset the pmd attributes for least-loaded scheduling. */
+    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
+        pmd->ll_n_rxq = 0;
+        pmd->ll_cycles = 0;
+    }
+
     HMAP_FOR_EACH (port, node, &dp->ports) {
         if (!netdev_is_pmd(port->netdev)) {
             continue;
@@ -5091,10 +5121,18 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned, bool dry_run)
 
         for (int qid = 0; qid < port->n_rxq; qid++) {
             struct dp_netdev_rxq *q = &port->rxqs[qid];
+            uint64_t cycles = 0;
 
-            if (pinned && q->core_id != OVS_CORE_UNSPEC) {
-                struct dp_netdev_pmd_thread *pmd;
+            if (assign_cyc) {
+                /* Sum the queue intervals and store the cycle history. */
+                for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
+                    cycles += dp_netdev_rxq_get_intrvl_cycles(q, i);
+                }
+                dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
+                                         cycles);
+            }
 
+            if (q->core_id != OVS_CORE_UNSPEC) {
                 pmd = dp_netdev_get_pmd(dp, q->core_id);
                 if (!pmd) {
                     VLOG_WARN("There is no PMD thread on core %d. Queue "
@@ -5103,30 +5141,48 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned, bool dry_run)
                 } else {
                     q->pmd = pmd;
                     pmd->isolated = true;
+                    pmd->ll_n_rxq++;
+                    pmd->ll_cycles += cycles;
                     VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
                               "rx queue %d.", pmd->core_id, pmd->numa_id,
                               netdev_rxq_get_name(q->rx),
                               netdev_rxq_get_queue_id(q->rx));
                     dp_netdev_pmd_unref(pmd);
                 }
-            } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
-                uint64_t cycle_hist = 0;
+            }
+        }
+    }
+}
+
+static void
+rxq_scheduling(struct dp_netdev *dp, bool dry_run)
+    OVS_REQUIRES(dp->port_mutex)
+{
+    struct dp_netdev_port *port;
+    struct dp_netdev_rxq ** rxqs = NULL;
+    struct rr_numa_list rr;
+    struct rr_numa *numa = NULL;
+    struct rr_numa *non_local_numa = NULL;
+    int n_rxqs = 0;
+    int numa_id;
+    bool assign_cyc = dp->pmd_rxq_assign_cyc;
+
+    HMAP_FOR_EACH (port, node, &dp->ports) {
+        if (!netdev_is_pmd(port->netdev)) {
+            continue;
+        }
+
+        for (int qid = 0; qid < port->n_rxq; qid++) {
+            struct dp_netdev_rxq *q = &port->rxqs[qid];
 
+            if (q->core_id == OVS_CORE_UNSPEC) {
                 if (n_rxqs == 0) {
                     rxqs = xmalloc(sizeof *rxqs);
                 } else {
                     rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
                 }
 
-                if (assign_cyc) {
-                    /* Sum the queue intervals and store the cycle history. */
-                    for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
-                        cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
-                    }
-                    dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
-                                             cycle_hist);
-                }
-                /* Store the queue. */
+                /* Store the unpinned queue for scheduling. */
                 rxqs[n_rxqs++] = q;
             }
         }
@@ -5139,9 +5195,13 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned, bool dry_run)
     }
 
     rr_numa_list_populate(dp, &rr);
-    /* Assign the sorted queues to pmds in round robin. */
+    /* Assign the sorted queues to pmds in least loaded fashion. */
     for (int i = 0; i < n_rxqs; i++) {
+        uint64_t cycles = 0;
+
         numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
+        cycles = dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST);
+
         numa = rr_numa_list_lookup(&rr, numa_id);
         if (!numa) {
             /* There are no pmds on the queue's local NUMA node.
@@ -5158,7 +5218,8 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned, bool dry_run)
                 }
                 continue;
             }
-            rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa, assign_cyc);
+            rxqs[i]->pmd =
+                    rr_numa_assign_least_loaded_pmd(non_local_numa, cycles);
             if (!dry_run) {
                 VLOG_WARN("There's no available (non-isolated) pmd thread "
                           "on numa node %d. Queue %d on port \'%s\' will "
@@ -5169,7 +5230,7 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned, bool dry_run)
                           rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
             }
         } else {
-            rxqs[i]->pmd = rr_numa_get_pmd(numa, assign_cyc);
+            rxqs[i]->pmd = rr_numa_assign_least_loaded_pmd(numa, cycles);
             if (!dry_run) {
                 if (assign_cyc) {
                     VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
@@ -5178,8 +5239,7 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned, bool dry_run)
                               rxqs[i]->pmd->core_id, numa_id,
                               netdev_rxq_get_name(rxqs[i]->rx),
                               netdev_rxq_get_queue_id(rxqs[i]->rx),
-                              dp_netdev_rxq_get_cycles(rxqs[i],
-                                                       RXQ_CYCLES_PROC_HIST));
+                              cycles);
                 } else {
                     VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
                               "rx queue %d.", rxqs[i]->pmd->core_id, numa_id,
@@ -5189,7 +5249,6 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned, bool dry_run)
             }
         }
     }
-
     rr_numa_list_destroy(&rr);
     free(rxqs);
 }
@@ -5443,10 +5502,10 @@ reconfigure_datapath(struct dp_netdev *dp)
     }
 
     /* Add pinned queues and mark pmd threads isolated. */
-    rxq_scheduling(dp, true, false);
+    prepare_rxq_scheduling(dp);
 
     /* Add non-pinned queues. */
-    rxq_scheduling(dp, false, false);
+    rxq_scheduling(dp, false);
 
     /* Step 5: Remove queues not compliant with new scheduling. */
 
@@ -5599,6 +5658,7 @@ pmd_rebalance_dry_run(struct dp_netdev *dp)
     uint32_t pmd_idx;
     struct rxq_poll *poll;
     bool ret = false;
+    bool dry_run;
 
     num_pmds = cmap_count(&dp->poll_threads);
 
@@ -5615,7 +5675,8 @@ pmd_rebalance_dry_run(struct dp_netdev *dp)
      * rxqs, but that will be restored from the actual polling lists
      * further down. This is safe as the pmd pointer is not used outside
      * the dp->port_mutex. */
-    rxq_scheduling(dp, false, true);
+    prepare_rxq_scheduling(dp);
+    rxq_scheduling(dp, dry_run = true);
 
     /* Calculate the current and predicted PMD loads. */
     pmd_idx = 0;
diff --git a/tests/pmd.at b/tests/pmd.at
index e0d54c1..93a0bad 100644
--- a/tests/pmd.at
+++ b/tests/pmd.at
@@ -150,8 +150,8 @@ AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x3])
 CHECK_PMD_THREADS_CREATED([2], [], [+$TMP])
 
 AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | awk '/AVAIL$/ { printf("%s\t", $0); next } 1' | parse_pmd_rxq_show_group | sort], [0], [dnl
-port: p0 queue-id: 0 3 4 7
-port: p0 queue-id: 1 2 5 6
+port: p0 queue-id: 0 2 4 6
+port: p0 queue-id: 1 3 5 7
 ])
 
 AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=roundrobin])
-- 
2.7.4