[ovs-dev] [PATCH v6 4/6] dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

Ilya Maximets i.maximets at samsung.com
Mon Aug 28 13:29:05 UTC 2017


> Previously rxqs were assigned to pmds by round robin in
> port/queue order.
> 
> Now that we have the processing cycles used for existing rxqs,
> use that information to try and produced a better balanced
> distribution of rxqs across pmds. i.e. given multiple pmds, the
> rxqs which have consumed the largest amount of processing cycles
> will be placed on different pmds.
> 
> The rxqs are sorted by their processing cycles and assigned (in
> sorted order) round robin across pmds.


One high level concern about this change:

Any change in any port or addition/deletion of PMD thread will likely
lead to rescheduling of all the ports to different PMD threads,
especially, if the load of threads is semi-equal. This will cause full
stop of packet processing for the time of reconfiguration and following
cache misses.

> 
> Signed-off-by: Kevin Traynor <ktraynor at redhat.com>
> ---
>  Documentation/howto/dpdk.rst |   7 +++
>  lib/dpif-netdev.c            | 123 ++++++++++++++++++++++++++++++++-----------
>  2 files changed, 99 insertions(+), 31 deletions(-)
> 
> diff --git a/Documentation/howto/dpdk.rst b/Documentation/howto/dpdk.rst
> index d7f6610..a67f3a1 100644
> --- a/Documentation/howto/dpdk.rst
> +++ b/Documentation/howto/dpdk.rst
> @@ -119,4 +119,11 @@ After that PMD threads on cores where RX queues was pinned will become
>    thread.
>  
> +If pmd-rxq-affinity is not set for rxqs, they will be assigned to pmds (cores)
> +automatically. The processing cycles that have been stored for each rxq
> +will be used where known to assign rxqs to pmd based on a round robin of the
> +sorted rxqs.
> +
> +Rxq to pmds assignment takes place whenever there are configuration changes.
> +
>  QoS
>  ---
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index cdf2662..4b1cb85 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -715,4 +715,6 @@ static void
>  dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
>                             unsigned long long cycles);
> +static uint64_t
> +dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
>  static void
>  dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
> @@ -3169,4 +3171,12 @@ dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
>  }
>  
> +static uint64_t
> +dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
> +{
> +    unsigned long long processing_cycles;
> +    atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
> +    return processing_cycles;
> +}
> +
>  static int
>  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
> @@ -3355,8 +3365,37 @@ rr_numa_list_destroy(struct rr_numa_list *rr)
>  }
>  
> +/* Sort Rx Queues by the processing cycles they are consuming. */
> +static int
> +rxq_cycle_sort(const void *a, const void *b)
> +{
> +    struct dp_netdev_rxq * qa;
> +    struct dp_netdev_rxq * qb;

'*' should be placed near to variable.

> +    uint64_t total_qa, total_qb;
> +    unsigned i;
> +
> +    qa = *(struct dp_netdev_rxq **) a;
> +    qb = *(struct dp_netdev_rxq **) b;
> +
> +    total_qa = total_qb = 0;
> +    for (i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
> +        total_qa += dp_netdev_rxq_get_intrvl_cycles(qa, i);
> +        total_qb += dp_netdev_rxq_get_intrvl_cycles(qb, i);

You're recalculating the sum of stats each time qsort calls comparison
function. This may lead to inconsistent results and, actually,
undefined result of sorting.

> +    }
> +    dp_netdev_rxq_set_cycles(qa, RXQ_CYCLES_PROC_HIST, total_qa);
> +    dp_netdev_rxq_set_cycles(qb, RXQ_CYCLES_PROC_HIST, total_qb);
> +
> +    if (total_qa >= total_qb) {
> +        return -1;
> +    }
> +    return 1;

Function should return '0' in case of equal values. Above code also
able to produce undefined sorting result.

> +}
> +
>  /* Assign pmds to queues.  If 'pinned' is true, assign pmds to pinned
>   * queues and marks the pmds as isolated.  Otherwise, assign non isolated
>   * pmds to unpinned queues.
>   *
> + * If 'pinned' is false queues will be sorted by processing cycles they are
> + * consuming and then assigned to pmds in round robin order.
> + *
>   * The function doesn't touch the pmd threads, it just stores the assignment
>   * in the 'pmd' member of each rxq. */
> @@ -3367,18 +3406,14 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
>      struct rr_numa_list rr;
>      struct rr_numa *non_local_numa = NULL;
> -
> -    rr_numa_list_populate(dp, &rr);
> +    struct dp_netdev_rxq ** rxqs = NULL;
> +    int i, n_rxqs = 0;
> +    struct rr_numa *numa = NULL;
> +    int numa_id;

Can you merge some of above lines with the same type or at least
group them near each other.

>  
>      HMAP_FOR_EACH (port, node, &dp->ports) {
> -        struct rr_numa *numa;
> -        int numa_id;
> -
>          if (!netdev_is_pmd(port->netdev)) {
>              continue;
>          }
>  
> -        numa_id = netdev_get_numa_id(port->netdev);
> -        numa = rr_numa_list_lookup(&rr, numa_id);
> -
>          for (int qid = 0; qid < port->n_rxq; qid++) {
>              struct dp_netdev_rxq *q = &port->rxqs[qid];
> @@ -3398,34 +3433,60 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
>                  }
>              } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
> -                if (!numa) {
> -                    /* There are no pmds on the queue's local NUMA node.
> -                       Round-robin on the NUMA nodes that do have pmds. */
> -                    non_local_numa = rr_numa_list_next(&rr, non_local_numa);
> -                    if (!non_local_numa) {
> -                        VLOG_ERR("There is no available (non-isolated) pmd "
> -                                 "thread for port \'%s\' queue %d. This queue "
> -                                 "will not be polled. Is pmd-cpu-mask set to "
> -                                 "zero? Or are all PMDs isolated to other "
> -                                 "queues?", netdev_get_name(port->netdev),
> -                                 qid);
> -                        continue;
> -                    }
> -                    q->pmd = rr_numa_get_pmd(non_local_numa);
> -                    VLOG_WARN("There's no available (non-isolated) pmd thread "
> -                              "on numa node %d. Queue %d on port \'%s\' will "
> -                              "be assigned to the pmd on core %d "
> -                              "(numa node %d). Expect reduced performance.",
> -                              numa_id, qid, netdev_get_name(port->netdev),
> -                              q->pmd->core_id, q->pmd->numa_id);
> +                if (n_rxqs == 0) {
> +                    rxqs = xmalloc(sizeof *rxqs);
>                  } else {
> -                    /* Assign queue to the next (round-robin) PMD on it's local
> -                       NUMA node. */
> -                    q->pmd = rr_numa_get_pmd(numa);
> +                    rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
>                  }
> +                /* Store the queue. */
> +                rxqs[n_rxqs++] = q;
>              }
>          }
>      }
>  
> +    if (n_rxqs > 1) {
> +        /* Sort the queues in order of the processing cycles
> +         * they consumed during their last pmd interval. */
> +        qsort(rxqs, n_rxqs, sizeof *rxqs, rxq_cycle_sort);
> +    }
> +
> +    rr_numa_list_populate(dp, &rr);
> +    /* Assign the sorted queues to pmds in round robin. */
> +    for (i = 0; i < n_rxqs; i++) {
> +        numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
> +        numa = rr_numa_list_lookup(&rr, numa_id);
> +        if (!numa) {
> +            /* There are no pmds on the queue's local NUMA node.
> +               Round robin on the NUMA nodes that do have pmds. */
> +            non_local_numa = rr_numa_list_next(&rr, non_local_numa);
> +            if (!non_local_numa) {
> +                VLOG_ERR("There is no available (non-isolated) pmd "
> +                         "thread for port \'%s\' queue %d. This queue "
> +                         "will not be polled. Is pmd-cpu-mask set to "
> +                         "zero? Or are all PMDs isolated to other "
> +                         "queues?", netdev_rxq_get_name(rxqs[i]->rx),
> +                         netdev_rxq_get_queue_id(rxqs[i]->rx));
> +                continue;
> +            }
> +            rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa);
> +            VLOG_WARN("There's no available (non-isolated) pmd thread "
> +                      "on numa node %d. Queue %d on port \'%s\' will "
> +                      "be assigned to the pmd on core %d "
> +                      "(numa node %d). Expect reduced performance.",
> +                      numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
> +                      netdev_rxq_get_name(rxqs[i]->rx),
> +                      rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
> +        } else {
> +        rxqs[i]->pmd = rr_numa_get_pmd(numa);
> +        VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
> +                  "rx queue %d (measured processing cycles %"PRIu64").",
> +                  rxqs[i]->pmd->core_id, numa_id,
> +                  netdev_rxq_get_name(rxqs[i]->rx),
> +                  netdev_rxq_get_queue_id(rxqs[i]->rx),
> +                  dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
> +        }
> +    }
> +
>      rr_numa_list_destroy(&rr);
> +    free(rxqs);
>  }
>  
> -- 
> 1.8.3.


More information about the dev mailing list