[ovs-dev] [PATCH v2 1/3] dpif-netdev: only poll enabled vhost queues

Ilya Maximets i.maximets at samsung.com
Tue Apr 16 13:41:24 UTC 2019


Hi.

One comment for the patch names. It's not a rule, but it's better
to make the <summary> part of the subject a complete sentence.
i.e. start with a capital letter and end with a period.

Same rule is applicable to the comments in the code, but this is
documented in coding-style.

More comments inline.

Best regards, Ilya Maximets.

On 16.04.2019 12:45, David Marchand wrote:
> We currently poll all available queues based on the max queue count
> exchanged with the vhost peer and rely on the vhost library in DPDK to
> check the vring status beneath.
> This can lead to some overhead when we have a lot of unused queues.
> 
> To enhance the situation, we can skip the disabled queues.
> On rxq notifications, we make use of the netdev's change_seq number so
> that the pmd thread main loop can cache the queue state periodically.
> 
> $ ovs-appctl dpif-netdev/pmd-rxq-show
> pmd thread numa_id 0 core_id 1:
>   isolated : true
>   port: dpdk0             queue-id:  0  pmd usage:  0 %
> pmd thread numa_id 0 core_id 2:
>   isolated : true
>   port: vhost1            queue-id:  0  pmd usage:  0 %
>   port: vhost3            queue-id:  0  pmd usage:  0 %
> pmd thread numa_id 0 core_id 15:
>   isolated : true
>   port: dpdk1             queue-id:  0  pmd usage:  0 %
> pmd thread numa_id 0 core_id 16:
>   isolated : true
>   port: vhost0            queue-id:  0  pmd usage:  0 %
>   port: vhost2            queue-id:  0  pmd usage:  0 %
> 
> $ while true; do
>   ovs-appctl dpif-netdev/pmd-rxq-show |awk '
>   /port: / {
>     tot++;
>     if ($NF == "disabled") {
>       dis++;
>     }
>   }
>   END {
>     print "total: " tot ", enabled: " (tot - dis)
>   }'
>   sleep 1
> done
> 
> total: 6, enabled: 2
> total: 6, enabled: 2
> ...
> 
>  # Started vm, virtio devices are bound to kernel driver which enables
>  # F_MQ + all queue pairs
> total: 6, enabled: 2
> total: 66, enabled: 66
> ...
> 
>  # Unbound vhost0 and vhost1 from the kernel driver
> total: 66, enabled: 66
> total: 66, enabled: 34
> ...
> 
>  # Configured kernel bound devices to use only 1 queue pair
> total: 66, enabled: 34
> total: 66, enabled: 19
> total: 66, enabled: 4
> ...
> 
>  # While rebooting the vm
> total: 66, enabled: 4
> total: 66, enabled: 2
> ...
> total: 66, enabled: 66
> ...
> 
>  # After shutting down the vm
> total: 66, enabled: 66
> total: 66, enabled: 2
> 
> Signed-off-by: David Marchand <david.marchand at redhat.com>
> ---
> 
> Changes since v1:
> - only indicate disabled queues in dpif-netdev/pmd-rxq-show output
> - Ilya comments
>   - no need for a struct as we only need a boolean per rxq
>   - "rx_q" is generic, while we only care for this in vhost case,
>     renamed as "vhost_rxq_enabled",
>   - add missing rte_free on allocation error,
>   - vhost_rxq_enabled is freed in vhost destruct only,
>   - rxq0 is enabled at the virtio device activation to accomodate
>     legacy implementations which would not report per queue states
>     later,
>   - do not mix boolean with integer,
>   - do not use bit operand on boolean,
> 
> ---
>  lib/dpif-netdev.c     | 27 ++++++++++++++++++++++++
>  lib/netdev-dpdk.c     | 58 +++++++++++++++++++++++++++++++++++++++------------
>  lib/netdev-provider.h |  5 +++++
>  lib/netdev.c          | 10 +++++++++
>  lib/netdev.h          |  1 +
>  5 files changed, 88 insertions(+), 13 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 4d6d0c3..5bfa6ad 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -591,6 +591,8 @@ struct polled_queue {
>      struct dp_netdev_rxq *rxq;
>      odp_port_t port_no;
>      bool emc_enabled;
> +    bool enabled;

What do you think about renaming to 'rxq_enabled'? It seems more clear.
Having both 'emc_enabled' and just 'enabled' is a bit confusing.

> +    uint64_t change_seq;
>  };
>  
>  /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
> @@ -1171,6 +1173,9 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
>              } else {
>                  ds_put_format(reply, "%s", "NOT AVAIL");
>              }
> +            if (!netdev_rxq_enabled(list[i].rxq->rx)) {
> +                ds_put_cstr(reply, "  polling: disabled");
> +            }
>              ds_put_cstr(reply, "\n");
>          }
>          ovs_mutex_unlock(&pmd->port_mutex);
> @@ -5198,6 +5203,11 @@ dpif_netdev_run(struct dpif *dpif)
>                  }
>  
>                  for (i = 0; i < port->n_rxq; i++) {
> +
> +                    if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
> +                        continue;
> +                    }
> +
>                      if (dp_netdev_process_rxq_port(non_pmd,
>                                                     &port->rxqs[i],
>                                                     port->port_no)) {
> @@ -5371,6 +5381,9 @@ pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
>          poll_list[i].rxq = poll->rxq;
>          poll_list[i].port_no = poll->rxq->port->port_no;
>          poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
> +        poll_list[i].enabled = netdev_rxq_enabled(poll->rxq->rx);
> +        poll_list[i].change_seq =
> +                     netdev_get_change_seq(poll->rxq->port->netdev);
>          i++;
>      }
>  
> @@ -5436,6 +5449,10 @@ reload:
>  
>          for (i = 0; i < poll_cnt; i++) {
>  
> +            if (!poll_list[i].enabled) {
> +                continue;
> +            }
> +
>              if (poll_list[i].emc_enabled) {
>                  atomic_read_relaxed(&pmd->dp->emc_insert_min,
>                                      &pmd->ctx.emc_insert_min);
> @@ -5472,6 +5489,16 @@ reload:
>              if (reload) {
>                  break;
>              }
> +
> +            for (i = 0; i < poll_cnt; i++) {
> +                uint64_t current_seq =
> +                         netdev_get_change_seq(poll_list[i].rxq->port->netdev);
> +                if (poll_list[i].change_seq != current_seq) {
> +                    poll_list[i].change_seq = current_seq;
> +                    poll_list[i].enabled =
> +                                 netdev_rxq_enabled(poll_list[i].rxq->rx);
> +                }
> +            }
>          }
>          pmd_perf_end_iteration(s, rx_packets, tx_packets,
>                                 pmd_perf_metrics_enabled(pmd));
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 47153dc..9ba8e67 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -424,6 +424,9 @@ struct netdev_dpdk {
>          OVSRCU_TYPE(struct ingress_policer *) ingress_policer;
>          uint32_t policer_rate;
>          uint32_t policer_burst;
> +
> +        /* Array of vhost rxq states, see vring_state_changed */

Should end with a period.

> +        bool *vhost_rxq_enabled;
>      );
>  
>      PADDED_MEMBERS(CACHE_LINE_SIZE,
> @@ -1235,8 +1238,14 @@ vhost_common_construct(struct netdev *netdev)
>      int socket_id = rte_lcore_to_socket_id(rte_get_master_lcore());
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>  
> +    dev->vhost_rxq_enabled = dpdk_rte_mzalloc(OVS_VHOST_MAX_QUEUE_NUM *
> +                                              sizeof *dev->vhost_rxq_enabled);
> +    if (!dev->vhost_rxq_enabled) {
> +        return ENOMEM;
> +    }
>      dev->tx_q = netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM);
>      if (!dev->tx_q) {
> +        rte_free(dev->vhost_rxq_enabled);
>          return ENOMEM;
>      }
>  
> @@ -1448,6 +1457,7 @@ netdev_dpdk_vhost_destruct(struct netdev *netdev)
>      dev->vhost_id = NULL;
>  
>      common_destruct(dev);
> +    rte_free(dev->vhost_rxq_enabled);

Logically, 'common_destruct' should go after class-specific things.

>  
>      ovs_mutex_unlock(&dpdk_mutex);
>  
> @@ -2200,6 +2210,14 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
>      return 0;
>  }
>  
> +static bool
> +netdev_dpdk_vhost_rxq_enabled(struct netdev_rxq *rxq)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> +
> +    return dev->vhost_rxq_enabled[rxq->queue_id];
> +}
> +
>  static int
>  netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
>                       int *qfill)
> @@ -3563,6 +3581,8 @@ destroy_device(int vid)
>              ovs_mutex_lock(&dev->mutex);
>              dev->vhost_reconfigured = false;
>              ovsrcu_index_set(&dev->vid, -1);
> +            memset(dev->vhost_rxq_enabled, 0,
> +                   OVS_VHOST_MAX_QUEUE_NUM * sizeof *dev->vhost_rxq_enabled);

We need to clear only first 'dev->up.n_rxq' queues.

>              netdev_dpdk_txq_map_clear(dev);
>  
>              netdev_change_seq_changed(&dev->up);
> @@ -3597,24 +3617,30 @@ vring_state_changed(int vid, uint16_t queue_id, int enable)
>      struct netdev_dpdk *dev;
>      bool exists = false;
>      int qid = queue_id / VIRTIO_QNUM;
> +    bool is_rx = (queue_id % VIRTIO_QNUM) == VIRTIO_TXQ;
>      char ifname[IF_NAME_SZ];
>  
>      rte_vhost_get_ifname(vid, ifname, sizeof ifname);
>  
> -    if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) {
> -        return 0;
> -    }
> -
>      ovs_mutex_lock(&dpdk_mutex);
>      LIST_FOR_EACH (dev, list_node, &dpdk_list) {
>          ovs_mutex_lock(&dev->mutex);
>          if (nullable_string_is_equal(ifname, dev->vhost_id)) {
> -            if (enable) {
> -                dev->tx_q[qid].map = qid;
> +            if (is_rx) {
> +                bool enabled = dev->vhost_rxq_enabled[qid];

This is also confusing to have 'enable' and 'enabled' in a same scope.
What do you think about renaming 'enabled' --> 'old_state'?

> +
> +                dev->vhost_rxq_enabled[qid] = enable != 0;
> +                if (enabled != dev->vhost_rxq_enabled[qid]) {
> +                    netdev_change_seq_changed(&dev->up);
> +                }
>              } else {
> -                dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
> +                if (enable) {
> +                    dev->tx_q[qid].map = qid;
> +                } else {
> +                    dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
> +                }
> +                netdev_dpdk_remap_txqs(dev);
>              }
> -            netdev_dpdk_remap_txqs(dev);
>              exists = true;
>              ovs_mutex_unlock(&dev->mutex);
>              break;
> @@ -3624,9 +3650,9 @@ vring_state_changed(int vid, uint16_t queue_id, int enable)
>      ovs_mutex_unlock(&dpdk_mutex);
>  
>      if (exists) {
> -        VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s' "
> -                  "changed to \'%s\'", queue_id, qid, ifname,
> -                  (enable == 1) ? "enabled" : "disabled");
> +        VLOG_INFO("State of queue %d ( %s_qid %d ) of vhost device '%s' "
> +                  "changed to \'%s\'", queue_id, is_rx == true ? "rx" : "tx",
> +                  qid, ifname, (enable == 1) ? "enabled" : "disabled");
>      } else {
>          VLOG_INFO("vHost Device '%s' not found", ifname);
>          return -1;
> @@ -4085,6 +4112,10 @@ dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
>      dev->up.n_rxq = dev->requested_n_rxq;
>      int err;
>  
> +    /* Always keep RX queue 0 enabled for implementations that won't
> +     * report vring states. */
> +    dev->vhost_rxq_enabled[0] = true;
> +
>      /* Enable TX queue 0 by default if it wasn't disabled. */
>      if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {
>          dev->tx_q[0].map = 0;
> @@ -4297,7 +4327,8 @@ static const struct netdev_class dpdk_vhost_class = {
>      .get_stats = netdev_dpdk_vhost_get_stats,
>      .get_status = netdev_dpdk_vhost_user_get_status,
>      .reconfigure = netdev_dpdk_vhost_reconfigure,
> -    .rxq_recv = netdev_dpdk_vhost_rxq_recv
> +    .rxq_recv = netdev_dpdk_vhost_rxq_recv,
> +    .rxq_enabled = netdev_dpdk_vhost_rxq_enabled,
>  };
>  
>  static const struct netdev_class dpdk_vhost_client_class = {
> @@ -4311,7 +4342,8 @@ static const struct netdev_class dpdk_vhost_client_class = {
>      .get_stats = netdev_dpdk_vhost_get_stats,
>      .get_status = netdev_dpdk_vhost_user_get_status,
>      .reconfigure = netdev_dpdk_vhost_client_reconfigure,
> -    .rxq_recv = netdev_dpdk_vhost_rxq_recv
> +    .rxq_recv = netdev_dpdk_vhost_rxq_recv,
> +    .rxq_enabled = netdev_dpdk_vhost_rxq_enabled,
>  };
>  
>  void
> diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
> index fb0c27e..5faae0d 100644
> --- a/lib/netdev-provider.h
> +++ b/lib/netdev-provider.h
> @@ -789,6 +789,11 @@ struct netdev_class {
>      void (*rxq_destruct)(struct netdev_rxq *);
>      void (*rxq_dealloc)(struct netdev_rxq *);
>  
> +    /* A netdev can report if a queue won't get traffic and should be excluded
> +     * from polling (no callback implicitely means that the queue is enabled).
> +     */

I'm suggesting following variant of this comment to be similar to other function
descriptions here:

    /* Retrieves the current state of rx queue.  'False' means that queue won't
     * get traffic in a short term and could be not polled.
     *
     * This function may be set to null if it would always return 'True'
     * anyhow. */

> +    bool (*rxq_enabled)(struct netdev_rxq *);
> +
>      /* Attempts to receive a batch of packets from 'rx'.  In 'batch', the
>       * caller supplies 'packets' as the pointer to the beginning of an array
>       * of NETDEV_MAX_BURST pointers to dp_packet.  If successful, the
> diff --git a/lib/netdev.c b/lib/netdev.c
> index 7d7ecf6..03a1b24 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> @@ -682,6 +682,16 @@ netdev_rxq_close(struct netdev_rxq *rx)
>      }
>  }
>  
> +bool netdev_rxq_enabled(struct netdev_rxq *rx)
> +{
> +    bool enabled = true;
> +
> +    if (rx->netdev->netdev_class->rxq_enabled) {
> +        enabled = rx->netdev->netdev_class->rxq_enabled(rx);
> +    }
> +    return enabled;
> +}
> +
>  /* Attempts to receive a batch of packets from 'rx'.  'batch' should point to
>   * the beginning of an array of NETDEV_MAX_BURST pointers to dp_packet.  If
>   * successful, this function stores pointers to up to NETDEV_MAX_BURST
> diff --git a/lib/netdev.h b/lib/netdev.h
> index d94817f..bfcdf39 100644
> --- a/lib/netdev.h
> +++ b/lib/netdev.h
> @@ -183,6 +183,7 @@ enum netdev_pt_mode netdev_get_pt_mode(const struct netdev *);
>  /* Packet reception. */
>  int netdev_rxq_open(struct netdev *, struct netdev_rxq **, int id);
>  void netdev_rxq_close(struct netdev_rxq *);
> +bool netdev_rxq_enabled(struct netdev_rxq *);
>  
>  const char *netdev_rxq_get_name(const struct netdev_rxq *);
>  int netdev_rxq_get_queue_id(const struct netdev_rxq *);
> 


More information about the dev mailing list