[ovs-dev] [PATCH RFC] netdev-dpdk: vhost-user: Fix sending packets to queues not enabled by guest.

Fri Feb 12 04:44:56 UTC 2016

Hi Ilya,

On Thu, 11 Feb 2016 16:04:12 +0300
Ilya Maximets <i.maximets at samsung.com> wrote:

> Currently virtio driver in guest operating system have to be configured
> to use exactly same number of queues. If number of queues will be less,
> some packets will get stuck in queues unused by guest and will not be
> received.
> 
> Fix that by using new 'vring_state_changed' callback, which is
> available for vhost-user since DPDK 2.2.
> Implementation uses additional mapping from configured tx queues to
> enabled by virtio driver. This requires mandatory locking of TX queues
> in __netdev_dpdk_vhost_send(), but this locking was almost always anyway
> because of calling set_multiq with n_txq = 'ovs_numa_get_n_cores() + 1'.
> 
> Fixes: 4573fbd38fa1 ("netdev-dpdk: Add vhost-user multiqueue support")
> Signed-off-by: Ilya Maximets <i.maximets at samsung.com>
> ---
>  lib/netdev-dpdk.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 95 insertions(+), 13 deletions(-)
> 
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index e4f789b..f04f2bd 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -173,6 +173,8 @@ struct dpdk_tx_queue {
>                                      * from concurrent access.  It is used only
>                                      * if the queue is shared among different
>                                      * pmd threads (see 'txq_needs_locking'). */
> +    int map;                       /* Mapping of configured vhost-user queues
> +                                    * to enabled by guest. */
>      uint64_t tsc;
>      struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];
>  };
> @@ -559,6 +561,13 @@ netdev_dpdk_alloc_txq(struct netdev_dpdk *netdev, unsigned int n_txqs)
>      unsigned i;
>  
>      netdev->tx_q = dpdk_rte_mzalloc(n_txqs * sizeof *netdev->tx_q);
> +
> +    if (netdev->type == DPDK_DEV_VHOST) {
> +        for (i = 0; i < n_txqs; i++) {
> +            netdev->tx_q[i].map = -1;
> +        }
> +    }
> +

There is the same loop down below which initializes other
queue variables, so the above could be included in the latter loop:

      for (i = 0; i < n_txqs; i++) {
           int numa_id = ovs_numa_get_numa_id(i);

           if (!netdev->txq_needs_locking) {
              netdev->tx_q[i].flush_tx = netdev->socket_id == numa_id;
           } else {
              netdev->tx_q[i].flush_tx = true;
           }

           /* initialize map for vhost devices */
           netdev->tx_q[i].map = -1;
           rte_spinlock_init(&netdev->tx_q[i].tx_lock);
      }

It seems cleaner to me, but I have no strong opinion here.

> @@ -1117,17 +1126,16 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
>      unsigned int total_pkts = cnt;
>      uint64_t start = 0;
>  
> -    if (OVS_UNLIKELY(!is_vhost_running(virtio_dev))) {
> +    qid = vhost_dev->tx_q[qid % vhost_dev->real_n_txq].map;
> +
> +    if (OVS_UNLIKELY(!is_vhost_running(virtio_dev) || qid == -1)) {
>          rte_spinlock_lock(&vhost_dev->stats_lock);
>          vhost_dev->stats.tx_dropped+= cnt;
>          rte_spinlock_unlock(&vhost_dev->stats_lock);
>          goto out;
>      }
>  
> -    if (vhost_dev->txq_needs_locking) {
> -        qid = qid % vhost_dev->real_n_txq;
> -        rte_spinlock_lock(&vhost_dev->tx_q[qid].tx_lock);
> -    }
> +    rte_spinlock_lock(&vhost_dev->tx_q[qid].tx_lock);
>  
>      do {
>          int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
> @@ -1165,9 +1173,7 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
>          }
>      } while (cnt);
>  
> -    if (vhost_dev->txq_needs_locking) {
> -        rte_spinlock_unlock(&vhost_dev->tx_q[qid].tx_lock);
> -    }
> +    rte_spinlock_unlock(&vhost_dev->tx_q[qid].tx_lock);
>  
>      rte_spinlock_lock(&vhost_dev->stats_lock);
>      netdev_dpdk_vhost_update_tx_counters(&vhost_dev->stats, pkts, total_pkts,
> @@ -1843,15 +1849,49 @@ netdev_dpdk_vhost_set_queues(struct netdev_dpdk *netdev, struct virtio_net *dev)
>  
>      netdev->real_n_rxq = qp_num;
>      netdev->real_n_txq = qp_num;
> -    if (netdev->up.n_txq > netdev->real_n_txq) {
> -        netdev->txq_needs_locking = true;
> -    } else {
> -        netdev->txq_needs_locking = false;
> -    }
> +    netdev->txq_needs_locking = true;
>  
>      return 0;
>  }
>  
> +/* Fixes mapping for vhost-user tx queues. */
> +static void
> +netdev_dpdk_remap_txqs(struct netdev_dpdk *netdev)
> +    OVS_REQUIRES(netdev->mutex)
> +{
> +    int *enabled_queues, n_enabled = 0;
> +    int i, k, total_txqs = netdev->real_n_txq;
> +
> +    enabled_queues = dpdk_rte_mzalloc(total_txqs * sizeof enabled_queues);
> +
> +    for (i = 0; i < total_txqs; i++) {
> +        /* Enabled queues always mapped to themselves. */
> +        if (netdev->tx_q[i].map == i) {
> +            enabled_queues[n_enabled++] = i;
> +        }
> +    }
> +
> +    if (n_enabled == 0 && total_txqs != 0) {
> +        enabled_queues[0] = -1;
> +        n_enabled = 1;
> +    }
> +
> +    k = 0;
> +    for (i = 0; i < total_txqs; i++) {
> +        if (netdev->tx_q[i].map != i) {
> +            netdev->tx_q[i].map = enabled_queues[k];
> +            k = (k + 1) % n_enabled;
> +        }
> +    }
> +
> +    VLOG_DBG("TX queue mapping for %s\n", netdev->vhost_id);
> +    for (i = 0; i < total_txqs; i++) {
> +        VLOG_DBG("%2d --> %2d", i, netdev->tx_q[i].map);
> +    }
> +
> +    rte_free(enabled_queues);
> +}
> +
>  /*
>   * A new virtio-net device is added to a vhost port.
>   */
> @@ -1941,6 +1981,47 @@ destroy_device(volatile struct virtio_net *dev)
>  
>  }
>  
> +static int
> +vring_state_changed(struct virtio_net *dev, uint16_t queue_id, int enable)
> +{
> +    struct netdev_dpdk *vhost_dev;
> +    bool exists = false;
> +    int qid = queue_id / VIRTIO_QNUM;
> +
> +    if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) {
> +        return 0;
> +    }
> +
> +    ovs_mutex_lock(&dpdk_mutex);
> +    LIST_FOR_EACH (vhost_dev, list_node, &dpdk_list) {
> +        if (strncmp(dev->ifname, vhost_dev->vhost_id, IF_NAME_SZ) == 0) {
> +            ovs_mutex_lock(&vhost_dev->mutex);
> +            if (enable) {
> +                vhost_dev->tx_q[qid].map = qid;
> +            } else {
> +                vhost_dev->tx_q[qid].map = -1;
> +            }
> +            netdev_dpdk_remap_txqs(vhost_dev);
> +            exists = true;
> +            ovs_mutex_unlock(&vhost_dev->mutex);
> +            break;
> +        }
> +    }
> +    ovs_mutex_unlock(&dpdk_mutex);
> +
> +    if (exists) {
> +        VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s' %"
> +                  PRIu64" changed to \'%s\'", queue_id, qid, dev->ifname,
> +                  dev->device_fh, (enable == 1) ? "enabled" : "disabled");
> +    } else {
> +        VLOG_INFO("vHost Device '%s' %"PRIu64" not found", dev->ifname,
> +                  dev->device_fh);
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
>  struct virtio_net *
>  netdev_dpdk_get_virtio(const struct netdev_dpdk *dev)
>  {
> @@ -1955,6 +2036,7 @@ static const struct virtio_net_device_ops virtio_net_device_ops =
>  {
>      .new_device =  new_device,
>      .destroy_device = destroy_device,
> +    .vring_state_changed = vring_state_changed
>  };
>  
>  static void *

I had a plan to actually change how many TX queues we are allocating
which then would allow to not use locking at all.   The reason for having
n_txq = 'ovs_numa_get_n_cores() + 1' is to make sure that regardless the
core that the PMD is running, we would have an unique TX queue.  Since
you proposed the patch to have sequential queue ids, we wouldn't need to
allocate that many queues anymore.

Anyway, this patch is more important and it works for me.

Acked-by: Flavio Leitner <fbl at sysclose.org>

This should go in 2.5 but I am not sure how much time we have left for
the actual release.

Thanks,
-- 
fbl