[ovs-dev] [PATCH v4 2/5] netdev-dpdk: Add netdev_dpdk_vhost_txq_flush function.

Wed Aug 9 12:22:26 UTC 2017

On 09.08.2017 13:03, Ilya Maximets wrote:
> One more comment inline.
> 
> On 09.08.2017 11:06, Ilya Maximets wrote:
>> Not a full review.
>> One comment inline.
>>
>>> Add netdev_dpdk_vhost_txq_flush(), that flushes packets on vHost User
>>> port queues. Also add netdev_dpdk_vhost_tx_burst() function that
>>> uses rte_vhost_enqueue_burst() to enqueue burst of packets on vHost User
>>> ports.
>>>
>>> Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy at intel.com>
>>> Signed-off-by: Antonio Fischetti <antonio.fischetti at intel.com>
>>> Co-authored-by: Antonio Fischetti <antonio.fischetti at intel.com>
>>> Acked-by: Eelco Chaudron <echaudro at redhat.com>
>>> ---
>>>  lib/netdev-dpdk.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
>>>  1 file changed, 108 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>>> index 50d6b29..d3892fe 100644
>>> --- a/lib/netdev-dpdk.c
>>> +++ b/lib/netdev-dpdk.c
>>> @@ -327,12 +327,22 @@ struct dpdk_tx_queue {
>>>                                      * pmd threads (see 'concurrent_txq'). */
>>>      int map;                       /* Mapping of configured vhost-user queues
>>>                                      * to enabled by guest. */
>>> -    int dpdk_pkt_cnt;              /* Number of buffered packets waiting to
>>> +    union {
>>> +        int dpdk_pkt_cnt;          /* Number of buffered packets waiting to
>>>                                        be sent on DPDK tx queue. */
>>> -    struct rte_mbuf *dpdk_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
>>> +        int vhost_pkt_cnt;         /* Number of buffered packets waiting to
>>> +                                      be sent on vhost port. */
>>> +    };
>>> +
>>> +    union {
>>> +        struct rte_mbuf *dpdk_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
>>>                                     /* Intermediate queue where packets can
>>>                                      * be buffered to amortize the cost of MMIO
>>>                                      * writes. */
>>> +        struct dp_packet *vhost_burst_pkts[INTERIM_QUEUE_BURST_THRESHOLD];
>>> +                                   /* Intermediate queue where packets can
>>> +                                    * be buffered for vhost ports. */
>>> +    };
>>>  };
>>>  
>>>  /* dpdk has no way to remove dpdk ring ethernet devices
>>> @@ -1756,6 +1766,88 @@ netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,
>>>      }
>>>  }
>>>  
>>> +static int
>>> +netdev_dpdk_vhost_tx_burst(struct netdev_dpdk *dev, int qid)
>>> +{
>>> +    struct dpdk_tx_queue *txq = &dev->tx_q[qid];
>>> +    struct rte_mbuf **cur_pkts = (struct rte_mbuf **)txq->vhost_burst_pkts;
>>> +
>>> +    int tx_vid = netdev_dpdk_get_vid(dev);
>>> +    int tx_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
>>> +    uint32_t sent = 0;
>>> +    uint32_t retries = 0;
>>> +    uint32_t sum, total_pkts;
>>> +
>>> +    total_pkts = sum = txq->vhost_pkt_cnt;
>>> +    do {
>>> +        uint32_t ret;
>>> +        ret = rte_vhost_enqueue_burst(tx_vid, tx_qid, &cur_pkts[sent], sum);
>>> +        if (OVS_UNLIKELY(!ret)) {
>>> +            /* No packets enqueued - do not retry. */
>>> +            break;
>>> +        } else {
>>> +            /* Packet have been sent. */
>>> +            sent += ret;
>>> +
>>> +            /* 'sum' packet have to be retransmitted. */
>>> +            sum -= ret;
>>> +        }
>>> +    } while (sum && (retries++ < VHOST_ENQ_RETRY_NUM));
>>> +
>>> +    for (int i = 0; i < total_pkts; i++) {
>>> +        dp_packet_delete(txq->vhost_burst_pkts[i]);
>>> +    }
>>> +
>>> +    /* Reset pkt count. */
>>> +    txq->vhost_pkt_cnt = 0;
>>> +
>>> +    /* 'sum' refers to packets dropped. */
>>> +    return sum;
>>> +}
>>> +
>>> +/* Flush the txq if there are any packets available. */
>>> +static int
>>> +netdev_dpdk_vhost_txq_flush(struct netdev *netdev, int qid,
>>> +                            bool concurrent_txq OVS_UNUSED)
>>> +{
>>> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>> +    struct dpdk_tx_queue *txq;
>>> +
>>> +    qid = dev->tx_q[qid % netdev->n_txq].map;
>>> +
>>> +    /* The qid may be disabled in the guest and has been set to
>>> +     * OVS_VHOST_QUEUE_DISABLED.
>>> +     */
>>> +    if (OVS_UNLIKELY(qid < 0)) {
>>> +        return 0;
>>> +    }
>>> +
>>> +    txq = &dev->tx_q[qid];
>>> +    /* Increment the drop count and free the memory. */
>>> +    if (OVS_UNLIKELY(!is_vhost_running(dev) ||
>>> +                     !(dev->flags & NETDEV_UP))) {
>>> +
>>> +        if (txq->vhost_pkt_cnt) {
>>> +            rte_spinlock_lock(&dev->stats_lock);
>>> +            dev->stats.tx_dropped+= txq->vhost_pkt_cnt;
>>> +            rte_spinlock_unlock(&dev->stats_lock);
>>> +
>>> +            for (int i = 0; i < txq->vhost_pkt_cnt; i++) {
>>> +                dp_packet_delete(txq->vhost_burst_pkts[i]);
>>
>> Spinlock (tx_lock) must be held here to avoid queue and mempool breakage.
>>
>>> +            }
>>> +            txq->vhost_pkt_cnt = 0;
>>> +        }
>>> +    }
>>> +
>>> +    if (OVS_LIKELY(txq->vhost_pkt_cnt)) {
>>> +        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>>> +        netdev_dpdk_vhost_tx_burst(dev, qid);
>>> +        rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>>  static void
>>>  __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
>>>                           struct dp_packet **pkts, int cnt)
>>> @@ -2799,6 +2891,17 @@ vring_state_changed(int vid, uint16_t queue_id, int enable)
>>>              if (enable) {
>>>                  dev->tx_q[qid].map = qid;
> 
> Here flushing required too because we're possibly enabling previously remapped queue.
> 
>>>              } else {
>>> +                /* If the queue is disabled in the guest, the corresponding qid
>>> +                 * map shall be set to OVS_VHOST_QUEUE_DISABLED(-2).
>>> +                 *
>>> +                 * The packets that were queued in 'qid' could be potentially
>>> +                 * stuck and needs to be dropped.
>>> +                 *
>>> +                 * XXX: The queues may be already disabled in the guest so
>>> +                 * flush function in this case only helps in updating stats
>>> +                 * and freeing memory.
>>> +                 */
>>> +                netdev_dpdk_vhost_txq_flush(&dev->up, qid, 0);
>>>                  dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
>>>              }
>>>              netdev_dpdk_remap_txqs(dev);
> 
> 'netdev_dpdk_remap_txqs()', actually, is able to change mapping for all the
> disabled in guest queues. So, we need to flush all of them while remapping
> somewhere inside the function.
> One other thing is that there is a race window between flush and
> mapping update where another process able to enqueue more packets in just
> flushed queue. The order of operations should be changed, or both of them
> should be done under the same tx_lock. I think, it's required to make
> tx_q[].map field atomic to fix the race condition, because send function
> takes the 'map' and then locks the corresponding queue. It wasn't an issue
> before, because packets in case of race was just dropped on attempt to send
> to disabled queue, but with this patch applied they will be enqueued to the
> intermediate queue and stuck there.

Making 'map' atomic will not help. To solve the race we should make
'reading of map + enqueue' an atomic operation by some spinlock.
Like this:

vhost_send:
----------------------------------------------------------------
    qid = qid % netdev->n_txq;
    rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

    mapped_qid = dev->tx_q[qid].map;

    if (qid != mapped_qid) {
        rte_spinlock_lock(&dev->tx_q[mapped_qid].tx_lock);
    }

    tx_enqueue(mapped_qid, pkts, cnt);

    if (qid != mapped_qid) {
        rte_spinlock_unlock(&dev->tx_q[mapped_qid].tx_lock);
    }

    rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
----------------------------------------------------------------

txq remapping inside 'netdev_dpdk_remap_txqs()' or 'vring_state_changed()':
----------------------------------------------------------------
    qid - queue we need to remap.
    new_qid - queue we need to remap to.

    rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

    mapped_qid = dev->tx_q[qid].map;
    if (qid != mapped_qid) {
        rte_spinlock_lock(&dev->tx_q[mapped_qid].tx_lock);
    }

    tx_flush(mapped_qid)

    if (qid != mapped_qid) {
        rte_spinlock_unlock(&dev->tx_q[mapped_qid].tx_lock);
    }

    dev->tx_q[qid].map = new_qid;

    rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
----------------------------------------------------------------

Above schema should work without races, but looks kind of ugly and requires
taking of additional spinlock on each send.

P.S. Sorry for talking with myself. Just want to share my thoughts.

> 
>>> @@ -3471,7 +3574,8 @@ static const struct netdev_class dpdk_vhost_class =
>>>          NULL,
>>>          netdev_dpdk_vhost_reconfigure,
>>>          netdev_dpdk_vhost_rxq_recv,
>>> -        NULL);
>>> +        netdev_dpdk_vhost_txq_flush);
>>> +
>>>  static const struct netdev_class dpdk_vhost_client_class =
>>>      NETDEV_DPDK_CLASS(
>>>          "dpdkvhostuserclient",
>>> @@ -3487,7 +3591,7 @@ static const struct netdev_class dpdk_vhost_client_class =
>>>          NULL,
>>>          netdev_dpdk_vhost_client_reconfigure,
>>>          netdev_dpdk_vhost_rxq_recv,
>>> -        NULL);
>>> +        netdev_dpdk_vhost_txq_flush);
>>>  
>>>  void
>>>  netdev_dpdk_register(void)
>>> -- 
>>> 2.4.11