[ovs-dev] [PATCH] netdev-dpdk: Add vHost User PMD

Traynor, Kevin kevin.traynor at intel.com
Thu Apr 28 14:16:00 UTC 2016


On 21/04/2016 13:20, Ciara Loftus wrote:
> DPDK 16.04 introduces the vHost PMD which allows 'dpdkvhostuser' ports
> to be controlled by the librte_ether API, like physical 'dpdk' ports.
> The commit integrates this functionality into OVS, and refactors some
> of the existing vhost code such that it is vhost-cuse specific.
> Similarly, there is now some overlap between dpdk and vhost-user port
> code.
>
> Signed-off-by: Ciara Loftus <ciara.loftus at intel.com>
> ---
>   INSTALL.DPDK.md   |  12 ++
>   NEWS              |   2 +
>   lib/netdev-dpdk.c | 515 +++++++++++++++++++++++++-----------------------------

Hi Ciara, there's a lot of churn in this file. It might be worth 
considering to see if it could be split through a few commits commits to 
help reviewers. e.g. new features like adding get_features, get_status 
for vhost could be a separate patch at least.

>   3 files changed, 254 insertions(+), 275 deletions(-)
>   mode change 100644 => 100755 lib/netdev-dpdk.c

file permission change.

>
> diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md
> index 7f76df8..5006812 100644
> --- a/INSTALL.DPDK.md
> +++ b/INSTALL.DPDK.md
> @@ -945,6 +945,18 @@ Restrictions:
>       increased to the desired number of queues. Both DPDK and OVS must be
>       recompiled for this change to take effect.
>
> +  DPDK 'eth' type ports:
> +  - dpdk, dpdkr and dpdkvhostuser ports are 'eth' type ports in the context of
> +    DPDK as they are all managed by the rte_ether API. This means that they
> +    adhere to the DPDK configuration option CONFIG_RTE_MAX_ETHPORTS which by
> +    default is set to 32. This means by default the combined total number of
> +    dpdk, dpdkr and dpdkvhostuser ports allowable in OVS with DPDK is 32. This
> +    value can be changed if desired by modifying the configuration file in
> +    DPDK, or by overriding the default value on the command line when building
> +    DPDK. eg.
> +
> +        `make install CONFIG_RTE_MAX_ETHPORTS=64`

format is not registering right for this in my md viewer.

> +
>   Bug Reporting:
>   --------------
>
> diff --git a/NEWS b/NEWS
> index ea7f3a1..4dc0201 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -26,6 +26,8 @@ Post-v2.5.0
>          assignment.
>        * Type of log messages from PMD threads changed from INFO to DBG.
>        * QoS functionality with sample egress-policer implementation.
> +     * vHost PMD integration brings vhost-user ports under control of the
> +       rte_ether DPDK API.
>      - ovs-benchmark: This utility has been removed due to lack of use and
>        bitrot.
>      - ovs-appctl:
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> old mode 100644
> new mode 100755
> index 208c5f5..4fccd63
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -56,6 +56,7 @@
>   #include "rte_mbuf.h"
>   #include "rte_meter.h"
>   #include "rte_virtio_net.h"
> +#include "rte_eth_vhost.h"

nit: generally these go in alphabetical order.

>
>   VLOG_DEFINE_THIS_MODULE(dpdk);
>   static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
> @@ -109,6 +110,8 @@ BUILD_ASSERT_DECL((MAX_NB_MBUF / ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF))
>
>   static char *cuse_dev_name = NULL;    /* Character device cuse_dev_name. */
>   static char *vhost_sock_dir = NULL;   /* Location of vhost-user sockets */
> +/* Array that tracks the used & unused vHost user driver IDs */
> +static unsigned int vhost_user_drv_ids[RTE_MAX_ETHPORTS];
>
>   /*
>    * Maximum amount of time in micro seconds to try and enqueue to vhost.
> @@ -143,7 +146,8 @@ enum { DRAIN_TSC = 200000ULL };
>
>   enum dpdk_dev_type {
>       DPDK_DEV_ETH = 0,
> -    DPDK_DEV_VHOST = 1,
> +    DPDK_DEV_VHOST_USER = 1,
> +    DPDK_DEV_VHOST_CUSE = 2,
>   };
>
>   static int rte_eal_init_ret = ENODEV;
> @@ -275,8 +279,6 @@ struct dpdk_tx_queue {
>                                       * from concurrent access.  It is used only
>                                       * if the queue is shared among different
>                                       * pmd threads (see 'txq_needs_locking'). */
> -    int map;                       /* Mapping of configured vhost-user queues
> -                                    * to enabled by guest. */
>       uint64_t tsc;
>       struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];
>   };
> @@ -329,12 +331,22 @@ struct netdev_dpdk {
>       int real_n_rxq;
>       bool txq_needs_locking;
>
> -    /* virtio-net structure for vhost device */
> +    /* Spinlock for vhost cuse transmission. Other DPDK devices use spinlocks
> +     * in dpdk_tx_queue */
> +    rte_spinlock_t vhost_cuse_tx_lock;

Why can't we continue to use the lock in dpdk_tx_queue? rather than 
adding a cuse specific lock.

> +
> +    /* virtio-net structure for vhost cuse device */
>       OVSRCU_TYPE(struct virtio_net *) virtio_dev;
>
>       /* Identifier used to distinguish vhost devices from each other */
>       char vhost_id[PATH_MAX];
>
> +    /* ID of vhost user port given to the PMD driver */
> +    unsigned int vhost_pmd_id;
> +
> +    /* Number of virtqueue pairs reported by the guest */
> +    uint32_t reported_queues;

Is this useful? we could just use the real_n_*xq's directly.

> +
>       /* In dpdk_list. */
>       struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
>
> @@ -352,13 +364,20 @@ struct netdev_rxq_dpdk {
>   static bool dpdk_thread_is_pmd(void);
>
>   static int netdev_dpdk_construct(struct netdev *);
> +static int netdev_dpdk_vhost_user_construct(struct netdev *);
>
>   struct virtio_net * netdev_dpdk_get_virtio(const struct netdev_dpdk *dev);
>
> +void vring_state_changed_callback(uint8_t port_id,
> +        enum rte_eth_event_type type OVS_UNUSED, void *param OVS_UNUSED);
> +void device_state_changed_callback(uint8_t port_id,
> +        enum rte_eth_event_type type OVS_UNUSED, void *param OVS_UNUSED);
> +
>   static bool
>   is_dpdk_class(const struct netdev_class *class)
>   {
> -    return class->construct == netdev_dpdk_construct;
> +    return ((class->construct == netdev_dpdk_construct) ||
> +            (class->construct == netdev_dpdk_vhost_user_construct));

You should probably change the name of the function to something more 
intuitive now that it's not just for a "dpdk" netdev class.

>   }
>
>   /* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
> @@ -581,7 +600,9 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
>           }
>
>           dev->up.n_rxq = n_rxq;
> -        dev->real_n_txq = n_txq;
> +        if (dev->type == DPDK_DEV_ETH) {
> +            dev->real_n_txq = n_txq;
> +        }

This needs a comment to explain why you've added this check.

>
>           return 0;
>       }
> @@ -672,11 +693,72 @@ netdev_dpdk_alloc_txq(struct netdev_dpdk *dev, unsigned int n_txqs)
>               /* Queues are shared among CPUs. Always flush */
>               dev->tx_q[i].flush_tx = true;
>           }
> +    }
> +}
>
> -        /* Initialize map for vhost devices. */
> -        dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
> -        rte_spinlock_init(&dev->tx_q[i].tx_lock);
> +void
> +vring_state_changed_callback(uint8_t port_id,
> +                             enum rte_eth_event_type type OVS_UNUSED,
> +                             void *param OVS_UNUSED)
> +{
> +    struct netdev_dpdk *dev;
> +    struct rte_eth_vhost_queue_event event;
> +    int err = 0;
> +
> +    err = rte_eth_vhost_get_queue_event(port_id, &event);
> +    if (err || (event.rx == 1)) {
> +        return;
>       }
> +
> +    ovs_mutex_lock(&dpdk_mutex);
> +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> +        if (port_id == dev->port_id) {
> +            ovs_mutex_lock(&dev->mutex);
> +            if (event.enable) {
> +                dev->reported_queues++;
> +            } else {
> +                dev->reported_queues--;
> +            }
> +            dev->real_n_rxq = dev->reported_queues;
> +            dev->real_n_txq = dev->reported_queues;
> +            dev->txq_needs_locking = true;

How do you know locking is needed at this point? Oh I just realised this 
is in the existing code. Seems like we should put in a check against 
netdev->n_txq like elsewhere in the code.

> +            netdev_dpdk_alloc_txq(dev, dev->real_n_txq);
> +            ovs_mutex_unlock(&dev->mutex);
> +            break;
> +        }
> +    }
> +    ovs_mutex_unlock(&dpdk_mutex);
> +
> +    return;
> +}
> +
> +void
> +device_state_changed_callback(uint8_t port_id,
> +                              enum rte_eth_event_type type OVS_UNUSED,
> +                              void *param OVS_UNUSED)
> +{
> +    struct netdev_dpdk *dev;
> +
> +    ovs_mutex_lock(&dpdk_mutex);
> +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> +        if (port_id == dev->port_id) {
> +            ovs_mutex_lock(&dev->mutex);
> +            check_link_status(dev);
> +            if (dev->link.link_status == ETH_LINK_UP) {
> +                /* new device */
> +                VLOG_INFO("vHost Device '%s' has been added", dev->vhost_id);
> +
> +            } else {
> +                /* destroy device */
> +                VLOG_INFO("vHost Device '%s' has been removed", dev->vhost_id);
> +            }
> +            ovs_mutex_unlock(&dev->mutex);
> +            break;
> +        }
> +    }
> +    ovs_mutex_unlock(&dpdk_mutex);
> +
> +    return;
>   }
>
>   static int
> @@ -688,6 +770,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
>       int sid;
>       int err = 0;
>       uint32_t buf_size;
> +    unsigned int n_queues = 0;
>
>       ovs_mutex_init(&dev->mutex);
>       ovs_mutex_lock(&dev->mutex);
> @@ -697,7 +780,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
>       /* If the 'sid' is negative, it means that the kernel fails
>        * to obtain the pci numa info.  In that situation, always
>        * use 'SOCKET0'. */
> -    if (type == DPDK_DEV_ETH) {
> +    if (type != DPDK_DEV_VHOST_CUSE) {
>           sid = rte_eth_dev_socket_id(port_no);

Are you trying to sneak in your NUMA changes for vhostuser ;-)

>       } else {
>           sid = rte_lcore_to_socket_id(rte_get_master_lcore());
> @@ -709,6 +792,8 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
>       dev->flags = 0;
>       dev->mtu = ETHER_MTU;
>       dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
> +    dev->vhost_pmd_id = 0;

0 is a valid vhost_pmd_id number. Seen as it's used in the destruct it 
might be better to set to an invalid value like MAX and check for that 
in destruct, otherwise you're relying on higher layer to ensure that 
destruct is never called before construct. edit: actually i've another 
comment wrt vhost_pmd_id below.

> +    dev->reported_queues = 0;
>
>       buf_size = dpdk_buf_size(dev->mtu);
>       dev->dpdk_mp = dpdk_mp_get(dev->socket_id, FRAME_LEN_TO_MTU(buf_size));
> @@ -726,14 +811,23 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
>       netdev->requested_n_rxq = NR_QUEUE;
>       dev->real_n_txq = NR_QUEUE;
>
> -    if (type == DPDK_DEV_ETH) {
> -        netdev_dpdk_alloc_txq(dev, NR_QUEUE);
> +    n_queues = (type == DPDK_DEV_VHOST_CUSE) ? OVS_VHOST_MAX_QUEUE_NUM :
> +                                               NR_QUEUE;

No need for special cuse case above.

> +    netdev_dpdk_alloc_txq(dev, n_queues);
> +    if (type != DPDK_DEV_VHOST_CUSE) {
>           err = dpdk_eth_dev_init(dev);
>           if (err) {
>               goto unlock;
>           }
> -    } else {
> -        netdev_dpdk_alloc_txq(dev, OVS_VHOST_MAX_QUEUE_NUM);
> +    }
> +
> +    if (type == DPDK_DEV_VHOST_USER) {
> +        rte_eth_dev_callback_register(port_no, RTE_ETH_EVENT_QUEUE_STATE,
> +                                     (void*)vring_state_changed_callback,
> +                                      NULL);
> +        rte_eth_dev_callback_register(port_no, RTE_ETH_EVENT_INTR_LSC,
> +                                     (void*)device_state_changed_callback,

Seems like this using this callback is a different way to do the same 
thing as the watchdog timer - could we consolidate them and simplify?

> +                                      NULL);
>       }
>
>       ovs_list_push_back(&dpdk_list, &dev->list_node);
> @@ -768,26 +862,37 @@ dpdk_dev_parse_name(const char dev_name[], const char prefix[],
>   }
>
>   static int
> -vhost_construct_helper(struct netdev *netdev) OVS_REQUIRES(dpdk_mutex)
> +netdev_dpdk_vhost_cuse_construct(struct netdev *netdev)
>   {
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +    int err;
> +
> +    ovs_mutex_lock(&dpdk_mutex);
> +    strncpy(dev->vhost_id, netdev->name, sizeof(dev->vhost_id));
>       if (rte_eal_init_ret) {
>           return rte_eal_init_ret;
>       }
>
> -    return netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST);
> +    rte_spinlock_init(&dev->vhost_cuse_tx_lock);
> +
> +    err = netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST_CUSE);
> +
> +    ovs_mutex_unlock(&dpdk_mutex);
> +    return err;
>   }
>
>   static int
> -netdev_dpdk_vhost_cuse_construct(struct netdev *netdev)
> +get_vhost_user_drv_id(void)
>   {
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -    int err;
> +    int i = 0;
>
> -    ovs_mutex_lock(&dpdk_mutex);
> -    strncpy(dev->vhost_id, netdev->name, sizeof(dev->vhost_id));
> -    err = vhost_construct_helper(netdev);
> -    ovs_mutex_unlock(&dpdk_mutex);
> -    return err;
> +    for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
> +        if (vhost_user_drv_ids[i] == 0) {

This is treating RTE_MAX_ETHPORTS as the max vhost ports. I didn't check 
but it sounds like it's for all DPDK ports and not just vhost. If true 
it means the code would allow >RTE_MAX_ETHPORTS.

> +            return i;
> +        }
> +    }
> +
> +    return -1;
>   }
>
>   static int
> @@ -796,6 +901,9 @@ netdev_dpdk_vhost_user_construct(struct netdev *netdev)
>       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>       const char *name = netdev->name;
>       int err;
> +    uint8_t port_no = 0;
> +    char devargs[4096];

magic number - any define we can use?

> +    int driver_id = 0;
>
>       /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
>        * the file system. '/' or '\' would traverse directories, so they're not
> @@ -807,22 +915,35 @@ netdev_dpdk_vhost_user_construct(struct netdev *netdev)
>           return EINVAL;
>       }
>
> +    if (rte_eal_init_ret) {
> +        return rte_eal_init_ret;
> +    }
> +

it's more consistent to check this before the name.

>       ovs_mutex_lock(&dpdk_mutex);
>       /* Take the name of the vhost-user port and append it to the location where
>        * the socket is to be created, then register the socket.
>        */
>       snprintf(dev->vhost_id, sizeof(dev->vhost_id), "%s/%s",
>                vhost_sock_dir, name);
> +    driver_id = get_vhost_user_drv_id();
> +    if (driver_id == -1) {
> +        VLOG_ERR("Too many vhost-user devices registered with PMD");
> +        err = ENODEV;
> +    } else {
> +        snprintf(devargs, sizeof(devargs), "eth_vhost%u,iface=%s,queues=%i",
> +                 driver_id, dev->vhost_id, RTE_MAX_QUEUES_PER_PORT);
> +        err = rte_eth_dev_attach(devargs, &port_no);
> +    }
>
> -    err = rte_vhost_driver_register(dev->vhost_id);
>       if (err) {
> -        VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
> -                 dev->vhost_id);
> +        VLOG_ERR("Failed to attach vhost-user device to DPDK");

It might be a bit confusing getting this error message in some cases 
where you have tried to attach and failed and in other cases where it 
has not been tried because of too many vhost-user devices. Also the 
socket name would be handy.

>       } else {
>           fatal_signal_add_file_to_unlink(dev->vhost_id);
>           VLOG_INFO("Socket %s created for vhost-user port %s\n",
>                     dev->vhost_id, name);
> -        err = vhost_construct_helper(netdev);
> +        dev->vhost_pmd_id = driver_id;

You're setting this but then it gets reset when you call 
netdev_dpdk_init(). So when you destruct you'll set the reset the wrong 
index in vhost_user_drv_ids[] ?

> +        vhost_user_drv_ids[dev->vhost_pmd_id] = 1;
> +        err = netdev_dpdk_init(netdev, port_no, DPDK_DEV_VHOST_USER);
>       }
>
>       ovs_mutex_unlock(&dpdk_mutex);
> @@ -857,6 +978,12 @@ netdev_dpdk_destruct(struct netdev *netdev)
>       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>
>       ovs_mutex_lock(&dev->mutex);
> +
> +    if (dev->type == DPDK_DEV_VHOST_USER) {
> +        rte_eth_dev_detach(dev->port_id, dev->vhost_id);
> +        vhost_user_drv_ids[dev->vhost_pmd_id] = 0;
> +    }
> +
>       rte_eth_dev_stop(dev->port_id);
>       ovs_mutex_unlock(&dev->mutex);
>
> @@ -868,7 +995,7 @@ netdev_dpdk_destruct(struct netdev *netdev)
>   }
>
>   static void
> -netdev_dpdk_vhost_destruct(struct netdev *netdev)
> +netdev_dpdk_vhost_cuse_destruct(struct netdev *netdev)
>   {
>       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>
> @@ -876,15 +1003,8 @@ netdev_dpdk_vhost_destruct(struct netdev *netdev)
>       if (netdev_dpdk_get_virtio(dev) != NULL) {
>           VLOG_ERR("Removing port '%s' while vhost device still attached.",
>                    netdev->name);
> -        VLOG_ERR("To restore connectivity after re-adding of port, VM on socket"
> -                 " '%s' must be restarted.",
> -                 dev->vhost_id);
> -    }
> -
> -    if (rte_vhost_driver_unregister(dev->vhost_id)) {
> -        VLOG_ERR("Unable to remove vhost-user socket %s", dev->vhost_id);
> -    } else {
> -        fatal_signal_remove_file_to_unlink(dev->vhost_id);
> +        VLOG_ERR("To restore connectivity after re-adding of port, VM with"
> +                 "port '%s' must be restarted.", dev->vhost_id);
>       }
>
>       ovs_mutex_lock(&dpdk_mutex);
> @@ -1000,30 +1120,6 @@ netdev_dpdk_vhost_cuse_set_multiq(struct netdev *netdev, unsigned int n_txq,
>       netdev->n_txq = n_txq;
>       dev->real_n_txq = 1;
>       netdev->n_rxq = 1;
> -    dev->txq_needs_locking = dev->real_n_txq != netdev->n_txq;
> -
> -    ovs_mutex_unlock(&dev->mutex);
> -    ovs_mutex_unlock(&dpdk_mutex);
> -
> -    return err;
> -}
> -
> -static int
> -netdev_dpdk_vhost_set_multiq(struct netdev *netdev, unsigned int n_txq,
> -                             unsigned int n_rxq)
> -{
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -    int err = 0;
> -
> -    if (netdev->n_txq == n_txq && netdev->n_rxq == n_rxq) {
> -        return err;
> -    }
> -
> -    ovs_mutex_lock(&dpdk_mutex);
> -    ovs_mutex_lock(&dev->mutex);
> -
> -    netdev->n_txq = n_txq;
> -    netdev->n_rxq = n_rxq;
>
>       ovs_mutex_unlock(&dev->mutex);
>       ovs_mutex_unlock(&dpdk_mutex);
> @@ -1118,13 +1214,13 @@ dpdk_queue_flush(struct netdev_dpdk *dev, int qid)
>   }
>
>   static bool
> -is_vhost_running(struct virtio_net *virtio_dev)
> +is_vhost_cuse_running(struct virtio_net *virtio_dev)
>   {
>       return (virtio_dev != NULL && (virtio_dev->flags & VIRTIO_DEV_RUNNING));
>   }
>
>   static inline void
> -netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,
> +netdev_dpdk_vhost_cuse_update_rx_counters(struct netdev_stats *stats,
>                                        struct dp_packet **packets, int count)
>   {
>       int i;
> @@ -1153,26 +1249,22 @@ netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,
>   }
>
>   /*
> - * The receive path for the vhost port is the TX path out from guest.
> + * The receive path for the vhost cuse port is the TX path out from guest.
>    */
>   static int
> -netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
> +netdev_dpdk_vhost_cuse_rxq_recv(struct netdev_rxq *rxq,
>                              struct dp_packet **packets, int *c)
>   {
>       struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
>       struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
> -    int qid = rxq->queue_id;
> +    int qid = 1;
>       uint16_t nb_rx = 0;
>
> -    if (OVS_UNLIKELY(!is_vhost_running(virtio_dev))) {
> +    if (OVS_UNLIKELY(!is_vhost_cuse_running(virtio_dev))) {
>           return EAGAIN;
>       }
>
> -    if (rxq->queue_id >= dev->real_n_rxq) {
> -        return EOPNOTSUPP;
> -    }
> -
> -    nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid * VIRTIO_QNUM + VIRTIO_TXQ,
> +    nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid,
>                                       dev->dpdk_mp->mp,
>                                       (struct rte_mbuf **)packets,
>                                       NETDEV_MAX_BURST);
> @@ -1181,7 +1273,7 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
>       }
>
>       rte_spinlock_lock(&dev->stats_lock);
> -    netdev_dpdk_vhost_update_rx_counters(&dev->stats, packets, nb_rx);
> +    netdev_dpdk_vhost_cuse_update_rx_counters(&dev->stats, packets, nb_rx);
>       rte_spinlock_unlock(&dev->stats_lock);
>
>       *c = (int) nb_rx;
> @@ -1217,6 +1309,18 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet **packets,
>       return 0;
>   }
>
> +static int
> +netdev_dpdk_vhost_user_rxq_recv(struct netdev_rxq *rxq,
> +                                struct dp_packet **packets, int *c)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> +
> +    if (rxq->queue_id >= dev->real_n_rxq) {
> +        return EOPNOTSUPP;
> +    }
> +
> +    return netdev_dpdk_rxq_recv(rxq, packets, c);
> +}
>   static inline int
>   netdev_dpdk_qos_run__(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
>                           int cnt)
> @@ -1235,7 +1339,7 @@ netdev_dpdk_qos_run__(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
>   }
>
>   static inline void
> -netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,
> +netdev_dpdk_vhost_cuse_update_tx_counters(struct netdev_stats *stats,
>                                        struct dp_packet **packets,
>                                        int attempted,
>                                        int dropped)
> @@ -1252,9 +1356,8 @@ netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,
>   }
>
>   static void
> -__netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
> -                         struct dp_packet **pkts, int cnt,
> -                         bool may_steal)
> +__netdev_dpdk_vhost_cuse_send(struct netdev *netdev, struct dp_packet **pkts,
> +                         int cnt, bool may_steal)
>   {
>       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>       struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
> @@ -1263,26 +1366,24 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
>       unsigned int qos_pkts = cnt;
>       uint64_t start = 0;
>
> -    qid = dev->tx_q[qid % dev->real_n_txq].map;
> -
> -    if (OVS_UNLIKELY(!is_vhost_running(virtio_dev) || qid < 0)) {
> +    if (OVS_UNLIKELY(!is_vhost_cuse_running(virtio_dev))) {
>           rte_spinlock_lock(&dev->stats_lock);
>           dev->stats.tx_dropped+= cnt;
>           rte_spinlock_unlock(&dev->stats_lock);
>           goto out;
>       }
>
> -    rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> +    /* There is a single vhost-cuse TX queue, So we need to lock it for TX. */
> +    rte_spinlock_lock(&dev->vhost_cuse_tx_lock);
>
>       /* Check has QoS has been configured for the netdev */
>       cnt = netdev_dpdk_qos_run__(dev, cur_pkts, cnt);
>       qos_pkts -= cnt;
>
>       do {
> -        int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
>           unsigned int tx_pkts;
>
> -        tx_pkts = rte_vhost_enqueue_burst(virtio_dev, vhost_qid,
> +        tx_pkts = rte_vhost_enqueue_burst(virtio_dev, VIRTIO_RXQ,
>                                             cur_pkts, cnt);
>           if (OVS_LIKELY(tx_pkts)) {
>               /* Packets have been sent.*/
> @@ -1301,7 +1402,7 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
>                * Unable to enqueue packets to vhost interface.
>                * Check available entries before retrying.
>                */
> -            while (!rte_vring_available_entries(virtio_dev, vhost_qid)) {
> +            while (!rte_vring_available_entries(virtio_dev, VIRTIO_RXQ)) {
>                   if (OVS_UNLIKELY((rte_get_timer_cycles() - start) > timeout)) {
>                       expired = 1;
>                       break;
> @@ -1313,12 +1414,12 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
>               }
>           }
>       } while (cnt);
> -
> -    rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
> +    rte_spinlock_unlock(&dev->vhost_cuse_tx_lock);
>
>       rte_spinlock_lock(&dev->stats_lock);
>       cnt += qos_pkts;
> -    netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts, cnt);
> +    netdev_dpdk_vhost_cuse_update_tx_counters(&dev->stats, pkts, total_pkts,
> +                                              cnt);
>       rte_spinlock_unlock(&dev->stats_lock);
>
>   out:
> @@ -1412,8 +1513,9 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,
>           newcnt++;
>       }
>
> -    if (dev->type == DPDK_DEV_VHOST) {
> -        __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) mbufs, newcnt, true);
> +    if (dev->type == DPDK_DEV_VHOST_CUSE) {
> +        __netdev_dpdk_vhost_cuse_send(netdev, (struct dp_packet **) mbufs,
> +                                      newcnt, true);
>       } else {
>           unsigned int qos_pkts = newcnt;
>
> @@ -1437,8 +1539,8 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts,
>   }
>
>   static int
> -netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct dp_packet **pkts,
> -                 int cnt, bool may_steal)
> +netdev_dpdk_vhost_cuse_send(struct netdev *netdev, int qid OVS_UNUSED,
> +                            struct dp_packet **pkts, int cnt, bool may_steal)
>   {
>       if (OVS_UNLIKELY(pkts[0]->source != DPBUF_DPDK)) {
>           int i;
> @@ -1450,7 +1552,7 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct dp_packet **pkts,
>               }
>           }
>       } else {
> -        __netdev_dpdk_vhost_send(netdev, qid, pkts, cnt, may_steal);
> +        __netdev_dpdk_vhost_cuse_send(netdev, pkts, cnt, may_steal);
>       }
>       return 0;
>   }
> @@ -1461,11 +1563,6 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
>   {

Now that this handles buffers for vhost, how is may_steal on/off catered 
for?

>       int i;
>
> -    if (OVS_UNLIKELY(dev->txq_needs_locking)) {
> -        qid = qid % dev->real_n_txq;
> -        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> -    }
> -

why is the possible locking removed?

>       if (OVS_UNLIKELY(!may_steal ||
>                        pkts[0]->source != DPBUF_DPDK)) {
>           struct netdev *netdev = &dev->up;
> @@ -1541,6 +1638,18 @@ netdev_dpdk_eth_send(struct netdev *netdev, int qid,
>   }
>
>   static int
> +netdev_dpdk_vhost_user_send(struct netdev *netdev, int qid,
> +        struct dp_packet **pkts, int cnt, bool may_steal)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +
> +    qid = qid % dev->real_n_txq;

Due to the modulo, 2 cores can be trying to send on the same queue which 
makes for lock contention. The code was changed to avoid that but now 
the code to avoid and the locking seems to be gone so it could cause 
corruption.

> +
> +    netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);
> +    return 0;
> +}
> +
> +static int
>   netdev_dpdk_set_etheraddr(struct netdev *netdev, const struct eth_addr mac)
>   {
>       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> @@ -1634,7 +1743,7 @@ static int
>   netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
>
>   static int
> -netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
> +netdev_dpdk_vhost_cuse_get_stats(const struct netdev *netdev,
>                               struct netdev_stats *stats)
>   {
>       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> @@ -1796,14 +1905,14 @@ netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier)
>   }
>
>   static int
> -netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier)
> +netdev_dpdk_vhost_cuse_get_carrier(const struct netdev *netdev, bool *carrier)
>   {
>       struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>       struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev);
>
>       ovs_mutex_lock(&dev->mutex);
>
> -    if (is_vhost_running(virtio_dev)) {
> +    if (is_vhost_cuse_running(virtio_dev)) {
>           *carrier = 1;
>       } else {
>           *carrier = 0;
> @@ -1853,7 +1962,7 @@ netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
>           return 0;
>       }
>
> -    if (dev->type == DPDK_DEV_ETH) {
> +    if (dev->type != DPDK_DEV_VHOST_CUSE) {
>           if (dev->flags & NETDEV_UP) {
>               err = rte_eth_dev_start(dev->port_id);
>               if (err)
> @@ -1998,75 +2107,7 @@ set_irq_status(struct virtio_net *virtio_dev)
>   }
>
>   /*
> - * Fixes mapping for vhost-user tx queues. Must be called after each
> - * enabling/disabling of queues and real_n_txq modifications.
> - */
> -static void
> -netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
> -    OVS_REQUIRES(dev->mutex)
> -{
> -    int *enabled_queues, n_enabled = 0;
> -    int i, k, total_txqs = dev->real_n_txq;
> -
> -    enabled_queues = dpdk_rte_mzalloc(total_txqs * sizeof *enabled_queues);
> -
> -    for (i = 0; i < total_txqs; i++) {
> -        /* Enabled queues always mapped to themselves. */
> -        if (dev->tx_q[i].map == i) {
> -            enabled_queues[n_enabled++] = i;
> -        }
> -    }
> -
> -    if (n_enabled == 0 && total_txqs != 0) {
> -        enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED;
> -        n_enabled = 1;
> -    }
> -
> -    k = 0;
> -    for (i = 0; i < total_txqs; i++) {
> -        if (dev->tx_q[i].map != i) {
> -            dev->tx_q[i].map = enabled_queues[k];
> -            k = (k + 1) % n_enabled;
> -        }
> -    }
> -
> -    VLOG_DBG("TX queue mapping for %s\n", dev->vhost_id);
> -    for (i = 0; i < total_txqs; i++) {
> -        VLOG_DBG("%2d --> %2d", i, dev->tx_q[i].map);
> -    }
> -
> -    rte_free(enabled_queues);
> -}
> -
> -static int
> -netdev_dpdk_vhost_set_queues(struct netdev_dpdk *dev, struct virtio_net *virtio_dev)
> -    OVS_REQUIRES(dev->mutex)
> -{
> -    uint32_t qp_num;
> -
> -    qp_num = virtio_dev->virt_qp_nb;
> -    if (qp_num > dev->up.n_rxq) {
> -        VLOG_ERR("vHost Device '%s' %"PRIu64" can't be added - "
> -                 "too many queues %d > %d", virtio_dev->ifname, virtio_dev->device_fh,
> -                 qp_num, dev->up.n_rxq);
> -        return -1;
> -    }
> -
> -    dev->real_n_rxq = qp_num;
> -    dev->real_n_txq = qp_num;
> -    dev->txq_needs_locking = true;
> -    /* Enable TX queue 0 by default if it wasn't disabled. */
> -    if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {
> -        dev->tx_q[0].map = 0;
> -    }
> -
> -    netdev_dpdk_remap_txqs(dev);
> -
> -    return 0;
> -}
> -
> -/*
> - * A new virtio-net device is added to a vhost port.
> + * A new virtio-net device is added to a vhost cuse port.
>    */
>   static int
>   new_device(struct virtio_net *virtio_dev)
> @@ -2079,11 +2120,6 @@ new_device(struct virtio_net *virtio_dev)
>       LIST_FOR_EACH(dev, list_node, &dpdk_list) {
>           if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
>               ovs_mutex_lock(&dev->mutex);
> -            if (netdev_dpdk_vhost_set_queues(dev, virtio_dev)) {
> -                ovs_mutex_unlock(&dev->mutex);
> -                ovs_mutex_unlock(&dpdk_mutex);
> -                return -1;
> -            }
>               ovsrcu_set(&dev->virtio_dev, virtio_dev);
>               exists = true;
>               virtio_dev->flags |= VIRTIO_DEV_RUNNING;
> @@ -2107,23 +2143,11 @@ new_device(struct virtio_net *virtio_dev)
>       return 0;
>   }
>
> -/* Clears mapping for all available queues of vhost interface. */
> -static void
> -netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
> -    OVS_REQUIRES(dev->mutex)
> -{
> -    int i;
> -
> -    for (i = 0; i < dev->real_n_txq; i++) {
> -        dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
> -    }
> -}
> -
>   /*
> - * Remove a virtio-net device from the specific vhost port.  Use dev->remove
> - * flag to stop any more packets from being sent or received to/from a VM and
> - * ensure all currently queued packets have been sent/received before removing
> - *  the device.
> + * Remove a virtio-net device from the specific vhost cuse port. Use
> + * dev->remove flag to stop any more packets from being sent or received
> + * to/from a VM and ensure all currently queued packets have been sent/received
> + * before removing the device.
>    */
>   static void
>   destroy_device(volatile struct virtio_net *virtio_dev)
> @@ -2138,7 +2162,6 @@ destroy_device(volatile struct virtio_net *virtio_dev)
>               ovs_mutex_lock(&dev->mutex);
>               virtio_dev->flags &= ~VIRTIO_DEV_RUNNING;
>               ovsrcu_set(&dev->virtio_dev, NULL);
> -            netdev_dpdk_txq_map_clear(dev);
>               exists = true;
>               ovs_mutex_unlock(&dev->mutex);
>               break;
> @@ -2166,49 +2189,6 @@ destroy_device(volatile struct virtio_net *virtio_dev)
>       }
>   }
>
> -static int
> -vring_state_changed(struct virtio_net *virtio_dev, uint16_t queue_id,
> -                    int enable)
> -{
> -    struct netdev_dpdk *dev;
> -    bool exists = false;
> -    int qid = queue_id / VIRTIO_QNUM;
> -
> -    if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) {
> -        return 0;
> -    }
> -
> -    ovs_mutex_lock(&dpdk_mutex);
> -    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> -        if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
> -            ovs_mutex_lock(&dev->mutex);
> -            if (enable) {
> -                dev->tx_q[qid].map = qid;
> -            } else {
> -                dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
> -            }
> -            netdev_dpdk_remap_txqs(dev);
> -            exists = true;
> -            ovs_mutex_unlock(&dev->mutex);
> -            break;
> -        }
> -    }
> -    ovs_mutex_unlock(&dpdk_mutex);
> -
> -    if (exists) {
> -        VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s' %"
> -                  PRIu64" changed to \'%s\'", queue_id, qid,
> -                  virtio_dev->ifname, virtio_dev->device_fh,
> -                  (enable == 1) ? "enabled" : "disabled");
> -    } else {
> -        VLOG_INFO("vHost Device '%s' %"PRIu64" not found", virtio_dev->ifname,
> -                  virtio_dev->device_fh);
> -        return -1;
> -    }
> -
> -    return 0;
> -}
> -
>   struct virtio_net *
>   netdev_dpdk_get_virtio(const struct netdev_dpdk *dev)
>   {
> @@ -2216,18 +2196,17 @@ netdev_dpdk_get_virtio(const struct netdev_dpdk *dev)
>   }
>
>   /*
> - * These callbacks allow virtio-net devices to be added to vhost ports when
> - * configuration has been fully complete.
> + * These callbacks allow virtio-net devices to be added to vhost cuse ports
> + * when configuration has been fully complete.
>    */
>   static const struct virtio_net_device_ops virtio_net_device_ops =
>   {
>       .new_device =  new_device,
>       .destroy_device = destroy_device,
> -    .vring_state_changed = vring_state_changed
>   };
>
>   static void *
> -start_vhost_loop(void *dummy OVS_UNUSED)
> +start_vhost_cuse_loop(void *dummy OVS_UNUSED)
>   {
>        pthread_detach(pthread_self());
>        /* Put the cuse thread into quiescent state. */
> @@ -2237,19 +2216,7 @@ start_vhost_loop(void *dummy OVS_UNUSED)
>   }
>
>   static int
> -dpdk_vhost_class_init(void)
> -{
> -    rte_vhost_driver_callback_register(&virtio_net_device_ops);
> -    rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
> -                            | 1ULL << VIRTIO_NET_F_HOST_TSO6
> -                            | 1ULL << VIRTIO_NET_F_CSUM);

I think you'll still want to set these for vhostuser.

> -
> -    ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
> -    return 0;
> -}
> -
> -static int
> -dpdk_vhost_cuse_class_init(void)
> +netdev_dpdk_vhost_cuse_class_init(void)
>   {
>       int err = -1;
>
> @@ -2265,14 +2232,12 @@ dpdk_vhost_cuse_class_init(void)
>           return -1;
>       }
>
> -    dpdk_vhost_class_init();
> -    return 0;
> -}
> +    rte_vhost_driver_callback_register(&virtio_net_device_ops);
> +    rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
> +                            | 1ULL << VIRTIO_NET_F_HOST_TSO6
> +                            | 1ULL << VIRTIO_NET_F_CSUM);
> +    ovs_thread_create("vhost_cuse_thread", start_vhost_cuse_loop, NULL);
>
> -static int
> -dpdk_vhost_user_class_init(void)
> -{
> -    dpdk_vhost_class_init();
>       return 0;
>   }
>
> @@ -2859,30 +2824,30 @@ static const struct netdev_class dpdk_ring_class =
>   static const struct netdev_class OVS_UNUSED dpdk_vhost_cuse_class =
>       NETDEV_DPDK_CLASS(
>           "dpdkvhostcuse",
> -        dpdk_vhost_cuse_class_init,
> +        netdev_dpdk_vhost_cuse_class_init,
>           netdev_dpdk_vhost_cuse_construct,
> -        netdev_dpdk_vhost_destruct,
> +        netdev_dpdk_vhost_cuse_destruct,
>           netdev_dpdk_vhost_cuse_set_multiq,
> -        netdev_dpdk_vhost_send,
> -        netdev_dpdk_vhost_get_carrier,
> -        netdev_dpdk_vhost_get_stats,
> +        netdev_dpdk_vhost_cuse_send,
> +        netdev_dpdk_vhost_cuse_get_carrier,
> +        netdev_dpdk_vhost_cuse_get_stats,
>           NULL,
>           NULL,
> -        netdev_dpdk_vhost_rxq_recv);
> +        netdev_dpdk_vhost_cuse_rxq_recv);
>
>   static const struct netdev_class OVS_UNUSED dpdk_vhost_user_class =
>       NETDEV_DPDK_CLASS(
>           "dpdkvhostuser",
> -        dpdk_vhost_user_class_init,
> -        netdev_dpdk_vhost_user_construct,
> -        netdev_dpdk_vhost_destruct,
> -        netdev_dpdk_vhost_set_multiq,
> -        netdev_dpdk_vhost_send,
> -        netdev_dpdk_vhost_get_carrier,
> -        netdev_dpdk_vhost_get_stats,
> -        NULL,
>           NULL,
> -        netdev_dpdk_vhost_rxq_recv);
> +        netdev_dpdk_vhost_user_construct,
> +        netdev_dpdk_destruct,
> +        netdev_dpdk_set_multiq,
> +        netdev_dpdk_vhost_user_send,
> +        netdev_dpdk_get_carrier,
> +        netdev_dpdk_get_stats,

Are all the phy type stats available for vhost? or do you need to check 
the netdev type in that function and report specific stats. I didn't 
think DPDK would report things like rx_dropped for vhost but I'll be 
glad to be wrong.

> +        netdev_dpdk_get_features,
> +        netdev_dpdk_get_status,
> +        netdev_dpdk_vhost_user_rxq_recv);
>
>   void
>   netdev_dpdk_register(void)
>




More information about the dev mailing list