[ovs-dev] [PATCH RFC v6 2/2] netdev-dpdk: Add vHost User PMD

Ilya Maximets i.maximets at samsung.com
Fri Oct 28 13:52:30 UTC 2016


Still not a full review.
Comments inline.

Bet regards, Ilya Maximets.

On 28.10.2016 16:03, Ciara Loftus wrote:
> The vHost PMD allows vHost User ports to be controlled by the
> librte_ether API, like physical 'dpdk' ports and IVSHM 'dpdkr' ports.
> This commit integrates this PMD into OVS and removes direct calls to the
> librte_vhost DPDK library.
> 
> This commit requires DPDK v16.11 functionality that isn't available in
> previous releases, and thus breaks compatibility with such releases.
> 
> Signed-off-by: Ciara Loftus <ciara.loftus at intel.com>
> 
> ---
> v6:
> * Unregister callbacks before detach
> 
> v5:
> * change vhost_pmd_id to signed int and use -1 value to indicate an
>   ID from the pool hasn't been alloced for it yet.
> * free pool ID if rte_eth_dev_attach fails.
> * check link status on destruct and print warning if the port is live.
> * only free ID during destruct if it has been allocated.
> 
> v4:
> * Use id-pool implementation for allocating vHost PMD IDs
> 
> v3:
> * Added DPDK_DEV_VHOST_CLIENT netdev_dpdk "type"
> * Reintroduced socket-id logic to correctly set client type sid
> * Removed magic number & used var instead
> * Remove race condition in netdev_dpdk_init by reordering code.
> * Detach vHost PMD if netdev_dpdk_init fails
> * Introduce "out" in client_reconfigure for when txq alloc fails
> * Remove set_tx_multiq fn for vHost ports
> ---
>  NEWS              |   2 +
>  lib/dpdk.c        |  10 +
>  lib/dpdk.h        |   1 +
>  lib/netdev-dpdk.c | 977 +++++++++++++++++++++---------------------------------
>  4 files changed, 388 insertions(+), 602 deletions(-)
> 
> diff --git a/NEWS b/NEWS
> index 037bcda..5f84b0b 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -137,6 +137,8 @@ v2.6.0 - 27 Sep 2016
>       * Remove dpdkvhostcuse port type.
>       * OVS client mode for vHost and vHost reconnect (Requires QEMU 2.7)
>       * 'dpdkvhostuserclient' port type.
> +     * vHost PMD integration brings vhost-user ports under control of the
> +       rte_ether DPDK API.
>     - Increase number of registers to 16.
>     - ovs-benchmark: This utility has been removed due to lack of use and
>       bitrot.
> diff --git a/lib/dpdk.c b/lib/dpdk.c
> index 49a589a..831257a 100644
> --- a/lib/dpdk.c
> +++ b/lib/dpdk.c
> @@ -29,6 +29,7 @@
>  
>  #include "dirs.h"
>  #include "fatal-signal.h"
> +#include "id-pool.h"
>  #include "netdev-dpdk.h"
>  #include "openvswitch/dynamic-string.h"
>  #include "openvswitch/vlog.h"
> @@ -37,6 +38,7 @@
>  VLOG_DEFINE_THIS_MODULE(dpdk);
>  
>  static char *vhost_sock_dir = NULL;   /* Location of vhost-user sockets */
> +static struct id_pool *vhost_driver_ids;  /* Pool of IDs for vHost PMDs */
>  
>  static int
>  process_vhost_flags(char *flag, char *default_val, int size,
> @@ -407,6 +409,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>      }
>  #endif
>  
> +    vhost_driver_ids = id_pool_create(0, RTE_MAX_ETHPORTS - 1);
> +
>      /* Finally, register the dpdk classes */
>      netdev_dpdk_register();
>  }
> @@ -428,6 +432,12 @@ dpdk_get_vhost_sock_dir(void)
>      return vhost_sock_dir;
>  }
>  
> +struct id_pool *
> +dpdk_get_vhost_id_pool(void)
> +{
> +    return vhost_driver_ids;
> +}
> +
>  void
>  dpdk_set_lcore_id(unsigned cpu)
>  {
> diff --git a/lib/dpdk.h b/lib/dpdk.h
> index 673a1f1..fb85c06 100644
> --- a/lib/dpdk.h
> +++ b/lib/dpdk.h
> @@ -35,5 +35,6 @@ struct smap;
>  void dpdk_init(const struct smap *ovs_other_config);
>  void dpdk_set_lcore_id(unsigned cpu);
>  const char *dpdk_get_vhost_sock_dir(void);
> +struct id_pool *dpdk_get_vhost_id_pool(void);
>  
>  #endif /* dpdk.h */
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 7c1523e..0b1af1d 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -27,6 +27,7 @@
>  #include <rte_cycles.h>
>  #include <rte_errno.h>
>  #include <rte_eth_ring.h>
> +#include <rte_eth_vhost.h>
>  #include <rte_ethdev.h>
>  #include <rte_malloc.h>
>  #include <rte_mbuf.h>
> @@ -38,6 +39,7 @@
>  #include "dpdk.h"
>  #include "dpif-netdev.h"
>  #include "fatal-signal.h"
> +#include "id-pool.h"
>  #include "netdev-provider.h"
>  #include "netdev-vport.h"
>  #include "odp-util.h"
> @@ -122,6 +124,7 @@ BUILD_ASSERT_DECL((MAX_NB_MBUF / ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF))
>  #define XSTAT_RX_BROADCAST_PACKETS       "rx_broadcast_packets"
>  #define XSTAT_TX_BROADCAST_PACKETS       "tx_broadcast_packets"
>  #define XSTAT_RX_UNDERSIZED_ERRORS       "rx_undersized_errors"
> +#define XSTAT_RX_UNDERSIZE_PACKETS       "rx_undersize_packets"
>  #define XSTAT_RX_OVERSIZE_ERRORS         "rx_oversize_errors"
>  #define XSTAT_RX_FRAGMENTED_ERRORS       "rx_fragmented_errors"
>  #define XSTAT_RX_JABBER_ERRORS           "rx_jabber_errors"
> @@ -171,6 +174,7 @@ enum { DRAIN_TSC = 200000ULL };
>  enum dpdk_dev_type {
>      DPDK_DEV_ETH = 0,
>      DPDK_DEV_VHOST = 1,
> +    DPDK_DEV_VHOST_CLIENT = 2,
>  };
>  
>  /* Quality of Service */
> @@ -343,15 +347,12 @@ struct netdev_dpdk {
>      struct rte_eth_link link;
>      int link_reset_cnt;
>  
> -    /* virtio identifier for vhost devices */
> -    ovsrcu_index vid;
> -
> -    /* True if vHost device is 'up' and has been reconfigured at least once */
> -    bool vhost_reconfigured;
> -
>      /* Identifier used to distinguish vhost devices from each other. */
>      char vhost_id[PATH_MAX];
>  
> +    /* ID of vhost user port given to the PMD driver */
> +    int32_t vhost_pmd_id;
> +
>      /* In dpdk_list. */
>      struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
>  
> @@ -392,16 +393,25 @@ struct netdev_rxq_dpdk {
>  };
>  
>  static int netdev_dpdk_construct(struct netdev *);
> -
> -int netdev_dpdk_get_vid(const struct netdev_dpdk *dev);
> +static int netdev_dpdk_vhost_construct(struct netdev *);
> +static int netdev_dpdk_vhost_client_construct(struct netdev *);
>  
>  struct ingress_policer *
>  netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev);
>  
> +static void link_status_changed_callback(uint8_t port_id,
> +        enum rte_eth_event_type type, void *param);
> +static void vring_state_changed_callback(uint8_t port_id,
> +        enum rte_eth_event_type type, void *param);
> +static void netdev_dpdk_remap_txqs(struct netdev_dpdk *dev);
> +static void netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev);
> +
>  static bool
> -is_dpdk_class(const struct netdev_class *class)
> +is_dpdk_eth_class(const struct netdev_class *class)
>  {
> -    return class->construct == netdev_dpdk_construct;
> +    return ((class->construct == netdev_dpdk_construct) ||
> +            (class->construct == netdev_dpdk_vhost_construct) ||
> +            (class->construct == netdev_dpdk_vhost_client_construct));
>  }
>  
>  /* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
> @@ -681,8 +691,13 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
>              continue;
>          }
>  
> -        dev->up.n_rxq = n_rxq;
> -        dev->up.n_txq = n_txq;
> +        /* Only set n_*xq for physical devices. vHost User devices will set
> +         * this value correctly using info from the virtio backend.
> +         */
> +        if (dev->type == DPDK_DEV_ETH) {
> +            dev->up.n_rxq = n_rxq;
> +            dev->up.n_txq = n_txq;
> +        }
>  
>          return 0;
>      }
> @@ -714,8 +729,16 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>  
>      rte_eth_dev_info_get(dev->port_id, &info);
>  
> -    n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
> -    n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
> +    if (dev->type != DPDK_DEV_ETH) {
> +        /* We don't know how many queues QEMU will use so we need to set up
> +         * the max. If we configure less than what QEMU configures, those
> +         * additional queues will not be available to us. */
> +        n_rxq = MIN(OVS_VHOST_MAX_QUEUE_NUM, RTE_MAX_QUEUES_PER_PORT);
> +        n_txq = MIN(OVS_VHOST_MAX_QUEUE_NUM, RTE_MAX_QUEUES_PER_PORT);
> +    } else {
> +        n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
> +        n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
> +    }
>  
>      diag = dpdk_eth_dev_queue_setup(dev, n_rxq, n_txq);
>      if (diag) {
> @@ -794,6 +817,93 @@ netdev_dpdk_alloc_txq(unsigned int n_txqs)
>      return txqs;
>  }
>  
> +void
> +link_status_changed_callback(uint8_t port_id,
> +                             enum rte_eth_event_type type OVS_UNUSED,
> +                             void *param OVS_UNUSED)
> +{
> +    struct netdev_dpdk *dev;
> +    int newnode = -1;
> +
> +    ovs_mutex_lock(&dpdk_mutex);
> +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> +        if (port_id == dev->port_id) {
> +            ovs_mutex_lock(&dev->mutex);
> +            check_link_status(dev);
> +            if (dev->link.link_status == ETH_LINK_UP) {
> +                /* Device brought up */
> +                /* Get queue information */
> +                int vid = rte_eth_vhost_get_vid_from_port_id(dev->port_id);
> +                uint32_t qp_num = rte_vhost_get_queue_num(vid);
> +                if (qp_num <= 0) {
> +                    qp_num = dev->requested_n_rxq;
> +                }
> +                /* Get NUMA information */
> +                newnode = rte_eth_dev_socket_id(dev->port_id);
> +                if (newnode == -1) {
> +                    newnode = dev->socket_id;
> +                }
> +                if (dev->requested_n_txq != qp_num
> +                                || dev->requested_n_rxq != qp_num
> +                                || dev->requested_socket_id != newnode) {
> +                    dev->requested_socket_id = newnode;
> +                    dev->requested_n_rxq = qp_num;
> +                    dev->requested_n_txq = qp_num;
> +                    netdev_request_reconfigure(&dev->up);
> +                }
> +                netdev_change_seq_changed(&dev->up);
> +                VLOG_INFO("vHost Device '%s' has been added on numa node %i",
> +                          dev->vhost_id, newnode);
> +            } else {
> +                /* Device brought down */
> +                /* Clear tx/rx queue settings. */
> +                netdev_dpdk_txq_map_clear(dev);
> +                netdev_change_seq_changed(&dev->up);
> +                VLOG_INFO("vHost Device '%s' has been removed", dev->vhost_id);
> +            }
> +            ovs_mutex_unlock(&dev->mutex);
> +            break;
> +        }
> +    }
> +
> +    ovs_mutex_unlock(&dpdk_mutex);
> +
> +    return;
> +}
> +
> +void
> +vring_state_changed_callback(uint8_t port_id,
> +                             enum rte_eth_event_type type OVS_UNUSED,
> +                             void *param OVS_UNUSED)
> +{
> +    struct netdev_dpdk *dev;
> +    struct rte_eth_vhost_queue_event event;
> +    int err = 0;
> +
> +    err = rte_eth_vhost_get_queue_event(port_id, &event);
> +    if (err || event.rx) {
> +        return;
> +    }
> +
> +    ovs_mutex_lock(&dpdk_mutex);
> +    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> +        if (port_id == dev->port_id) {
> +            ovs_mutex_lock(&dev->mutex);
> +            if (event.enable) {
> +                dev->tx_q[event.queue_id].map = event.queue_id;
> +            } else {
> +                dev->tx_q[event.queue_id].map = OVS_VHOST_QUEUE_DISABLED;
> +            }
> +            netdev_dpdk_remap_txqs(dev);
> +            ovs_mutex_unlock(&dev->mutex);
> +            break;
> +        }
> +    }
> +    ovs_mutex_unlock(&dpdk_mutex);
> +
> +    return;
> +}
> +
>  static int
>  netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
>                   enum dpdk_dev_type type)
> @@ -802,6 +912,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>      int sid;
>      int err = 0;
> +    unsigned int nr_q = 0;
>  
>      ovs_mutex_init(&dev->mutex);
>      ovs_mutex_lock(&dev->mutex);
> @@ -811,7 +922,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
>      /* If the 'sid' is negative, it means that the kernel fails
>       * to obtain the pci numa info.  In that situation, always
>       * use 'SOCKET0'. */
> -    if (type == DPDK_DEV_ETH) {
> +    if (type != DPDK_DEV_VHOST_CLIENT) {
>          sid = rte_eth_dev_socket_id(port_no);
>      } else {
>          sid = rte_lcore_to_socket_id(rte_get_master_lcore());
> @@ -824,8 +935,8 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
>      dev->flags = 0;
>      dev->requested_mtu = dev->mtu = ETHER_MTU;
>      dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
> -    ovsrcu_index_init(&dev->vid, -1);
> -    dev->vhost_reconfigured = false;
> +    dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT;
> +    dev->vhost_pmd_id = -1;
>  
>      err = netdev_dpdk_mempool_configure(dev);
>      if (err) {
> @@ -849,23 +960,28 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no,
>  
>      /* Initialize the flow control to NULL */
>      memset(&dev->fc_conf, 0, sizeof dev->fc_conf);
> -    if (type == DPDK_DEV_ETH) {
> +    if (type != DPDK_DEV_VHOST_CLIENT) {
>          err = dpdk_eth_dev_init(dev);
>          if (err) {
>              goto unlock;
>          }
> -        dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq);
> -    } else {
> -        dev->tx_q = netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM);
> -        /* Enable DPDK_DEV_VHOST device and set promiscuous mode flag. */
> -        dev->flags = NETDEV_UP | NETDEV_PROMISC;
>      }
> +    nr_q = (type == DPDK_DEV_ETH ?
> +        netdev->n_txq : MIN(OVS_VHOST_MAX_QUEUE_NUM, RTE_MAX_QUEUES_PER_PORT));
> +    dev->tx_q = netdev_dpdk_alloc_txq(nr_q);
>  
>      if (!dev->tx_q) {
>          err = ENOMEM;
>          goto unlock;
>      }
>  
> +    if (type == DPDK_DEV_VHOST) {
> +        rte_eth_dev_callback_register(port_no, RTE_ETH_EVENT_QUEUE_STATE,
> +                                      vring_state_changed_callback, NULL);
> +        rte_eth_dev_callback_register(port_no, RTE_ETH_EVENT_INTR_LSC,
> +                                      link_status_changed_callback, NULL);
> +    }
> +
>      ovs_list_push_back(&dpdk_list, &dev->list_node);
>  
>  unlock:
> @@ -895,6 +1011,37 @@ dpdk_dev_parse_name(const char dev_name[], const char prefix[],
>  }
>  
>  static int
> +dpdk_attach_vhost_pmd(struct netdev_dpdk *dev, int mode)
> +{
> +    char *devargs;
> +    int err = 0;
> +    uint8_t port_no = 0;
> +    uint32_t driver_id = 0;
> +
> +    if (id_pool_alloc_id(dpdk_get_vhost_id_pool(), &driver_id)) {
> +        devargs = xasprintf("net_vhost%u,iface=%s,queues=%i,client=%i",
> +                 driver_id, dev->vhost_id,
> +                 MIN(OVS_VHOST_MAX_QUEUE_NUM, RTE_MAX_QUEUES_PER_PORT),
> +                 mode);
> +        err = rte_eth_dev_attach(devargs, &port_no);
> +        if (!err) {
> +            dev->port_id = port_no;
> +            dev->vhost_pmd_id = driver_id;
> +        } else {
> +            id_pool_free_id(dpdk_get_vhost_id_pool(), driver_id);
> +            VLOG_ERR("Failed to attach vhost-user device %s to DPDK",
> +                     dev->vhost_id);
> +        }
> +    } else {
> +        VLOG_ERR("Unable to create vhost-user device %s - too many vhost-user"
> +                 "devices registered with PMD", dev->vhost_id);
> +        err = ENODEV;
> +    }
> +
> +    return err;
> +}
> +
> +static int
>  netdev_dpdk_vhost_construct(struct netdev *netdev)
>  {
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> @@ -904,7 +1051,7 @@ netdev_dpdk_vhost_construct(struct netdev *netdev)
>      /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
>       * the file system. '/' or '\' would traverse directories, so they're not
>       * acceptable in 'name'. */
> -    if (strchr(name, '/') || strchr(name, '\\')) {
> +    if (strchr(name, '/') || strchr(name, '\\') || strchr(name, ',')) {
>          VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
>                   "A valid name must not include '/' or '\\'",
>                   name);
> @@ -917,18 +1064,26 @@ netdev_dpdk_vhost_construct(struct netdev *netdev)
>       */
>      snprintf(dev->vhost_id, sizeof dev->vhost_id, "%s/%s",
>               dpdk_get_vhost_sock_dir(), name);
> +    dev->port_id = -1;
>  
> -    dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT;
> -    err = rte_vhost_driver_register(dev->vhost_id, dev->vhost_driver_flags);
> -    if (err) {
> -        VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
> -                 dev->vhost_id);
> -    } else {
> +    err = dpdk_attach_vhost_pmd(dev, 0);
> +
> +    if (!err) {
>          fatal_signal_add_file_to_unlink(dev->vhost_id);
>          VLOG_INFO("Socket %s created for vhost-user port %s\n",
>                    dev->vhost_id, name);
>      }
> -    err = netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST);
> +    err = netdev_dpdk_init(netdev, dev->port_id, DPDK_DEV_VHOST);
> +
> +    if (err) {

I think, that callbacks are not registered at this point in case of failure.
Anyway, IMHO, 'netdev_dpdk_init' should be responsible for unregistering.

> +        rte_eth_dev_callback_unregister(dev->port_id,
> +                                        RTE_ETH_EVENT_QUEUE_STATE,
> +                                        vring_state_changed_callback, NULL);
> +        rte_eth_dev_callback_unregister(dev->port_id,
> +                                        RTE_ETH_EVENT_INTR_LSC,
> +                                        link_status_changed_callback, NULL);
> +        rte_eth_dev_detach(dev->port_id, dev->vhost_id);
> +    }
>  
>      ovs_mutex_unlock(&dpdk_mutex);
>      return err;
> @@ -940,7 +1095,7 @@ netdev_dpdk_vhost_client_construct(struct netdev *netdev)
>      int err;
>  
>      ovs_mutex_lock(&dpdk_mutex);
> -    err = netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST);
> +    err = netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST_CLIENT);
>      ovs_mutex_unlock(&dpdk_mutex);
>      return err;
>  }
> @@ -964,13 +1119,10 @@ netdev_dpdk_construct(struct netdev *netdev)
>  }
>  
>  static void
> -netdev_dpdk_destruct(struct netdev *netdev)
> +dpdk_destruct_helper(struct netdev_dpdk *dev)
> +    OVS_REQUIRES(dpdk_mutex)
> +    OVS_REQUIRES(dev->mutex)
>  {
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -
> -    ovs_mutex_lock(&dpdk_mutex);
> -    ovs_mutex_lock(&dev->mutex);
> -
>      rte_eth_dev_stop(dev->port_id);
>      free(ovsrcu_get_protected(struct ingress_policer *,
>                                &dev->ingress_policer));
> @@ -978,61 +1130,59 @@ netdev_dpdk_destruct(struct netdev *netdev)
>      rte_free(dev->tx_q);
>      ovs_list_remove(&dev->list_node);
>      dpdk_mp_put(dev->dpdk_mp);
> -
> -    ovs_mutex_unlock(&dev->mutex);
> -    ovs_mutex_unlock(&dpdk_mutex);
>  }
>  
> -/* rte_vhost_driver_unregister() can call back destroy_device(), which will
> - * try to acquire 'dpdk_mutex' and possibly 'dev->mutex'.  To avoid a
> - * deadlock, none of the mutexes must be held while calling this function. */
> -static int
> -dpdk_vhost_driver_unregister(struct netdev_dpdk *dev OVS_UNUSED,
> -                             char *vhost_id)
> -    OVS_EXCLUDED(dpdk_mutex)
> -    OVS_EXCLUDED(dev->mutex)
> +static void
> +netdev_dpdk_destruct(struct netdev *netdev)
>  {
> -    return rte_vhost_driver_unregister(vhost_id);
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +
> +    ovs_mutex_lock(&dpdk_mutex);
> +    ovs_mutex_lock(&dev->mutex);
> +
> +    dpdk_destruct_helper(dev);
> +
> +    ovs_mutex_unlock(&dev->mutex);
> +    ovs_mutex_unlock(&dpdk_mutex);
>  }
>  
>  static void
>  netdev_dpdk_vhost_destruct(struct netdev *netdev)
>  {
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -    char *vhost_id;
>  
>      ovs_mutex_lock(&dpdk_mutex);
>      ovs_mutex_lock(&dev->mutex);
>  
> -    /* Guest becomes an orphan if still attached. */
> -    if (netdev_dpdk_get_vid(dev) >= 0
> -        && !(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
> +    check_link_status(dev);
> +    if (dev->link.link_status == ETH_LINK_UP) {
>          VLOG_ERR("Removing port '%s' while vhost device still attached.",
>                   netdev->name);
>          VLOG_ERR("To restore connectivity after re-adding of port, VM on "
>                   "socket '%s' must be restarted.", dev->vhost_id);
>      }
>  
> -    free(ovsrcu_get_protected(struct ingress_policer *,
> -                              &dev->ingress_policer));
> +    rte_eth_dev_callback_unregister(dev->port_id,
> +                                    RTE_ETH_EVENT_QUEUE_STATE,
> +                                    vring_state_changed_callback, NULL);
> +    rte_eth_dev_callback_unregister(dev->port_id,
> +                                    RTE_ETH_EVENT_INTR_LSC,
> +                                    link_status_changed_callback, NULL);

We should check for EAGAIN here to avoid leaks cased by races.

>  
> -    rte_free(dev->tx_q);
> -    ovs_list_remove(&dev->list_node);
> -    dpdk_mp_put(dev->dpdk_mp);
> +    if (rte_eth_dev_detach(dev->port_id, dev->vhost_id)) {
> +        VLOG_ERR("Error removing vhost device %s", dev->vhost_id);
> +    } else if (dev->type == DPDK_DEV_VHOST) {
> +        fatal_signal_remove_file_to_unlink(dev->vhost_id);
> +    }
>  
> -    vhost_id = xstrdup(dev->vhost_id);
> +    if (dev->vhost_pmd_id >= 0) {
> +        id_pool_free_id(dpdk_get_vhost_id_pool(), dev->vhost_pmd_id);
> +    }
> +
> +    dpdk_destruct_helper(dev);
>  
>      ovs_mutex_unlock(&dev->mutex);
>      ovs_mutex_unlock(&dpdk_mutex);
> -
> -    if (dpdk_vhost_driver_unregister(dev, vhost_id)) {
> -        VLOG_ERR("%s: Unable to unregister vhost driver for socket '%s'.\n",
> -                 netdev->name, vhost_id);
> -    } else if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
> -        /* OVS server mode - remove this socket from list for deletion */
> -        fatal_signal_remove_file_to_unlink(vhost_id);
> -    }
> -    free(vhost_id);
>  }
>  
>  static void
> @@ -1054,12 +1204,16 @@ netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args)
>      smap_add_format(args, "configured_rx_queues", "%d", netdev->n_rxq);
>      smap_add_format(args, "requested_tx_queues", "%d", dev->requested_n_txq);
>      smap_add_format(args, "configured_tx_queues", "%d", netdev->n_txq);
> -    smap_add_format(args, "requested_rxq_descriptors", "%d",
> -                    dev->requested_rxq_size);
> -    smap_add_format(args, "configured_rxq_descriptors", "%d", dev->rxq_size);
> -    smap_add_format(args, "requested_txq_descriptors", "%d",
> -                    dev->requested_txq_size);
> -    smap_add_format(args, "configured_txq_descriptors", "%d", dev->txq_size);
> +    if (dev->type == DPDK_DEV_ETH) {
> +        smap_add_format(args, "requested_rxq_descriptors", "%d",
> +                        dev->requested_rxq_size);
> +        smap_add_format(args, "configured_rxq_descriptors", "%d",
> +                        dev->rxq_size);
> +        smap_add_format(args, "requested_txq_descriptors", "%d",
> +                        dev->requested_txq_size);
> +        smap_add_format(args, "configured_txq_descriptors", "%d",
> +                        dev->txq_size);
> +    }
>      smap_add_format(args, "mtu", "%d", dev->mtu);
>      ovs_mutex_unlock(&dev->mutex);
>  
> @@ -1320,119 +1474,8 @@ ingress_policer_run(struct ingress_policer *policer, struct rte_mbuf **pkts,
>      return cnt;
>  }
>  
> -static bool
> -is_vhost_running(struct netdev_dpdk *dev)
> -{
> -    return (netdev_dpdk_get_vid(dev) >= 0 && dev->vhost_reconfigured);
> -}
> -
> -static inline void
> -netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats,
> -                                          unsigned int packet_size)
> -{
> -    /* Hard-coded search for the size bucket. */
> -    if (packet_size < 256) {
> -        if (packet_size >= 128) {
> -            stats->rx_128_to_255_packets++;
> -        } else if (packet_size <= 64) {
> -            stats->rx_1_to_64_packets++;
> -        } else {
> -            stats->rx_65_to_127_packets++;
> -        }
> -    } else {
> -        if (packet_size >= 1523) {
> -            stats->rx_1523_to_max_packets++;
> -        } else if (packet_size >= 1024) {
> -            stats->rx_1024_to_1522_packets++;
> -        } else if (packet_size < 512) {
> -            stats->rx_256_to_511_packets++;
> -        } else {
> -            stats->rx_512_to_1023_packets++;
> -        }
> -    }
> -}
> -
> -static inline void
> -netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,
> -                                     struct dp_packet **packets, int count,
> -                                     int dropped)
> -{
> -    int i;
> -    unsigned int packet_size;
> -    struct dp_packet *packet;
> -
> -    stats->rx_packets += count;
> -    stats->rx_dropped += dropped;
> -    for (i = 0; i < count; i++) {
> -        packet = packets[i];
> -        packet_size = dp_packet_size(packet);
> -
> -        if (OVS_UNLIKELY(packet_size < ETH_HEADER_LEN)) {
> -            /* This only protects the following multicast counting from
> -             * too short packets, but it does not stop the packet from
> -             * further processing. */
> -            stats->rx_errors++;
> -            stats->rx_length_errors++;
> -            continue;
> -        }
> -
> -        netdev_dpdk_vhost_update_rx_size_counters(stats, packet_size);
> -
> -        struct eth_header *eh = (struct eth_header *) dp_packet_data(packet);
> -        if (OVS_UNLIKELY(eth_addr_is_multicast(eh->eth_dst))) {
> -            stats->multicast++;
> -        }
> -
> -        stats->rx_bytes += packet_size;
> -    }
> -}
> -
> -/*
> - * The receive path for the vhost port is the TX path out from guest.
> - */
> -static int
> -netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
> -                           struct dp_packet_batch *batch)
> -{
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> -    int qid = rxq->queue_id;
> -    struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
> -    uint16_t nb_rx = 0;
> -    uint16_t dropped = 0;
> -
> -    if (OVS_UNLIKELY(!is_vhost_running(dev)
> -                     || !(dev->flags & NETDEV_UP))) {
> -        return EAGAIN;
> -    }
> -
> -    nb_rx = rte_vhost_dequeue_burst(netdev_dpdk_get_vid(dev),
> -                                    qid * VIRTIO_QNUM + VIRTIO_TXQ,
> -                                    dev->dpdk_mp->mp,
> -                                    (struct rte_mbuf **) batch->packets,
> -                                    NETDEV_MAX_BURST);
> -    if (!nb_rx) {
> -        return EAGAIN;
> -    }
> -
> -    if (policer) {
> -        dropped = nb_rx;
> -        nb_rx = ingress_policer_run(policer,
> -                                    (struct rte_mbuf **) batch->packets,
> -                                    nb_rx);
> -        dropped -= nb_rx;
> -    }
> -
> -    rte_spinlock_lock(&dev->stats_lock);
> -    netdev_dpdk_vhost_update_rx_counters(&dev->stats, batch->packets,
> -                                         nb_rx, dropped);
> -    rte_spinlock_unlock(&dev->stats_lock);
> -
> -    batch->count = (int) nb_rx;
> -    return 0;
> -}
> -
> -static int
> -netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch)
> +static inline int
> +dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch)
>  {
>      struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
>      struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> @@ -1467,6 +1510,30 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch)
>      return 0;
>  }
>  
> +static int
> +netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch)
> +{
> +    return dpdk_rxq_recv(rxq, batch);
> +}
> +
> +static int
> +netdev_dpdk_vhost_client_rxq_recv(struct netdev_rxq *rxq,
> +                                  struct dp_packet_batch *batch)
> +{
> +    struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
> +
> +    if (rte_eth_dev_is_valid_port(rx->port_id)) {
> +        return dpdk_rxq_recv(rxq, batch);
> +    } else {
> +        struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> +
> +        rte_spinlock_lock(&dev->stats_lock);
> +        dev->stats.rx_dropped += batch->count;
> +        rte_spinlock_unlock(&dev->stats_lock);
> +        return 0;
> +    }
> +}
> +
>  static inline int
>  netdev_dpdk_qos_run(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
>                      int cnt)
> @@ -1508,80 +1575,6 @@ netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
>      return cnt;
>  }
>  
> -static inline void
> -netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,
> -                                     struct dp_packet **packets,
> -                                     int attempted,
> -                                     int dropped)
> -{
> -    int i;
> -    int sent = attempted - dropped;
> -
> -    stats->tx_packets += sent;
> -    stats->tx_dropped += dropped;
> -
> -    for (i = 0; i < sent; i++) {
> -        stats->tx_bytes += dp_packet_size(packets[i]);
> -    }
> -}
> -
> -static void
> -__netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
> -                         struct dp_packet **pkts, int cnt)
> -{
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -    struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts;
> -    unsigned int total_pkts = cnt;
> -    unsigned int dropped = 0;
> -    int i, retries = 0;
> -
> -    qid = dev->tx_q[qid % netdev->n_txq].map;
> -
> -    if (OVS_UNLIKELY(!is_vhost_running(dev) || qid < 0
> -                     || !(dev->flags & NETDEV_UP))) {
> -        rte_spinlock_lock(&dev->stats_lock);
> -        dev->stats.tx_dropped+= cnt;
> -        rte_spinlock_unlock(&dev->stats_lock);
> -        goto out;
> -    }
> -
> -    rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> -
> -    cnt = netdev_dpdk_filter_packet_len(dev, cur_pkts, cnt);
> -    /* Check has QoS has been configured for the netdev */
> -    cnt = netdev_dpdk_qos_run(dev, cur_pkts, cnt);
> -    dropped = total_pkts - cnt;
> -
> -    do {
> -        int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
> -        unsigned int tx_pkts;
> -
> -        tx_pkts = rte_vhost_enqueue_burst(netdev_dpdk_get_vid(dev),
> -                                          vhost_qid, cur_pkts, cnt);
> -        if (OVS_LIKELY(tx_pkts)) {
> -            /* Packets have been sent.*/
> -            cnt -= tx_pkts;
> -            /* Prepare for possible retry.*/
> -            cur_pkts = &cur_pkts[tx_pkts];
> -        } else {
> -            /* No packets sent - do not retry.*/
> -            break;
> -        }
> -    } while (cnt && (retries++ <= VHOST_ENQ_RETRY_NUM));
> -
> -    rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
> -
> -    rte_spinlock_lock(&dev->stats_lock);
> -    netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts,
> -                                         cnt + dropped);
> -    rte_spinlock_unlock(&dev->stats_lock);
> -
> -out:
> -    for (i = 0; i < total_pkts - dropped; i++) {
> -        dp_packet_delete(pkts[i]);
> -    }
> -}
> -
>  /* Tx function. Transmit packets indefinitely */
>  static void
>  dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
> @@ -1629,18 +1622,13 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
>          newcnt++;
>      }
>  
> -    if (dev->type == DPDK_DEV_VHOST) {
> -        __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) pkts,
> -                                 newcnt);
> -    } else {
> -        unsigned int qos_pkts = newcnt;
> +    unsigned int qos_pkts = newcnt;
>  
> -        /* Check if QoS has been configured for this netdev. */
> -        newcnt = netdev_dpdk_qos_run(dev, pkts, newcnt);
> +    /* Check if QoS has been configured for this netdev. */
> +    newcnt = netdev_dpdk_qos_run(dev, pkts, newcnt);
>  
> -        dropped += qos_pkts - newcnt;
> -        dropped += netdev_dpdk_eth_tx_burst(dev, qid, pkts, newcnt);
> -    }
> +    dropped += qos_pkts - newcnt;
> +    dropped += netdev_dpdk_eth_tx_burst(dev, qid, pkts, newcnt);
>  
>      if (OVS_UNLIKELY(dropped)) {
>          rte_spinlock_lock(&dev->stats_lock);
> @@ -1649,32 +1637,10 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
>      }
>  }
>  
> -static int
> -netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
> -                       struct dp_packet_batch *batch,
> -                       bool may_steal, bool concurrent_txq OVS_UNUSED)
> -{
> -
> -    if (OVS_UNLIKELY(!may_steal || batch->packets[0]->source != DPBUF_DPDK)) {
> -        dpdk_do_tx_copy(netdev, qid, batch);
> -        dp_packet_delete_batch(batch, may_steal);
> -    } else {
> -        dp_packet_batch_apply_cutlen(batch);
> -        __netdev_dpdk_vhost_send(netdev, qid, batch->packets, batch->count);
> -    }
> -    return 0;
> -}
> -
>  static inline void
>  netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
> -                   struct dp_packet_batch *batch, bool may_steal,
> -                   bool concurrent_txq)
> +                   struct dp_packet_batch *batch, bool may_steal)
>  {
> -    if (OVS_UNLIKELY(concurrent_txq)) {
> -        qid = qid % dev->up.n_txq;
> -        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> -    }
> -
>      if (OVS_UNLIKELY(!may_steal ||
>                       batch->packets[0]->source != DPBUF_DPDK)) {
>          struct netdev *netdev = &dev->up;
> @@ -1700,20 +1666,50 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
>              rte_spinlock_unlock(&dev->stats_lock);
>          }
>      }
> +}
> +
> +static int
> +netdev_dpdk_eth_send(struct netdev *netdev, int qid,
> +                     struct dp_packet_batch *batch, bool may_steal,
> +                     bool concurrent_txq)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +
> +    if (OVS_UNLIKELY(concurrent_txq)) {
> +        qid = qid % dev->up.n_txq;
> +        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> +    }
> +
> +    netdev_dpdk_send__(dev, qid, batch, may_steal);
>  
>      if (OVS_UNLIKELY(concurrent_txq)) {
>          rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
>      }
> +
> +    return 0;
>  }
>  
>  static int
> -netdev_dpdk_eth_send(struct netdev *netdev, int qid,
> -                     struct dp_packet_batch *batch, bool may_steal,
> -                     bool concurrent_txq)
> +netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
> +                       struct dp_packet_batch *batch, bool may_steal,
> +                       bool concurrent_txq OVS_UNUSED)
>  {
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>  
> -    netdev_dpdk_send__(dev, qid, batch, may_steal, concurrent_txq);
> +    qid = dev->tx_q[qid % netdev->n_txq].map;
> +    if (qid == -1) {
> +        rte_spinlock_lock(&dev->stats_lock);
> +        dev->stats.tx_dropped+= batch->count;
> +        rte_spinlock_unlock(&dev->stats_lock);
> +        if (may_steal) {
> +            dp_packet_delete_batch(batch, may_steal);
> +        }
> +    } else {
> +        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> +        netdev_dpdk_send__(dev, qid, batch, may_steal);
> +        rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
> +    }
> +
>      return 0;
>  }
>  
> @@ -1780,41 +1776,6 @@ netdev_dpdk_set_mtu(struct netdev *netdev, int mtu)
>  static int
>  netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
>  
> -static int
> -netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
> -                            struct netdev_stats *stats)
> -{
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -
> -    ovs_mutex_lock(&dev->mutex);
> -
> -    rte_spinlock_lock(&dev->stats_lock);
> -    /* Supported Stats */
> -    stats->rx_packets += dev->stats.rx_packets;
> -    stats->tx_packets += dev->stats.tx_packets;
> -    stats->rx_dropped = dev->stats.rx_dropped;
> -    stats->tx_dropped += dev->stats.tx_dropped;
> -    stats->multicast = dev->stats.multicast;
> -    stats->rx_bytes = dev->stats.rx_bytes;
> -    stats->tx_bytes = dev->stats.tx_bytes;
> -    stats->rx_errors = dev->stats.rx_errors;
> -    stats->rx_length_errors = dev->stats.rx_length_errors;
> -
> -    stats->rx_1_to_64_packets = dev->stats.rx_1_to_64_packets;
> -    stats->rx_65_to_127_packets = dev->stats.rx_65_to_127_packets;
> -    stats->rx_128_to_255_packets = dev->stats.rx_128_to_255_packets;
> -    stats->rx_256_to_511_packets = dev->stats.rx_256_to_511_packets;
> -    stats->rx_512_to_1023_packets = dev->stats.rx_512_to_1023_packets;
> -    stats->rx_1024_to_1522_packets = dev->stats.rx_1024_to_1522_packets;
> -    stats->rx_1523_to_max_packets = dev->stats.rx_1523_to_max_packets;
> -
> -    rte_spinlock_unlock(&dev->stats_lock);
> -
> -    ovs_mutex_unlock(&dev->mutex);
> -
> -    return 0;
> -}
> -
>  static void
>  netdev_dpdk_convert_xstats(struct netdev_stats *stats,
>                             const struct rte_eth_xstat *xstats,
> @@ -1858,6 +1819,8 @@ netdev_dpdk_convert_xstats(struct netdev_stats *stats,
>              stats->tx_broadcast_packets = xstats[i].value;
>          } else if (strcmp(XSTAT_RX_UNDERSIZED_ERRORS, names[i].name) == 0) {
>              stats->rx_undersized_errors = xstats[i].value;
> +        } else if (strcmp(XSTAT_RX_UNDERSIZE_PACKETS, names[i].name) == 0) {
> +            stats->rx_undersized_errors = xstats[i].value;
>          } else if (strcmp(XSTAT_RX_FRAGMENTED_ERRORS, names[i].name) == 0) {
>              stats->rx_fragmented_errors = xstats[i].value;
>          } else if (strcmp(XSTAT_RX_JABBER_ERRORS, names[i].name) == 0) {
> @@ -1880,6 +1843,11 @@ netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
>      struct rte_eth_xstat_name *rte_xstats_names = NULL;
>      int rte_xstats_len, rte_xstats_new_len, rte_xstats_ret;
>  
> +    if (rte_eth_dev_is_valid_port(dev->port_id)) {
> +        ovs_mutex_unlock(&dev->mutex);
> +        return EPROTO;
> +    }
> +
>      if (rte_eth_stats_get(dev->port_id, &rte_stats)) {
>          VLOG_ERR("Can't get ETH statistics for port: %i.", dev->port_id);
>          ovs_mutex_unlock(&dev->mutex);
> @@ -2088,24 +2056,6 @@ netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier)
>      return 0;
>  }
>  
> -static int
> -netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier)
> -{
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -
> -    ovs_mutex_lock(&dev->mutex);
> -
> -    if (is_vhost_running(dev)) {
> -        *carrier = 1;
> -    } else {
> -        *carrier = 0;
> -    }
> -
> -    ovs_mutex_unlock(&dev->mutex);
> -
> -    return 0;
> -}
> -
>  static long long int
>  netdev_dpdk_get_carrier_resets(const struct netdev *netdev)
>  {
> @@ -2161,12 +2111,10 @@ netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
>              rte_eth_dev_stop(dev->port_id);
>          }
>      } else {
> -        /* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed and vhost is
> -         * running then change netdev's change_seq to trigger link state
> -         * update. */
> +        /* If the vhost device's NETDEV_UP flag was changed then change
> +         * netdev's change_seq to trigger link state update. */
>  
> -        if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))
> -            && is_vhost_running(dev)) {
> +        if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))) {
>              netdev_change_seq_changed(&dev->up);
>  
>              /* Clear statistics if device is getting up. */
> @@ -2224,11 +2172,17 @@ netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args)
>      smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs);
>      smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools);
>  
> -    if (dev_info.pci_dev) {
> -        smap_add_format(args, "pci-vendor_id", "0x%u",
> -                        dev_info.pci_dev->id.vendor_id);
> -        smap_add_format(args, "pci-device_id", "0x%x",
> -                        dev_info.pci_dev->id.device_id);
> +    if (dev->type == DPDK_DEV_ETH) {
> +        smap_add_format(args, "max_hash_mac_addrs", "%u",
> +                        dev_info.max_hash_mac_addrs);
> +        smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs);
> +        smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools);
> +        if (dev_info.pci_dev) {
> +            smap_add_format(args, "pci-vendor_id", "0x%u",
> +                            dev_info.pci_dev->id.vendor_id);
> +            smap_add_format(args, "pci-device_id", "0x%x",
> +                            dev_info.pci_dev->id.device_id);
> +        }
>      }
>  
>      return 0;
> @@ -2264,7 +2218,7 @@ netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
>  
>      if (argc > 2) {
>          struct netdev *netdev = netdev_from_name(argv[1]);
> -        if (netdev && is_dpdk_class(netdev->netdev_class)) {
> +        if (netdev && is_dpdk_eth_class(netdev->netdev_class)) {
>              struct netdev_dpdk *dpdk_dev = netdev_dpdk_cast(netdev);
>  
>              ovs_mutex_lock(&dpdk_dev->mutex);
> @@ -2292,22 +2246,6 @@ netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
>  }
>  
>  /*
> - * Set virtqueue flags so that we do not receive interrupts.
> - */
> -static void
> -set_irq_status(int vid)
> -{
> -    uint32_t i;
> -    uint64_t idx;
> -
> -    for (i = 0; i < rte_vhost_get_queue_num(vid); i++) {
> -        idx = i * VIRTIO_QNUM;
> -        rte_vhost_enable_guest_notification(vid, idx + VIRTIO_RXQ, 0);
> -        rte_vhost_enable_guest_notification(vid, idx + VIRTIO_TXQ, 0);
> -    }
> -}
> -
> -/*
>   * Fixes mapping for vhost-user tx queues. Must be called after each
>   * enabling/disabling of queues and n_txq modifications.
>   */
> @@ -2348,73 +2286,6 @@ netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
>      free(enabled_queues);
>  }
>  
> -/*
> - * A new virtio-net device is added to a vhost port.
> - */
> -static int
> -new_device(int vid)
> -{
> -    struct netdev_dpdk *dev;
> -    bool exists = false;
> -    int newnode = 0;
> -    char ifname[IF_NAME_SZ];
> -
> -    rte_vhost_get_ifname(vid, ifname, sizeof ifname);
> -
> -    ovs_mutex_lock(&dpdk_mutex);
> -    /* Add device to the vhost port with the same name as that passed down. */
> -    LIST_FOR_EACH(dev, list_node, &dpdk_list) {
> -        ovs_mutex_lock(&dev->mutex);
> -        if (strncmp(ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
> -            uint32_t qp_num = rte_vhost_get_queue_num(vid);
> -
> -            /* Get NUMA information */
> -            newnode = rte_vhost_get_numa_node(vid);
> -            if (newnode == -1) {
> -#ifdef VHOST_NUMA
> -                VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
> -                          ifname);
> -#endif
> -                newnode = dev->socket_id;
> -            }
> -
> -            if (dev->requested_n_txq != qp_num
> -                || dev->requested_n_rxq != qp_num
> -                || dev->requested_socket_id != newnode) {
> -                dev->requested_socket_id = newnode;
> -                dev->requested_n_rxq = qp_num;
> -                dev->requested_n_txq = qp_num;
> -                netdev_request_reconfigure(&dev->up);
> -            } else {
> -                /* Reconfiguration not required. */
> -                dev->vhost_reconfigured = true;
> -            }
> -
> -            ovsrcu_index_set(&dev->vid, vid);
> -            exists = true;
> -
> -            /* Disable notifications. */
> -            set_irq_status(vid);
> -            netdev_change_seq_changed(&dev->up);
> -            ovs_mutex_unlock(&dev->mutex);
> -            break;
> -        }
> -        ovs_mutex_unlock(&dev->mutex);
> -    }
> -    ovs_mutex_unlock(&dpdk_mutex);
> -
> -    if (!exists) {
> -        VLOG_INFO("vHost Device '%s' can't be added - name not found", ifname);
> -
> -        return -1;
> -    }
> -
> -    VLOG_INFO("vHost Device '%s' has been added on numa node %i",
> -              ifname, newnode);
> -
> -    return 0;
> -}
> -
>  /* Clears mapping for all available queues of vhost interface. */
>  static void
>  netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
> @@ -2427,133 +2298,12 @@ netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
>      }
>  }
>  
> -/*
> - * Remove a virtio-net device from the specific vhost port.  Use dev->remove
> - * flag to stop any more packets from being sent or received to/from a VM and
> - * ensure all currently queued packets have been sent/received before removing
> - *  the device.
> - */
> -static void
> -destroy_device(int vid)
> -{
> -    struct netdev_dpdk *dev;
> -    bool exists = false;
> -    char ifname[IF_NAME_SZ];
> -
> -    rte_vhost_get_ifname(vid, ifname, sizeof ifname);
> -
> -    ovs_mutex_lock(&dpdk_mutex);
> -    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> -        if (netdev_dpdk_get_vid(dev) == vid) {
> -
> -            ovs_mutex_lock(&dev->mutex);
> -            dev->vhost_reconfigured = false;
> -            ovsrcu_index_set(&dev->vid, -1);
> -            netdev_dpdk_txq_map_clear(dev);
> -
> -            netdev_change_seq_changed(&dev->up);
> -            ovs_mutex_unlock(&dev->mutex);
> -            exists = true;
> -            break;
> -        }
> -    }
> -
> -    ovs_mutex_unlock(&dpdk_mutex);
> -
> -    if (exists) {
> -        /*
> -         * Wait for other threads to quiesce after setting the 'virtio_dev'
> -         * to NULL, before returning.
> -         */
> -        ovsrcu_synchronize();
> -        /*
> -         * As call to ovsrcu_synchronize() will end the quiescent state,
> -         * put thread back into quiescent state before returning.
> -         */
> -        ovsrcu_quiesce_start();
> -        VLOG_INFO("vHost Device '%s' has been removed", ifname);
> -    } else {
> -        VLOG_INFO("vHost Device '%s' not found", ifname);
> -    }
> -}
> -
> -static int
> -vring_state_changed(int vid, uint16_t queue_id, int enable)
> -{
> -    struct netdev_dpdk *dev;
> -    bool exists = false;
> -    int qid = queue_id / VIRTIO_QNUM;
> -    char ifname[IF_NAME_SZ];
> -
> -    rte_vhost_get_ifname(vid, ifname, sizeof ifname);
> -
> -    if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) {
> -        return 0;
> -    }
> -
> -    ovs_mutex_lock(&dpdk_mutex);
> -    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> -        ovs_mutex_lock(&dev->mutex);
> -        if (strncmp(ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
> -            if (enable) {
> -                dev->tx_q[qid].map = qid;
> -            } else {
> -                dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
> -            }
> -            netdev_dpdk_remap_txqs(dev);
> -            exists = true;
> -            ovs_mutex_unlock(&dev->mutex);
> -            break;
> -        }
> -        ovs_mutex_unlock(&dev->mutex);
> -    }
> -    ovs_mutex_unlock(&dpdk_mutex);
> -
> -    if (exists) {
> -        VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s'"
> -                  "changed to \'%s\'", queue_id, qid, ifname,
> -                  (enable == 1) ? "enabled" : "disabled");
> -    } else {
> -        VLOG_INFO("vHost Device '%s' not found", ifname);
> -        return -1;
> -    }
> -
> -    return 0;
> -}
> -
> -int
> -netdev_dpdk_get_vid(const struct netdev_dpdk *dev)
> -{
> -    return ovsrcu_index_get(&dev->vid);
> -}
> -
>  struct ingress_policer *
>  netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev)
>  {
>      return ovsrcu_get(struct ingress_policer *, &dev->ingress_policer);
>  }
>  
> -/*
> - * These callbacks allow virtio-net devices to be added to vhost ports when
> - * configuration has been fully complete.
> - */
> -static const struct virtio_net_device_ops virtio_net_device_ops =
> -{
> -    .new_device =  new_device,
> -    .destroy_device = destroy_device,
> -    .vring_state_changed = vring_state_changed
> -};
> -
> -static void *
> -start_vhost_loop(void *dummy OVS_UNUSED)
> -{
> -     pthread_detach(pthread_self());
> -     /* Put the vhost thread into quiescent state. */
> -     ovsrcu_quiesce_start();
> -     rte_vhost_driver_session_start();
> -     return NULL;
> -}
> -
>  static int
>  netdev_dpdk_class_init(void)
>  {
> @@ -2576,19 +2326,9 @@ netdev_dpdk_class_init(void)
>  static int
>  netdev_dpdk_vhost_class_init(void)
>  {
> -    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
> -
> -    /* This function can be called for different classes.  The initialization
> -     * needs to be done only once */
> -    if (ovsthread_once_start(&once)) {
> -        rte_vhost_driver_callback_register(&virtio_net_device_ops);
> -        rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
> -                                  | 1ULL << VIRTIO_NET_F_HOST_TSO6
> -                                  | 1ULL << VIRTIO_NET_F_CSUM);
> -        ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
> -
> -        ovsthread_once_done(&once);
> -    }
> +    rte_eth_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
> +                                | 1ULL << VIRTIO_NET_F_HOST_TSO6
> +                                | 1ULL << VIRTIO_NET_F_CSUM);

About features:
How about to disable all new features in this patch and make another
one for that? I think, that it's better to split infrastructure
changes with the new features.

>  
>      return 0;
>  }
> @@ -2690,7 +2430,17 @@ netdev_dpdk_ring_send(struct netdev *netdev, int qid,
>          dp_packet_rss_invalidate(batch->packets[i]);
>      }
>  
> -    netdev_dpdk_send__(dev, qid, batch, may_steal, concurrent_txq);
> +    if (OVS_UNLIKELY(concurrent_txq)) {
> +        qid = qid % dev->up.n_txq;
> +        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> +    }
> +
> +    netdev_dpdk_send__(dev, qid, batch, may_steal);
> +
> +    if (OVS_UNLIKELY(concurrent_txq)) {
> +        rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
> +    }
> +
>      return 0;
>  }
>  
> @@ -2985,10 +2735,6 @@ dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
>              netdev_change_seq_changed(&dev->up);
>          }
>      }
> -
> -    if (netdev_dpdk_get_vid(dev) >= 0) {
> -        dev->vhost_reconfigured = true;
> -    }
>  }
>  
>  static int
> @@ -3007,6 +2753,7 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
>  {
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>      int err = 0;
> +    int sid = -1;
>  
>      ovs_mutex_lock(&dev->mutex);
>  
> @@ -3020,20 +2767,45 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
>      if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)
>              && strlen(dev->vhost_id)) {
>          /* Register client-mode device */
> -        err = rte_vhost_driver_register(dev->vhost_id,
> -                                        RTE_VHOST_USER_CLIENT);
> -        if (err) {
> -            VLOG_ERR("vhost-user device setup failure for device %s\n",
> -                     dev->vhost_id);
> -        } else {
> -            /* Configuration successful */
> +        err = dpdk_attach_vhost_pmd(dev, 1);
> +
> +        if (!err) {
> +            sid = rte_eth_dev_socket_id(dev->port_id);
> +            dev->socket_id = sid < 0 ? SOCKET0 : sid;
>              dev->vhost_driver_flags |= RTE_VHOST_USER_CLIENT;
> +
> +            rte_eth_dev_callback_register(dev->port_id,
> +                                          RTE_ETH_EVENT_QUEUE_STATE,
> +                                          vring_state_changed_callback, NULL);
> +            rte_eth_dev_callback_register(dev->port_id,
> +                                          RTE_ETH_EVENT_INTR_LSC,
> +                                          link_status_changed_callback, NULL);
> +
> +            rte_eth_dev_stop(dev->port_id);
> +            rte_free(dev->tx_q);
> +            err = dpdk_eth_dev_init(dev);
> +            dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq);
> +            if (!dev->tx_q) {

callback unregistering required. And detach + id free also required if
we are returning 0. (I'm suggesting to register callbacks after
allocation to avoid unregistering on failure.)

> +                err = ENOMEM;

Useless assignment right now, but it's unclear what should we return
in case of memory allocation failure.
Should this port be deleted from the datapath?
If we're here, it means that port configured properly and there will
be no more reconfiguration requests to try to reconfigure it again.

What do you think?

> +                goto out;
> +            }
> +
> +            /* Enable TX queue 0 by default if it wasn't disabled. */
> +            if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {
> +                dev->tx_q[0].map = 0;
> +            }
> +
> +            netdev_dpdk_remap_txqs(dev);
> +
> +            netdev_change_seq_changed(netdev);
> +
>              VLOG_INFO("vHost User device '%s' created in 'client' mode, "
>                        "using client socket '%s'",
>                        dev->up.name, dev->vhost_id);
>          }
>      }
>  
> +out:
>      ovs_mutex_unlock(&dev->mutex);
>  
>      return 0;
> @@ -3153,12 +2925,13 @@ static const struct netdev_class dpdk_vhost_class =
>          NULL,
>          NULL,
>          netdev_dpdk_vhost_send,
> -        netdev_dpdk_vhost_get_carrier,
> -        netdev_dpdk_vhost_get_stats,
> -        NULL,
> -        NULL,
> +        netdev_dpdk_get_carrier,
> +        netdev_dpdk_get_stats,
> +        netdev_dpdk_get_features,
> +        netdev_dpdk_get_status,
>          netdev_dpdk_vhost_reconfigure,
> -        netdev_dpdk_vhost_rxq_recv);
> +        netdev_dpdk_rxq_recv);
> +
>  static const struct netdev_class dpdk_vhost_client_class =
>      NETDEV_DPDK_CLASS(
>          "dpdkvhostuserclient",
> @@ -3168,12 +2941,12 @@ static const struct netdev_class dpdk_vhost_client_class =
>          netdev_dpdk_vhost_client_set_config,
>          NULL,
>          netdev_dpdk_vhost_send,
> -        netdev_dpdk_vhost_get_carrier,
> -        netdev_dpdk_vhost_get_stats,
> -        NULL,
> -        NULL,
> +        netdev_dpdk_get_carrier,
> +        netdev_dpdk_get_stats,
> +        netdev_dpdk_get_features,
> +        netdev_dpdk_get_status,
>          netdev_dpdk_vhost_client_reconfigure,
> -        netdev_dpdk_vhost_rxq_recv);
> +        netdev_dpdk_vhost_client_rxq_recv);
>  
>  void
>  netdev_dpdk_register(void)
> 



More information about the dev mailing list