[ovs-dev] [RFC dpdk-latest 1/1] netdev-dpdk: integrate dpdk vhost pmd

Flavio Leitner fbl at sysclose.org
Wed Jun 10 21:06:01 UTC 2020


Hi,

First of all thanks for the patch.

I found some issues that I would like to discuss while I continue
reviewing the patch. I haven't tested yet.

See my comment in line.

Thanks!
fbl

On Tue, May 19, 2020 at 05:19:12PM +0530, Sivaprasad Tummala wrote:
> The vHost PMD brings vHost User port types ('dpdkvhostuser' and
> 'dpdkvhostuserclient') under control of DPDK's librte_ether API, like
> other DPDK netdev types ('dpdk'). In doing so, direct
> calls to DPDK's librte_vhost library are removed and replaced with
> librte_ether API calls, for which most of the infrastructure is already
> in place.
> 
> To enable TSO, specific changes were required in the vhost PMD. The
> patch which enables these is  available on dpdk-master and here:
> https://patches.dpdk.org/patch/66052/
> 
> Signed-off-by: Ciara Loftus <Ciara.Loftus at intel.com>
> Signed-off-by: Sivaprasad Tummala <Sivaprasad.Tummala at intel.com>
> 
> Tested-by: Sunil Pai G <sunil.pai.g at intel.com>
> ---
>  Documentation/topics/dpdk/vhost-user.rst |    3 +
>  NEWS                                     |    3 +
>  acinclude.m4                             |    4 +
>  include/openvswitch/netdev.h             |    1 +
>  lib/dpdk.c                               |   11 +
>  lib/dpdk.h                               |    2 +
>  lib/netdev-dpdk.c                        | 1384 ++++++++--------------
>  7 files changed, 535 insertions(+), 873 deletions(-)
> 
> diff --git a/Documentation/topics/dpdk/vhost-user.rst b/Documentation/topics/dpdk/vhost-user.rst
> index c6c6fd8bd..644598f79 100644
> --- a/Documentation/topics/dpdk/vhost-user.rst
> +++ b/Documentation/topics/dpdk/vhost-user.rst
> @@ -34,6 +34,9 @@ User, refer to the `QEMU documentation`_ on same.
>     To use any DPDK-backed interface, you must ensure your bridge is configured
>     correctly. For more information, refer to :doc:`bridge`.
>  
> +   Maximum number of vHost ports should be less than RTE_MAX_ETHPORTS as
> +   defined in the DPDK configuration.
> +

This needs more clarification saying that now the vhost-user ports
are counted and the maximum is RTE_MAX_ETHPORTS.  This also misses
the name restriction change disallowing commas.



>  Quick Example
>  -------------
>  
> diff --git a/NEWS b/NEWS
> index 3dbd8ec0e..0071916fb 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -9,6 +9,9 @@ Post-v2.13.0
>     - DPDK:
>       * Deprecated DPDK pdump packet capture support removed.
>       * Deprecated DPDK ring ports (dpdkr) are no longer supported.
> +     * Use DPDK's vHost PMD instead of direct library calls. This means the
> +       maximum number of vHost ports is equal to RTE_MAX_ETHPORTS as defined
> +       in the DPDK configuration.


Same here.


>     - Linux datapath:
>       * Support for kernel versions up to 5.5.x.
>     - AF_XDP:
> diff --git a/acinclude.m4 b/acinclude.m4
> index dabbffd01..80d09f61c 100644
> --- a/acinclude.m4
> +++ b/acinclude.m4
> @@ -367,6 +367,10 @@ AC_DEFUN([OVS_CHECK_DPDK], [
>        AC_DEFINE([VHOST_NUMA], [1], [NUMA Aware vHost support detected in DPDK.])
>      ], [], [[#include <rte_config.h>]])
>  
> +    AC_CHECK_DECL([RTE_LIBRTE_PMD_VHOST], [], [
> +      AC_MSG_ERROR([RTE_LIBRTE_PMD_VHOST is not defined in rte_config.h])
> +    ], [[#include <rte_config.h>]])
> +
>      AC_CHECK_DECL([RTE_LIBRTE_MLX5_PMD], [dnl found
>        OVS_FIND_DEPENDENCY([mnl_attr_put], [mnl], [libmnl])
>        AC_CHECK_DECL([RTE_IBVERBS_LINK_DLOPEN], [], [dnl not found
> diff --git a/include/openvswitch/netdev.h b/include/openvswitch/netdev.h
> index 0c10f7b48..09027e15d 100644
> --- a/include/openvswitch/netdev.h
> +++ b/include/openvswitch/netdev.h
> @@ -84,6 +84,7 @@ struct netdev_stats {
>      uint64_t tx_broadcast_packets;
>  
>      uint64_t rx_undersized_errors;
> +    uint64_t rx_undersize_packets;

This doesn't seem to be required for this patchset, correct?. I know
it will improve things but it could be done on a separate patch
to reduce the patch's size and complexity.


>      uint64_t rx_oversize_errors;
>      uint64_t rx_fragmented_errors;
>      uint64_t rx_jabber_errors;
> diff --git a/lib/dpdk.c b/lib/dpdk.c
> index 31450d470..202965b2a 100644
> --- a/lib/dpdk.c
> +++ b/lib/dpdk.c
> @@ -23,12 +23,14 @@
>  #include <getopt.h>
>  
>  #include <rte_errno.h>
> +#include <rte_ethdev.h>


Why this is needed?


>  #include <rte_log.h>
>  #include <rte_memzone.h>
>  #include <rte_version.h>
>  
>  #include "dirs.h"
>  #include "fatal-signal.h"
> +#include "id-pool.h"
>  #include "netdev-dpdk.h"
>  #include "netdev-offload-provider.h"
>  #include "openvswitch/dynamic-string.h"
> @@ -50,6 +52,7 @@ static bool vhost_postcopy_enabled = false; /* Status of vHost POSTCOPY
>  static bool dpdk_initialized = false; /* Indicates successful initialization
>                                         * of DPDK. */
>  static bool per_port_memory = false; /* Status of per port memory support */
> +static struct id_pool *vhost_driver_ids;  /* Pool of IDs for vHost PMDs. */
>  
>  static int
>  process_vhost_flags(char *flag, const char *default_val, int size,
> @@ -428,6 +431,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>      /* We are called from the main thread here */
>      RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
>  
> +    vhost_driver_ids = id_pool_create(0, RTE_MAX_ETHPORTS);
> +
>      /* Finally, register the dpdk classes */
>      netdev_dpdk_register();
>      netdev_register_flow_api_provider(&netdev_offload_dpdk);
> @@ -499,6 +504,12 @@ dpdk_available(void)
>      return dpdk_initialized;
>  }
>  
> +struct id_pool *
> +dpdk_get_vhost_id_pool(void)
> +{
> +    return vhost_driver_ids;
> +}
> +
>  void
>  dpdk_set_lcore_id(unsigned cpu)
>  {
> diff --git a/lib/dpdk.h b/lib/dpdk.h
> index 736a64279..2dea08f84 100644
> --- a/lib/dpdk.h
> +++ b/lib/dpdk.h
> @@ -44,4 +44,6 @@ bool dpdk_per_port_memory(void);
>  bool dpdk_available(void);
>  void print_dpdk_version(void);
>  void dpdk_status(const struct ovsrec_open_vswitch *);
> +struct id_pool *dpdk_get_vhost_id_pool(void);
> +
>  #endif /* dpdk.h */
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 9d2b4104c..e02929174 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -30,6 +30,7 @@
>  #include <rte_config.h>
>  #include <rte_cycles.h>
>  #include <rte_errno.h>
> +#include <rte_eth_vhost.h>
>  #include <rte_ethdev.h>
>  #include <rte_flow.h>
>  #include <rte_malloc.h>
> @@ -47,6 +48,7 @@
>  #include "dpif-netdev.h"
>  #include "fatal-signal.h"
>  #include "if-notifier.h"
> +#include "id-pool.h"
>  #include "netdev-provider.h"
>  #include "netdev-vport.h"
>  #include "odp-util.h"
> @@ -70,6 +72,7 @@
>  #include "uuid.h"
>  
>  enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
> +enum {VHOST_SERVER_MODE, VHOST_CLIENT_MODE};


Why do we need this since we have dev->vhost_driver_flags?


>  VLOG_DEFINE_THIS_MODULE(netdev_dpdk);
>  static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
> @@ -125,7 +128,7 @@ BUILD_ASSERT_DECL((MAX_NB_MBUF / ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF))
>  /* Maximum size of Physical NIC Queues */
>  #define NIC_PORT_MAX_Q_SIZE 4096
>  
> -#define OVS_VHOST_MAX_QUEUE_NUM 1024  /* Maximum number of vHost TX queues. */
> +#define OVS_VHOST_MAX_QUEUE_NUM RTE_MAX_QUEUES_PER_PORT /* Max vHost TXQs. */


Since we want to use a PMD with eth API, then I'd suggest to remove
that define and use RTE_MAX_QUEUES_PER_PORT instead to reduce the
code differences.


>  #define OVS_VHOST_QUEUE_MAP_UNKNOWN (-1) /* Mapping not initialized. */
>  #define OVS_VHOST_QUEUE_DISABLED    (-2) /* Queue was disabled by guest and not
>                                            * yet mapped to another queue. */
> @@ -172,27 +175,6 @@ static const struct rte_eth_conf port_conf = {
>      },
>  };
>  
> -/*
> - * These callbacks allow virtio-net devices to be added to vhost ports when
> - * configuration has been fully completed.
> - */
> -static int new_device(int vid);
> -static void destroy_device(int vid);
> -static int vring_state_changed(int vid, uint16_t queue_id, int enable);
> -static void destroy_connection(int vid);
> -static void vhost_guest_notified(int vid);
> -
> -static const struct vhost_device_ops virtio_net_device_ops =
> -{
> -    .new_device =  new_device,
> -    .destroy_device = destroy_device,
> -    .vring_state_changed = vring_state_changed,
> -    .features_changed = NULL,
> -    .new_connection = NULL,
> -    .destroy_connection = destroy_connection,
> -    .guest_notified = vhost_guest_notified,
> -};
> -


This reverts commit 3d56e4ac445d17e69484a95b319ac578e3580b65
from Eelco and it is not noted anywhere.


>  /* Custom software stats for dpdk ports */
>  struct netdev_dpdk_sw_stats {
>      /* No. of retries when unable to transmit. */
> @@ -444,6 +426,8 @@ struct netdev_dpdk {
>          };
>          struct dpdk_tx_queue *tx_q;
>          struct rte_eth_link link;
> +        /* ID of vhost user port given to the PMD driver. */
> +        int32_t vhost_pmd_id;
>      );
>  
>      PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,
> @@ -540,7 +524,12 @@ static int netdev_dpdk_get_sw_custom_stats(const struct netdev *,
>                                             struct netdev_custom_stats *);
>  static void netdev_dpdk_clear_xstats(struct netdev_dpdk *dev);
>  
> -int netdev_dpdk_get_vid(const struct netdev_dpdk *dev);
> +static int link_status_changed_callback(dpdk_port_t port_id,
> +        enum rte_eth_event_type type, void *param, void *ret_param);
> +static int vring_state_changed_callback(dpdk_port_t port_id,
> +        enum rte_eth_event_type type, void *param, void *ret_param);
> +static void netdev_dpdk_remap_txqs(struct netdev_dpdk *dev);
> +static void netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev);
>  
>  struct ingress_policer *
>  netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev);
> @@ -1008,20 +997,22 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
>              break;
>          }
>  
> -        diag = rte_eth_dev_set_mtu(dev->port_id, dev->mtu);
> -        if (diag) {
> -            /* A device may not support rte_eth_dev_set_mtu, in this case
> -             * flag a warning to the user and include the devices configured
> -             * MTU value that will be used instead. */
> -            if (-ENOTSUP == diag) {
> -                rte_eth_dev_get_mtu(dev->port_id, &conf_mtu);
> -                VLOG_WARN("Interface %s does not support MTU configuration, "
> -                          "max packet size supported is %"PRIu16".",
> -                          dev->up.name, conf_mtu);
> -            } else {
> -                VLOG_ERR("Interface %s MTU (%d) setup error: %s",
> -                         dev->up.name, dev->mtu, rte_strerror(-diag));
> -                break;
> +        if (dev->type == DPDK_DEV_ETH) {
> +            diag = rte_eth_dev_set_mtu(dev->port_id, dev->mtu);
> +            if (diag) {
> +                /* A device may not support rte_eth_dev_set_mtu, in this
> +                 * case flag a warning to the user and include the devices
> +                 * configured MTU value that will be used instead. */
> +                if (-ENOTSUP == diag) {
> +                    rte_eth_dev_get_mtu(dev->port_id, &conf_mtu);
> +                    VLOG_WARN("Interface %s does not support MTU configuration"
> +                              ", max packet size supported is %"PRIu16".",
> +                              dev->up.name, conf_mtu);
> +                } else {
> +                    VLOG_ERR("Interface %s MTU (%d) setup error: %s",
> +                             dev->up.name, dev->mtu, rte_strerror(-diag));
> +                    break;
> +                }
>              }
>          }
>  
> @@ -1058,8 +1049,13 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
>              continue;
>          }
>  
> -        dev->up.n_rxq = n_rxq;
> -        dev->up.n_txq = n_txq;
> +        /* Only set n_*xq for physical devices. vHost User devices will set
> +         * this value correctly using info from the virtio frontend.
> +         */
> +        if (dev->type == DPDK_DEV_ETH) {
> +            dev->up.n_rxq = n_rxq;
> +            dev->up.n_txq = n_txq;
> +        }
>  
>          return 0;
>      }
> @@ -1133,8 +1129,17 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>          }
>      }
>  
> -    n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
> -    n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
> +    if (dev->type == DPDK_DEV_VHOST) {
> +        /* We don't know how many queues the Virtio frontend will use, so we
> +         * need to provision for the maximum, as if we configure less up front
> +         * than what Virtio configures later, those surplus queues will never
> +         * be available to the vHost backend. */
> +        n_rxq = OVS_VHOST_MAX_QUEUE_NUM;
> +        n_txq = OVS_VHOST_MAX_QUEUE_NUM;
> +    } else {
> +        n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
> +        n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
> +    }
>  
>      diag = dpdk_eth_dev_port_config(dev, n_rxq, n_txq);
>      if (diag) {
> @@ -1229,7 +1234,7 @@ common_construct(struct netdev *netdev, dpdk_port_t port_no,
>      dev->requested_mtu = RTE_ETHER_MTU;
>      dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
>      dev->requested_lsc_interrupt_mode = 0;
> -    ovsrcu_index_init(&dev->vid, -1);
> +    dev->vhost_pmd_id = -1;
>      dev->vhost_reconfigured = false;
>      dev->attached = false;
>      dev->started = false;
> @@ -1293,27 +1298,83 @@ netdev_dpdk_get_num_ports(struct rte_device *device)
>  }
>  
>  static int
> -vhost_common_construct(struct netdev *netdev)
> -    OVS_REQUIRES(dpdk_mutex)
> +dpdk_attach_vhost_pmd(struct netdev_dpdk *dev, int mode)
>  {
> -    int socket_id = rte_lcore_to_socket_id(rte_get_master_lcore());
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +    char *devargs;
> +    int err = 0;
> +    dpdk_port_t port_no = 0;
> +    uint32_t driver_id = 0;
> +    int iommu_enabled = 0;
> +    int zc_enabled = 0;
> +    int postcopy_enabled = 0;
> +    int linear_buf = 1;
> +    int ext_buf = 0;
> +    int tso = 0;
> +    char *driver_name;
>  
> -    dev->vhost_rxq_enabled = dpdk_rte_mzalloc(OVS_VHOST_MAX_QUEUE_NUM *
> -                                              sizeof *dev->vhost_rxq_enabled);
> -    if (!dev->vhost_rxq_enabled) {
> -        return ENOMEM;
> +    atomic_init(&dev->vhost_tx_retries_max, VHOST_ENQ_RETRY_DEF);
> +
> +    if (dev->vhost_driver_flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY) {
> +        zc_enabled = 1;
> +        /*
> +         * DPDK vHost library doesn't allow zero-copy with linear buffers
> +         * currently. Hence disabling the Linear buffer check until the
> +         * issue is fixed in DPDK.
> +         */
> +        linear_buf = 0;
> +        VLOG_WARN("Zero copy enabled and hence disabling linear buffer"
> +                         " check  for vHost port %s", dev->up.name);


Please see Maxime's comment in another reply.


>      }
> -    dev->tx_q = netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM);
> -    if (!dev->tx_q) {
> -        rte_free(dev->vhost_rxq_enabled);
> -        return ENOMEM;
> +
> +    if (dpdk_vhost_postcopy_enabled()) {
> +        postcopy_enabled = 1;
>      }
>  
> -    atomic_init(&dev->vhost_tx_retries_max, VHOST_ENQ_RETRY_DEF);
> +    if (dpdk_vhost_iommu_enabled()) {
> +        iommu_enabled = 1;
> +    }
> +
> +    if (userspace_tso_enabled()) {
> +        ext_buf = 1;
> +        tso = 1;
> +    }
> +
> +    if (id_pool_alloc_id(dpdk_get_vhost_id_pool(), &driver_id)) {
> +        driver_name = xasprintf("net_vhost%u", driver_id);
> +        devargs = xasprintf("%s,iface=%s,queues=%i,client=%i,"
> +                            "dequeue-zero-copy=%i,postcopy-support=%i,"
> +                            "iommu-support=%i,tso=%i,"
> +                            "linear-buffer=%i,ext-buffer=%i",
> +                 driver_name, dev->vhost_id, OVS_VHOST_MAX_QUEUE_NUM, mode,
> +                 zc_enabled, postcopy_enabled, iommu_enabled, tso,
> +                 linear_buf, ext_buf);
> +        if (!rte_dev_probe(devargs) &&
> +                 !rte_eth_dev_get_port_by_name(driver_name, &port_no)) {
> +            dev->attached = true;
> +            dev->port_id = port_no;
> +            dev->vhost_pmd_id = driver_id;
> +
> +            rte_eth_dev_callback_register(dev->port_id,
> +                                          RTE_ETH_EVENT_QUEUE_STATE,
> +                                          vring_state_changed_callback,
> +                                          NULL);
> +            rte_eth_dev_callback_register(dev->port_id,
> +                                          RTE_ETH_EVENT_INTR_LSC,
> +                                          link_status_changed_callback,
> +                                          NULL);
> +        } else {
> +            id_pool_free_id(dpdk_get_vhost_id_pool(), driver_id);
> +            VLOG_ERR("Failed to attach vhost-user device %s to DPDK",
> +                     dev->vhost_id);
> +            err = ENODEV;
> +        }
> +    } else {
> +        VLOG_ERR("Unable to create vhost-user device %s - too many vhost-user "
> +                 "devices registered with PMD", dev->vhost_id);
> +        err = ENODEV;
> +    }
>  
> -    return common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
> -                            DPDK_DEV_VHOST, socket_id);
> +    return err;
>  }
>  
>  static int
> @@ -1322,11 +1383,12 @@ netdev_dpdk_vhost_construct(struct netdev *netdev)
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>      const char *name = netdev->name;
>      int err;
> +    struct rte_eth_dev_info dev_info;
>  
>      /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
>       * the file system. '/' or '\' would traverse directories, so they're not
>       * acceptable in 'name'. */
> -    if (strchr(name, '/') || strchr(name, '\\')) {
> +    if (strchr(name, '/') || strchr(name, '\\') || strchr(name, ',')) {
>          VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
>                   "A valid name must not include '/' or '\\'",
>                   name);
> @@ -1341,50 +1403,32 @@ netdev_dpdk_vhost_construct(struct netdev *netdev)
>  
>      dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT;
>  
> -    /* There is no support for multi-segments buffers. */
> -    dev->vhost_driver_flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
> -    err = rte_vhost_driver_register(dev->vhost_id, dev->vhost_driver_flags);
> -    if (err) {
> -        VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
> -                 dev->vhost_id);
> -        goto out;
> -    } else {
> +    err = dpdk_attach_vhost_pmd(dev, VHOST_SERVER_MODE);
> +    if (!err) {
>          fatal_signal_add_file_to_unlink(dev->vhost_id);
>          VLOG_INFO("Socket %s created for vhost-user port %s\n",
>                    dev->vhost_id, name);
> -    }
> -
> -    err = rte_vhost_driver_callback_register(dev->vhost_id,
> -                                                &virtio_net_device_ops);
> -    if (err) {
> -        VLOG_ERR("rte_vhost_driver_callback_register failed for vhost user "
> -                 "port: %s\n", name);
> +    } else {
>          goto out;
>      }
>  
> -    if (!userspace_tso_enabled()) {
> -        err = rte_vhost_driver_disable_features(dev->vhost_id,
> -                                    1ULL << VIRTIO_NET_F_HOST_TSO4
> -                                    | 1ULL << VIRTIO_NET_F_HOST_TSO6
> -                                    | 1ULL << VIRTIO_NET_F_CSUM);
> -        if (err) {
> -            VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
> -                     "port: %s\n", name);
> -            goto out;
> -        }
> -    }
> -
> -    err = rte_vhost_driver_start(dev->vhost_id);
> -    if (err) {
> -        VLOG_ERR("rte_vhost_driver_start failed for vhost user "
> -                 "port: %s\n", name);
> -        goto out;
> +    dev->vhost_rxq_enabled = dpdk_rte_mzalloc(OVS_VHOST_MAX_QUEUE_NUM *
> +                                              sizeof *dev->vhost_rxq_enabled);
> +    if (!dev->vhost_rxq_enabled) {
> +        return ENOMEM;
>      }
>  
> -    err = vhost_common_construct(netdev);
> +    err = common_construct(&dev->up, dev->port_id, DPDK_DEV_VHOST,
> +                           rte_lcore_to_socket_id(rte_get_master_lcore()));
>      if (err) {
> -        VLOG_ERR("vhost_common_construct failed for vhost user "
> -                 "port: %s\n", name);
> +        VLOG_ERR("common_construct failed for vhost user port: %s\n", name);
> +        rte_eth_dev_info_get(dev->port_id, &dev_info);
> +        if (!dev_info.device || rte_dev_remove(dev_info.device)) {
> +            VLOG_ERR("Error detatching device\n");
> +        }
> +        if (dev->vhost_pmd_id >= 0) {
> +            id_pool_free_id(dpdk_get_vhost_id_pool(), dev->vhost_pmd_id);
> +        }
>      }
>  
>  out:
> @@ -1402,12 +1446,14 @@ out:
>  static int
>  netdev_dpdk_vhost_client_construct(struct netdev *netdev)
>  {
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>      int err;
>  
>      ovs_mutex_lock(&dpdk_mutex);
> -    err = vhost_common_construct(netdev);
> +    err = common_construct(&dev->up, DPDK_ETH_PORT_ID_INVALID, DPDK_DEV_VHOST,
> +                           rte_lcore_to_socket_id(rte_get_master_lcore()));
>      if (err) {
> -        VLOG_ERR("vhost_common_construct failed for vhost user client"
> +        VLOG_ERR("common_construct failed for vhost user client"
>                   "port: %s\n", netdev->name);
>      }
>      ovs_mutex_unlock(&dpdk_mutex);
> @@ -1442,110 +1488,114 @@ common_destruct(struct netdev_dpdk *dev)
>  }
>  
>  static void
> -netdev_dpdk_destruct(struct netdev *netdev)
> +dpdk_dev_remove_helper(struct netdev_dpdk *dev)
>  {
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>      struct rte_device *rte_dev;
>      struct rte_eth_dev *eth_dev;
>      bool remove_on_close;
>  
> -    ovs_mutex_lock(&dpdk_mutex);
> +    /* Retrieve eth device data before closing it.
> +     * FIXME: avoid direct access to DPDK internal array rte_eth_devices.
> +     */
> +    eth_dev = &rte_eth_devices[dev->port_id];
>  
> -    rte_eth_dev_stop(dev->port_id);
> -    dev->started = false;
> +    remove_on_close =
> +        eth_dev->data &&
> +            (eth_dev->data->dev_flags & RTE_ETH_DEV_CLOSE_REMOVE);
> +    rte_dev = eth_dev->device;
>  
> -    if (dev->attached) {
> -        /* Retrieve eth device data before closing it.
> -         * FIXME: avoid direct access to DPDK internal array rte_eth_devices.
> -         */
> -        eth_dev = &rte_eth_devices[dev->port_id];
> -        remove_on_close =
> -            eth_dev->data &&
> -                (eth_dev->data->dev_flags & RTE_ETH_DEV_CLOSE_REMOVE);
> -        rte_dev = eth_dev->device;
> -
> -        /* Remove the eth device. */
> -        rte_eth_dev_close(dev->port_id);
> -
> -        /* Remove this rte device and all its eth devices if flag
> -         * RTE_ETH_DEV_CLOSE_REMOVE is not supported (which means representors
> -         * are not supported), or if all the eth devices belonging to the rte
> -         * device are closed.
> -         */
> -        if (!remove_on_close || !netdev_dpdk_get_num_ports(rte_dev)) {
> -            int ret = rte_dev_remove(rte_dev);
> +    /* Remove the eth device. */
> +    rte_eth_dev_close(dev->port_id);
>  
> -            if (ret < 0) {
> -                VLOG_ERR("Device '%s' can not be detached: %s.",
> -                         dev->devargs, rte_strerror(-ret));
> -            } else {
> -                /* Device was closed and detached. */
> -                VLOG_INFO("Device '%s' has been removed and detached",
> -                    dev->devargs);
> -            }
> +    /* Remove this rte device and all its eth devices if flag
> +     * RTE_ETH_DEV_CLOSE_REMOVE is not supported (which means representors
> +     * are not supported), or if all the eth devices belonging to the rte
> +     * device are closed.
> +     */
> +    if (!remove_on_close || !netdev_dpdk_get_num_ports(rte_dev)) {
> +        int ret = rte_dev_remove(rte_dev);
> +
> +        if (ret < 0) {
> +            VLOG_ERR("Device '%s' can not be detached: %s.",
> +                     dev->devargs, rte_strerror(-ret));
>          } else {
> -            /* Device was only closed. rte_dev_remove() was not called. */
> -            VLOG_INFO("Device '%s' has been removed", dev->devargs);
> +            /* Device was closed and detached. */
> +            VLOG_INFO("Device '%s' has been removed and detached",
> +                dev->devargs);
>          }
> +    } else {
> +        /* Device was only closed. rte_dev_remove() was not called. */
> +        VLOG_INFO("Device '%s' has been removed", dev->devargs);
>      }
>  
> -    netdev_dpdk_clear_xstats(dev);
> -    free(dev->devargs);
> -    common_destruct(dev);
> -
> -    ovs_mutex_unlock(&dpdk_mutex);
>  }
>  
> -/* rte_vhost_driver_unregister() can call back destroy_device(), which will
> - * try to acquire 'dpdk_mutex' and possibly 'dev->mutex'.  To avoid a
> - * deadlock, none of the mutexes must be held while calling this function. */
> -static int
> -dpdk_vhost_driver_unregister(struct netdev_dpdk *dev OVS_UNUSED,
> -                             char *vhost_id)
> -    OVS_EXCLUDED(dpdk_mutex)
> -    OVS_EXCLUDED(dev->mutex)
> +static void
> +netdev_dpdk_destruct(struct netdev *netdev)
>  {
> -    return rte_vhost_driver_unregister(vhost_id);
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +
> +    rte_eth_dev_stop(dev->port_id);
> +    dev->started = false;
> +
> +    if (dev->attached) {
> +        dpdk_dev_remove_helper(dev);
> +    }
> +
> +    netdev_dpdk_clear_xstats(dev);
> +    rte_free(dev->tx_q);
> +    dpdk_mp_put(dev->dpdk_mp);
> +    ovs_list_remove(&dev->list_node);
> +    free(ovsrcu_get_protected(struct ingress_policer *,
> +                              &dev->ingress_policer));
> +    ovs_mutex_destroy(&dev->mutex);
>  }
>  
>  static void
>  netdev_dpdk_vhost_destruct(struct netdev *netdev)
>  {
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -    char *vhost_id;
>  
>      ovs_mutex_lock(&dpdk_mutex);
>  
>      /* Guest becomes an orphan if still attached. */
> -    if (netdev_dpdk_get_vid(dev) >= 0
> -        && !(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
> +    check_link_status(dev);
> +    if (dev->link.link_status == ETH_LINK_UP) {
>          VLOG_ERR("Removing port '%s' while vhost device still attached.",
>                   netdev->name);
>          VLOG_ERR("To restore connectivity after re-adding of port, VM on "
>                   "socket '%s' must be restarted.", dev->vhost_id);
>      }
>  
> -    vhost_id = dev->vhost_id;
> -    dev->vhost_id = NULL;
> -    rte_free(dev->vhost_rxq_enabled);
> +    rte_eth_dev_stop(dev->port_id);
> +    dev->started = false;
>  
> -    common_destruct(dev);
> +    rte_eth_dev_callback_unregister(dev->port_id,
> +                                    RTE_ETH_EVENT_QUEUE_STATE,
> +                                    vring_state_changed_callback, NULL);
> +    rte_eth_dev_callback_unregister(dev->port_id,
> +                                    RTE_ETH_EVENT_INTR_LSC,
> +                                    link_status_changed_callback, NULL);
>  
> -    ovs_mutex_unlock(&dpdk_mutex);
> +    if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
> +           /* OVS server mode - remove this socket from list for deletion. */
> +           fatal_signal_remove_file_to_unlink(dev->vhost_id);
> +    }
>  
> -    if (!vhost_id) {
> -        goto out;
> +    if (dev->attached) {
> +        dpdk_dev_remove_helper(dev);
>      }
>  
> -    if (dpdk_vhost_driver_unregister(dev, vhost_id)) {
> -        VLOG_ERR("%s: Unable to unregister vhost driver for socket '%s'.\n",
> -                 netdev->name, vhost_id);
> -    } else if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
> -        /* OVS server mode - remove this socket from list for deletion */
> -        fatal_signal_remove_file_to_unlink(vhost_id);
> +    if (dev->vhost_pmd_id >= 0) {
> +        id_pool_free_id(dpdk_get_vhost_id_pool(),
> +                dev->vhost_pmd_id);
>      }
> -out:
> -    free(vhost_id);
> +
> +    rte_free(dev->vhost_rxq_enabled);
> +
> +    common_destruct(dev);
> +
> +    ovs_mutex_unlock(&dpdk_mutex);
>  }
>  
>  static void
> @@ -2192,12 +2242,15 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
>   * Returns the number of packets that weren't transmitted. */
>  static inline int
>  netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> -                         struct rte_mbuf **pkts, int cnt)
> +                         struct rte_mbuf **pkts, int cnt, bool vhost)
>  {
>      uint32_t nb_tx = 0;
>      uint16_t nb_tx_prep = cnt;
> +    int retries = 0;
>  
> -    if (userspace_tso_enabled()) {
> +    if (vhost) {
> +        atomic_read_relaxed(&dev->vhost_tx_retries_max, &retries);
> +    } else if (userspace_tso_enabled()) {

This is ignoring TSO support for vhost-user which is its main use
case today.


>          nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
>          if (nb_tx_prep != cnt) {
>              VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
> @@ -2216,6 +2269,10 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
>          }
>  
>          nb_tx += ret;
> +
> +        if (OVS_UNLIKELY(vhost && (--retries < 0))) {
> +            break;
> +        }
>      }
>  
>      if (OVS_UNLIKELY(nb_tx != cnt)) {
> @@ -2287,12 +2344,6 @@ ingress_policer_run(struct ingress_policer *policer, struct rte_mbuf **pkts,
>      return cnt;
>  }
>  
> -static bool
> -is_vhost_running(struct netdev_dpdk *dev)
> -{
> -    return (netdev_dpdk_get_vid(dev) >= 0 && dev->vhost_reconfigured);
> -}
> -
>  static inline void
>  netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats,
>                                            unsigned int packet_size)
> @@ -2359,72 +2410,9 @@ netdev_dpdk_vhost_update_rx_counters(struct netdev_dpdk *dev,
>      }
>  }
>  
> -/*
> - * The receive path for the vhost port is the TX path out from guest.
> - */
> -static int
> -netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
> -                           struct dp_packet_batch *batch, int *qfill)
> -{
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> -    struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
> -    uint16_t nb_rx = 0;
> -    uint16_t qos_drops = 0;
> -    int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
> -    int vid = netdev_dpdk_get_vid(dev);
> -
> -    if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
> -                     || !(dev->flags & NETDEV_UP))) {
> -        return EAGAIN;
> -    }
> -
> -    nb_rx = rte_vhost_dequeue_burst(vid, qid, dev->dpdk_mp->mp,
> -                                    (struct rte_mbuf **) batch->packets,
> -                                    NETDEV_MAX_BURST);
> -    if (!nb_rx) {
> -        return EAGAIN;
> -    }
> -
> -    if (qfill) {
> -        if (nb_rx == NETDEV_MAX_BURST) {
> -            /* The DPDK API returns a uint32_t which often has invalid bits in
> -             * the upper 16-bits. Need to restrict the value to uint16_t. */
> -            *qfill = rte_vhost_rx_queue_count(vid, qid) & UINT16_MAX;
> -        } else {
> -            *qfill = 0;
> -        }
> -    }
> -
> -    if (policer) {
> -        qos_drops = nb_rx;
> -        nb_rx = ingress_policer_run(policer,
> -                                    (struct rte_mbuf **) batch->packets,
> -                                    nb_rx, true);
> -        qos_drops -= nb_rx;
> -    }
> -
> -    rte_spinlock_lock(&dev->stats_lock);
> -    netdev_dpdk_vhost_update_rx_counters(dev, batch->packets,
> -                                         nb_rx, qos_drops);


This removes the only caller for netdev_dpdk_vhost_update_rx_counters
but left the function behind.


> -    rte_spinlock_unlock(&dev->stats_lock);
> -
> -    batch->count = nb_rx;
> -    dp_packet_batch_init_packet_fields(batch);
> -
> -    return 0;
> -}
> -
> -static bool
> -netdev_dpdk_vhost_rxq_enabled(struct netdev_rxq *rxq)
> -{
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> -
> -    return dev->vhost_rxq_enabled[rxq->queue_id];
> -}
> -
>  static int
> -netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
> -                     int *qfill)
> +common_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
> +            int *qfill)
>  {
>      struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
>      struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> @@ -2473,6 +2461,38 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
>      return 0;
>  }
>  
> +/*
> + * The receive path for the vhost port is the TX path out from guest.
> + */
> +static int
> +netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
> +                           struct dp_packet_batch *batch,
> +                           int *qfill)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> +
> +    if (dev->vhost_reconfigured) {
> +        return common_recv(rxq, batch, qfill);
> +    }
> +
> +    return EAGAIN;
> +}
> +
> +static bool
> +netdev_dpdk_vhost_rxq_enabled(struct netdev_rxq *rxq)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
> +
> +    return dev->vhost_rxq_enabled[rxq->queue_id];
> +}
> +
> +static int
> +netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
> +                     int *qfill)
> +{
> +    return common_recv(rxq, batch, qfill);
> +}
> +
>  static inline int
>  netdev_dpdk_qos_run(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
>                      int cnt, bool should_steal)
> @@ -2517,295 +2537,66 @@ netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
>      return cnt;
>  }
>  
> -static inline void
> -netdev_dpdk_vhost_update_tx_counters(struct netdev_dpdk *dev,
> -                                     struct dp_packet **packets,
> -                                     int attempted,
> -                                     struct netdev_dpdk_sw_stats *sw_stats_add)
> -{
> -    int dropped = sw_stats_add->tx_mtu_exceeded_drops +
> -                  sw_stats_add->tx_qos_drops +
> -                  sw_stats_add->tx_failure_drops +
> -                  sw_stats_add->tx_invalid_hwol_drops;
> -    struct netdev_stats *stats = &dev->stats;
> -    int sent = attempted - dropped;
> -    int i;
> -
> -    stats->tx_packets += sent;
> -    stats->tx_dropped += dropped;
> -
> -    for (i = 0; i < sent; i++) {
> -        stats->tx_bytes += dp_packet_size(packets[i]);
> -    }
> -
> -    if (OVS_UNLIKELY(dropped || sw_stats_add->tx_retries)) {
> -        struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
> -
> -        sw_stats->tx_retries            += sw_stats_add->tx_retries;
> -        sw_stats->tx_failure_drops      += sw_stats_add->tx_failure_drops;
> -        sw_stats->tx_mtu_exceeded_drops += sw_stats_add->tx_mtu_exceeded_drops;
> -        sw_stats->tx_qos_drops          += sw_stats_add->tx_qos_drops;
> -        sw_stats->tx_invalid_hwol_drops += sw_stats_add->tx_invalid_hwol_drops;
> -    }
> -}
>  
> +/* Tx function. Transmit packets indefinitely. */
>  static void
> -__netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
> -                         struct dp_packet **pkts, int cnt)
> +dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
> +    OVS_NO_THREAD_SAFETY_ANALYSIS
>  {
> +    const size_t batch_cnt = dp_packet_batch_size(batch);
> +#if !defined(__CHECKER__) && !defined(_WIN32)
> +    const size_t PKT_ARRAY_SIZE = batch_cnt;
> +#else
> +    /* Sparse or MSVC doesn't like variable length array. */
> +    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
> +#endif
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -    struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts;
> -    struct netdev_dpdk_sw_stats sw_stats_add;
> -    unsigned int n_packets_to_free = cnt;
> -    unsigned int total_packets = cnt;
> -    int i, retries = 0;
> -    int max_retries = VHOST_ENQ_RETRY_MIN;
> -    int vid = netdev_dpdk_get_vid(dev);
> -
> -    qid = dev->tx_q[qid % netdev->n_txq].map;
> -
> -    if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured || qid < 0
> -                     || !(dev->flags & NETDEV_UP))) {
> -        rte_spinlock_lock(&dev->stats_lock);
> -        dev->stats.tx_dropped+= cnt;
> -        rte_spinlock_unlock(&dev->stats_lock);
> -        goto out;
> -    }
> -
> -    if (OVS_UNLIKELY(!rte_spinlock_trylock(&dev->tx_q[qid].tx_lock))) {
> -        COVERAGE_INC(vhost_tx_contention);
> -        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
> -    }
> +    struct rte_mbuf *pkts[PKT_ARRAY_SIZE];
> +    struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
> +    uint32_t cnt = batch_cnt;
> +    uint32_t dropped = 0;
> +    uint32_t tx_failure = 0;
> +    uint32_t mtu_drops = 0;
> +    uint32_t qos_drops = 0;
> +    bool vhost = true;
>  
> -    sw_stats_add.tx_invalid_hwol_drops = cnt;
> -    if (userspace_tso_enabled()) {
> -        cnt = netdev_dpdk_prep_hwol_batch(dev, cur_pkts, cnt);
> +    if (dev->type != DPDK_DEV_VHOST) {
> +        vhost = false;
> +        /* Check if QoS has been configured for this netdev. */
> +        cnt = netdev_dpdk_qos_run(dev, (struct rte_mbuf **) batch->packets,
> +                                  batch_cnt, false);
> +        qos_drops = batch_cnt - cnt;
>      }
>  
> -    sw_stats_add.tx_invalid_hwol_drops -= cnt;
> -    sw_stats_add.tx_mtu_exceeded_drops = cnt;
> -    cnt = netdev_dpdk_filter_packet_len(dev, cur_pkts, cnt);
> -    sw_stats_add.tx_mtu_exceeded_drops -= cnt;
> +    uint32_t txcnt = 0;
>  
> -    /* Check has QoS has been configured for the netdev */
> -    sw_stats_add.tx_qos_drops = cnt;
> -    cnt = netdev_dpdk_qos_run(dev, cur_pkts, cnt, true);
> -    sw_stats_add.tx_qos_drops -= cnt;
> +    for (uint32_t i = 0; i < cnt; i++) {
> +        struct dp_packet *packet = batch->packets[i];
> +        uint32_t size = dp_packet_size(packet);
>  
> -    n_packets_to_free = cnt;
> +        if (OVS_UNLIKELY(size > dev->max_packet_len)) {
> +            VLOG_WARN_RL(&rl, "Too big size %u max_packet_len %d",
> +                         size, dev->max_packet_len);
> +            mtu_drops++;


I think this will drop all TSO packet.


> +            continue;
> +        }
>  
> -    do {
> -        int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
> -        unsigned int tx_pkts;
> -
> -        tx_pkts = rte_vhost_enqueue_burst(vid, vhost_qid, cur_pkts, cnt);
> -        if (OVS_LIKELY(tx_pkts)) {
> -            /* Packets have been sent.*/
> -            cnt -= tx_pkts;
> -            /* Prepare for possible retry.*/
> -            cur_pkts = &cur_pkts[tx_pkts];
> -            if (OVS_UNLIKELY(cnt && !retries)) {
> -                /*
> -                 * Read max retries as there are packets not sent
> -                 * and no retries have already occurred.
> -                 */
> -                atomic_read_relaxed(&dev->vhost_tx_retries_max, &max_retries);
> -            }
> -        } else {
> -            /* No packets sent - do not retry.*/
> +        pkts[txcnt] = rte_pktmbuf_alloc(dev->dpdk_mp->mp);
> +        if (OVS_UNLIKELY(!pkts[txcnt])) {
> +            dropped = cnt - i;
>              break;
>          }
> -    } while (cnt && (retries++ < max_retries));
> -
> -    rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
>  
> -    sw_stats_add.tx_failure_drops = cnt;
> -    sw_stats_add.tx_retries = MIN(retries, max_retries);
> +        /* We have to do a copy for now. */
> +        memcpy(rte_pktmbuf_mtod(pkts[txcnt], void *),
> +               dp_packet_data(packet), size);
> +        dp_packet_set_size((struct dp_packet *)pkts[txcnt], size);


This also doesn't support TSO because we need to allocate
buffers using rte_malloc() and attach it as an external buffer.

>  
> -    rte_spinlock_lock(&dev->stats_lock);
> -    netdev_dpdk_vhost_update_tx_counters(dev, pkts, total_packets,
> -                                         &sw_stats_add);
> -    rte_spinlock_unlock(&dev->stats_lock);
> -
> -out:
> -    for (i = 0; i < n_packets_to_free; i++) {
> -        dp_packet_delete(pkts[i]);
> -    }
> -}
> -
> -static void
> -netdev_dpdk_extbuf_free(void *addr OVS_UNUSED, void *opaque)
> -{
> -    rte_free(opaque);
> -}
> -
> -static struct rte_mbuf *
> -dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, uint32_t data_len)
> -{
> -    uint32_t total_len = RTE_PKTMBUF_HEADROOM + data_len;
> -    struct rte_mbuf_ext_shared_info *shinfo = NULL;
> -    uint16_t buf_len;
> -    void *buf;
> -
> -    if (rte_pktmbuf_tailroom(pkt) >= sizeof *shinfo) {
> -        shinfo = rte_pktmbuf_mtod(pkt, struct rte_mbuf_ext_shared_info *);
> -    } else {
> -        total_len += sizeof *shinfo + sizeof(uintptr_t);
> -        total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
> -    }
> -
> -    if (OVS_UNLIKELY(total_len > UINT16_MAX)) {
> -        VLOG_ERR("Can't copy packet: too big %u", total_len);
> -        return NULL;
> -    }
> -
> -    buf_len = total_len;
> -    buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
> -    if (OVS_UNLIKELY(buf == NULL)) {
> -        VLOG_ERR("Failed to allocate memory using rte_malloc: %u", buf_len);
> -        return NULL;
> -    }
> -
> -    /* Initialize shinfo. */
> -    if (shinfo) {
> -        shinfo->free_cb = netdev_dpdk_extbuf_free;
> -        shinfo->fcb_opaque = buf;
> -        rte_mbuf_ext_refcnt_set(shinfo, 1);
> -    } else {
> -        shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
> -                                                    netdev_dpdk_extbuf_free,
> -                                                    buf);
> -        if (OVS_UNLIKELY(shinfo == NULL)) {
> -            rte_free(buf);
> -            VLOG_ERR("Failed to initialize shared info for mbuf while "
> -                     "attempting to attach an external buffer.");
> -            return NULL;
> -        }
> -    }
> -
> -    rte_pktmbuf_attach_extbuf(pkt, buf, rte_malloc_virt2iova(buf), buf_len,
> -                              shinfo);
> -    rte_pktmbuf_reset_headroom(pkt);
> -
> -    return pkt;
> -}
> -
> -static struct rte_mbuf *
> -dpdk_pktmbuf_alloc(struct rte_mempool *mp, uint32_t data_len)
> -{
> -    struct rte_mbuf *pkt = rte_pktmbuf_alloc(mp);
> -
> -    if (OVS_UNLIKELY(!pkt)) {
> -        return NULL;
> -    }
> -
> -    if (rte_pktmbuf_tailroom(pkt) >= data_len) {
> -        return pkt;
> -    }
> -
> -    if (dpdk_pktmbuf_attach_extbuf(pkt, data_len)) {
> -        return pkt;
> -    }
> -
> -    rte_pktmbuf_free(pkt);
> -
> -    return NULL;
> -}
> -
> -static struct dp_packet *
> -dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig)
> -{
> -    struct rte_mbuf *mbuf_dest;
> -    struct dp_packet *pkt_dest;
> -    uint32_t pkt_len;
> -
> -    pkt_len = dp_packet_size(pkt_orig);
> -    mbuf_dest = dpdk_pktmbuf_alloc(mp, pkt_len);
> -    if (OVS_UNLIKELY(mbuf_dest == NULL)) {
> -            return NULL;
> -    }
> -
> -    pkt_dest = CONTAINER_OF(mbuf_dest, struct dp_packet, mbuf);
> -    memcpy(dp_packet_data(pkt_dest), dp_packet_data(pkt_orig), pkt_len);
> -    dp_packet_set_size(pkt_dest, pkt_len);
> -
> -    mbuf_dest->tx_offload = pkt_orig->mbuf.tx_offload;
> -    mbuf_dest->packet_type = pkt_orig->mbuf.packet_type;
> -    mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
> -                            ~(EXT_ATTACHED_MBUF | IND_ATTACHED_MBUF));
> -
> -    memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
> -           sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size));
> -
> -    if (mbuf_dest->ol_flags & PKT_TX_L4_MASK) {
> -        mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest)
> -                                - (char *)dp_packet_eth(pkt_dest);
> -        mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest)
> -                                - (char *) dp_packet_l3(pkt_dest);
> -    }
> -
> -    return pkt_dest;
> -}
> -
> -/* Tx function. Transmit packets indefinitely */
> -static void
> -dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
> -    OVS_NO_THREAD_SAFETY_ANALYSIS
> -{
> -    const size_t batch_cnt = dp_packet_batch_size(batch);
> -#if !defined(__CHECKER__) && !defined(_WIN32)
> -    const size_t PKT_ARRAY_SIZE = batch_cnt;
> -#else
> -    /* Sparse or MSVC doesn't like variable length array. */
> -    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
> -#endif
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -    struct dp_packet *pkts[PKT_ARRAY_SIZE];
> -    struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats;
> -    uint32_t cnt = batch_cnt;
> -    uint32_t dropped = 0;
> -    uint32_t tx_failure = 0;
> -    uint32_t mtu_drops = 0;
> -    uint32_t qos_drops = 0;
> -
> -    if (dev->type != DPDK_DEV_VHOST) {
> -        /* Check if QoS has been configured for this netdev. */
> -        cnt = netdev_dpdk_qos_run(dev, (struct rte_mbuf **) batch->packets,
> -                                  batch_cnt, false);
> -        qos_drops = batch_cnt - cnt;
> -    }
> -
> -    uint32_t txcnt = 0;
> -
> -    for (uint32_t i = 0; i < cnt; i++) {
> -        struct dp_packet *packet = batch->packets[i];
> -        uint32_t size = dp_packet_size(packet);
> -
> -        if (size > dev->max_packet_len
> -            && !(packet->mbuf.ol_flags & PKT_TX_TCP_SEG)) {
> -            VLOG_WARN_RL(&rl, "Too big size %u max_packet_len %d", size,
> -                         dev->max_packet_len);
> -            mtu_drops++;
> -            continue;
> -        }
> -
> -        pkts[txcnt] = dpdk_copy_dp_packet_to_mbuf(dev->dpdk_mp->mp, packet);
> -        if (OVS_UNLIKELY(!pkts[txcnt])) {
> -            dropped = cnt - i;
> -            break;
> -        }
> -
> -        txcnt++;
> -    }
> +        txcnt++;
> +    }
>  
>      if (OVS_LIKELY(txcnt)) {
> -        if (dev->type == DPDK_DEV_VHOST) {
> -            __netdev_dpdk_vhost_send(netdev, qid, pkts, txcnt);
> -        } else {
> -            tx_failure += netdev_dpdk_eth_tx_burst(dev, qid,
> -                                                   (struct rte_mbuf **)pkts,
> -                                                   txcnt);
> -        }
> +        dropped += netdev_dpdk_eth_tx_burst(dev, qid, pkts, txcnt, vhost);
>      }
>  
>      dropped += qos_drops + mtu_drops + tx_failure;
> @@ -2819,26 +2610,10 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
>      }
>  }
>  
> -static int
> -netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
> -                       struct dp_packet_batch *batch,
> -                       bool concurrent_txq OVS_UNUSED)
> -{
> -
> -    if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
> -        dpdk_do_tx_copy(netdev, qid, batch);
> -        dp_packet_delete_batch(batch, true);
> -    } else {
> -        __netdev_dpdk_vhost_send(netdev, qid, batch->packets,
> -                                 dp_packet_batch_size(batch));
> -    }
> -    return 0;
> -}
> -
>  static inline void
>  netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
>                     struct dp_packet_batch *batch,
> -                   bool concurrent_txq)
> +                   bool concurrent_txq, bool vhost)
>  {
>      if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
>          dp_packet_delete_batch(batch, true);
> @@ -2846,7 +2621,6 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
>      }
>  
>      if (OVS_UNLIKELY(concurrent_txq)) {
> -        qid = qid % dev->up.n_txq;
>          rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
>      }
>  
> @@ -2874,7 +2648,8 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
>          batch_cnt = netdev_dpdk_qos_run(dev, pkts, batch_cnt, true);
>          qos_drops -= batch_cnt;
>  
> -        tx_failure = netdev_dpdk_eth_tx_burst(dev, qid, pkts, batch_cnt);
> +        tx_failure = netdev_dpdk_eth_tx_burst(dev, qid, pkts, batch_cnt,
> +                                              vhost);
>  
>          dropped = tx_failure + mtu_drops + qos_drops + hwol_drops;
>          if (OVS_UNLIKELY(dropped)) {
> @@ -2899,7 +2674,31 @@ netdev_dpdk_eth_send(struct netdev *netdev, int qid,
>  {
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>  
> -    netdev_dpdk_send__(dev, qid, batch, concurrent_txq);
> +    if (concurrent_txq) {
> +        qid = qid % dev->up.n_txq;
> +    }
> +
> +    netdev_dpdk_send__(dev, qid, batch, concurrent_txq, false);
> +    return 0;
> +}
> +
> +static int
> +netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
> +                       struct dp_packet_batch *batch,
> +                       bool concurrent_txq OVS_UNUSED)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +
> +    qid = dev->tx_q[qid % netdev->n_txq].map;
> +    if (qid == -1 || !dev->vhost_reconfigured) {
> +        rte_spinlock_lock(&dev->stats_lock);
> +        dev->stats.tx_dropped+= batch->count;
> +        rte_spinlock_unlock(&dev->stats_lock);
> +        dp_packet_delete_batch(batch, true);
> +    } else {
> +        netdev_dpdk_send__(dev, qid, batch, false, true);

Why concurrent_txq is disabled? It won't lock the TX queue
which it does before the patch and I don't see any other
protection.

Also this change obsoleted vhost_tx_contention coverage inc
reverting 9ff24b9c93 from David without a note.


> +    }
> +
>      return 0;
>  }
>  
> @@ -2977,41 +2776,6 @@ netdev_dpdk_set_mtu(struct netdev *netdev, int mtu)
>  static int
>  netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);
>  
> -static int
> -netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
> -                            struct netdev_stats *stats)
> -{
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -
> -    ovs_mutex_lock(&dev->mutex);
> -
> -    rte_spinlock_lock(&dev->stats_lock);
> -    /* Supported Stats */
> -    stats->rx_packets = dev->stats.rx_packets;
> -    stats->tx_packets = dev->stats.tx_packets;
> -    stats->rx_dropped = dev->stats.rx_dropped;
> -    stats->tx_dropped = dev->stats.tx_dropped;
> -    stats->multicast = dev->stats.multicast;
> -    stats->rx_bytes = dev->stats.rx_bytes;
> -    stats->tx_bytes = dev->stats.tx_bytes;
> -    stats->rx_errors = dev->stats.rx_errors;
> -    stats->rx_length_errors = dev->stats.rx_length_errors;
> -
> -    stats->rx_1_to_64_packets = dev->stats.rx_1_to_64_packets;
> -    stats->rx_65_to_127_packets = dev->stats.rx_65_to_127_packets;
> -    stats->rx_128_to_255_packets = dev->stats.rx_128_to_255_packets;
> -    stats->rx_256_to_511_packets = dev->stats.rx_256_to_511_packets;
> -    stats->rx_512_to_1023_packets = dev->stats.rx_512_to_1023_packets;
> -    stats->rx_1024_to_1522_packets = dev->stats.rx_1024_to_1522_packets;
> -    stats->rx_1523_to_max_packets = dev->stats.rx_1523_to_max_packets;
> -
> -    rte_spinlock_unlock(&dev->stats_lock);
> -
> -    ovs_mutex_unlock(&dev->mutex);
> -
> -    return 0;
> -}
> -
>  static void
>  netdev_dpdk_convert_xstats(struct netdev_stats *stats,
>                             const struct rte_eth_xstat *xstats,
> @@ -3025,6 +2789,7 @@ netdev_dpdk_convert_xstats(struct netdev_stats *stats,
>      DPDK_XSTAT(rx_broadcast_packets,    "rx_broadcast_packets"            ) \
>      DPDK_XSTAT(tx_broadcast_packets,    "tx_broadcast_packets"            ) \
>      DPDK_XSTAT(rx_undersized_errors,    "rx_undersized_errors"            ) \
> +    DPDK_XSTAT(rx_undersize_packets,    "rx_undersize_packets"            ) \
>      DPDK_XSTAT(rx_oversize_errors,      "rx_oversize_errors"              ) \
>      DPDK_XSTAT(rx_fragmented_errors,    "rx_fragmented_errors"            ) \
>      DPDK_XSTAT(rx_jabber_errors,        "rx_jabber_errors"                ) \
> @@ -3069,6 +2834,11 @@ netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
>      struct rte_eth_xstat_name *rte_xstats_names = NULL;
>      int rte_xstats_len, rte_xstats_new_len, rte_xstats_ret;
>  
> +    if (!rte_eth_dev_is_valid_port(dev->port_id)) {
> +        ovs_mutex_unlock(&dev->mutex);
> +        return EPROTO;
> +    }
> +
>      if (rte_eth_stats_get(dev->port_id, &rte_stats)) {
>          VLOG_ERR("Can't get ETH statistics for port: "DPDK_PORT_ID_FMT,
>                   dev->port_id);
> @@ -3147,6 +2917,10 @@ netdev_dpdk_get_custom_stats(const struct netdev *netdev,
>  
>      ovs_mutex_lock(&dev->mutex);
>  
> +    if (rte_eth_dev_is_valid_port(dev->port_id)) {
> +        goto out;
> +    }
> +
>      if (netdev_dpdk_configure_xstats(dev)) {
>          uint64_t *values = xcalloc(dev->rte_xstats_ids_size,
>                                     sizeof(uint64_t));
> @@ -3182,6 +2956,7 @@ netdev_dpdk_get_custom_stats(const struct netdev *netdev,
>          free(values);
>      }
>  
> +out:
>      ovs_mutex_unlock(&dev->mutex);
>  
>      return 0;
> @@ -3409,24 +3184,6 @@ netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier)
>      return 0;
>  }
>  
> -static int
> -netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier)
> -{
> -    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> -
> -    ovs_mutex_lock(&dev->mutex);
> -
> -    if (is_vhost_running(dev)) {
> -        *carrier = 1;
> -    } else {
> -        *carrier = 0;
> -    }
> -
> -    ovs_mutex_unlock(&dev->mutex);
> -
> -    return 0;
> -}
> -
>  static long long int
>  netdev_dpdk_get_carrier_resets(const struct netdev *netdev)
>  {
> @@ -3496,8 +3253,7 @@ netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
>           * running then change netdev's change_seq to trigger link state
>           * update. */
>  
> -        if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))
> -            && is_vhost_running(dev)) {
> +        if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))) {
>              netdev_change_seq_changed(&dev->up);
>  
>              /* Clear statistics if device is getting up. */
> @@ -3527,18 +3283,41 @@ netdev_dpdk_update_flags(struct netdev *netdev,
>      return error;
>  }
>  
> +static void
> +common_get_status(struct smap *args, struct netdev_dpdk *dev,
> +                  struct rte_eth_dev_info *dev_info)
> +{
> +    smap_add_format(args, "port_no", DPDK_PORT_ID_FMT, dev->port_id);
> +    smap_add_format(args, "numa_id", "%d",
> +                           rte_eth_dev_socket_id(dev->port_id));
> +    smap_add_format(args, "driver_name", "%s", dev_info->driver_name);
> +    smap_add_format(args, "min_rx_bufsize", "%u", dev_info->min_rx_bufsize);
> +    smap_add_format(args, "max_rx_pktlen", "%u", dev->max_packet_len);
> +    smap_add_format(args, "max_rx_queues", "%u", dev_info->max_rx_queues);
> +    smap_add_format(args, "max_tx_queues", "%u", dev_info->max_tx_queues);
> +    smap_add_format(args, "max_mac_addrs", "%u", dev_info->max_mac_addrs);
> +}
> +
>  static int
>  netdev_dpdk_vhost_user_get_status(const struct netdev *netdev,
>                                    struct smap *args)
>  {
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +    struct rte_eth_dev_info dev_info;
> +
> +    if (!rte_eth_dev_is_valid_port(dev->port_id)) {
> +        return ENODEV;
> +    }
>  
>      ovs_mutex_lock(&dev->mutex);
> +    rte_eth_dev_info_get(dev->port_id, &dev_info);
> +
> +    common_get_status(args, dev, &dev_info);
>  
>      bool client_mode = dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT;
>      smap_add_format(args, "mode", "%s", client_mode ? "client" : "server");
>  
> -    int vid = netdev_dpdk_get_vid(dev);
> +    int vid = rte_eth_vhost_get_vid_from_port_id(dev->port_id);;
>      if (vid < 0) {
>          smap_add_format(args, "status", "disconnected");
>          ovs_mutex_unlock(&dev->mutex);
> @@ -3638,15 +3417,8 @@ netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args)
>      }
>      ovs_mutex_unlock(&dpdk_mutex);
>  
> -    smap_add_format(args, "port_no", DPDK_PORT_ID_FMT, dev->port_id);
> -    smap_add_format(args, "numa_id", "%d",
> -                           rte_eth_dev_socket_id(dev->port_id));
> -    smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
> -    smap_add_format(args, "min_rx_bufsize", "%u", dev_info.min_rx_bufsize);
> -    smap_add_format(args, "max_rx_pktlen", "%u", dev->max_packet_len);
> -    smap_add_format(args, "max_rx_queues", "%u", dev_info.max_rx_queues);
> -    smap_add_format(args, "max_tx_queues", "%u", dev_info.max_tx_queues);
> -    smap_add_format(args, "max_mac_addrs", "%u", dev_info.max_mac_addrs);
> +    common_get_status(args, dev, &dev_info);
> +
>      smap_add_format(args, "max_hash_mac_addrs", "%u",
>                             dev_info.max_hash_mac_addrs);
>      smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs);
> @@ -3843,19 +3615,6 @@ out:
>      netdev_close(netdev);
>  }
>  
> -/*
> - * Set virtqueue flags so that we do not receive interrupts.
> - */
> -static void
> -set_irq_status(int vid)
> -{
> -    uint32_t i;
> -
> -    for (i = 0; i < rte_vhost_get_vring_num(vid); i++) {
> -        rte_vhost_enable_guest_notification(vid, i, 0);
> -    }
> -}
> -
>  /*
>   * Fixes mapping for vhost-user tx queues. Must be called after each
>   * enabling/disabling of queues and n_txq modifications.
> @@ -3905,53 +3664,62 @@ netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
>      free(enabled_queues);
>  }
>  
> -/*
> - * A new virtio-net device is added to a vhost port.
> - */
>  static int
> -new_device(int vid)
> +link_status_changed_callback(dpdk_port_t port_id,
> +                             enum rte_eth_event_type type OVS_UNUSED,
> +                             void *param OVS_UNUSED,
> +                             void *ret_param OVS_UNUSED)
>  {
>      struct netdev_dpdk *dev;
>      bool exists = false;
>      int newnode = 0;
> -    char ifname[IF_NAME_SZ];
> -
> -    rte_vhost_get_ifname(vid, ifname, sizeof ifname);
>  
>      ovs_mutex_lock(&dpdk_mutex);
>      /* Add device to the vhost port with the same name as that passed down. */
>      LIST_FOR_EACH(dev, list_node, &dpdk_list) {
>          ovs_mutex_lock(&dev->mutex);
> -        if (nullable_string_is_equal(ifname, dev->vhost_id)) {
> -            uint32_t qp_num = rte_vhost_get_vring_num(vid) / VIRTIO_QNUM;
> -
> -            /* Get NUMA information */
> -            newnode = rte_vhost_get_numa_node(vid);
> -            if (newnode == -1) {
> +        if (port_id == dev->port_id) {
> +            check_link_status(dev);
> +            if (dev->link.link_status == ETH_LINK_UP) {
> +                /* Device brought up. */
> +                /* Get queue information. */
> +                int vid = rte_eth_vhost_get_vid_from_port_id(dev->port_id);
> +                uint32_t qp_num = rte_vhost_get_vring_num(vid) / VIRTIO_QNUM;
> +                if (qp_num <= 0) {
> +                    qp_num = dev->requested_n_rxq;
> +                }
> +                /* Get NUMA information. */
> +                newnode = rte_eth_dev_socket_id(dev->port_id);
> +                if (newnode == -1) {
>  #ifdef VHOST_NUMA
> -                VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
> -                          ifname);
> +                    VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
> +                              dev->vhost_id);
>  #endif
> -                newnode = dev->socket_id;
> -            }
> +                    newnode = dev->socket_id;
> +                }
> +                if (dev->requested_n_txq != qp_num
> +                                || dev->requested_n_rxq != qp_num
> +                                || dev->requested_socket_id != newnode) {
> +                    dev->requested_socket_id = newnode;
> +                    dev->requested_n_rxq = qp_num;
> +                    dev->requested_n_txq = qp_num;
> +                    netdev_request_reconfigure(&dev->up);
> +                } else {
> +                    /* Reconfiguration not required. */
> +                    dev->vhost_reconfigured = true;
> +                }
>  
> -            if (dev->requested_n_txq < qp_num
> -                || dev->requested_n_rxq < qp_num
> -                || dev->requested_socket_id != newnode) {
> -                dev->requested_socket_id = newnode;
> -                dev->requested_n_rxq = qp_num;
> -                dev->requested_n_txq = qp_num;
> -                netdev_request_reconfigure(&dev->up);
> +                VLOG_INFO("vHost Device '%s' has been added on numa node %i",
> +                          dev->vhost_id, newnode);
>              } else {
> -                /* Reconfiguration not required. */
> -                dev->vhost_reconfigured = true;
> +                /* Device brought down. */
> +                dev->vhost_reconfigured = false;
> +                memset(dev->vhost_rxq_enabled, 0,
> +                       dev->up.n_rxq * sizeof *dev->vhost_rxq_enabled);
> +                netdev_dpdk_txq_map_clear(dev);
> +                VLOG_INFO("vHost Device '%s' has been removed", dev->vhost_id);
>              }
> -
> -            ovsrcu_index_set(&dev->vid, vid);
>              exists = true;
> -
> -            /* Disable notifications. */
> -            set_irq_status(vid);
>              netdev_change_seq_changed(&dev->up);
>              ovs_mutex_unlock(&dev->mutex);
>              break;
> @@ -3961,14 +3729,11 @@ new_device(int vid)
>      ovs_mutex_unlock(&dpdk_mutex);
>  
>      if (!exists) {
> -        VLOG_INFO("vHost Device '%s' can't be added - name not found", ifname);
> +        VLOG_INFO("vHost Device with port id %i not found", port_id);
>  
>          return -1;
>      }
>  
> -    VLOG_INFO("vHost Device '%s' has been added on numa node %i",
> -              ifname, newnode);
> -
>      return 0;
>  }
>  
> @@ -3984,175 +3749,63 @@ netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
>      }
>  }
>  
> -/*
> - * Remove a virtio-net device from the specific vhost port.  Use dev->remove
> - * flag to stop any more packets from being sent or received to/from a VM and
> - * ensure all currently queued packets have been sent/received before removing
> - *  the device.
> - */
> -static void
> -destroy_device(int vid)
> +static int
> +vring_state_changed_callback(dpdk_port_t port_id,
> +                             enum rte_eth_event_type type OVS_UNUSED,
> +                             void *param OVS_UNUSED,
> +                             void *ret_param OVS_UNUSED)
>  {
>      struct netdev_dpdk *dev;
>      bool exists = false;
> -    char ifname[IF_NAME_SZ];
> -
> -    rte_vhost_get_ifname(vid, ifname, sizeof ifname);
> -
> -    ovs_mutex_lock(&dpdk_mutex);
> -    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> -        if (netdev_dpdk_get_vid(dev) == vid) {
> -
> -            ovs_mutex_lock(&dev->mutex);
> -            dev->vhost_reconfigured = false;
> -            ovsrcu_index_set(&dev->vid, -1);
> -            memset(dev->vhost_rxq_enabled, 0,
> -                   dev->up.n_rxq * sizeof *dev->vhost_rxq_enabled);
> -            netdev_dpdk_txq_map_clear(dev);
> -
> -            netdev_change_seq_changed(&dev->up);
> -            ovs_mutex_unlock(&dev->mutex);
> -            exists = true;
> -            break;
> -        }
> -    }
> -
> -    ovs_mutex_unlock(&dpdk_mutex);
> +    bool is_rx;
> +    uint16_t qid;
> +    struct rte_eth_vhost_queue_event event;
>  
> -    if (exists) {
> -        /*
> -         * Wait for other threads to quiesce after setting the 'virtio_dev'
> -         * to NULL, before returning.
> -         */
> -        ovsrcu_synchronize();
> -        /*
> -         * As call to ovsrcu_synchronize() will end the quiescent state,
> -         * put thread back into quiescent state before returning.
> -         */
> -        ovsrcu_quiesce_start();
> -        VLOG_INFO("vHost Device '%s' has been removed", ifname);
> -    } else {
> -        VLOG_INFO("vHost Device '%s' not found", ifname);
> +    if (rte_eth_vhost_get_queue_event(port_id, &event)) {
> +        return 0;
>      }
> -}
>  
> -static int
> -vring_state_changed(int vid, uint16_t queue_id, int enable)
> -{
> -    struct netdev_dpdk *dev;
> -    bool exists = false;
> -    int qid = queue_id / VIRTIO_QNUM;
> -    bool is_rx = (queue_id % VIRTIO_QNUM) == VIRTIO_TXQ;
> -    char ifname[IF_NAME_SZ];
> -
> -    rte_vhost_get_ifname(vid, ifname, sizeof ifname);
> +    is_rx = event.rx;
> +    qid = event.queue_id;
>  
>      ovs_mutex_lock(&dpdk_mutex);
>      LIST_FOR_EACH (dev, list_node, &dpdk_list) {
>          ovs_mutex_lock(&dev->mutex);
> -        if (nullable_string_is_equal(ifname, dev->vhost_id)) {
> +        if (port_id == dev->port_id) {
>              if (is_rx) {
>                  bool old_state = dev->vhost_rxq_enabled[qid];
>  
> -                dev->vhost_rxq_enabled[qid] = enable != 0;
> +                dev->vhost_rxq_enabled[qid] = event.enable != 0;
>                  if (old_state != dev->vhost_rxq_enabled[qid]) {
>                      netdev_change_seq_changed(&dev->up);
>                  }
>              } else {
> -                if (enable) {
> -                    dev->tx_q[qid].map = qid;
> +                if (event.enable) {
> +                    dev->tx_q[event.queue_id].map = event.queue_id;
>                  } else {
> -                    dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
> +                    dev->tx_q[event.queue_id].map = OVS_VHOST_QUEUE_DISABLED;
>                  }
> -                netdev_dpdk_remap_txqs(dev);
> -            }
> -            exists = true;
> -            ovs_mutex_unlock(&dev->mutex);
> -            break;
> +                exists = true;
> +                ovs_mutex_unlock(&dev->mutex);
> +                break;
> +           }
>          }
>          ovs_mutex_unlock(&dev->mutex);
>      }
>      ovs_mutex_unlock(&dpdk_mutex);
>  
>      if (exists) {
> -        VLOG_INFO("State of queue %d ( %s_qid %d ) of vhost device '%s' "
> -                  "changed to \'%s\'", queue_id, is_rx == true ? "rx" : "tx",
> -                  qid, ifname, (enable == 1) ? "enabled" : "disabled");
> +        VLOG_INFO("State of queue %d ( %s_qid %d ) of vhost device "
> +                  "changed to \'%s\'", qid, is_rx == true ? "rx" : "tx",
> +                  qid, (event.enable == 1) ? "enabled" : "disabled");
>      } else {
> -        VLOG_INFO("vHost Device '%s' not found", ifname);
> +        VLOG_INFO("vHost Device not found");
>          return -1;
>      }
>  
>      return 0;
>  }
>  
> -static void
> -destroy_connection(int vid)
> -{
> -    struct netdev_dpdk *dev;
> -    char ifname[IF_NAME_SZ];
> -    bool exists = false;
> -
> -    rte_vhost_get_ifname(vid, ifname, sizeof ifname);
> -
> -    ovs_mutex_lock(&dpdk_mutex);
> -    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
> -        ovs_mutex_lock(&dev->mutex);
> -        if (nullable_string_is_equal(ifname, dev->vhost_id)) {
> -            uint32_t qp_num = NR_QUEUE;
> -
> -            if (netdev_dpdk_get_vid(dev) >= 0) {
> -                VLOG_ERR("Connection on socket '%s' destroyed while vhost "
> -                         "device still attached.", dev->vhost_id);
> -            }
> -
> -            /* Restore the number of queue pairs to default. */
> -            if (dev->requested_n_txq != qp_num
> -                || dev->requested_n_rxq != qp_num) {
> -                dev->requested_n_rxq = qp_num;
> -                dev->requested_n_txq = qp_num;
> -                netdev_request_reconfigure(&dev->up);
> -            }
> -            ovs_mutex_unlock(&dev->mutex);
> -            exists = true;
> -            break;
> -        }
> -        ovs_mutex_unlock(&dev->mutex);
> -    }
> -    ovs_mutex_unlock(&dpdk_mutex);
> -
> -    if (exists) {
> -        VLOG_INFO("vHost Device '%s' connection has been destroyed", ifname);
> -    } else {
> -        VLOG_INFO("vHost Device '%s' not found", ifname);
> -    }
> -}
> -
> -static
> -void vhost_guest_notified(int vid OVS_UNUSED)
> -{
> -    COVERAGE_INC(vhost_notification);
> -}
> -
> -/*
> - * Retrieve the DPDK virtio device ID (vid) associated with a vhostuser
> - * or vhostuserclient netdev.
> - *
> - * Returns a value greater or equal to zero for a valid vid or '-1' if
> - * there is no valid vid associated. A vid of '-1' must not be used in
> - * rte_vhost_ APi calls.
> - *
> - * Once obtained and validated, a vid can be used by a PMD for multiple
> - * subsequent rte_vhost API calls until the PMD quiesces. A PMD should
> - * not fetch the vid again for each of a series of API calls.
> - */
> -
> -int
> -netdev_dpdk_get_vid(const struct netdev_dpdk *dev)
> -{
> -    return ovsrcu_index_get(&dev->vid);
> -}
> -
>  struct ingress_policer *
>  netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev)
>  {
> @@ -4893,13 +4546,12 @@ static const struct dpdk_qos_ops trtcm_policer_ops = {
>  };
>  
>  static int
> -netdev_dpdk_reconfigure(struct netdev *netdev)
> +common_reconfigure(struct netdev *netdev)
> +    OVS_REQUIRES(dev->mutex)
>  {
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>      int err = 0;
>  
> -    ovs_mutex_lock(&dev->mutex);
> -
>      if (netdev->n_txq == dev->requested_n_txq
>          && netdev->n_rxq == dev->requested_n_rxq
>          && dev->mtu == dev->requested_mtu
> @@ -4956,17 +4608,36 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
>      netdev_change_seq_changed(netdev);
>  
>  out:
> +    return err;
> +}
> +
> +static int
> +netdev_dpdk_reconfigure(struct netdev *netdev)
> +{
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +    int err = 0;
> +
> +    ovs_mutex_lock(&dev->mutex);
> +    err = common_reconfigure(netdev);
>      ovs_mutex_unlock(&dev->mutex);
> +
>      return err;
>  }
>  
>  static int
> -dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
> +dpdk_vhost_reconfigure_helper(struct netdev *netdev)
>      OVS_REQUIRES(dev->mutex)
>  {
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +    int err;
> +
>      dev->up.n_txq = dev->requested_n_txq;
>      dev->up.n_rxq = dev->requested_n_rxq;
> -    int err;
> +
> +    err = common_reconfigure(netdev);
> +    if (err) {
> +        return err;
> +    }
>  
>      /* Always keep RX queue 0 enabled for implementations that won't
>       * report vring states. */
> @@ -4984,14 +4655,7 @@ dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
>  
>      netdev_dpdk_remap_txqs(dev);
>  
> -    err = netdev_dpdk_mempool_configure(dev);
> -    if (!err) {
> -        /* A new mempool was created or re-used. */
> -        netdev_change_seq_changed(&dev->up);
> -    } else if (err != EEXIST) {
> -        return err;
> -    }
> -    if (netdev_dpdk_get_vid(dev) >= 0) {
> +    if (rte_eth_vhost_get_vid_from_port_id(dev->port_id) >= 0) {
>          if (dev->vhost_reconfigured == false) {
>              dev->vhost_reconfigured = true;
>              /* Carrier status may need updating. */
> @@ -5009,7 +4673,7 @@ netdev_dpdk_vhost_reconfigure(struct netdev *netdev)
>      int err;
>  
>      ovs_mutex_lock(&dev->mutex);
> -    err = dpdk_vhost_reconfigure_helper(dev);
> +    err = dpdk_vhost_reconfigure_helper(netdev);
>      ovs_mutex_unlock(&dev->mutex);
>  
>      return err;
> @@ -5020,9 +4684,8 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
>  {
>      struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>      int err;
> -    uint64_t vhost_flags = 0;
> -    uint64_t vhost_unsup_flags;
> -    bool zc_enabled;
> +    int sid = -1;
> +    struct rte_eth_dev_info dev_info;
>  
>      ovs_mutex_lock(&dev->mutex);
>  
> @@ -5032,90 +4695,65 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
>       *  2. A path has been specified.
>       */
>      if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT) && dev->vhost_id) {
> -        /* Register client-mode device. */
> -        vhost_flags |= RTE_VHOST_USER_CLIENT;
> -
> -        /* There is no support for multi-segments buffers. */
> -        vhost_flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
> +        /* First time once-only configuration. */
> +        err = dpdk_attach_vhost_pmd(dev, VHOST_CLIENT_MODE);
> +
> +        if (!err) {
> +            sid = rte_eth_dev_socket_id(dev->port_id);
> +            dev->socket_id = sid < 0 ? SOCKET0 : sid;
> +            dev->vhost_driver_flags |= RTE_VHOST_USER_CLIENT;
> +
> +            if (dev->requested_socket_id != dev->socket_id
> +                || dev->requested_mtu != dev->mtu) {
> +                err = netdev_dpdk_mempool_configure(dev);
> +                if (err && err != EEXIST) {
> +                    goto unlock;
> +                }
> +            }
>  
> -        /* Enable IOMMU support, if explicitly requested. */
> -        if (dpdk_vhost_iommu_enabled()) {
> -            vhost_flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
> -        }
> +            netdev->n_txq = dev->requested_n_txq;
> +            netdev->n_rxq = dev->requested_n_rxq;
>  
> -        /* Enable POSTCOPY support, if explicitly requested. */
> -        if (dpdk_vhost_postcopy_enabled()) {
> -            vhost_flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
> -        }
> +            rte_free(dev->vhost_rxq_enabled);
> +            dev->vhost_rxq_enabled = dpdk_rte_mzalloc(
> +                                        OVS_VHOST_MAX_QUEUE_NUM *
> +                                        sizeof *dev->vhost_rxq_enabled);
> +            if (!dev->vhost_rxq_enabled) {
> +                err = ENOMEM;
> +                goto err_unlock;
> +            }
>  
> -        zc_enabled = dev->vhost_driver_flags
> -                     & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
> -        /* Enable zero copy flag, if requested */
> -        if (zc_enabled) {
> -            vhost_flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
> -        }
> +            rte_free(dev->tx_q);
> +            err = dpdk_eth_dev_init(dev);
> +            dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq);
> +            if (!dev->tx_q) {
> +                err = ENOMEM;
> +                goto err_unlock;
> +            }
>  
> -        /* Enable External Buffers if TCP Segmentation Offload is enabled. */
> -        if (userspace_tso_enabled()) {
> -            vhost_flags |= RTE_VHOST_USER_EXTBUF_SUPPORT;
> -        }
> +            netdev_change_seq_changed(netdev);
>  
> -        err = rte_vhost_driver_register(dev->vhost_id, vhost_flags);
> -        if (err) {
> -            VLOG_ERR("vhost-user device setup failure for device %s\n",
> -                     dev->vhost_id);
> -            goto unlock;
> -        } else {
> -            /* Configuration successful */
> -            dev->vhost_driver_flags |= vhost_flags;
>              VLOG_INFO("vHost User device '%s' created in 'client' mode, "
>                        "using client socket '%s'",
>                        dev->up.name, dev->vhost_id);
> -            if (zc_enabled) {
> -                VLOG_INFO("Zero copy enabled for vHost port %s", dev->up.name);
> -            }
> -        }
> -
> -        err = rte_vhost_driver_callback_register(dev->vhost_id,
> -                                                 &virtio_net_device_ops);
> -        if (err) {
> -            VLOG_ERR("rte_vhost_driver_callback_register failed for "
> -                     "vhost user client port: %s\n", dev->up.name);
> -            goto unlock;
> -        }
> -
> -        if (userspace_tso_enabled()) {
> -            netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO;
> -            netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
> -            netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
> -            netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
> -            netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
> -            vhost_unsup_flags = 1ULL << VIRTIO_NET_F_HOST_ECN
> -                                | 1ULL << VIRTIO_NET_F_HOST_UFO;
> -        } else {
> -            /* This disables checksum offloading and all the features
> -             * that depends on it (TSO, UFO, ECN) according to virtio
> -             * specification. */
> -            vhost_unsup_flags = 1ULL << VIRTIO_NET_F_CSUM;
> -        }
> -
> -        err = rte_vhost_driver_disable_features(dev->vhost_id,
> -                                                vhost_unsup_flags);
> -        if (err) {
> -            VLOG_ERR("rte_vhost_driver_disable_features failed for "
> -                     "vhost user client port: %s\n", dev->up.name);
> -            goto unlock;
>          }
> +        goto unlock;
> +    }
>  
> -        err = rte_vhost_driver_start(dev->vhost_id);
> -        if (err) {
> -            VLOG_ERR("rte_vhost_driver_start failed for vhost user "
> -                     "client port: %s\n", dev->up.name);
> -            goto unlock;
> -        }
> +    if (rte_eth_dev_is_valid_port(dev->port_id)) {
> +        err = dpdk_vhost_reconfigure_helper(netdev);
> +        goto unlock;
>      }
>  
> -    err = dpdk_vhost_reconfigure_helper(dev);
> +err_unlock:
> +    rte_eth_dev_info_get(dev->port_id, &dev_info);
> +    if (!dev_info.device || rte_dev_remove(dev_info.device)) {
> +        VLOG_ERR("Error detatching device\n");
> +    }
> +    if (dev->vhost_pmd_id >= 0) {
> +        id_pool_free_id(dpdk_get_vhost_id_pool(),
> +                dev->vhost_pmd_id);
> +    }
>  
>  unlock:
>      ovs_mutex_unlock(&dev->mutex);
> @@ -5279,9 +4917,9 @@ static const struct netdev_class dpdk_vhost_class = {
>      .construct = netdev_dpdk_vhost_construct,
>      .destruct = netdev_dpdk_vhost_destruct,
>      .send = netdev_dpdk_vhost_send,
> -    .get_carrier = netdev_dpdk_vhost_get_carrier,
> -    .get_stats = netdev_dpdk_vhost_get_stats,
> -    .get_custom_stats = netdev_dpdk_get_sw_custom_stats,
> +    .get_carrier = netdev_dpdk_get_carrier,
> +    .get_stats = netdev_dpdk_get_stats,
> +    .get_custom_stats = netdev_dpdk_get_custom_stats,
>      .get_status = netdev_dpdk_vhost_user_get_status,
>      .reconfigure = netdev_dpdk_vhost_reconfigure,
>      .rxq_recv = netdev_dpdk_vhost_rxq_recv,
> @@ -5295,9 +4933,9 @@ static const struct netdev_class dpdk_vhost_client_class = {
>      .destruct = netdev_dpdk_vhost_destruct,
>      .set_config = netdev_dpdk_vhost_client_set_config,
>      .send = netdev_dpdk_vhost_send,
> -    .get_carrier = netdev_dpdk_vhost_get_carrier,
> -    .get_stats = netdev_dpdk_vhost_get_stats,
> -    .get_custom_stats = netdev_dpdk_get_sw_custom_stats,
> +    .get_carrier = netdev_dpdk_get_carrier,
> +    .get_stats = netdev_dpdk_get_stats,
> +    .get_custom_stats = netdev_dpdk_get_custom_stats,
>      .get_status = netdev_dpdk_vhost_user_get_status,
>      .reconfigure = netdev_dpdk_vhost_client_reconfigure,
>      .rxq_recv = netdev_dpdk_vhost_rxq_recv,


I would suggest to move get_stats, get_carrier and get_custom_stats
to the dpdk class base since now all classes use the same functions.

It will make the benefits of this patch more visible. :-)

Thanks,
-- 
fbl


More information about the dev mailing list