[ovs-discuss] OVS 2.10.1 build --with-dpdk 18.11 fails in compiling a structure

Debug Netiq One debugnetiq1 at yahoo.ca
Sun Jan 27 00:08:18 UTC 2019


Hi

I am not knowledgeable on gcc advanced syntax but I worked in C/C++ 
before and gcc 4.8.5 on CentOS 4.6 croaks when seeing the below in 
"lib/netdev-dpdk.c" and then many other errors

Build first dpdk 18.11 w/o error then attempted to build ovs based on 
http://docs.openvswitch.org/en/latest/intro/install/dpdk/ and 
http://docs.openvswitch.org/en/latest/intro/install/general/

My gcc is 4.8.5, libtool 2.4.6 and doing the make after a ./configure 
--with-dpdk=<DPDK build folder>

What is missing please??? There is a mismatch between DPDK 18.11 
structures and what OVS expects -below and

Here is the culprit (or one of them)

static const struct rte_eth_conf port_conf = {
     .rxmode = {
         .mq_mode = ETH_MQ_RX_RSS,
         .split_hdr_size = 0,
*.header_split   = 0, /* Header Split disabled */                     /* 
<<= Line 171 */*
         .hw_ip_checksum = 0, /* IP checksum offload disabled */
         .hw_vlan_filter = 0, /* VLAN filtering disabled */
         .jumbo_frame    = 0, /* Jumbo Frame Support disabled */
         .hw_strip_crc   = 0,
     },
     .rx_adv_conf = {
         .rss_conf = {
             .rss_key = NULL,
             .rss_hf = ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP,
         },
     },
     .txmode = {
         .mq_mode = ETH_MQ_TX_NONE,
     },
};

And why is failing as shown below - because "struct rte_eth_conf" and 
its element struct rte_eth_rxmode rxmode DO NOT HAVE these elements

At least not in DPDK 18.11 I am compiling against. Attaching the DPDK 
18.11 header file at stake dpdk-18.11/lib/librte_ethdev attached 
together with openvswitch-2.10.1/lib/netdev-dpdk.c

So - what version of OpenVswitch with what version of DPDK please???



make

... all fine till below ...

/bin/sh ./libtool  --tag=CC   --mode=compile gcc -std=gnu99 
-DHAVE_CONFIG_H -I.    -I ./include -I ./include -I ./lib -I ./lib    
-Wstrict-prototypes -Wall -Wextra -Wno-sign-compare -Wpointer-arith 
-Wformat -Wformat-security -Wswitch-enum -Wunused-parameter 
-Wbad-function-cast -Wcast-align -Wstrict-prototypes 
-Wold-style-definition -Wmissing-prototypes -Wmissing-field-initializers 
-fno-strict-aliasing -Wshadow -Wno-null-pointer-arithmetic -mssse3 
-I/opt/dpdk_install/dpdk-18.11/x86_64-native-linuxapp-gcc/include 
-D_FILE_OFFSET_BITS=64  -g -O2 -MT lib/netdev-dpdk.lo -MD -MP -MF 
$depbase.Tpo -c -o lib/netdev-dpdk.lo lib/netdev-dpdk.c &&\
mv -f $depbase.Tpo $depbase.Plo
libtool: compile:  gcc -std=gnu99 -DHAVE_CONFIG_H -I. -I ./include -I 
./include -I ./lib -I ./lib -Wstrict-prototypes -Wall -Wextra 
-Wno-sign-compare -Wpointer-arith -Wformat -Wformat-security 
-Wswitch-enum -Wunused-parameter -Wbad-function-cast -Wcast-align 
-Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes 
-Wmissing-field-initializers -fno-strict-aliasing -Wshadow 
-Wno-null-pointer-arithmetic -mssse3 
-I/opt/dpdk_install/dpdk-18.11/x86_64-native-linuxapp-gcc/include 
-D_FILE_OFFSET_BITS=64 -g -O2 -MT lib/netdev-dpdk.lo -MD -MP -MF 
lib/.deps/netdev-dpdk.Tpo -c lib/netdev-dpdk.c -o lib/netdev-dpdk.o
lib/netdev-dpdk.c:171:9: error: unknown field âheader_splitâ specified 
in initializer
          .header_split   = 0, /* Header Split disabled */
          ^
lib/netdev-dpdk.c:172:9: error: unknown field âhw_ip_checksumâ specified 
in initializer
          .hw_ip_checksum = 0, /* IP checksum offload disabled */
          ^
lib/netdev-dpdk.c:172:9: warning: excess elements in struct initializer 
[enabled by default]
lib/netdev-dpdk.c:172:9: warning: (near initialization for 
âport_conf.rxmodeâ) [enabled by default]
lib/netdev-dpdk.c:173:9: error: unknown field âhw_vlan_filterâ specified 
in initializer
          .hw_vlan_filter = 0, /* VLAN filtering disabled */
          ^
lib/netdev-dpdk.c:173:9: warning: excess elements in struct initializer 
[enabled by default]
lib/netdev-dpdk.c:173:9: warning: (near initialization for 
âport_conf.rxmodeâ) [enabled by default]
lib/netdev-dpdk.c:174:9: error: unknown field âjumbo_frameâ specified in 
initializer
          .jumbo_frame    = 0, /* Jumbo Frame Support disabled */
          ^
lib/netdev-dpdk.c:174:9: warning: excess elements in struct initializer 
[enabled by default]
lib/netdev-dpdk.c:174:9: warning: (near initialization for 
âport_conf.rxmodeâ) [enabled by default]
lib/netdev-dpdk.c:175:9: error: unknown field âhw_strip_crcâ specified 
in initializer
          .hw_strip_crc   = 0,
          ^
lib/netdev-dpdk.c:175:9: warning: excess elements in struct initializer 
[enabled by default]
lib/netdev-dpdk.c:175:9: warning: (near initialization for 
âport_conf.rxmodeâ) [enabled by default]
lib/netdev-dpdk.c: In function âdpdk_eth_dev_port_configâ:
lib/netdev-dpdk.c:906:24: error: âstruct rte_eth_rxmodeâ has no member 
named âenable_scatterâ
              conf.rxmode.enable_scatter = 1;
                         ^
lib/netdev-dpdk.c:911:16: error: âstruct rte_eth_rxmodeâ has no member 
named âhw_ip_checksumâ
      conf.rxmode.hw_ip_checksum = (dev->hw_ol_features &
                 ^
lib/netdev-dpdk.c:915:20: error: âstruct rte_eth_rxmodeâ has no member 
named âhw_strip_crcâ
          conf.rxmode.hw_strip_crc = 1;







---
This email has been checked for viruses by Avast antivirus software.
https://www.avast.com/antivirus
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.openvswitch.org/pipermail/ovs-discuss/attachments/20190126/3ac25d3d/attachment-0001.html>
-------------- next part --------------
/* SPDX-License-Identifier: BSD-3-Clause
 * Copyright(c) 2010-2017 Intel Corporation
 */

#ifndef _RTE_ETHDEV_H_
#define _RTE_ETHDEV_H_

/**
 * @file
 *
 * RTE Ethernet Device API
 *
 * The Ethernet Device API is composed of two parts:
 *
 * - The application-oriented Ethernet API that includes functions to setup
 *   an Ethernet device (configure it, setup its RX and TX queues and start it),
 *   to get its MAC address, the speed and the status of its physical link,
 *   to receive and to transmit packets, and so on.
 *
 * - The driver-oriented Ethernet API that exports functions allowing
 *   an Ethernet Poll Mode Driver (PMD) to allocate an Ethernet device instance,
 *   create memzone for HW rings and process registered callbacks, and so on.
 *   PMDs should include rte_ethdev_driver.h instead of this header.
 *
 * By default, all the functions of the Ethernet Device API exported by a PMD
 * are lock-free functions which assume to not be invoked in parallel on
 * different logical cores to work on the same target object.  For instance,
 * the receive function of a PMD cannot be invoked in parallel on two logical
 * cores to poll the same RX queue [of the same port]. Of course, this function
 * can be invoked in parallel by different logical cores on different RX queues.
 * It is the responsibility of the upper level application to enforce this rule.
 *
 * If needed, parallel accesses by multiple logical cores to shared queues
 * shall be explicitly protected by dedicated inline lock-aware functions
 * built on top of their corresponding lock-free functions of the PMD API.
 *
 * In all functions of the Ethernet API, the Ethernet device is
 * designated by an integer >= 0 named the device port identifier.
 *
 * At the Ethernet driver level, Ethernet devices are represented by a generic
 * data structure of type *rte_eth_dev*.
 *
 * Ethernet devices are dynamically registered during the PCI probing phase
 * performed at EAL initialization time.
 * When an Ethernet device is being probed, an *rte_eth_dev* structure and
 * a new port identifier are allocated for that device. Then, the eth_dev_init()
 * function supplied by the Ethernet driver matching the probed PCI
 * device is invoked to properly initialize the device.
 *
 * The role of the device init function consists of resetting the hardware,
 * checking access to Non-volatile Memory (NVM), reading the MAC address
 * from NVM etc.
 *
 * If the device init operation is successful, the correspondence between
 * the port identifier assigned to the new device and its associated
 * *rte_eth_dev* structure is effectively registered.
 * Otherwise, both the *rte_eth_dev* structure and the port identifier are
 * freed.
 *
 * The functions exported by the application Ethernet API to setup a device
 * designated by its port identifier must be invoked in the following order:
 *     - rte_eth_dev_configure()
 *     - rte_eth_tx_queue_setup()
 *     - rte_eth_rx_queue_setup()
 *     - rte_eth_dev_start()
 *
 * Then, the network application can invoke, in any order, the functions
 * exported by the Ethernet API to get the MAC address of a given device, to
 * get the speed and the status of a device physical link, to receive/transmit
 * [burst of] packets, and so on.
 *
 * If the application wants to change the configuration (i.e. call
 * rte_eth_dev_configure(), rte_eth_tx_queue_setup(), or
 * rte_eth_rx_queue_setup()), it must call rte_eth_dev_stop() first to stop the
 * device and then do the reconfiguration before calling rte_eth_dev_start()
 * again. The transmit and receive functions should not be invoked when the
 * device is stopped.
 *
 * Please note that some configuration is not stored between calls to
 * rte_eth_dev_stop()/rte_eth_dev_start(). The following configuration will
 * be retained:
 *
 *     - flow control settings
 *     - receive mode configuration (promiscuous mode, hardware checksum mode,
 *       RSS/VMDQ settings etc.)
 *     - VLAN filtering configuration
 *     - MAC addresses supplied to MAC address array
 *     - flow director filtering mode (but not filtering rules)
 *     - NIC queue statistics mappings
 *
 * Any other configuration will not be stored and will need to be re-entered
 * before a call to rte_eth_dev_start().
 *
 * Finally, a network application can close an Ethernet device by invoking the
 * rte_eth_dev_close() function.
 *
 * Each function of the application Ethernet API invokes a specific function
 * of the PMD that controls the target device designated by its port
 * identifier.
 * For this purpose, all device-specific functions of an Ethernet driver are
 * supplied through a set of pointers contained in a generic structure of type
 * *eth_dev_ops*.
 * The address of the *eth_dev_ops* structure is stored in the *rte_eth_dev*
 * structure by the device init function of the Ethernet driver, which is
 * invoked during the PCI probing phase, as explained earlier.
 *
 * In other words, each function of the Ethernet API simply retrieves the
 * *rte_eth_dev* structure associated with the device port identifier and
 * performs an indirect invocation of the corresponding driver function
 * supplied in the *eth_dev_ops* structure of the *rte_eth_dev* structure.
 *
 * For performance reasons, the address of the burst-oriented RX and TX
 * functions of the Ethernet driver are not contained in the *eth_dev_ops*
 * structure. Instead, they are directly stored at the beginning of the
 * *rte_eth_dev* structure to avoid an extra indirect memory access during
 * their invocation.
 *
 * RTE ethernet device drivers do not use interrupts for transmitting or
 * receiving. Instead, Ethernet drivers export Poll-Mode receive and transmit
 * functions to applications.
 * Both receive and transmit functions are packet-burst oriented to minimize
 * their cost per packet through the following optimizations:
 *
 * - Sharing among multiple packets the incompressible cost of the
 *   invocation of receive/transmit functions.
 *
 * - Enabling receive/transmit functions to take advantage of burst-oriented
 *   hardware features (L1 cache, prefetch instructions, NIC head/tail
 *   registers) to minimize the number of CPU cycles per packet, for instance,
 *   by avoiding useless read memory accesses to ring descriptors, or by
 *   systematically using arrays of pointers that exactly fit L1 cache line
 *   boundaries and sizes.
 *
 * The burst-oriented receive function does not provide any error notification,
 * to avoid the corresponding overhead. As a hint, the upper-level application
 * might check the status of the device link once being systematically returned
 * a 0 value by the receive function of the driver for a given number of tries.
 */

#ifdef __cplusplus
extern "C" {
#endif

#include <stdint.h>

/* Use this macro to check if LRO API is supported */
#define RTE_ETHDEV_HAS_LRO_SUPPORT

#include <rte_compat.h>
#include <rte_log.h>
#include <rte_interrupts.h>
#include <rte_dev.h>
#include <rte_devargs.h>
#include <rte_errno.h>
#include <rte_common.h>
#include <rte_config.h>

#include "rte_ether.h"
#include "rte_eth_ctrl.h"
#include "rte_dev_info.h"

extern int rte_eth_dev_logtype;

#define RTE_ETHDEV_LOG(level, ...) \
	rte_log(RTE_LOG_ ## level, rte_eth_dev_logtype, "" __VA_ARGS__)

struct rte_mbuf;

/**
 * Initializes a device iterator.
 *
 * This iterator allows accessing a list of devices matching some devargs.
 *
 * @param iter
 *   Device iterator handle initialized by the function.
 *   The fields bus_str and cls_str might be dynamically allocated,
 *   and could be freed by calling rte_eth_iterator_cleanup().
 *
 * @param devargs
 *   Device description string.
 *
 * @return
 *   0 on successful initialization, negative otherwise.
 */
int rte_eth_iterator_init(struct rte_dev_iterator *iter, const char *devargs);

/**
 * Iterates on devices with devargs filter.
 * The ownership is not checked.
 *
 * The next port id is returned, and the iterator is updated.
 *
 * @param iter
 *   Device iterator handle initialized by rte_eth_iterator_init().
 *   Some fields bus_str and cls_str might be freed when no more port is found,
 *   by calling rte_eth_iterator_cleanup().
 *
 * @return
 *   A port id if found, RTE_MAX_ETHPORTS otherwise.
 */
uint16_t rte_eth_iterator_next(struct rte_dev_iterator *iter);

/**
 * Free some allocated fields of the iterator.
 *
 * This function is automatically called by rte_eth_iterator_next()
 * on the last iteration (i.e. when no more matching port is found).
 *
 * It is safe to call this function twice; it will do nothing more.
 *
 * @param iter
 *   Device iterator handle initialized by rte_eth_iterator_init().
 *   The fields bus_str and cls_str are freed if needed.
 */
void rte_eth_iterator_cleanup(struct rte_dev_iterator *iter);

/**
 * Macro to iterate over all ethdev ports matching some devargs.
 *
 * If a break is done before the end of the loop,
 * the function rte_eth_iterator_cleanup() must be called.
 *
 * @param id
 *   Iterated port id of type uint16_t.
 * @param devargs
 *   Device parameters input as string of type char*.
 * @param iter
 *   Iterator handle of type struct rte_dev_iterator, used internally.
 */
#define RTE_ETH_FOREACH_MATCHING_DEV(id, devargs, iter) \
	for (rte_eth_iterator_init(iter, devargs), \
	     id = rte_eth_iterator_next(iter); \
	     id != RTE_MAX_ETHPORTS; \
	     id = rte_eth_iterator_next(iter))

/**
 * A structure used to retrieve statistics for an Ethernet port.
 * Not all statistics fields in struct rte_eth_stats are supported
 * by any type of network interface card (NIC). If any statistics
 * field is not supported, its value is 0.
 */
struct rte_eth_stats {
	uint64_t ipackets;  /**< Total number of successfully received packets. */
	uint64_t opackets;  /**< Total number of successfully transmitted packets.*/
	uint64_t ibytes;    /**< Total number of successfully received bytes. */
	uint64_t obytes;    /**< Total number of successfully transmitted bytes. */
	uint64_t imissed;
	/**< Total of RX packets dropped by the HW,
	 * because there are no available buffer (i.e. RX queues are full).
	 */
	uint64_t ierrors;   /**< Total number of erroneous received packets. */
	uint64_t oerrors;   /**< Total number of failed transmitted packets. */
	uint64_t rx_nombuf; /**< Total number of RX mbuf allocation failures. */
	uint64_t q_ipackets[RTE_ETHDEV_QUEUE_STAT_CNTRS];
	/**< Total number of queue RX packets. */
	uint64_t q_opackets[RTE_ETHDEV_QUEUE_STAT_CNTRS];
	/**< Total number of queue TX packets. */
	uint64_t q_ibytes[RTE_ETHDEV_QUEUE_STAT_CNTRS];
	/**< Total number of successfully received queue bytes. */
	uint64_t q_obytes[RTE_ETHDEV_QUEUE_STAT_CNTRS];
	/**< Total number of successfully transmitted queue bytes. */
	uint64_t q_errors[RTE_ETHDEV_QUEUE_STAT_CNTRS];
	/**< Total number of queue packets received that are dropped. */
};

/**
 * Device supported speeds bitmap flags
 */
#define ETH_LINK_SPEED_AUTONEG  (0 <<  0)  /**< Autonegotiate (all speeds) */
#define ETH_LINK_SPEED_FIXED    (1 <<  0)  /**< Disable autoneg (fixed speed) */
#define ETH_LINK_SPEED_10M_HD   (1 <<  1)  /**<  10 Mbps half-duplex */
#define ETH_LINK_SPEED_10M      (1 <<  2)  /**<  10 Mbps full-duplex */
#define ETH_LINK_SPEED_100M_HD  (1 <<  3)  /**< 100 Mbps half-duplex */
#define ETH_LINK_SPEED_100M     (1 <<  4)  /**< 100 Mbps full-duplex */
#define ETH_LINK_SPEED_1G       (1 <<  5)  /**<   1 Gbps */
#define ETH_LINK_SPEED_2_5G     (1 <<  6)  /**< 2.5 Gbps */
#define ETH_LINK_SPEED_5G       (1 <<  7)  /**<   5 Gbps */
#define ETH_LINK_SPEED_10G      (1 <<  8)  /**<  10 Gbps */
#define ETH_LINK_SPEED_20G      (1 <<  9)  /**<  20 Gbps */
#define ETH_LINK_SPEED_25G      (1 << 10)  /**<  25 Gbps */
#define ETH_LINK_SPEED_40G      (1 << 11)  /**<  40 Gbps */
#define ETH_LINK_SPEED_50G      (1 << 12)  /**<  50 Gbps */
#define ETH_LINK_SPEED_56G      (1 << 13)  /**<  56 Gbps */
#define ETH_LINK_SPEED_100G     (1 << 14)  /**< 100 Gbps */

/**
 * Ethernet numeric link speeds in Mbps
 */
#define ETH_SPEED_NUM_NONE         0 /**< Not defined */
#define ETH_SPEED_NUM_10M         10 /**<  10 Mbps */
#define ETH_SPEED_NUM_100M       100 /**< 100 Mbps */
#define ETH_SPEED_NUM_1G        1000 /**<   1 Gbps */
#define ETH_SPEED_NUM_2_5G      2500 /**< 2.5 Gbps */
#define ETH_SPEED_NUM_5G        5000 /**<   5 Gbps */
#define ETH_SPEED_NUM_10G      10000 /**<  10 Gbps */
#define ETH_SPEED_NUM_20G      20000 /**<  20 Gbps */
#define ETH_SPEED_NUM_25G      25000 /**<  25 Gbps */
#define ETH_SPEED_NUM_40G      40000 /**<  40 Gbps */
#define ETH_SPEED_NUM_50G      50000 /**<  50 Gbps */
#define ETH_SPEED_NUM_56G      56000 /**<  56 Gbps */
#define ETH_SPEED_NUM_100G    100000 /**< 100 Gbps */

/**
 * A structure used to retrieve link-level information of an Ethernet port.
 */
__extension__
struct rte_eth_link {
	uint32_t link_speed;        /**< ETH_SPEED_NUM_ */
	uint16_t link_duplex  : 1;  /**< ETH_LINK_[HALF/FULL]_DUPLEX */
	uint16_t link_autoneg : 1;  /**< ETH_LINK_[AUTONEG/FIXED] */
	uint16_t link_status  : 1;  /**< ETH_LINK_[DOWN/UP] */
} __attribute__((aligned(8)));      /**< aligned for atomic64 read/write */

/* Utility constants */
#define ETH_LINK_HALF_DUPLEX 0 /**< Half-duplex connection (see link_duplex). */
#define ETH_LINK_FULL_DUPLEX 1 /**< Full-duplex connection (see link_duplex). */
#define ETH_LINK_DOWN        0 /**< Link is down (see link_status). */
#define ETH_LINK_UP          1 /**< Link is up (see link_status). */
#define ETH_LINK_FIXED       0 /**< No autonegotiation (see link_autoneg). */
#define ETH_LINK_AUTONEG     1 /**< Autonegotiated (see link_autoneg). */

/**
 * A structure used to configure the ring threshold registers of an RX/TX
 * queue for an Ethernet port.
 */
struct rte_eth_thresh {
	uint8_t pthresh; /**< Ring prefetch threshold. */
	uint8_t hthresh; /**< Ring host threshold. */
	uint8_t wthresh; /**< Ring writeback threshold. */
};

/**
 *  Simple flags are used for rte_eth_conf.rxmode.mq_mode.
 */
#define ETH_MQ_RX_RSS_FLAG  0x1
#define ETH_MQ_RX_DCB_FLAG  0x2
#define ETH_MQ_RX_VMDQ_FLAG 0x4

/**
 *  A set of values to identify what method is to be used to route
 *  packets to multiple queues.
 */
enum rte_eth_rx_mq_mode {
	/** None of DCB,RSS or VMDQ mode */
	ETH_MQ_RX_NONE = 0,

	/** For RX side, only RSS is on */
	ETH_MQ_RX_RSS = ETH_MQ_RX_RSS_FLAG,
	/** For RX side,only DCB is on. */
	ETH_MQ_RX_DCB = ETH_MQ_RX_DCB_FLAG,
	/** Both DCB and RSS enable */
	ETH_MQ_RX_DCB_RSS = ETH_MQ_RX_RSS_FLAG | ETH_MQ_RX_DCB_FLAG,

	/** Only VMDQ, no RSS nor DCB */
	ETH_MQ_RX_VMDQ_ONLY = ETH_MQ_RX_VMDQ_FLAG,
	/** RSS mode with VMDQ */
	ETH_MQ_RX_VMDQ_RSS = ETH_MQ_RX_RSS_FLAG | ETH_MQ_RX_VMDQ_FLAG,
	/** Use VMDQ+DCB to route traffic to queues */
	ETH_MQ_RX_VMDQ_DCB = ETH_MQ_RX_VMDQ_FLAG | ETH_MQ_RX_DCB_FLAG,
	/** Enable both VMDQ and DCB in VMDq */
	ETH_MQ_RX_VMDQ_DCB_RSS = ETH_MQ_RX_RSS_FLAG | ETH_MQ_RX_DCB_FLAG |
				 ETH_MQ_RX_VMDQ_FLAG,
};

/**
 * for rx mq mode backward compatible
 */
#define ETH_RSS                       ETH_MQ_RX_RSS
#define VMDQ_DCB                      ETH_MQ_RX_VMDQ_DCB
#define ETH_DCB_RX                    ETH_MQ_RX_DCB

/**
 * A set of values to identify what method is to be used to transmit
 * packets using multi-TCs.
 */
enum rte_eth_tx_mq_mode {
	ETH_MQ_TX_NONE    = 0,  /**< It is in neither DCB nor VT mode. */
	ETH_MQ_TX_DCB,          /**< For TX side,only DCB is on. */
	ETH_MQ_TX_VMDQ_DCB,	/**< For TX side,both DCB and VT is on. */
	ETH_MQ_TX_VMDQ_ONLY,    /**< Only VT on, no DCB */
};

/**
 * for tx mq mode backward compatible
 */
#define ETH_DCB_NONE                ETH_MQ_TX_NONE
#define ETH_VMDQ_DCB_TX             ETH_MQ_TX_VMDQ_DCB
#define ETH_DCB_TX                  ETH_MQ_TX_DCB

/**
 * A structure used to configure the RX features of an Ethernet port.
 */
struct rte_eth_rxmode {
	/** The multi-queue packet distribution mode to be used, e.g. RSS. */
	enum rte_eth_rx_mq_mode mq_mode;
	uint32_t max_rx_pkt_len;  /**< Only used if JUMBO_FRAME enabled. */
	uint16_t split_hdr_size;  /**< hdr buf size (header_split enabled).*/
	/**
	 * Per-port Rx offloads to be set using DEV_RX_OFFLOAD_* flags.
	 * Only offloads set on rx_offload_capa field on rte_eth_dev_info
	 * structure are allowed to be set.
	 */
	uint64_t offloads;
};

/**
 * VLAN types to indicate if it is for single VLAN, inner VLAN or outer VLAN.
 * Note that single VLAN is treated the same as inner VLAN.
 */
enum rte_vlan_type {
	ETH_VLAN_TYPE_UNKNOWN = 0,
	ETH_VLAN_TYPE_INNER, /**< Inner VLAN. */
	ETH_VLAN_TYPE_OUTER, /**< Single VLAN, or outer VLAN. */
	ETH_VLAN_TYPE_MAX,
};

/**
 * A structure used to describe a vlan filter.
 * If the bit corresponding to a VID is set, such VID is on.
 */
struct rte_vlan_filter_conf {
	uint64_t ids[64];
};

/**
 * A structure used to configure the Receive Side Scaling (RSS) feature
 * of an Ethernet port.
 * If not NULL, the *rss_key* pointer of the *rss_conf* structure points
 * to an array holding the RSS key to use for hashing specific header
 * fields of received packets. The length of this array should be indicated
 * by *rss_key_len* below. Otherwise, a default random hash key is used by
 * the device driver.
 *
 * The *rss_key_len* field of the *rss_conf* structure indicates the length
 * in bytes of the array pointed by *rss_key*. To be compatible, this length
 * will be checked in i40e only. Others assume 40 bytes to be used as before.
 *
 * The *rss_hf* field of the *rss_conf* structure indicates the different
 * types of IPv4/IPv6 packets to which the RSS hashing must be applied.
 * Supplying an *rss_hf* equal to zero disables the RSS feature.
 */
struct rte_eth_rss_conf {
	uint8_t *rss_key;    /**< If not NULL, 40-byte hash key. */
	uint8_t rss_key_len; /**< hash key length in bytes. */
	uint64_t rss_hf;     /**< Hash functions to apply - see below. */
};

/*
 * The RSS offload types are defined based on flow types which are defined
 * in rte_eth_ctrl.h. Different NIC hardwares may support different RSS offload
 * types. The supported flow types or RSS offload types can be queried by
 * rte_eth_dev_info_get().
 */
#define ETH_RSS_IPV4               (1ULL << RTE_ETH_FLOW_IPV4)
#define ETH_RSS_FRAG_IPV4          (1ULL << RTE_ETH_FLOW_FRAG_IPV4)
#define ETH_RSS_NONFRAG_IPV4_TCP   (1ULL << RTE_ETH_FLOW_NONFRAG_IPV4_TCP)
#define ETH_RSS_NONFRAG_IPV4_UDP   (1ULL << RTE_ETH_FLOW_NONFRAG_IPV4_UDP)
#define ETH_RSS_NONFRAG_IPV4_SCTP  (1ULL << RTE_ETH_FLOW_NONFRAG_IPV4_SCTP)
#define ETH_RSS_NONFRAG_IPV4_OTHER (1ULL << RTE_ETH_FLOW_NONFRAG_IPV4_OTHER)
#define ETH_RSS_IPV6               (1ULL << RTE_ETH_FLOW_IPV6)
#define ETH_RSS_FRAG_IPV6          (1ULL << RTE_ETH_FLOW_FRAG_IPV6)
#define ETH_RSS_NONFRAG_IPV6_TCP   (1ULL << RTE_ETH_FLOW_NONFRAG_IPV6_TCP)
#define ETH_RSS_NONFRAG_IPV6_UDP   (1ULL << RTE_ETH_FLOW_NONFRAG_IPV6_UDP)
#define ETH_RSS_NONFRAG_IPV6_SCTP  (1ULL << RTE_ETH_FLOW_NONFRAG_IPV6_SCTP)
#define ETH_RSS_NONFRAG_IPV6_OTHER (1ULL << RTE_ETH_FLOW_NONFRAG_IPV6_OTHER)
#define ETH_RSS_L2_PAYLOAD         (1ULL << RTE_ETH_FLOW_L2_PAYLOAD)
#define ETH_RSS_IPV6_EX            (1ULL << RTE_ETH_FLOW_IPV6_EX)
#define ETH_RSS_IPV6_TCP_EX        (1ULL << RTE_ETH_FLOW_IPV6_TCP_EX)
#define ETH_RSS_IPV6_UDP_EX        (1ULL << RTE_ETH_FLOW_IPV6_UDP_EX)
#define ETH_RSS_PORT               (1ULL << RTE_ETH_FLOW_PORT)
#define ETH_RSS_VXLAN              (1ULL << RTE_ETH_FLOW_VXLAN)
#define ETH_RSS_GENEVE             (1ULL << RTE_ETH_FLOW_GENEVE)
#define ETH_RSS_NVGRE              (1ULL << RTE_ETH_FLOW_NVGRE)

#define ETH_RSS_IP ( \
	ETH_RSS_IPV4 | \
	ETH_RSS_FRAG_IPV4 | \
	ETH_RSS_NONFRAG_IPV4_OTHER | \
	ETH_RSS_IPV6 | \
	ETH_RSS_FRAG_IPV6 | \
	ETH_RSS_NONFRAG_IPV6_OTHER | \
	ETH_RSS_IPV6_EX)

#define ETH_RSS_UDP ( \
	ETH_RSS_NONFRAG_IPV4_UDP | \
	ETH_RSS_NONFRAG_IPV6_UDP | \
	ETH_RSS_IPV6_UDP_EX)

#define ETH_RSS_TCP ( \
	ETH_RSS_NONFRAG_IPV4_TCP | \
	ETH_RSS_NONFRAG_IPV6_TCP | \
	ETH_RSS_IPV6_TCP_EX)

#define ETH_RSS_SCTP ( \
	ETH_RSS_NONFRAG_IPV4_SCTP | \
	ETH_RSS_NONFRAG_IPV6_SCTP)

#define ETH_RSS_TUNNEL ( \
	ETH_RSS_VXLAN  | \
	ETH_RSS_GENEVE | \
	ETH_RSS_NVGRE)

/**< Mask of valid RSS hash protocols */
#define ETH_RSS_PROTO_MASK ( \
	ETH_RSS_IPV4 | \
	ETH_RSS_FRAG_IPV4 | \
	ETH_RSS_NONFRAG_IPV4_TCP | \
	ETH_RSS_NONFRAG_IPV4_UDP | \
	ETH_RSS_NONFRAG_IPV4_SCTP | \
	ETH_RSS_NONFRAG_IPV4_OTHER | \
	ETH_RSS_IPV6 | \
	ETH_RSS_FRAG_IPV6 | \
	ETH_RSS_NONFRAG_IPV6_TCP | \
	ETH_RSS_NONFRAG_IPV6_UDP | \
	ETH_RSS_NONFRAG_IPV6_SCTP | \
	ETH_RSS_NONFRAG_IPV6_OTHER | \
	ETH_RSS_L2_PAYLOAD | \
	ETH_RSS_IPV6_EX | \
	ETH_RSS_IPV6_TCP_EX | \
	ETH_RSS_IPV6_UDP_EX | \
	ETH_RSS_PORT  | \
	ETH_RSS_VXLAN | \
	ETH_RSS_GENEVE | \
	ETH_RSS_NVGRE)

/*
 * Definitions used for redirection table entry size.
 * Some RSS RETA sizes may not be supported by some drivers, check the
 * documentation or the description of relevant functions for more details.
 */
#define ETH_RSS_RETA_SIZE_64  64
#define ETH_RSS_RETA_SIZE_128 128
#define ETH_RSS_RETA_SIZE_256 256
#define ETH_RSS_RETA_SIZE_512 512
#define RTE_RETA_GROUP_SIZE   64

/* Definitions used for VMDQ and DCB functionality */
#define ETH_VMDQ_MAX_VLAN_FILTERS   64 /**< Maximum nb. of VMDQ vlan filters. */
#define ETH_DCB_NUM_USER_PRIORITIES 8  /**< Maximum nb. of DCB priorities. */
#define ETH_VMDQ_DCB_NUM_QUEUES     128 /**< Maximum nb. of VMDQ DCB queues. */
#define ETH_DCB_NUM_QUEUES          128 /**< Maximum nb. of DCB queues. */

/* DCB capability defines */
#define ETH_DCB_PG_SUPPORT      0x00000001 /**< Priority Group(ETS) support. */
#define ETH_DCB_PFC_SUPPORT     0x00000002 /**< Priority Flow Control support. */

/* Definitions used for VLAN Offload functionality */
#define ETH_VLAN_STRIP_OFFLOAD   0x0001 /**< VLAN Strip  On/Off */
#define ETH_VLAN_FILTER_OFFLOAD  0x0002 /**< VLAN Filter On/Off */
#define ETH_VLAN_EXTEND_OFFLOAD  0x0004 /**< VLAN Extend On/Off */

/* Definitions used for mask VLAN setting */
#define ETH_VLAN_STRIP_MASK   0x0001 /**< VLAN Strip  setting mask */
#define ETH_VLAN_FILTER_MASK  0x0002 /**< VLAN Filter  setting mask*/
#define ETH_VLAN_EXTEND_MASK  0x0004 /**< VLAN Extend  setting mask*/
#define ETH_VLAN_ID_MAX       0x0FFF /**< VLAN ID is in lower 12 bits*/

/* Definitions used for receive MAC address   */
#define ETH_NUM_RECEIVE_MAC_ADDR  128 /**< Maximum nb. of receive mac addr. */

/* Definitions used for unicast hash  */
#define ETH_VMDQ_NUM_UC_HASH_ARRAY  128 /**< Maximum nb. of UC hash array. */

/* Definitions used for VMDQ pool rx mode setting */
#define ETH_VMDQ_ACCEPT_UNTAG   0x0001 /**< accept untagged packets. */
#define ETH_VMDQ_ACCEPT_HASH_MC 0x0002 /**< accept packets in multicast table . */
#define ETH_VMDQ_ACCEPT_HASH_UC 0x0004 /**< accept packets in unicast table. */
#define ETH_VMDQ_ACCEPT_BROADCAST   0x0008 /**< accept broadcast packets. */
#define ETH_VMDQ_ACCEPT_MULTICAST   0x0010 /**< multicast promiscuous. */

/** Maximum nb. of vlan per mirror rule */
#define ETH_MIRROR_MAX_VLANS       64

#define ETH_MIRROR_VIRTUAL_POOL_UP     0x01  /**< Virtual Pool uplink Mirroring. */
#define ETH_MIRROR_UPLINK_PORT         0x02  /**< Uplink Port Mirroring. */
#define ETH_MIRROR_DOWNLINK_PORT       0x04  /**< Downlink Port Mirroring. */
#define ETH_MIRROR_VLAN                0x08  /**< VLAN Mirroring. */
#define ETH_MIRROR_VIRTUAL_POOL_DOWN   0x10  /**< Virtual Pool downlink Mirroring. */

/**
 * A structure used to configure VLAN traffic mirror of an Ethernet port.
 */
struct rte_eth_vlan_mirror {
	uint64_t vlan_mask; /**< mask for valid VLAN ID. */
	/** VLAN ID list for vlan mirroring. */
	uint16_t vlan_id[ETH_MIRROR_MAX_VLANS];
};

/**
 * A structure used to configure traffic mirror of an Ethernet port.
 */
struct rte_eth_mirror_conf {
	uint8_t rule_type; /**< Mirroring rule type */
	uint8_t dst_pool;  /**< Destination pool for this mirror rule. */
	uint64_t pool_mask; /**< Bitmap of pool for pool mirroring */
	/** VLAN ID setting for VLAN mirroring. */
	struct rte_eth_vlan_mirror vlan;
};

/**
 * A structure used to configure 64 entries of Redirection Table of the
 * Receive Side Scaling (RSS) feature of an Ethernet port. To configure
 * more than 64 entries supported by hardware, an array of this structure
 * is needed.
 */
struct rte_eth_rss_reta_entry64 {
	uint64_t mask;
	/**< Mask bits indicate which entries need to be updated/queried. */
	uint16_t reta[RTE_RETA_GROUP_SIZE];
	/**< Group of 64 redirection table entries. */
};

/**
 * This enum indicates the possible number of traffic classes
 * in DCB configurations
 */
enum rte_eth_nb_tcs {
	ETH_4_TCS = 4, /**< 4 TCs with DCB. */
	ETH_8_TCS = 8  /**< 8 TCs with DCB. */
};

/**
 * This enum indicates the possible number of queue pools
 * in VMDQ configurations.
 */
enum rte_eth_nb_pools {
	ETH_8_POOLS = 8,    /**< 8 VMDq pools. */
	ETH_16_POOLS = 16,  /**< 16 VMDq pools. */
	ETH_32_POOLS = 32,  /**< 32 VMDq pools. */
	ETH_64_POOLS = 64   /**< 64 VMDq pools. */
};

/* This structure may be extended in future. */
struct rte_eth_dcb_rx_conf {
	enum rte_eth_nb_tcs nb_tcs; /**< Possible DCB TCs, 4 or 8 TCs */
	/** Traffic class each UP mapped to. */
	uint8_t dcb_tc[ETH_DCB_NUM_USER_PRIORITIES];
};

struct rte_eth_vmdq_dcb_tx_conf {
	enum rte_eth_nb_pools nb_queue_pools; /**< With DCB, 16 or 32 pools. */
	/** Traffic class each UP mapped to. */
	uint8_t dcb_tc[ETH_DCB_NUM_USER_PRIORITIES];
};

struct rte_eth_dcb_tx_conf {
	enum rte_eth_nb_tcs nb_tcs; /**< Possible DCB TCs, 4 or 8 TCs. */
	/** Traffic class each UP mapped to. */
	uint8_t dcb_tc[ETH_DCB_NUM_USER_PRIORITIES];
};

struct rte_eth_vmdq_tx_conf {
	enum rte_eth_nb_pools nb_queue_pools; /**< VMDq mode, 64 pools. */
};

/**
 * A structure used to configure the VMDQ+DCB feature
 * of an Ethernet port.
 *
 * Using this feature, packets are routed to a pool of queues, based
 * on the vlan id in the vlan tag, and then to a specific queue within
 * that pool, using the user priority vlan tag field.
 *
 * A default pool may be used, if desired, to route all traffic which
 * does not match the vlan filter rules.
 */
struct rte_eth_vmdq_dcb_conf {
	enum rte_eth_nb_pools nb_queue_pools; /**< With DCB, 16 or 32 pools */
	uint8_t enable_default_pool; /**< If non-zero, use a default pool */
	uint8_t default_pool; /**< The default pool, if applicable */
	uint8_t nb_pool_maps; /**< We can have up to 64 filters/mappings */
	struct {
		uint16_t vlan_id; /**< The vlan id of the received frame */
		uint64_t pools;   /**< Bitmask of pools for packet rx */
	} pool_map[ETH_VMDQ_MAX_VLAN_FILTERS]; /**< VMDq vlan pool maps. */
	uint8_t dcb_tc[ETH_DCB_NUM_USER_PRIORITIES];
	/**< Selects a queue in a pool */
};

/**
 * A structure used to configure the VMDQ feature of an Ethernet port when
 * not combined with the DCB feature.
 *
 * Using this feature, packets are routed to a pool of queues. By default,
 * the pool selection is based on the MAC address, the vlan id in the
 * vlan tag as specified in the pool_map array.
 * Passing the ETH_VMDQ_ACCEPT_UNTAG in the rx_mode field allows pool
 * selection using only the MAC address. MAC address to pool mapping is done
 * using the rte_eth_dev_mac_addr_add function, with the pool parameter
 * corresponding to the pool id.
 *
 * Queue selection within the selected pool will be done using RSS when
 * it is enabled or revert to the first queue of the pool if not.
 *
 * A default pool may be used, if desired, to route all traffic which
 * does not match the vlan filter rules or any pool MAC address.
 */
struct rte_eth_vmdq_rx_conf {
	enum rte_eth_nb_pools nb_queue_pools; /**< VMDq only mode, 8 or 64 pools */
	uint8_t enable_default_pool; /**< If non-zero, use a default pool */
	uint8_t default_pool; /**< The default pool, if applicable */
	uint8_t enable_loop_back; /**< Enable VT loop back */
	uint8_t nb_pool_maps; /**< We can have up to 64 filters/mappings */
	uint32_t rx_mode; /**< Flags from ETH_VMDQ_ACCEPT_* */
	struct {
		uint16_t vlan_id; /**< The vlan id of the received frame */
		uint64_t pools;   /**< Bitmask of pools for packet rx */
	} pool_map[ETH_VMDQ_MAX_VLAN_FILTERS]; /**< VMDq vlan pool maps. */
};

/**
 * A structure used to configure the TX features of an Ethernet port.
 */
struct rte_eth_txmode {
	enum rte_eth_tx_mq_mode mq_mode; /**< TX multi-queues mode. */
	/**
	 * Per-port Tx offloads to be set using DEV_TX_OFFLOAD_* flags.
	 * Only offloads set on tx_offload_capa field on rte_eth_dev_info
	 * structure are allowed to be set.
	 */
	uint64_t offloads;

	/* For i40e specifically */
	uint16_t pvid;
	__extension__
	uint8_t hw_vlan_reject_tagged : 1,
		/**< If set, reject sending out tagged pkts */
		hw_vlan_reject_untagged : 1,
		/**< If set, reject sending out untagged pkts */
		hw_vlan_insert_pvid : 1;
		/**< If set, enable port based VLAN insertion */
};

/**
 * A structure used to configure an RX ring of an Ethernet port.
 */
struct rte_eth_rxconf {
	struct rte_eth_thresh rx_thresh; /**< RX ring threshold registers. */
	uint16_t rx_free_thresh; /**< Drives the freeing of RX descriptors. */
	uint8_t rx_drop_en; /**< Drop packets if no descriptors are available. */
	uint8_t rx_deferred_start; /**< Do not start queue with rte_eth_dev_start(). */
	/**
	 * Per-queue Rx offloads to be set using DEV_RX_OFFLOAD_* flags.
	 * Only offloads set on rx_queue_offload_capa or rx_offload_capa
	 * fields on rte_eth_dev_info structure are allowed to be set.
	 */
	uint64_t offloads;
};

/**
 * A structure used to configure a TX ring of an Ethernet port.
 */
struct rte_eth_txconf {
	struct rte_eth_thresh tx_thresh; /**< TX ring threshold registers. */
	uint16_t tx_rs_thresh; /**< Drives the setting of RS bit on TXDs. */
	uint16_t tx_free_thresh; /**< Start freeing TX buffers if there are
				      less free descriptors than this value. */

	uint8_t tx_deferred_start; /**< Do not start queue with rte_eth_dev_start(). */
	/**
	 * Per-queue Tx offloads to be set  using DEV_TX_OFFLOAD_* flags.
	 * Only offloads set on tx_queue_offload_capa or tx_offload_capa
	 * fields on rte_eth_dev_info structure are allowed to be set.
	 */
	uint64_t offloads;
};

/**
 * A structure contains information about HW descriptor ring limitations.
 */
struct rte_eth_desc_lim {
	uint16_t nb_max;   /**< Max allowed number of descriptors. */
	uint16_t nb_min;   /**< Min allowed number of descriptors. */
	uint16_t nb_align; /**< Number of descriptors should be aligned to. */

	/**
	 * Max allowed number of segments per whole packet.
	 *
	 * - For TSO packet this is the total number of data descriptors allowed
	 *   by device.
	 *
	 * @see nb_mtu_seg_max
	 */
	uint16_t nb_seg_max;

	/**
	 * Max number of segments per one MTU.
	 *
	 * - For non-TSO packet, this is the maximum allowed number of segments
	 *   in a single transmit packet.
	 *
	 * - For TSO packet each segment within the TSO may span up to this
	 *   value.
	 *
	 * @see nb_seg_max
	 */
	uint16_t nb_mtu_seg_max;
};

/**
 * This enum indicates the flow control mode
 */
enum rte_eth_fc_mode {
	RTE_FC_NONE = 0, /**< Disable flow control. */
	RTE_FC_RX_PAUSE, /**< RX pause frame, enable flowctrl on TX side. */
	RTE_FC_TX_PAUSE, /**< TX pause frame, enable flowctrl on RX side. */
	RTE_FC_FULL      /**< Enable flow control on both side. */
};

/**
 * A structure used to configure Ethernet flow control parameter.
 * These parameters will be configured into the register of the NIC.
 * Please refer to the corresponding data sheet for proper value.
 */
struct rte_eth_fc_conf {
	uint32_t high_water;  /**< High threshold value to trigger XOFF */
	uint32_t low_water;   /**< Low threshold value to trigger XON */
	uint16_t pause_time;  /**< Pause quota in the Pause frame */
	uint16_t send_xon;    /**< Is XON frame need be sent */
	enum rte_eth_fc_mode mode;  /**< Link flow control mode */
	uint8_t mac_ctrl_frame_fwd; /**< Forward MAC control frames */
	uint8_t autoneg;      /**< Use Pause autoneg */
};

/**
 * A structure used to configure Ethernet priority flow control parameter.
 * These parameters will be configured into the register of the NIC.
 * Please refer to the corresponding data sheet for proper value.
 */
struct rte_eth_pfc_conf {
	struct rte_eth_fc_conf fc; /**< General flow control parameter. */
	uint8_t priority;          /**< VLAN User Priority. */
};

/**
 *  Memory space that can be configured to store Flow Director filters
 *  in the board memory.
 */
enum rte_fdir_pballoc_type {
	RTE_FDIR_PBALLOC_64K = 0,  /**< 64k. */
	RTE_FDIR_PBALLOC_128K,     /**< 128k. */
	RTE_FDIR_PBALLOC_256K,     /**< 256k. */
};

/**
 *  Select report mode of FDIR hash information in RX descriptors.
 */
enum rte_fdir_status_mode {
	RTE_FDIR_NO_REPORT_STATUS = 0, /**< Never report FDIR hash. */
	RTE_FDIR_REPORT_STATUS, /**< Only report FDIR hash for matching pkts. */
	RTE_FDIR_REPORT_STATUS_ALWAYS, /**< Always report FDIR hash. */
};

/**
 * A structure used to configure the Flow Director (FDIR) feature
 * of an Ethernet port.
 *
 * If mode is RTE_FDIR_DISABLE, the pballoc value is ignored.
 */
struct rte_fdir_conf {
	enum rte_fdir_mode mode; /**< Flow Director mode. */
	enum rte_fdir_pballoc_type pballoc; /**< Space for FDIR filters. */
	enum rte_fdir_status_mode status;  /**< How to report FDIR hash. */
	/** RX queue of packets matching a "drop" filter in perfect mode. */
	uint8_t drop_queue;
	struct rte_eth_fdir_masks mask;
	struct rte_eth_fdir_flex_conf flex_conf;
	/**< Flex payload configuration. */
};

/**
 * UDP tunneling configuration.
 * Used to config the UDP port for a type of tunnel.
 * NICs need the UDP port to identify the tunnel type.
 * Normally a type of tunnel has a default UDP port, this structure can be used
 * in case if the users want to change or support more UDP port.
 */
struct rte_eth_udp_tunnel {
	uint16_t udp_port; /**< UDP port used for the tunnel. */
	uint8_t prot_type; /**< Tunnel type. Defined in rte_eth_tunnel_type. */
};

/**
 * A structure used to enable/disable specific device interrupts.
 */
struct rte_intr_conf {
	/** enable/disable lsc interrupt. 0 (default) - disable, 1 enable */
	uint32_t lsc:1;
	/** enable/disable rxq interrupt. 0 (default) - disable, 1 enable */
	uint32_t rxq:1;
	/** enable/disable rmv interrupt. 0 (default) - disable, 1 enable */
	uint32_t rmv:1;
};

/**
 * A structure used to configure an Ethernet port.
 * Depending upon the RX multi-queue mode, extra advanced
 * configuration settings may be needed.
 */
struct rte_eth_conf {
	uint32_t link_speeds; /**< bitmap of ETH_LINK_SPEED_XXX of speeds to be
				used. ETH_LINK_SPEED_FIXED disables link
				autonegotiation, and a unique speed shall be
				set. Otherwise, the bitmap defines the set of
				speeds to be advertised. If the special value
				ETH_LINK_SPEED_AUTONEG (0) is used, all speeds
				supported are advertised. */
	struct rte_eth_rxmode rxmode; /**< Port RX configuration. */
	struct rte_eth_txmode txmode; /**< Port TX configuration. */
	uint32_t lpbk_mode; /**< Loopback operation mode. By default the value
			         is 0, meaning the loopback mode is disabled.
				 Read the datasheet of given ethernet controller
				 for details. The possible values of this field
				 are defined in implementation of each driver. */
	struct {
		struct rte_eth_rss_conf rss_conf; /**< Port RSS configuration */
		struct rte_eth_vmdq_dcb_conf vmdq_dcb_conf;
		/**< Port vmdq+dcb configuration. */
		struct rte_eth_dcb_rx_conf dcb_rx_conf;
		/**< Port dcb RX configuration. */
		struct rte_eth_vmdq_rx_conf vmdq_rx_conf;
		/**< Port vmdq RX configuration. */
	} rx_adv_conf; /**< Port RX filtering configuration. */
	union {
		struct rte_eth_vmdq_dcb_tx_conf vmdq_dcb_tx_conf;
		/**< Port vmdq+dcb TX configuration. */
		struct rte_eth_dcb_tx_conf dcb_tx_conf;
		/**< Port dcb TX configuration. */
		struct rte_eth_vmdq_tx_conf vmdq_tx_conf;
		/**< Port vmdq TX configuration. */
	} tx_adv_conf; /**< Port TX DCB configuration (union). */
	/** Currently,Priority Flow Control(PFC) are supported,if DCB with PFC
	    is needed,and the variable must be set ETH_DCB_PFC_SUPPORT. */
	uint32_t dcb_capability_en;
	struct rte_fdir_conf fdir_conf; /**< FDIR configuration. */
	struct rte_intr_conf intr_conf; /**< Interrupt mode configuration. */
};

/**
 * RX offload capabilities of a device.
 */
#define DEV_RX_OFFLOAD_VLAN_STRIP  0x00000001
#define DEV_RX_OFFLOAD_IPV4_CKSUM  0x00000002
#define DEV_RX_OFFLOAD_UDP_CKSUM   0x00000004
#define DEV_RX_OFFLOAD_TCP_CKSUM   0x00000008
#define DEV_RX_OFFLOAD_TCP_LRO     0x00000010
#define DEV_RX_OFFLOAD_QINQ_STRIP  0x00000020
#define DEV_RX_OFFLOAD_OUTER_IPV4_CKSUM 0x00000040
#define DEV_RX_OFFLOAD_MACSEC_STRIP     0x00000080
#define DEV_RX_OFFLOAD_HEADER_SPLIT	0x00000100
#define DEV_RX_OFFLOAD_VLAN_FILTER	0x00000200
#define DEV_RX_OFFLOAD_VLAN_EXTEND	0x00000400
#define DEV_RX_OFFLOAD_JUMBO_FRAME	0x00000800
#define DEV_RX_OFFLOAD_SCATTER		0x00002000
#define DEV_RX_OFFLOAD_TIMESTAMP	0x00004000
#define DEV_RX_OFFLOAD_SECURITY         0x00008000
#define DEV_RX_OFFLOAD_KEEP_CRC		0x00010000
#define DEV_RX_OFFLOAD_SCTP_CKSUM	0x00020000
#define DEV_RX_OFFLOAD_OUTER_UDP_CKSUM  0x00040000

#define DEV_RX_OFFLOAD_CHECKSUM (DEV_RX_OFFLOAD_IPV4_CKSUM | \
				 DEV_RX_OFFLOAD_UDP_CKSUM | \
				 DEV_RX_OFFLOAD_TCP_CKSUM)
#define DEV_RX_OFFLOAD_VLAN (DEV_RX_OFFLOAD_VLAN_STRIP | \
			     DEV_RX_OFFLOAD_VLAN_FILTER | \
			     DEV_RX_OFFLOAD_VLAN_EXTEND)

/*
 * If new Rx offload capabilities are defined, they also must be
 * mentioned in rte_rx_offload_names in rte_ethdev.c file.
 */

/**
 * TX offload capabilities of a device.
 */
#define DEV_TX_OFFLOAD_VLAN_INSERT 0x00000001
#define DEV_TX_OFFLOAD_IPV4_CKSUM  0x00000002
#define DEV_TX_OFFLOAD_UDP_CKSUM   0x00000004
#define DEV_TX_OFFLOAD_TCP_CKSUM   0x00000008
#define DEV_TX_OFFLOAD_SCTP_CKSUM  0x00000010
#define DEV_TX_OFFLOAD_TCP_TSO     0x00000020
#define DEV_TX_OFFLOAD_UDP_TSO     0x00000040
#define DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM 0x00000080 /**< Used for tunneling packet. */
#define DEV_TX_OFFLOAD_QINQ_INSERT 0x00000100
#define DEV_TX_OFFLOAD_VXLAN_TNL_TSO    0x00000200    /**< Used for tunneling packet. */
#define DEV_TX_OFFLOAD_GRE_TNL_TSO      0x00000400    /**< Used for tunneling packet. */
#define DEV_TX_OFFLOAD_IPIP_TNL_TSO     0x00000800    /**< Used for tunneling packet. */
#define DEV_TX_OFFLOAD_GENEVE_TNL_TSO   0x00001000    /**< Used for tunneling packet. */
#define DEV_TX_OFFLOAD_MACSEC_INSERT    0x00002000
#define DEV_TX_OFFLOAD_MT_LOCKFREE      0x00004000
/**< Multiple threads can invoke rte_eth_tx_burst() concurrently on the same
 * tx queue without SW lock.
 */
#define DEV_TX_OFFLOAD_MULTI_SEGS	0x00008000
/**< Device supports multi segment send. */
#define DEV_TX_OFFLOAD_MBUF_FAST_FREE	0x00010000
/**< Device supports optimization for fast release of mbufs.
 *   When set application must guarantee that per-queue all mbufs comes from
 *   the same mempool and has refcnt = 1.
 */
#define DEV_TX_OFFLOAD_SECURITY         0x00020000
/**
 * Device supports generic UDP tunneled packet TSO.
 * Application must set PKT_TX_TUNNEL_UDP and other mbuf fields required
 * for tunnel TSO.
 */
#define DEV_TX_OFFLOAD_UDP_TNL_TSO      0x00040000
/**
 * Device supports generic IP tunneled packet TSO.
 * Application must set PKT_TX_TUNNEL_IP and other mbuf fields required
 * for tunnel TSO.
 */
#define DEV_TX_OFFLOAD_IP_TNL_TSO       0x00080000
/** Device supports outer UDP checksum */
#define DEV_TX_OFFLOAD_OUTER_UDP_CKSUM  0x00100000
/**
 * Device supports match on metadata Tx offload..
 * Application must set PKT_TX_METADATA and mbuf metadata field.
 */
#define DEV_TX_OFFLOAD_MATCH_METADATA   0x00200000

#define RTE_ETH_DEV_CAPA_RUNTIME_RX_QUEUE_SETUP 0x00000001
/**< Device supports Rx queue setup after device started*/
#define RTE_ETH_DEV_CAPA_RUNTIME_TX_QUEUE_SETUP 0x00000002
/**< Device supports Tx queue setup after device started*/

/*
 * If new Tx offload capabilities are defined, they also must be
 * mentioned in rte_tx_offload_names in rte_ethdev.c file.
 */

/*
 * Fallback default preferred Rx/Tx port parameters.
 * These are used if an application requests default parameters
 * but the PMD does not provide preferred values.
 */
#define RTE_ETH_DEV_FALLBACK_RX_RINGSIZE 512
#define RTE_ETH_DEV_FALLBACK_TX_RINGSIZE 512
#define RTE_ETH_DEV_FALLBACK_RX_NBQUEUES 1
#define RTE_ETH_DEV_FALLBACK_TX_NBQUEUES 1

/**
 * Preferred Rx/Tx port parameters.
 * There are separate instances of this structure for transmission
 * and reception respectively.
 */
struct rte_eth_dev_portconf {
	uint16_t burst_size; /**< Device-preferred burst size */
	uint16_t ring_size; /**< Device-preferred size of queue rings */
	uint16_t nb_queues; /**< Device-preferred number of queues */
};

/**
 * Default values for switch domain id when ethdev does not support switch
 * domain definitions.
 */
#define RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID	(0)

/**
 * Ethernet device associated switch information
 */
struct rte_eth_switch_info {
	const char *name;	/**< switch name */
	uint16_t domain_id;	/**< switch domain id */
	uint16_t port_id;
	/**<
	 * mapping to the devices physical switch port as enumerated from the
	 * perspective of the embedded interconnect/switch. For SR-IOV enabled
	 * device this may correspond to the VF_ID of each virtual function,
	 * but each driver should explicitly define the mapping of switch
	 * port identifier to that physical interconnect/switch
	 */
};

/**
 * Ethernet device information
 */

/**
 * A structure used to retrieve the contextual information of
 * an Ethernet device, such as the controlling driver of the
 * device, etc...
 */
struct rte_eth_dev_info {
	struct rte_device *device; /** Generic device information */
	const char *driver_name; /**< Device Driver name. */
	unsigned int if_index; /**< Index to bound host interface, or 0 if none.
		Use if_indextoname() to translate into an interface name. */
	const uint32_t *dev_flags; /**< Device flags */
	uint32_t min_rx_bufsize; /**< Minimum size of RX buffer. */
	uint32_t max_rx_pktlen; /**< Maximum configurable length of RX pkt. */
	uint16_t max_rx_queues; /**< Maximum number of RX queues. */
	uint16_t max_tx_queues; /**< Maximum number of TX queues. */
	uint32_t max_mac_addrs; /**< Maximum number of MAC addresses. */
	uint32_t max_hash_mac_addrs;
	/** Maximum number of hash MAC addresses for MTA and UTA. */
	uint16_t max_vfs; /**< Maximum number of VFs. */
	uint16_t max_vmdq_pools; /**< Maximum number of VMDq pools. */
	uint64_t rx_offload_capa;
	/**< All RX offload capabilities including all per-queue ones */
	uint64_t tx_offload_capa;
	/**< All TX offload capabilities including all per-queue ones */
	uint64_t rx_queue_offload_capa;
	/**< Device per-queue RX offload capabilities. */
	uint64_t tx_queue_offload_capa;
	/**< Device per-queue TX offload capabilities. */
	uint16_t reta_size;
	/**< Device redirection table size, the total number of entries. */
	uint8_t hash_key_size; /**< Hash key size in bytes */
	/** Bit mask of RSS offloads, the bit offset also means flow type */
	uint64_t flow_type_rss_offloads;
	struct rte_eth_rxconf default_rxconf; /**< Default RX configuration */
	struct rte_eth_txconf default_txconf; /**< Default TX configuration */
	uint16_t vmdq_queue_base; /**< First queue ID for VMDQ pools. */
	uint16_t vmdq_queue_num;  /**< Queue number for VMDQ pools. */
	uint16_t vmdq_pool_base;  /**< First ID of VMDQ pools. */
	struct rte_eth_desc_lim rx_desc_lim;  /**< RX descriptors limits */
	struct rte_eth_desc_lim tx_desc_lim;  /**< TX descriptors limits */
	uint32_t speed_capa;  /**< Supported speeds bitmap (ETH_LINK_SPEED_). */
	/** Configured number of rx/tx queues */
	uint16_t nb_rx_queues; /**< Number of RX queues. */
	uint16_t nb_tx_queues; /**< Number of TX queues. */
	/** Rx parameter recommendations */
	struct rte_eth_dev_portconf default_rxportconf;
	/** Tx parameter recommendations */
	struct rte_eth_dev_portconf default_txportconf;
	/** Generic device capabilities (RTE_ETH_DEV_CAPA_). */
	uint64_t dev_capa;
	/**
	 * Switching information for ports on a device with a
	 * embedded managed interconnect/switch.
	 */
	struct rte_eth_switch_info switch_info;
};

/**
 * Ethernet device RX queue information structure.
 * Used to retieve information about configured queue.
 */
struct rte_eth_rxq_info {
	struct rte_mempool *mp;     /**< mempool used by that queue. */
	struct rte_eth_rxconf conf; /**< queue config parameters. */
	uint8_t scattered_rx;       /**< scattered packets RX supported. */
	uint16_t nb_desc;           /**< configured number of RXDs. */
} __rte_cache_min_aligned;

/**
 * Ethernet device TX queue information structure.
 * Used to retrieve information about configured queue.
 */
struct rte_eth_txq_info {
	struct rte_eth_txconf conf; /**< queue config parameters. */
	uint16_t nb_desc;           /**< configured number of TXDs. */
} __rte_cache_min_aligned;

/** Maximum name length for extended statistics counters */
#define RTE_ETH_XSTATS_NAME_SIZE 64

/**
 * An Ethernet device extended statistic structure
 *
 * This structure is used by rte_eth_xstats_get() to provide
 * statistics that are not provided in the generic *rte_eth_stats*
 * structure.
 * It maps a name id, corresponding to an index in the array returned
 * by rte_eth_xstats_get_names(), to a statistic value.
 */
struct rte_eth_xstat {
	uint64_t id;        /**< The index in xstats name array. */
	uint64_t value;     /**< The statistic counter value. */
};

/**
 * A name element for extended statistics.
 *
 * An array of this structure is returned by rte_eth_xstats_get_names().
 * It lists the names of extended statistics for a PMD. The *rte_eth_xstat*
 * structure references these names by their array index.
 */
struct rte_eth_xstat_name {
	char name[RTE_ETH_XSTATS_NAME_SIZE]; /**< The statistic name. */
};

#define ETH_DCB_NUM_TCS    8
#define ETH_MAX_VMDQ_POOL  64

/**
 * A structure used to get the information of queue and
 * TC mapping on both TX and RX paths.
 */
struct rte_eth_dcb_tc_queue_mapping {
	/** rx queues assigned to tc per Pool */
	struct {
		uint8_t base;
		uint8_t nb_queue;
	} tc_rxq[ETH_MAX_VMDQ_POOL][ETH_DCB_NUM_TCS];
	/** rx queues assigned to tc per Pool */
	struct {
		uint8_t base;
		uint8_t nb_queue;
	} tc_txq[ETH_MAX_VMDQ_POOL][ETH_DCB_NUM_TCS];
};

/**
 * A structure used to get the information of DCB.
 * It includes TC UP mapping and queue TC mapping.
 */
struct rte_eth_dcb_info {
	uint8_t nb_tcs;        /**< number of TCs */
	uint8_t prio_tc[ETH_DCB_NUM_USER_PRIORITIES]; /**< Priority to tc */
	uint8_t tc_bws[ETH_DCB_NUM_TCS]; /**< TX BW percentage for each TC */
	/** rx queues assigned to tc */
	struct rte_eth_dcb_tc_queue_mapping tc_queue;
};

/**
 * RX/TX queue states
 */
#define RTE_ETH_QUEUE_STATE_STOPPED 0
#define RTE_ETH_QUEUE_STATE_STARTED 1

#define RTE_ETH_ALL RTE_MAX_ETHPORTS

/* Macros to check for valid port */
#define RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, retval) do { \
	if (!rte_eth_dev_is_valid_port(port_id)) { \
		RTE_ETHDEV_LOG(ERR, "Invalid port_id=%u\n", port_id); \
		return retval; \
	} \
} while (0)

#define RTE_ETH_VALID_PORTID_OR_RET(port_id) do { \
	if (!rte_eth_dev_is_valid_port(port_id)) { \
		RTE_ETHDEV_LOG(ERR, "Invalid port_id=%u\n", port_id); \
		return; \
	} \
} while (0)

/**
 * l2 tunnel configuration.
 */

/**< l2 tunnel enable mask */
#define ETH_L2_TUNNEL_ENABLE_MASK       0x00000001
/**< l2 tunnel insertion mask */
#define ETH_L2_TUNNEL_INSERTION_MASK    0x00000002
/**< l2 tunnel stripping mask */
#define ETH_L2_TUNNEL_STRIPPING_MASK    0x00000004
/**< l2 tunnel forwarding mask */
#define ETH_L2_TUNNEL_FORWARDING_MASK   0x00000008

/**
 * Function type used for RX packet processing packet callbacks.
 *
 * The callback function is called on RX with a burst of packets that have
 * been received on the given port and queue.
 *
 * @param port_id
 *   The Ethernet port on which RX is being performed.
 * @param queue
 *   The queue on the Ethernet port which is being used to receive the packets.
 * @param pkts
 *   The burst of packets that have just been received.
 * @param nb_pkts
 *   The number of packets in the burst pointed to by "pkts".
 * @param max_pkts
 *   The max number of packets that can be stored in the "pkts" array.
 * @param user_param
 *   The arbitrary user parameter passed in by the application when the callback
 *   was originally configured.
 * @return
 *   The number of packets returned to the user.
 */
typedef uint16_t (*rte_rx_callback_fn)(uint16_t port_id, uint16_t queue,
	struct rte_mbuf *pkts[], uint16_t nb_pkts, uint16_t max_pkts,
	void *user_param);

/**
 * Function type used for TX packet processing packet callbacks.
 *
 * The callback function is called on TX with a burst of packets immediately
 * before the packets are put onto the hardware queue for transmission.
 *
 * @param port_id
 *   The Ethernet port on which TX is being performed.
 * @param queue
 *   The queue on the Ethernet port which is being used to transmit the packets.
 * @param pkts
 *   The burst of packets that are about to be transmitted.
 * @param nb_pkts
 *   The number of packets in the burst pointed to by "pkts".
 * @param user_param
 *   The arbitrary user parameter passed in by the application when the callback
 *   was originally configured.
 * @return
 *   The number of packets to be written to the NIC.
 */
typedef uint16_t (*rte_tx_callback_fn)(uint16_t port_id, uint16_t queue,
	struct rte_mbuf *pkts[], uint16_t nb_pkts, void *user_param);

/**
 * Possible states of an ethdev port.
 */
enum rte_eth_dev_state {
	/** Device is unused before being probed. */
	RTE_ETH_DEV_UNUSED = 0,
	/** Device is attached when allocated in probing. */
	RTE_ETH_DEV_ATTACHED,
	/** Device is in removed state when plug-out is detected. */
	RTE_ETH_DEV_REMOVED,
};

struct rte_eth_dev_sriov {
	uint8_t active;               /**< SRIOV is active with 16, 32 or 64 pools */
	uint8_t nb_q_per_pool;        /**< rx queue number per pool */
	uint16_t def_vmdq_idx;        /**< Default pool num used for PF */
	uint16_t def_pool_q_idx;      /**< Default pool queue start reg index */
};
#define RTE_ETH_DEV_SRIOV(dev)         ((dev)->data->sriov)

#define RTE_ETH_NAME_MAX_LEN RTE_DEV_NAME_MAX_LEN

#define RTE_ETH_DEV_NO_OWNER 0

#define RTE_ETH_MAX_OWNER_NAME_LEN 64

struct rte_eth_dev_owner {
	uint64_t id; /**< The owner unique identifier. */
	char name[RTE_ETH_MAX_OWNER_NAME_LEN]; /**< The owner name. */
};

/**
 * Port is released (i.e. totally freed and data erased) on close.
 * Temporary flag for PMD migration to new rte_eth_dev_close() behaviour.
 */
#define RTE_ETH_DEV_CLOSE_REMOVE 0x0001
/** Device supports link state interrupt */
#define RTE_ETH_DEV_INTR_LSC     0x0002
/** Device is a bonded slave */
#define RTE_ETH_DEV_BONDED_SLAVE 0x0004
/** Device supports device removal interrupt */
#define RTE_ETH_DEV_INTR_RMV     0x0008
/** Device is port representor */
#define RTE_ETH_DEV_REPRESENTOR  0x0010
/** Device does not support MAC change after started */
#define RTE_ETH_DEV_NOLIVE_MAC_ADDR  0x0020

/**
 * Iterates over valid ethdev ports owned by a specific owner.
 *
 * @param port_id
 *   The id of the next possible valid owned port.
 * @param	owner_id
 *  The owner identifier.
 *  RTE_ETH_DEV_NO_OWNER means iterate over all valid ownerless ports.
 * @return
 *   Next valid port id owned by owner_id, RTE_MAX_ETHPORTS if there is none.
 */
uint64_t rte_eth_find_next_owned_by(uint16_t port_id,
		const uint64_t owner_id);

/**
 * Macro to iterate over all enabled ethdev ports owned by a specific owner.
 */
#define RTE_ETH_FOREACH_DEV_OWNED_BY(p, o) \
	for (p = rte_eth_find_next_owned_by(0, o); \
	     (unsigned int)p < (unsigned int)RTE_MAX_ETHPORTS; \
	     p = rte_eth_find_next_owned_by(p + 1, o))

/**
 * Iterates over valid ethdev ports.
 *
 * @param port_id
 *   The id of the next possible valid port.
 * @return
 *   Next valid port id, RTE_MAX_ETHPORTS if there is none.
 */
uint16_t rte_eth_find_next(uint16_t port_id);

/**
 * Macro to iterate over all enabled and ownerless ethdev ports.
 */
#define RTE_ETH_FOREACH_DEV(p) \
	RTE_ETH_FOREACH_DEV_OWNED_BY(p, RTE_ETH_DEV_NO_OWNER)


/**
 * @warning
 * @b EXPERIMENTAL: this API may change without prior notice.
 *
 * Get a new unique owner identifier.
 * An owner identifier is used to owns Ethernet devices by only one DPDK entity
 * to avoid multiple management of device by different entities.
 *
 * @param	owner_id
 *   Owner identifier pointer.
 * @return
 *   Negative errno value on error, 0 on success.
 */
int __rte_experimental rte_eth_dev_owner_new(uint64_t *owner_id);

/**
 * @warning
 * @b EXPERIMENTAL: this API may change without prior notice.
 *
 * Set an Ethernet device owner.
 *
 * @param	port_id
 *  The identifier of the port to own.
 * @param	owner
 *  The owner pointer.
 * @return
 *  Negative errno value on error, 0 on success.
 */
int __rte_experimental rte_eth_dev_owner_set(const uint16_t port_id,
		const struct rte_eth_dev_owner *owner);

/**
 * @warning
 * @b EXPERIMENTAL: this API may change without prior notice.
 *
 * Unset Ethernet device owner to make the device ownerless.
 *
 * @param	port_id
 *  The identifier of port to make ownerless.
 * @param	owner_id
 *  The owner identifier.
 * @return
 *  0 on success, negative errno value on error.
 */
int __rte_experimental rte_eth_dev_owner_unset(const uint16_t port_id,
		const uint64_t owner_id);

/**
 * @warning
 * @b EXPERIMENTAL: this API may change without prior notice.
 *
 * Remove owner from all Ethernet devices owned by a specific owner.
 *
 * @param	owner_id
 *  The owner identifier.
 */
void __rte_experimental rte_eth_dev_owner_delete(const uint64_t owner_id);

/**
 * @warning
 * @b EXPERIMENTAL: this API may change without prior notice.
 *
 * Get the owner of an Ethernet device.
 *
 * @param	port_id
 *  The port identifier.
 * @param	owner
 *  The owner structure pointer to fill.
 * @return
 *  0 on success, negative errno value on error..
 */
int __rte_experimental rte_eth_dev_owner_get(const uint16_t port_id,
		struct rte_eth_dev_owner *owner);

/**
 * Get the total number of Ethernet devices that have been successfully
 * initialized by the matching Ethernet driver during the PCI probing phase
 * and that are available for applications to use. These devices must be
 * accessed by using the ``RTE_ETH_FOREACH_DEV()`` macro to deal with
 * non-contiguous ranges of devices.
 * These non-contiguous ranges can be created by calls to hotplug functions or
 * by some PMDs.
 *
 * @return
 *   - The total number of usable Ethernet devices.
 */
__rte_deprecated
uint16_t rte_eth_dev_count(void);

/**
 * Get the number of ports which are usable for the application.
 *
 * These devices must be iterated by using the macro
 * ``RTE_ETH_FOREACH_DEV`` or ``RTE_ETH_FOREACH_DEV_OWNED_BY``
 * to deal with non-contiguous ranges of devices.
 *
 * @return
 *   The count of available Ethernet devices.
 */
uint16_t rte_eth_dev_count_avail(void);

/**
 * Get the total number of ports which are allocated.
 *
 * Some devices may not be available for the application.
 *
 * @return
 *   The total count of Ethernet devices.
 */
uint16_t __rte_experimental rte_eth_dev_count_total(void);

/**
 * Convert a numerical speed in Mbps to a bitmap flag that can be used in
 * the bitmap link_speeds of the struct rte_eth_conf
 *
 * @param speed
 *   Numerical speed value in Mbps
 * @param duplex
 *   ETH_LINK_[HALF/FULL]_DUPLEX (only for 10/100M speeds)
 * @return
 *   0 if the speed cannot be mapped
 */
uint32_t rte_eth_speed_bitflag(uint32_t speed, int duplex);

/**
 * Get DEV_RX_OFFLOAD_* flag name.
 *
 * @param offload
 *   Offload flag.
 * @return
 *   Offload name or 'UNKNOWN' if the flag cannot be recognised.
 */
const char *rte_eth_dev_rx_offload_name(uint64_t offload);

/**
 * Get DEV_TX_OFFLOAD_* flag name.
 *
 * @param offload
 *   Offload flag.
 * @return
 *   Offload name or 'UNKNOWN' if the flag cannot be recognised.
 */
const char *rte_eth_dev_tx_offload_name(uint64_t offload);

/**
 * Configure an Ethernet device.
 * This function must be invoked first before any other function in the
 * Ethernet API. This function can also be re-invoked when a device is in the
 * stopped state.
 *
 * @param port_id
 *   The port identifier of the Ethernet device to configure.
 * @param nb_rx_queue
 *   The number of receive queues to set up for the Ethernet device.
 * @param nb_tx_queue
 *   The number of transmit queues to set up for the Ethernet device.
 * @param eth_conf
 *   The pointer to the configuration data to be used for the Ethernet device.
 *   The *rte_eth_conf* structure includes:
 *     -  the hardware offload features to activate, with dedicated fields for
 *        each statically configurable offload hardware feature provided by
 *        Ethernet devices, such as IP checksum or VLAN tag stripping for
 *        example.
 *        The Rx offload bitfield API is obsolete and will be deprecated.
 *        Applications should set the ignore_bitfield_offloads bit on *rxmode*
 *        structure and use offloads field to set per-port offloads instead.
 *     -  Any offloading set in eth_conf->[rt]xmode.offloads must be within
 *        the [rt]x_offload_capa returned from rte_eth_dev_infos_get().
 *        Any type of device supported offloading set in the input argument
 *        eth_conf->[rt]xmode.offloads to rte_eth_dev_configure() is enabled
 *        on all queues and it can't be disabled in rte_eth_[rt]x_queue_setup()
 *     -  the Receive Side Scaling (RSS) configuration when using multiple RX
 *        queues per port. Any RSS hash function set in eth_conf->rss_conf.rss_hf
 *        must be within the flow_type_rss_offloads provided by drivers via
 *        rte_eth_dev_infos_get() API.
 *
 *   Embedding all configuration information in a single data structure
 *   is the more flexible method that allows the addition of new features
 *   without changing the syntax of the API.
 * @return
 *   - 0: Success, device configured.
 *   - <0: Error code returned by the driver configuration function.
 */
int rte_eth_dev_configure(uint16_t port_id, uint16_t nb_rx_queue,
		uint16_t nb_tx_queue, const struct rte_eth_conf *eth_conf);

/**
 * @warning
 * @b EXPERIMENTAL: this API may change without prior notice.
 *
 * Check if an Ethernet device was physically removed.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @return
 *   1 when the Ethernet device is removed, otherwise 0.
 */
int __rte_experimental
rte_eth_dev_is_removed(uint16_t port_id);

/**
 * Allocate and set up a receive queue for an Ethernet device.
 *
 * The function allocates a contiguous block of memory for *nb_rx_desc*
 * receive descriptors from a memory zone associated with *socket_id*
 * and initializes each receive descriptor with a network buffer allocated
 * from the memory pool *mb_pool*.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param rx_queue_id
 *   The index of the receive queue to set up.
 *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
 *   to rte_eth_dev_configure().
 * @param nb_rx_desc
 *   The number of receive descriptors to allocate for the receive ring.
 * @param socket_id
 *   The *socket_id* argument is the socket identifier in case of NUMA.
 *   The value can be *SOCKET_ID_ANY* if there is no NUMA constraint for
 *   the DMA memory allocated for the receive descriptors of the ring.
 * @param rx_conf
 *   The pointer to the configuration data to be used for the receive queue.
 *   NULL value is allowed, in which case default RX configuration
 *   will be used.
 *   The *rx_conf* structure contains an *rx_thresh* structure with the values
 *   of the Prefetch, Host, and Write-Back threshold registers of the receive
 *   ring.
 *   In addition it contains the hardware offloads features to activate using
 *   the DEV_RX_OFFLOAD_* flags.
 *   If an offloading set in rx_conf->offloads
 *   hasn't been set in the input argument eth_conf->rxmode.offloads
 *   to rte_eth_dev_configure(), it is a new added offloading, it must be
 *   per-queue type and it is enabled for the queue.
 *   No need to repeat any bit in rx_conf->offloads which has already been
 *   enabled in rte_eth_dev_configure() at port level. An offloading enabled
 *   at port level can't be disabled at queue level.
 * @param mb_pool
 *   The pointer to the memory pool from which to allocate *rte_mbuf* network
 *   memory buffers to populate each descriptor of the receive ring.
 * @return
 *   - 0: Success, receive queue correctly set up.
 *   - -EIO: if device is removed.
 *   - -EINVAL: The size of network buffers which can be allocated from the
 *      memory pool does not fit the various buffer sizes allowed by the
 *      device controller.
 *   - -ENOMEM: Unable to allocate the receive ring descriptors or to
 *      allocate network memory buffers from the memory pool when
 *      initializing receive descriptors.
 */
int rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id,
		uint16_t nb_rx_desc, unsigned int socket_id,
		const struct rte_eth_rxconf *rx_conf,
		struct rte_mempool *mb_pool);

/**
 * Allocate and set up a transmit queue for an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param tx_queue_id
 *   The index of the transmit queue to set up.
 *   The value must be in the range [0, nb_tx_queue - 1] previously supplied
 *   to rte_eth_dev_configure().
 * @param nb_tx_desc
 *   The number of transmit descriptors to allocate for the transmit ring.
 * @param socket_id
 *   The *socket_id* argument is the socket identifier in case of NUMA.
 *   Its value can be *SOCKET_ID_ANY* if there is no NUMA constraint for
 *   the DMA memory allocated for the transmit descriptors of the ring.
 * @param tx_conf
 *   The pointer to the configuration data to be used for the transmit queue.
 *   NULL value is allowed, in which case default TX configuration
 *   will be used.
 *   The *tx_conf* structure contains the following data:
 *   - The *tx_thresh* structure with the values of the Prefetch, Host, and
 *     Write-Back threshold registers of the transmit ring.
 *     When setting Write-Back threshold to the value greater then zero,
 *     *tx_rs_thresh* value should be explicitly set to one.
 *   - The *tx_free_thresh* value indicates the [minimum] number of network
 *     buffers that must be pending in the transmit ring to trigger their
 *     [implicit] freeing by the driver transmit function.
 *   - The *tx_rs_thresh* value indicates the [minimum] number of transmit
 *     descriptors that must be pending in the transmit ring before setting the
 *     RS bit on a descriptor by the driver transmit function.
 *     The *tx_rs_thresh* value should be less or equal then
 *     *tx_free_thresh* value, and both of them should be less then
 *     *nb_tx_desc* - 3.
 *   - The *offloads* member contains Tx offloads to be enabled.
 *     If an offloading set in tx_conf->offloads
 *     hasn't been set in the input argument eth_conf->txmode.offloads
 *     to rte_eth_dev_configure(), it is a new added offloading, it must be
 *     per-queue type and it is enabled for the queue.
 *     No need to repeat any bit in tx_conf->offloads which has already been
 *     enabled in rte_eth_dev_configure() at port level. An offloading enabled
 *     at port level can't be disabled at queue level.
 *
 *     Note that setting *tx_free_thresh* or *tx_rs_thresh* value to 0 forces
 *     the transmit function to use default values.
 * @return
 *   - 0: Success, the transmit queue is correctly set up.
 *   - -ENOMEM: Unable to allocate the transmit ring descriptors.
 */
int rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id,
		uint16_t nb_tx_desc, unsigned int socket_id,
		const struct rte_eth_txconf *tx_conf);

/**
 * Return the NUMA socket to which an Ethernet device is connected
 *
 * @param port_id
 *   The port identifier of the Ethernet device
 * @return
 *   The NUMA socket id to which the Ethernet device is connected or
 *   a default of zero if the socket could not be determined.
 *   -1 is returned is the port_id value is out of range.
 */
int rte_eth_dev_socket_id(uint16_t port_id);

/**
 * Check if port_id of device is attached
 *
 * @param port_id
 *   The port identifier of the Ethernet device
 * @return
 *   - 0 if port is out of range or not attached
 *   - 1 if device is attached
 */
int rte_eth_dev_is_valid_port(uint16_t port_id);

/**
 * Start specified RX queue of a port. It is used when rx_deferred_start
 * flag of the specified queue is true.
 *
 * @param port_id
 *   The port identifier of the Ethernet device
 * @param rx_queue_id
 *   The index of the rx queue to update the ring.
 *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
 *   to rte_eth_dev_configure().
 * @return
 *   - 0: Success, the receive queue is started.
 *   - -EINVAL: The port_id or the queue_id out of range.
 *   - -EIO: if device is removed.
 *   - -ENOTSUP: The function not supported in PMD driver.
 */
int rte_eth_dev_rx_queue_start(uint16_t port_id, uint16_t rx_queue_id);

/**
 * Stop specified RX queue of a port
 *
 * @param port_id
 *   The port identifier of the Ethernet device
 * @param rx_queue_id
 *   The index of the rx queue to update the ring.
 *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
 *   to rte_eth_dev_configure().
 * @return
 *   - 0: Success, the receive queue is stopped.
 *   - -EINVAL: The port_id or the queue_id out of range.
 *   - -EIO: if device is removed.
 *   - -ENOTSUP: The function not supported in PMD driver.
 */
int rte_eth_dev_rx_queue_stop(uint16_t port_id, uint16_t rx_queue_id);

/**
 * Start TX for specified queue of a port. It is used when tx_deferred_start
 * flag of the specified queue is true.
 *
 * @param port_id
 *   The port identifier of the Ethernet device
 * @param tx_queue_id
 *   The index of the tx queue to update the ring.
 *   The value must be in the range [0, nb_tx_queue - 1] previously supplied
 *   to rte_eth_dev_configure().
 * @return
 *   - 0: Success, the transmit queue is started.
 *   - -EINVAL: The port_id or the queue_id out of range.
 *   - -EIO: if device is removed.
 *   - -ENOTSUP: The function not supported in PMD driver.
 */
int rte_eth_dev_tx_queue_start(uint16_t port_id, uint16_t tx_queue_id);

/**
 * Stop specified TX queue of a port
 *
 * @param port_id
 *   The port identifier of the Ethernet device
 * @param tx_queue_id
 *   The index of the tx queue to update the ring.
 *   The value must be in the range [0, nb_tx_queue - 1] previously supplied
 *   to rte_eth_dev_configure().
 * @return
 *   - 0: Success, the transmit queue is stopped.
 *   - -EINVAL: The port_id or the queue_id out of range.
 *   - -EIO: if device is removed.
 *   - -ENOTSUP: The function not supported in PMD driver.
 */
int rte_eth_dev_tx_queue_stop(uint16_t port_id, uint16_t tx_queue_id);

/**
 * Start an Ethernet device.
 *
 * The device start step is the last one and consists of setting the configured
 * offload features and in starting the transmit and the receive units of the
 * device.
 *
 * Device RTE_ETH_DEV_NOLIVE_MAC_ADDR flag causes MAC address to be set before
 * PMD port start callback function is invoked.
 *
 * On success, all basic functions exported by the Ethernet API (link status,
 * receive/transmit, and so on) can be invoked.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @return
 *   - 0: Success, Ethernet device started.
 *   - <0: Error code of the driver device start function.
 */
int rte_eth_dev_start(uint16_t port_id);

/**
 * Stop an Ethernet device. The device can be restarted with a call to
 * rte_eth_dev_start()
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 */
void rte_eth_dev_stop(uint16_t port_id);

/**
 * Link up an Ethernet device.
 *
 * Set device link up will re-enable the device rx/tx
 * functionality after it is previously set device linked down.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @return
 *   - 0: Success, Ethernet device linked up.
 *   - <0: Error code of the driver device link up function.
 */
int rte_eth_dev_set_link_up(uint16_t port_id);

/**
 * Link down an Ethernet device.
 * The device rx/tx functionality will be disabled if success,
 * and it can be re-enabled with a call to
 * rte_eth_dev_set_link_up()
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 */
int rte_eth_dev_set_link_down(uint16_t port_id);

/**
 * Close a stopped Ethernet device. The device cannot be restarted!
 * The function frees all port resources if the driver supports
 * the flag RTE_ETH_DEV_CLOSE_REMOVE.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 */
void rte_eth_dev_close(uint16_t port_id);

/**
 * Reset a Ethernet device and keep its port id.
 *
 * When a port has to be reset passively, the DPDK application can invoke
 * this function. For example when a PF is reset, all its VFs should also
 * be reset. Normally a DPDK application can invoke this function when
 * RTE_ETH_EVENT_INTR_RESET event is detected, but can also use it to start
 * a port reset in other circumstances.
 *
 * When this function is called, it first stops the port and then calls the
 * PMD specific dev_uninit( ) and dev_init( ) to return the port to initial
 * state, in which no Tx and Rx queues are setup, as if the port has been
 * reset and not started. The port keeps the port id it had before the
 * function call.
 *
 * After calling rte_eth_dev_reset( ), the application should use
 * rte_eth_dev_configure( ), rte_eth_rx_queue_setup( ),
 * rte_eth_tx_queue_setup( ), and rte_eth_dev_start( )
 * to reconfigure the device as appropriate.
 *
 * Note: To avoid unexpected behavior, the application should stop calling
 * Tx and Rx functions before calling rte_eth_dev_reset( ). For thread
 * safety, all these controlling functions should be called from the same
 * thread.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 *
 * @return
 *   - (0) if successful.
 *   - (-EINVAL) if port identifier is invalid.
 *   - (-ENOTSUP) if hardware doesn't support this function.
 *   - (-EPERM) if not ran from the primary process.
 *   - (-EIO) if re-initialisation failed or device is removed.
 *   - (-ENOMEM) if the reset failed due to OOM.
 *   - (-EAGAIN) if the reset temporarily failed and should be retried later.
 */
int rte_eth_dev_reset(uint16_t port_id);

/**
 * Enable receipt in promiscuous mode for an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 */
void rte_eth_promiscuous_enable(uint16_t port_id);

/**
 * Disable receipt in promiscuous mode for an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 */
void rte_eth_promiscuous_disable(uint16_t port_id);

/**
 * Return the value of promiscuous mode for an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @return
 *   - (1) if promiscuous is enabled
 *   - (0) if promiscuous is disabled.
 *   - (-1) on error
 */
int rte_eth_promiscuous_get(uint16_t port_id);

/**
 * Enable the receipt of any multicast frame by an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 */
void rte_eth_allmulticast_enable(uint16_t port_id);

/**
 * Disable the receipt of all multicast frames by an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 */
void rte_eth_allmulticast_disable(uint16_t port_id);

/**
 * Return the value of allmulticast mode for an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @return
 *   - (1) if allmulticast is enabled
 *   - (0) if allmulticast is disabled.
 *   - (-1) on error
 */
int rte_eth_allmulticast_get(uint16_t port_id);

/**
 * Retrieve the status (ON/OFF), the speed (in Mbps) and the mode (HALF-DUPLEX
 * or FULL-DUPLEX) of the physical link of an Ethernet device. It might need
 * to wait up to 9 seconds in it.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param link
 *   A pointer to an *rte_eth_link* structure to be filled with
 *   the status, the speed and the mode of the Ethernet device link.
 */
void rte_eth_link_get(uint16_t port_id, struct rte_eth_link *link);

/**
 * Retrieve the status (ON/OFF), the speed (in Mbps) and the mode (HALF-DUPLEX
 * or FULL-DUPLEX) of the physical link of an Ethernet device. It is a no-wait
 * version of rte_eth_link_get().
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param link
 *   A pointer to an *rte_eth_link* structure to be filled with
 *   the status, the speed and the mode of the Ethernet device link.
 */
void rte_eth_link_get_nowait(uint16_t port_id, struct rte_eth_link *link);

/**
 * Retrieve the general I/O statistics of an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param stats
 *   A pointer to a structure of type *rte_eth_stats* to be filled with
 *   the values of device counters for the following set of statistics:
 *   - *ipackets* with the total of successfully received packets.
 *   - *opackets* with the total of successfully transmitted packets.
 *   - *ibytes*   with the total of successfully received bytes.
 *   - *obytes*   with the total of successfully transmitted bytes.
 *   - *ierrors*  with the total of erroneous received packets.
 *   - *oerrors*  with the total of failed transmitted packets.
 * @return
 *   Zero if successful. Non-zero otherwise.
 */
int rte_eth_stats_get(uint16_t port_id, struct rte_eth_stats *stats);

/**
 * Reset the general I/O statistics of an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @return
 *   - (0) if device notified to reset stats.
 *   - (-ENOTSUP) if hardware doesn't support.
 *   - (-ENODEV) if *port_id* invalid.
 */
int rte_eth_stats_reset(uint16_t port_id);

/**
 * Retrieve names of extended statistics of an Ethernet device.
 *
 * There is an assumption that 'xstat_names' and 'xstats' arrays are matched
 * by array index:
 *  xstats_names[i].name => xstats[i].value
 *
 * And the array index is same with id field of 'struct rte_eth_xstat':
 *  xstats[i].id == i
 *
 * This assumption makes key-value pair matching less flexible but simpler.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param xstats_names
 *   An rte_eth_xstat_name array of at least *size* elements to
 *   be filled. If set to NULL, the function returns the required number
 *   of elements.
 * @param size
 *   The size of the xstats_names array (number of elements).
 * @return
 *   - A positive value lower or equal to size: success. The return value
 *     is the number of entries filled in the stats table.
 *   - A positive value higher than size: error, the given statistics table
 *     is too small. The return value corresponds to the size that should
 *     be given to succeed. The entries in the table are not valid and
 *     shall not be used by the caller.
 *   - A negative value on error (invalid port id).
 */
int rte_eth_xstats_get_names(uint16_t port_id,
		struct rte_eth_xstat_name *xstats_names,
		unsigned int size);

/**
 * Retrieve extended statistics of an Ethernet device.
 *
 * There is an assumption that 'xstat_names' and 'xstats' arrays are matched
 * by array index:
 *  xstats_names[i].name => xstats[i].value
 *
 * And the array index is same with id field of 'struct rte_eth_xstat':
 *  xstats[i].id == i
 *
 * This assumption makes key-value pair matching less flexible but simpler.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param xstats
 *   A pointer to a table of structure of type *rte_eth_xstat*
 *   to be filled with device statistics ids and values.
 *   This parameter can be set to NULL if n is 0.
 * @param n
 *   The size of the xstats array (number of elements).
 * @return
 *   - A positive value lower or equal to n: success. The return value
 *     is the number of entries filled in the stats table.
 *   - A positive value higher than n: error, the given statistics table
 *     is too small. The return value corresponds to the size that should
 *     be given to succeed. The entries in the table are not valid and
 *     shall not be used by the caller.
 *   - A negative value on error (invalid port id).
 */
int rte_eth_xstats_get(uint16_t port_id, struct rte_eth_xstat *xstats,
		unsigned int n);

/**
 * Retrieve names of extended statistics of an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param xstats_names
 *   An rte_eth_xstat_name array of at least *size* elements to
 *   be filled. If set to NULL, the function returns the required number
 *   of elements.
 * @param ids
 *   IDs array given by app to retrieve specific statistics
 * @param size
 *   The size of the xstats_names array (number of elements).
 * @return
 *   - A positive value lower or equal to size: success. The return value
 *     is the number of entries filled in the stats table.
 *   - A positive value higher than size: error, the given statistics table
 *     is too small. The return value corresponds to the size that should
 *     be given to succeed. The entries in the table are not valid and
 *     shall not be used by the caller.
 *   - A negative value on error (invalid port id).
 */
int
rte_eth_xstats_get_names_by_id(uint16_t port_id,
	struct rte_eth_xstat_name *xstats_names, unsigned int size,
	uint64_t *ids);

/**
 * Retrieve extended statistics of an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param ids
 *   A pointer to an ids array passed by application. This tells which
 *   statistics values function should retrieve. This parameter
 *   can be set to NULL if size is 0. In this case function will retrieve
 *   all avalible statistics.
 * @param values
 *   A pointer to a table to be filled with device statistics values.
 * @param size
 *   The size of the ids array (number of elements).
 * @return
 *   - A positive value lower or equal to size: success. The return value
 *     is the number of entries filled in the stats table.
 *   - A positive value higher than size: error, the given statistics table
 *     is too small. The return value corresponds to the size that should
 *     be given to succeed. The entries in the table are not valid and
 *     shall not be used by the caller.
 *   - A negative value on error (invalid port id).
 */
int rte_eth_xstats_get_by_id(uint16_t port_id, const uint64_t *ids,
			     uint64_t *values, unsigned int size);

/**
 * Gets the ID of a statistic from its name.
 *
 * This function searches for the statistics using string compares, and
 * as such should not be used on the fast-path. For fast-path retrieval of
 * specific statistics, store the ID as provided in *id* from this function,
 * and pass the ID to rte_eth_xstats_get()
 *
 * @param port_id The port to look up statistics from
 * @param xstat_name The name of the statistic to return
 * @param[out] id A pointer to an app-supplied uint64_t which should be
 *                set to the ID of the stat if the stat exists.
 * @return
 *    0 on success
 *    -ENODEV for invalid port_id,
 *    -EIO if device is removed,
 *    -EINVAL if the xstat_name doesn't exist in port_id
 */
int rte_eth_xstats_get_id_by_name(uint16_t port_id, const char *xstat_name,
		uint64_t *id);

/**
 * Reset extended statistics of an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 */
void rte_eth_xstats_reset(uint16_t port_id);

/**
 *  Set a mapping for the specified transmit queue to the specified per-queue
 *  statistics counter.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param tx_queue_id
 *   The index of the transmit queue for which a queue stats mapping is required.
 *   The value must be in the range [0, nb_tx_queue - 1] previously supplied
 *   to rte_eth_dev_configure().
 * @param stat_idx
 *   The per-queue packet statistics functionality number that the transmit
 *   queue is to be assigned.
 *   The value must be in the range [0, RTE_ETHDEV_QUEUE_STAT_CNTRS - 1].
 * @return
 *   Zero if successful. Non-zero otherwise.
 */
int rte_eth_dev_set_tx_queue_stats_mapping(uint16_t port_id,
		uint16_t tx_queue_id, uint8_t stat_idx);

/**
 *  Set a mapping for the specified receive queue to the specified per-queue
 *  statistics counter.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param rx_queue_id
 *   The index of the receive queue for which a queue stats mapping is required.
 *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
 *   to rte_eth_dev_configure().
 * @param stat_idx
 *   The per-queue packet statistics functionality number that the receive
 *   queue is to be assigned.
 *   The value must be in the range [0, RTE_ETHDEV_QUEUE_STAT_CNTRS - 1].
 * @return
 *   Zero if successful. Non-zero otherwise.
 */
int rte_eth_dev_set_rx_queue_stats_mapping(uint16_t port_id,
					   uint16_t rx_queue_id,
					   uint8_t stat_idx);

/**
 * Retrieve the Ethernet address of an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param mac_addr
 *   A pointer to a structure of type *ether_addr* to be filled with
 *   the Ethernet address of the Ethernet device.
 */
void rte_eth_macaddr_get(uint16_t port_id, struct ether_addr *mac_addr);

/**
 * Retrieve the contextual information of an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param dev_info
 *   A pointer to a structure of type *rte_eth_dev_info* to be filled with
 *   the contextual information of the Ethernet device.
 */
void rte_eth_dev_info_get(uint16_t port_id, struct rte_eth_dev_info *dev_info);

/**
 * Retrieve the firmware version of a device.
 *
 * @param port_id
 *   The port identifier of the device.
 * @param fw_version
 *   A pointer to a string array storing the firmware version of a device,
 *   the string includes terminating null. This pointer is allocated by caller.
 * @param fw_size
 *   The size of the string array pointed by fw_version, which should be
 *   large enough to store firmware version of the device.
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if operation is not supported.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 *   - (>0) if *fw_size* is not enough to store firmware version, return
 *          the size of the non truncated string.
 */
int rte_eth_dev_fw_version_get(uint16_t port_id,
			       char *fw_version, size_t fw_size);

/**
 * Retrieve the supported packet types of an Ethernet device.
 *
 * When a packet type is announced as supported, it *must* be recognized by
 * the PMD. For instance, if RTE_PTYPE_L2_ETHER, RTE_PTYPE_L2_ETHER_VLAN
 * and RTE_PTYPE_L3_IPV4 are announced, the PMD must return the following
 * packet types for these packets:
 * - Ether/IPv4              -> RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4
 * - Ether/Vlan/IPv4         -> RTE_PTYPE_L2_ETHER_VLAN | RTE_PTYPE_L3_IPV4
 * - Ether/[anything else]   -> RTE_PTYPE_L2_ETHER
 * - Ether/Vlan/[anything else] -> RTE_PTYPE_L2_ETHER_VLAN
 *
 * When a packet is received by a PMD, the most precise type must be
 * returned among the ones supported. However a PMD is allowed to set
 * packet type that is not in the supported list, at the condition that it
 * is more precise. Therefore, a PMD announcing no supported packet types
 * can still set a matching packet type in a received packet.
 *
 * @note
 *   Better to invoke this API after the device is already started or rx burst
 *   function is decided, to obtain correct supported ptypes.
 * @note
 *   if a given PMD does not report what ptypes it supports, then the supported
 *   ptype count is reported as 0.
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param ptype_mask
 *   A hint of what kind of packet type which the caller is interested in.
 * @param ptypes
 *   An array pointer to store adequate packet types, allocated by caller.
 * @param num
 *  Size of the array pointed by param ptypes.
 * @return
 *   - (>=0) Number of supported ptypes. If the number of types exceeds num,
 *           only num entries will be filled into the ptypes array, but the full
 *           count of supported ptypes will be returned.
 *   - (-ENODEV) if *port_id* invalid.
 */
int rte_eth_dev_get_supported_ptypes(uint16_t port_id, uint32_t ptype_mask,
				     uint32_t *ptypes, int num);

/**
 * Retrieve the MTU of an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param mtu
 *   A pointer to a uint16_t where the retrieved MTU is to be stored.
 * @return
 *   - (0) if successful.
 *   - (-ENODEV) if *port_id* invalid.
 */
int rte_eth_dev_get_mtu(uint16_t port_id, uint16_t *mtu);

/**
 * Change the MTU of an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param mtu
 *   A uint16_t for the MTU to be applied.
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if operation is not supported.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 *   - (-EINVAL) if *mtu* invalid.
 *   - (-EBUSY) if operation is not allowed when the port is running
 */
int rte_eth_dev_set_mtu(uint16_t port_id, uint16_t mtu);

/**
 * Enable/Disable hardware filtering by an Ethernet device of received
 * VLAN packets tagged with a given VLAN Tag Identifier.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param vlan_id
 *   The VLAN Tag Identifier whose filtering must be enabled or disabled.
 * @param on
 *   If > 0, enable VLAN filtering of VLAN packets tagged with *vlan_id*.
 *   Otherwise, disable VLAN filtering of VLAN packets tagged with *vlan_id*.
 * @return
 *   - (0) if successful.
 *   - (-ENOSUP) if hardware-assisted VLAN filtering not configured.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 *   - (-ENOSYS) if VLAN filtering on *port_id* disabled.
 *   - (-EINVAL) if *vlan_id* > 4095.
 */
int rte_eth_dev_vlan_filter(uint16_t port_id, uint16_t vlan_id, int on);

/**
 * Enable/Disable hardware VLAN Strip by a rx queue of an Ethernet device.
 * 82599/X540/X550 can support VLAN stripping at the rx queue level
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param rx_queue_id
 *   The index of the receive queue for which a queue stats mapping is required.
 *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
 *   to rte_eth_dev_configure().
 * @param on
 *   If 1, Enable VLAN Stripping of the receive queue of the Ethernet port.
 *   If 0, Disable VLAN Stripping of the receive queue of the Ethernet port.
 * @return
 *   - (0) if successful.
 *   - (-ENOSUP) if hardware-assisted VLAN stripping not configured.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EINVAL) if *rx_queue_id* invalid.
 */
int rte_eth_dev_set_vlan_strip_on_queue(uint16_t port_id, uint16_t rx_queue_id,
		int on);

/**
 * Set the Outer VLAN Ether Type by an Ethernet device, it can be inserted to
 * the VLAN Header. This is a register setup available on some Intel NIC, not
 * but all, please check the data sheet for availability.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param vlan_type
 *   The vlan type.
 * @param tag_type
 *   The Tag Protocol ID
 * @return
 *   - (0) if successful.
 *   - (-ENOSUP) if hardware-assisted VLAN TPID setup is not supported.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 */
int rte_eth_dev_set_vlan_ether_type(uint16_t port_id,
				    enum rte_vlan_type vlan_type,
				    uint16_t tag_type);

/**
 * Set VLAN offload configuration on an Ethernet device
 * Enable/Disable Extended VLAN by an Ethernet device, This is a register setup
 * available on some Intel NIC, not but all, please check the data sheet for
 * availability.
 * Enable/Disable VLAN Strip can be done on rx queue for certain NIC, but here
 * the configuration is applied on the port level.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param offload_mask
 *   The VLAN Offload bit mask can be mixed use with "OR"
 *       ETH_VLAN_STRIP_OFFLOAD
 *       ETH_VLAN_FILTER_OFFLOAD
 *       ETH_VLAN_EXTEND_OFFLOAD
 * @return
 *   - (0) if successful.
 *   - (-ENOSUP) if hardware-assisted VLAN filtering not configured.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 */
int rte_eth_dev_set_vlan_offload(uint16_t port_id, int offload_mask);

/**
 * Read VLAN Offload configuration from an Ethernet device
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @return
 *   - (>0) if successful. Bit mask to indicate
 *       ETH_VLAN_STRIP_OFFLOAD
 *       ETH_VLAN_FILTER_OFFLOAD
 *       ETH_VLAN_EXTEND_OFFLOAD
 *   - (-ENODEV) if *port_id* invalid.
 */
int rte_eth_dev_get_vlan_offload(uint16_t port_id);

/**
 * Set port based TX VLAN insertion on or off.
 *
 * @param port_id
 *  The port identifier of the Ethernet device.
 * @param pvid
 *  Port based TX VLAN identifier together with user priority.
 * @param on
 *  Turn on or off the port based TX VLAN insertion.
 *
 * @return
 *   - (0) if successful.
 *   - negative if failed.
 */
int rte_eth_dev_set_vlan_pvid(uint16_t port_id, uint16_t pvid, int on);

typedef void (*buffer_tx_error_fn)(struct rte_mbuf **unsent, uint16_t count,
		void *userdata);

/**
 * Structure used to buffer packets for future TX
 * Used by APIs rte_eth_tx_buffer and rte_eth_tx_buffer_flush
 */
struct rte_eth_dev_tx_buffer {
	buffer_tx_error_fn error_callback;
	void *error_userdata;
	uint16_t size;           /**< Size of buffer for buffered tx */
	uint16_t length;         /**< Number of packets in the array */
	struct rte_mbuf *pkts[];
	/**< Pending packets to be sent on explicit flush or when full */
};

/**
 * Calculate the size of the tx buffer.
 *
 * @param sz
 *   Number of stored packets.
 */
#define RTE_ETH_TX_BUFFER_SIZE(sz) \
	(sizeof(struct rte_eth_dev_tx_buffer) + (sz) * sizeof(struct rte_mbuf *))

/**
 * Initialize default values for buffered transmitting
 *
 * @param buffer
 *   Tx buffer to be initialized.
 * @param size
 *   Buffer size
 * @return
 *   0 if no error
 */
int
rte_eth_tx_buffer_init(struct rte_eth_dev_tx_buffer *buffer, uint16_t size);

/**
 * Configure a callback for buffered packets which cannot be sent
 *
 * Register a specific callback to be called when an attempt is made to send
 * all packets buffered on an ethernet port, but not all packets can
 * successfully be sent. The callback registered here will be called only
 * from calls to rte_eth_tx_buffer() and rte_eth_tx_buffer_flush() APIs.
 * The default callback configured for each queue by default just frees the
 * packets back to the calling mempool. If additional behaviour is required,
 * for example, to count dropped packets, or to retry transmission of packets
 * which cannot be sent, this function should be used to register a suitable
 * callback function to implement the desired behaviour.
 * The example callback "rte_eth_count_unsent_packet_callback()" is also
 * provided as reference.
 *
 * @param buffer
 *   The port identifier of the Ethernet device.
 * @param callback
 *   The function to be used as the callback.
 * @param userdata
 *   Arbitrary parameter to be passed to the callback function
 * @return
 *   0 on success, or -1 on error with rte_errno set appropriately
 */
int
rte_eth_tx_buffer_set_err_callback(struct rte_eth_dev_tx_buffer *buffer,
		buffer_tx_error_fn callback, void *userdata);

/**
 * Callback function for silently dropping unsent buffered packets.
 *
 * This function can be passed to rte_eth_tx_buffer_set_err_callback() to
 * adjust the default behavior when buffered packets cannot be sent. This
 * function drops any unsent packets silently and is used by tx buffered
 * operations as default behavior.
 *
 * NOTE: this function should not be called directly, instead it should be used
 *       as a callback for packet buffering.
 *
 * NOTE: when configuring this function as a callback with
 *       rte_eth_tx_buffer_set_err_callback(), the final, userdata parameter
 *       should point to an uint64_t value.
 *
 * @param pkts
 *   The previously buffered packets which could not be sent
 * @param unsent
 *   The number of unsent packets in the pkts array
 * @param userdata
 *   Not used
 */
void
rte_eth_tx_buffer_drop_callback(struct rte_mbuf **pkts, uint16_t unsent,
		void *userdata);

/**
 * Callback function for tracking unsent buffered packets.
 *
 * This function can be passed to rte_eth_tx_buffer_set_err_callback() to
 * adjust the default behavior when buffered packets cannot be sent. This
 * function drops any unsent packets, but also updates a user-supplied counter
 * to track the overall number of packets dropped. The counter should be an
 * uint64_t variable.
 *
 * NOTE: this function should not be called directly, instead it should be used
 *       as a callback for packet buffering.
 *
 * NOTE: when configuring this function as a callback with
 *       rte_eth_tx_buffer_set_err_callback(), the final, userdata parameter
 *       should point to an uint64_t value.
 *
 * @param pkts
 *   The previously buffered packets which could not be sent
 * @param unsent
 *   The number of unsent packets in the pkts array
 * @param userdata
 *   Pointer to an uint64_t value, which will be incremented by unsent
 */
void
rte_eth_tx_buffer_count_callback(struct rte_mbuf **pkts, uint16_t unsent,
		void *userdata);

/**
 * Request the driver to free mbufs currently cached by the driver. The
 * driver will only free the mbuf if it is no longer in use. It is the
 * application's responsibity to ensure rte_eth_tx_buffer_flush(..) is
 * called if needed.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param queue_id
 *   The index of the transmit queue through which output packets must be
 *   sent.
 *   The value must be in the range [0, nb_tx_queue - 1] previously supplied
 *   to rte_eth_dev_configure().
 * @param free_cnt
 *   Maximum number of packets to free. Use 0 to indicate all possible packets
 *   should be freed. Note that a packet may be using multiple mbufs.
 * @return
 *   Failure: < 0
 *     -ENODEV: Invalid interface
 *     -EIO: device is removed
 *     -ENOTSUP: Driver does not support function
 *   Success: >= 0
 *     0-n: Number of packets freed. More packets may still remain in ring that
 *     are in use.
 */
int
rte_eth_tx_done_cleanup(uint16_t port_id, uint16_t queue_id, uint32_t free_cnt);

/**
 * Subtypes for IPsec offload event(@ref RTE_ETH_EVENT_IPSEC) raised by
 * eth device.
 */
enum rte_eth_event_ipsec_subtype {
	RTE_ETH_EVENT_IPSEC_UNKNOWN = 0,
			/**< Unknown event type */
	RTE_ETH_EVENT_IPSEC_ESN_OVERFLOW,
			/**< Sequence number overflow */
	RTE_ETH_EVENT_IPSEC_SA_TIME_EXPIRY,
			/**< Soft time expiry of SA */
	RTE_ETH_EVENT_IPSEC_SA_BYTE_EXPIRY,
			/**< Soft byte expiry of SA */
	RTE_ETH_EVENT_IPSEC_MAX
			/**< Max value of this enum */
};

/**
 * Descriptor for @ref RTE_ETH_EVENT_IPSEC event. Used by eth dev to send extra
 * information of the IPsec offload event.
 */
struct rte_eth_event_ipsec_desc {
	enum rte_eth_event_ipsec_subtype subtype;
			/**< Type of RTE_ETH_EVENT_IPSEC_* event */
	uint64_t metadata;
			/**< Event specific metadata
			 *
			 * For the following events, *userdata* registered
			 * with the *rte_security_session* would be returned
			 * as metadata,
			 *
			 * - @ref RTE_ETH_EVENT_IPSEC_ESN_OVERFLOW
			 * - @ref RTE_ETH_EVENT_IPSEC_SA_TIME_EXPIRY
			 * - @ref RTE_ETH_EVENT_IPSEC_SA_BYTE_EXPIRY
			 *
			 * @see struct rte_security_session_conf
			 *
			 */
};

/**
 * The eth device event type for interrupt, and maybe others in the future.
 */
enum rte_eth_event_type {
	RTE_ETH_EVENT_UNKNOWN,  /**< unknown event type */
	RTE_ETH_EVENT_INTR_LSC, /**< lsc interrupt event */
	RTE_ETH_EVENT_QUEUE_STATE,
				/**< queue state event (enabled/disabled) */
	RTE_ETH_EVENT_INTR_RESET,
			/**< reset interrupt event, sent to VF on PF reset */
	RTE_ETH_EVENT_VF_MBOX,  /**< message from the VF received by PF */
	RTE_ETH_EVENT_MACSEC,   /**< MACsec offload related event */
	RTE_ETH_EVENT_INTR_RMV, /**< device removal event */
	RTE_ETH_EVENT_NEW,      /**< port is probed */
	RTE_ETH_EVENT_DESTROY,  /**< port is released */
	RTE_ETH_EVENT_IPSEC,    /**< IPsec offload related event */
	RTE_ETH_EVENT_MAX       /**< max value of this enum */
};

typedef int (*rte_eth_dev_cb_fn)(uint16_t port_id,
		enum rte_eth_event_type event, void *cb_arg, void *ret_param);
/**< user application callback to be registered for interrupts */

/**
 * Register a callback function for port event.
 *
 * @param port_id
 *  Port id.
 *  RTE_ETH_ALL means register the event for all port ids.
 * @param event
 *  Event interested.
 * @param cb_fn
 *  User supplied callback function to be called.
 * @param cb_arg
 *  Pointer to the parameters for the registered callback.
 *
 * @return
 *  - On success, zero.
 *  - On failure, a negative value.
 */
int rte_eth_dev_callback_register(uint16_t port_id,
			enum rte_eth_event_type event,
		rte_eth_dev_cb_fn cb_fn, void *cb_arg);

/**
 * Unregister a callback function for port event.
 *
 * @param port_id
 *  Port id.
 *  RTE_ETH_ALL means unregister the event for all port ids.
 * @param event
 *  Event interested.
 * @param cb_fn
 *  User supplied callback function to be called.
 * @param cb_arg
 *  Pointer to the parameters for the registered callback. -1 means to
 *  remove all for the same callback address and same event.
 *
 * @return
 *  - On success, zero.
 *  - On failure, a negative value.
 */
int rte_eth_dev_callback_unregister(uint16_t port_id,
			enum rte_eth_event_type event,
		rte_eth_dev_cb_fn cb_fn, void *cb_arg);

/**
 * When there is no rx packet coming in Rx Queue for a long time, we can
 * sleep lcore related to RX Queue for power saving, and enable rx interrupt
 * to be triggered when Rx packet arrives.
 *
 * The rte_eth_dev_rx_intr_enable() function enables rx queue
 * interrupt on specific rx queue of a port.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param queue_id
 *   The index of the receive queue from which to retrieve input packets.
 *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
 *   to rte_eth_dev_configure().
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if underlying hardware OR driver doesn't support
 *     that operation.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 */
int rte_eth_dev_rx_intr_enable(uint16_t port_id, uint16_t queue_id);

/**
 * When lcore wakes up from rx interrupt indicating packet coming, disable rx
 * interrupt and returns to polling mode.
 *
 * The rte_eth_dev_rx_intr_disable() function disables rx queue
 * interrupt on specific rx queue of a port.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param queue_id
 *   The index of the receive queue from which to retrieve input packets.
 *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
 *   to rte_eth_dev_configure().
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if underlying hardware OR driver doesn't support
 *     that operation.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 */
int rte_eth_dev_rx_intr_disable(uint16_t port_id, uint16_t queue_id);

/**
 * RX Interrupt control per port.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param epfd
 *   Epoll instance fd which the intr vector associated to.
 *   Using RTE_EPOLL_PER_THREAD allows to use per thread epoll instance.
 * @param op
 *   The operation be performed for the vector.
 *   Operation type of {RTE_INTR_EVENT_ADD, RTE_INTR_EVENT_DEL}.
 * @param data
 *   User raw data.
 * @return
 *   - On success, zero.
 *   - On failure, a negative value.
 */
int rte_eth_dev_rx_intr_ctl(uint16_t port_id, int epfd, int op, void *data);

/**
 * RX Interrupt control per queue.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param queue_id
 *   The index of the receive queue from which to retrieve input packets.
 *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
 *   to rte_eth_dev_configure().
 * @param epfd
 *   Epoll instance fd which the intr vector associated to.
 *   Using RTE_EPOLL_PER_THREAD allows to use per thread epoll instance.
 * @param op
 *   The operation be performed for the vector.
 *   Operation type of {RTE_INTR_EVENT_ADD, RTE_INTR_EVENT_DEL}.
 * @param data
 *   User raw data.
 * @return
 *   - On success, zero.
 *   - On failure, a negative value.
 */
int rte_eth_dev_rx_intr_ctl_q(uint16_t port_id, uint16_t queue_id,
			      int epfd, int op, void *data);

/**
 * @warning
 * @b EXPERIMENTAL: this API may change without prior notice.
 *
 * Get interrupt fd per Rx queue.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param queue_id
 *   The index of the receive queue from which to retrieve input packets.
 *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
 *   to rte_eth_dev_configure().
 * @return
 *   - (>=0) the interrupt fd associated to the requested Rx queue if
 *           successful.
 *   - (-1) on error.
 */
int __rte_experimental
rte_eth_dev_rx_intr_ctl_q_get_fd(uint16_t port_id, uint16_t queue_id);

/**
 * Turn on the LED on the Ethernet device.
 * This function turns on the LED on the Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if underlying hardware OR driver doesn't support
 *     that operation.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 */
int  rte_eth_led_on(uint16_t port_id);

/**
 * Turn off the LED on the Ethernet device.
 * This function turns off the LED on the Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if underlying hardware OR driver doesn't support
 *     that operation.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 */
int  rte_eth_led_off(uint16_t port_id);

/**
 * Get current status of the Ethernet link flow control for Ethernet device
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param fc_conf
 *   The pointer to the structure where to store the flow control parameters.
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if hardware doesn't support flow control.
 *   - (-ENODEV)  if *port_id* invalid.
 *   - (-EIO)  if device is removed.
 */
int rte_eth_dev_flow_ctrl_get(uint16_t port_id,
			      struct rte_eth_fc_conf *fc_conf);

/**
 * Configure the Ethernet link flow control for Ethernet device
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param fc_conf
 *   The pointer to the structure of the flow control parameters.
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if hardware doesn't support flow control mode.
 *   - (-ENODEV)  if *port_id* invalid.
 *   - (-EINVAL)  if bad parameter
 *   - (-EIO)     if flow control setup failure or device is removed.
 */
int rte_eth_dev_flow_ctrl_set(uint16_t port_id,
			      struct rte_eth_fc_conf *fc_conf);

/**
 * Configure the Ethernet priority flow control under DCB environment
 * for Ethernet device.
 *
 * @param port_id
 * The port identifier of the Ethernet device.
 * @param pfc_conf
 * The pointer to the structure of the priority flow control parameters.
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if hardware doesn't support priority flow control mode.
 *   - (-ENODEV)  if *port_id* invalid.
 *   - (-EINVAL)  if bad parameter
 *   - (-EIO)     if flow control setup failure or device is removed.
 */
int rte_eth_dev_priority_flow_ctrl_set(uint16_t port_id,
				struct rte_eth_pfc_conf *pfc_conf);

/**
 * Add a MAC address to an internal array of addresses used to enable whitelist
 * filtering to accept packets only if the destination MAC address matches.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param mac_addr
 *   The MAC address to add.
 * @param pool
 *   VMDq pool index to associate address with (if VMDq is enabled). If VMDq is
 *   not enabled, this should be set to 0.
 * @return
 *   - (0) if successfully added or *mac_addr* was already added.
 *   - (-ENOTSUP) if hardware doesn't support this feature.
 *   - (-ENODEV) if *port* is invalid.
 *   - (-EIO) if device is removed.
 *   - (-ENOSPC) if no more MAC addresses can be added.
 *   - (-EINVAL) if MAC address is invalid.
 */
int rte_eth_dev_mac_addr_add(uint16_t port_id, struct ether_addr *mac_addr,
				uint32_t pool);

/**
 * Remove a MAC address from the internal array of addresses.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param mac_addr
 *   MAC address to remove.
 * @return
 *   - (0) if successful, or *mac_addr* didn't exist.
 *   - (-ENOTSUP) if hardware doesn't support.
 *   - (-ENODEV) if *port* invalid.
 *   - (-EADDRINUSE) if attempting to remove the default MAC address
 */
int rte_eth_dev_mac_addr_remove(uint16_t port_id, struct ether_addr *mac_addr);

/**
 * Set the default MAC address.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param mac_addr
 *   New default MAC address.
 * @return
 *   - (0) if successful, or *mac_addr* didn't exist.
 *   - (-ENOTSUP) if hardware doesn't support.
 *   - (-ENODEV) if *port* invalid.
 *   - (-EINVAL) if MAC address is invalid.
 */
int rte_eth_dev_default_mac_addr_set(uint16_t port_id,
		struct ether_addr *mac_addr);

/**
 * Update Redirection Table(RETA) of Receive Side Scaling of Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param reta_conf
 *   RETA to update.
 * @param reta_size
 *   Redirection table size. The table size can be queried by
 *   rte_eth_dev_info_get().
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if hardware doesn't support.
 *   - (-EINVAL) if bad parameter.
 *   - (-EIO) if device is removed.
 */
int rte_eth_dev_rss_reta_update(uint16_t port_id,
				struct rte_eth_rss_reta_entry64 *reta_conf,
				uint16_t reta_size);

 /**
 * Query Redirection Table(RETA) of Receive Side Scaling of Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param reta_conf
 *   RETA to query.
 * @param reta_size
 *   Redirection table size. The table size can be queried by
 *   rte_eth_dev_info_get().
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if hardware doesn't support.
 *   - (-EINVAL) if bad parameter.
 *   - (-EIO) if device is removed.
 */
int rte_eth_dev_rss_reta_query(uint16_t port_id,
			       struct rte_eth_rss_reta_entry64 *reta_conf,
			       uint16_t reta_size);

 /**
 * Updates unicast hash table for receiving packet with the given destination
 * MAC address, and the packet is routed to all VFs for which the RX mode is
 * accept packets that match the unicast hash table.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param addr
 *   Unicast MAC address.
 * @param on
 *    1 - Set an unicast hash bit for receiving packets with the MAC address.
 *    0 - Clear an unicast hash bit.
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if hardware doesn't support.
  *  - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 *   - (-EINVAL) if bad parameter.
 */
int rte_eth_dev_uc_hash_table_set(uint16_t port_id, struct ether_addr *addr,
				  uint8_t on);

 /**
 * Updates all unicast hash bitmaps for receiving packet with any Unicast
 * Ethernet MAC addresses,the packet is routed to all VFs for which the RX
 * mode is accept packets that match the unicast hash table.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param on
 *    1 - Set all unicast hash bitmaps for receiving all the Ethernet
 *         MAC addresses
 *    0 - Clear all unicast hash bitmaps
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if hardware doesn't support.
  *  - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 *   - (-EINVAL) if bad parameter.
 */
int rte_eth_dev_uc_all_hash_table_set(uint16_t port_id, uint8_t on);

/**
 * Set a traffic mirroring rule on an Ethernet device
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param mirror_conf
 *   The pointer to the traffic mirroring structure describing the mirroring rule.
 *   The *rte_eth_vm_mirror_conf* structure includes the type of mirroring rule,
 *   destination pool and the value of rule if enable vlan or pool mirroring.
 *
 * @param rule_id
 *   The index of traffic mirroring rule, we support four separated rules.
 * @param on
 *   1 - Enable a mirroring rule.
 *   0 - Disable a mirroring rule.
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if hardware doesn't support this feature.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 *   - (-EINVAL) if the mr_conf information is not correct.
 */
int rte_eth_mirror_rule_set(uint16_t port_id,
			struct rte_eth_mirror_conf *mirror_conf,
			uint8_t rule_id,
			uint8_t on);

/**
 * Reset a traffic mirroring rule on an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param rule_id
 *   The index of traffic mirroring rule, we support four separated rules.
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if hardware doesn't support this feature.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 *   - (-EINVAL) if bad parameter.
 */
int rte_eth_mirror_rule_reset(uint16_t port_id,
					 uint8_t rule_id);

/**
 * Set the rate limitation for a queue on an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param queue_idx
 *   The queue id.
 * @param tx_rate
 *   The tx rate in Mbps. Allocated from the total port link speed.
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if hardware doesn't support this feature.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 *   - (-EINVAL) if bad parameter.
 */
int rte_eth_set_queue_rate_limit(uint16_t port_id, uint16_t queue_idx,
			uint16_t tx_rate);

 /**
 * Configuration of Receive Side Scaling hash computation of Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param rss_conf
 *   The new configuration to use for RSS hash computation on the port.
 * @return
 *   - (0) if successful.
 *   - (-ENODEV) if port identifier is invalid.
 *   - (-EIO) if device is removed.
 *   - (-ENOTSUP) if hardware doesn't support.
 *   - (-EINVAL) if bad parameter.
 */
int rte_eth_dev_rss_hash_update(uint16_t port_id,
				struct rte_eth_rss_conf *rss_conf);

 /**
 * Retrieve current configuration of Receive Side Scaling hash computation
 * of Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param rss_conf
 *   Where to store the current RSS hash configuration of the Ethernet device.
 * @return
 *   - (0) if successful.
 *   - (-ENODEV) if port identifier is invalid.
 *   - (-EIO) if device is removed.
 *   - (-ENOTSUP) if hardware doesn't support RSS.
 */
int
rte_eth_dev_rss_hash_conf_get(uint16_t port_id,
			      struct rte_eth_rss_conf *rss_conf);

 /**
 * Add UDP tunneling port for a specific type of tunnel.
 * The packets with this UDP port will be identified as this type of tunnel.
 * Before enabling any offloading function for a tunnel, users can call this API
 * to change or add more UDP port for the tunnel. So the offloading function
 * can take effect on the packets with the specific UDP port.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param tunnel_udp
 *   UDP tunneling configuration.
 *
 * @return
 *   - (0) if successful.
 *   - (-ENODEV) if port identifier is invalid.
 *   - (-EIO) if device is removed.
 *   - (-ENOTSUP) if hardware doesn't support tunnel type.
 */
int
rte_eth_dev_udp_tunnel_port_add(uint16_t port_id,
				struct rte_eth_udp_tunnel *tunnel_udp);

 /**
 * Delete UDP tunneling port a specific type of tunnel.
 * The packets with this UDP port will not be identified as this type of tunnel
 * any more.
 * Before enabling any offloading function for a tunnel, users can call this API
 * to delete a UDP port for the tunnel. So the offloading function will not take
 * effect on the packets with the specific UDP port.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param tunnel_udp
 *   UDP tunneling configuration.
 *
 * @return
 *   - (0) if successful.
 *   - (-ENODEV) if port identifier is invalid.
 *   - (-EIO) if device is removed.
 *   - (-ENOTSUP) if hardware doesn't support tunnel type.
 */
int
rte_eth_dev_udp_tunnel_port_delete(uint16_t port_id,
				   struct rte_eth_udp_tunnel *tunnel_udp);

/**
 * Check whether the filter type is supported on an Ethernet device.
 * All the supported filter types are defined in 'rte_eth_ctrl.h'.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param filter_type
 *   Filter type.
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if hardware doesn't support this filter type.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 */
int rte_eth_dev_filter_supported(uint16_t port_id,
		enum rte_filter_type filter_type);

/**
 * Take operations to assigned filter type on an Ethernet device.
 * All the supported operations and filter types are defined in 'rte_eth_ctrl.h'.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param filter_type
 *   Filter type.
 * @param filter_op
 *   Type of operation.
 * @param arg
 *   A pointer to arguments defined specifically for the operation.
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if hardware doesn't support.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 *   - others depends on the specific operations implementation.
 */
int rte_eth_dev_filter_ctrl(uint16_t port_id, enum rte_filter_type filter_type,
			enum rte_filter_op filter_op, void *arg);

/**
 * Get DCB information on an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param dcb_info
 *   dcb information.
 * @return
 *   - (0) if successful.
 *   - (-ENODEV) if port identifier is invalid.
 *   - (-EIO) if device is removed.
 *   - (-ENOTSUP) if hardware doesn't support.
 */
int rte_eth_dev_get_dcb_info(uint16_t port_id,
			     struct rte_eth_dcb_info *dcb_info);

struct rte_eth_rxtx_callback;

/**
 * Add a callback to be called on packet RX on a given port and queue.
 *
 * This API configures a function to be called for each burst of
 * packets received on a given NIC port queue. The return value is a pointer
 * that can be used to later remove the callback using
 * rte_eth_remove_rx_callback().
 *
 * Multiple functions are called in the order that they are added.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param queue_id
 *   The queue on the Ethernet device on which the callback is to be added.
 * @param fn
 *   The callback function
 * @param user_param
 *   A generic pointer parameter which will be passed to each invocation of the
 *   callback function on this port and queue.
 *
 * @return
 *   NULL on error.
 *   On success, a pointer value which can later be used to remove the callback.
 */
const struct rte_eth_rxtx_callback *
rte_eth_add_rx_callback(uint16_t port_id, uint16_t queue_id,
		rte_rx_callback_fn fn, void *user_param);

/**
 * Add a callback that must be called first on packet RX on a given port
 * and queue.
 *
 * This API configures a first function to be called for each burst of
 * packets received on a given NIC port queue. The return value is a pointer
 * that can be used to later remove the callback using
 * rte_eth_remove_rx_callback().
 *
 * Multiple functions are called in the order that they are added.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param queue_id
 *   The queue on the Ethernet device on which the callback is to be added.
 * @param fn
 *   The callback function
 * @param user_param
 *   A generic pointer parameter which will be passed to each invocation of the
 *   callback function on this port and queue.
 *
 * @return
 *   NULL on error.
 *   On success, a pointer value which can later be used to remove the callback.
 */
const struct rte_eth_rxtx_callback *
rte_eth_add_first_rx_callback(uint16_t port_id, uint16_t queue_id,
		rte_rx_callback_fn fn, void *user_param);

/**
 * Add a callback to be called on packet TX on a given port and queue.
 *
 * This API configures a function to be called for each burst of
 * packets sent on a given NIC port queue. The return value is a pointer
 * that can be used to later remove the callback using
 * rte_eth_remove_tx_callback().
 *
 * Multiple functions are called in the order that they are added.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param queue_id
 *   The queue on the Ethernet device on which the callback is to be added.
 * @param fn
 *   The callback function
 * @param user_param
 *   A generic pointer parameter which will be passed to each invocation of the
 *   callback function on this port and queue.
 *
 * @return
 *   NULL on error.
 *   On success, a pointer value which can later be used to remove the callback.
 */
const struct rte_eth_rxtx_callback *
rte_eth_add_tx_callback(uint16_t port_id, uint16_t queue_id,
		rte_tx_callback_fn fn, void *user_param);

/**
 * Remove an RX packet callback from a given port and queue.
 *
 * This function is used to removed callbacks that were added to a NIC port
 * queue using rte_eth_add_rx_callback().
 *
 * Note: the callback is removed from the callback list but it isn't freed
 * since the it may still be in use. The memory for the callback can be
 * subsequently freed back by the application by calling rte_free():
 *
 * - Immediately - if the port is stopped, or the user knows that no
 *   callbacks are in flight e.g. if called from the thread doing RX/TX
 *   on that queue.
 *
 * - After a short delay - where the delay is sufficient to allow any
 *   in-flight callbacks to complete.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param queue_id
 *   The queue on the Ethernet device from which the callback is to be removed.
 * @param user_cb
 *   User supplied callback created via rte_eth_add_rx_callback().
 *
 * @return
 *   - 0: Success. Callback was removed.
 *   - -ENOTSUP: Callback support is not available.
 *   - -EINVAL:  The port_id or the queue_id is out of range, or the callback
 *               is NULL or not found for the port/queue.
 */
int rte_eth_remove_rx_callback(uint16_t port_id, uint16_t queue_id,
		const struct rte_eth_rxtx_callback *user_cb);

/**
 * Remove a TX packet callback from a given port and queue.
 *
 * This function is used to removed callbacks that were added to a NIC port
 * queue using rte_eth_add_tx_callback().
 *
 * Note: the callback is removed from the callback list but it isn't freed
 * since the it may still be in use. The memory for the callback can be
 * subsequently freed back by the application by calling rte_free():
 *
 * - Immediately - if the port is stopped, or the user knows that no
 *   callbacks are in flight e.g. if called from the thread doing RX/TX
 *   on that queue.
 *
 * - After a short delay - where the delay is sufficient to allow any
 *   in-flight callbacks to complete.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param queue_id
 *   The queue on the Ethernet device from which the callback is to be removed.
 * @param user_cb
 *   User supplied callback created via rte_eth_add_tx_callback().
 *
 * @return
 *   - 0: Success. Callback was removed.
 *   - -ENOTSUP: Callback support is not available.
 *   - -EINVAL:  The port_id or the queue_id is out of range, or the callback
 *               is NULL or not found for the port/queue.
 */
int rte_eth_remove_tx_callback(uint16_t port_id, uint16_t queue_id,
		const struct rte_eth_rxtx_callback *user_cb);

/**
 * Retrieve information about given port's RX queue.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param queue_id
 *   The RX queue on the Ethernet device for which information
 *   will be retrieved.
 * @param qinfo
 *   A pointer to a structure of type *rte_eth_rxq_info_info* to be filled with
 *   the information of the Ethernet device.
 *
 * @return
 *   - 0: Success
 *   - -ENOTSUP: routine is not supported by the device PMD.
 *   - -EINVAL:  The port_id or the queue_id is out of range.
 */
int rte_eth_rx_queue_info_get(uint16_t port_id, uint16_t queue_id,
	struct rte_eth_rxq_info *qinfo);

/**
 * Retrieve information about given port's TX queue.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param queue_id
 *   The TX queue on the Ethernet device for which information
 *   will be retrieved.
 * @param qinfo
 *   A pointer to a structure of type *rte_eth_txq_info_info* to be filled with
 *   the information of the Ethernet device.
 *
 * @return
 *   - 0: Success
 *   - -ENOTSUP: routine is not supported by the device PMD.
 *   - -EINVAL:  The port_id or the queue_id is out of range.
 */
int rte_eth_tx_queue_info_get(uint16_t port_id, uint16_t queue_id,
	struct rte_eth_txq_info *qinfo);

/**
 * Retrieve device registers and register attributes (number of registers and
 * register size)
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param info
 *   Pointer to rte_dev_reg_info structure to fill in. If info->data is
 *   NULL the function fills in the width and length fields. If non-NULL
 *   the registers are put into the buffer pointed at by the data field.
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if hardware doesn't support.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 *   - others depends on the specific operations implementation.
 */
int rte_eth_dev_get_reg_info(uint16_t port_id, struct rte_dev_reg_info *info);

/**
 * Retrieve size of device EEPROM
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @return
 *   - (>=0) EEPROM size if successful.
 *   - (-ENOTSUP) if hardware doesn't support.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 *   - others depends on the specific operations implementation.
 */
int rte_eth_dev_get_eeprom_length(uint16_t port_id);

/**
 * Retrieve EEPROM and EEPROM attribute
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param info
 *   The template includes buffer for return EEPROM data and
 *   EEPROM attributes to be filled.
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if hardware doesn't support.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 *   - others depends on the specific operations implementation.
 */
int rte_eth_dev_get_eeprom(uint16_t port_id, struct rte_dev_eeprom_info *info);

/**
 * Program EEPROM with provided data
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param info
 *   The template includes EEPROM data for programming and
 *   EEPROM attributes to be filled
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if hardware doesn't support.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 *   - others depends on the specific operations implementation.
 */
int rte_eth_dev_set_eeprom(uint16_t port_id, struct rte_dev_eeprom_info *info);

/**
 * @warning
 * @b EXPERIMENTAL: this API may change without prior notice.
 *
 * Retrieve the type and size of plugin module EEPROM
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param modinfo
 *   The type and size of plugin module EEPROM.
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if hardware doesn't support.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 *   - others depends on the specific operations implementation.
 */
int __rte_experimental
rte_eth_dev_get_module_info(uint16_t port_id,
			    struct rte_eth_dev_module_info *modinfo);

/**
 * @warning
 * @b EXPERIMENTAL: this API may change without prior notice.
 *
 * Retrieve the data of plugin module EEPROM
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param info
 *   The template includes the plugin module EEPROM attributes, and the
 *   buffer for return plugin module EEPROM data.
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP) if hardware doesn't support.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 *   - others depends on the specific operations implementation.
 */
int __rte_experimental
rte_eth_dev_get_module_eeprom(uint16_t port_id,
			      struct rte_dev_eeprom_info *info);

/**
 * Set the list of multicast addresses to filter on an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param mc_addr_set
 *   The array of multicast addresses to set. Equal to NULL when the function
 *   is invoked to flush the set of filtered addresses.
 * @param nb_mc_addr
 *   The number of multicast addresses in the *mc_addr_set* array. Equal to 0
 *   when the function is invoked to flush the set of filtered addresses.
 * @return
 *   - (0) if successful.
 *   - (-ENODEV) if *port_id* invalid.
 *   - (-EIO) if device is removed.
 *   - (-ENOTSUP) if PMD of *port_id* doesn't support multicast filtering.
 *   - (-ENOSPC) if *port_id* has not enough multicast filtering resources.
 */
int rte_eth_dev_set_mc_addr_list(uint16_t port_id,
				 struct ether_addr *mc_addr_set,
				 uint32_t nb_mc_addr);

/**
 * Enable IEEE1588/802.1AS timestamping for an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 *
 * @return
 *   - 0: Success.
 *   - -ENODEV: The port ID is invalid.
 *   - -EIO: if device is removed.
 *   - -ENOTSUP: The function is not supported by the Ethernet driver.
 */
int rte_eth_timesync_enable(uint16_t port_id);

/**
 * Disable IEEE1588/802.1AS timestamping for an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 *
 * @return
 *   - 0: Success.
 *   - -ENODEV: The port ID is invalid.
 *   - -EIO: if device is removed.
 *   - -ENOTSUP: The function is not supported by the Ethernet driver.
 */
int rte_eth_timesync_disable(uint16_t port_id);

/**
 * Read an IEEE1588/802.1AS RX timestamp from an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param timestamp
 *   Pointer to the timestamp struct.
 * @param flags
 *   Device specific flags. Used to pass the RX timesync register index to
 *   i40e. Unused in igb/ixgbe, pass 0 instead.
 *
 * @return
 *   - 0: Success.
 *   - -EINVAL: No timestamp is available.
 *   - -ENODEV: The port ID is invalid.
 *   - -EIO: if device is removed.
 *   - -ENOTSUP: The function is not supported by the Ethernet driver.
 */
int rte_eth_timesync_read_rx_timestamp(uint16_t port_id,
		struct timespec *timestamp, uint32_t flags);

/**
 * Read an IEEE1588/802.1AS TX timestamp from an Ethernet device.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param timestamp
 *   Pointer to the timestamp struct.
 *
 * @return
 *   - 0: Success.
 *   - -EINVAL: No timestamp is available.
 *   - -ENODEV: The port ID is invalid.
 *   - -EIO: if device is removed.
 *   - -ENOTSUP: The function is not supported by the Ethernet driver.
 */
int rte_eth_timesync_read_tx_timestamp(uint16_t port_id,
		struct timespec *timestamp);

/**
 * Adjust the timesync clock on an Ethernet device.
 *
 * This is usually used in conjunction with other Ethdev timesync functions to
 * synchronize the device time using the IEEE1588/802.1AS protocol.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param delta
 *   The adjustment in nanoseconds.
 *
 * @return
 *   - 0: Success.
 *   - -ENODEV: The port ID is invalid.
 *   - -EIO: if device is removed.
 *   - -ENOTSUP: The function is not supported by the Ethernet driver.
 */
int rte_eth_timesync_adjust_time(uint16_t port_id, int64_t delta);

/**
 * Read the time from the timesync clock on an Ethernet device.
 *
 * This is usually used in conjunction with other Ethdev timesync functions to
 * synchronize the device time using the IEEE1588/802.1AS protocol.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param time
 *   Pointer to the timespec struct that holds the time.
 *
 * @return
 *   - 0: Success.
 */
int rte_eth_timesync_read_time(uint16_t port_id, struct timespec *time);

/**
 * Set the time of the timesync clock on an Ethernet device.
 *
 * This is usually used in conjunction with other Ethdev timesync functions to
 * synchronize the device time using the IEEE1588/802.1AS protocol.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param time
 *   Pointer to the timespec struct that holds the time.
 *
 * @return
 *   - 0: Success.
 *   - -EINVAL: No timestamp is available.
 *   - -ENODEV: The port ID is invalid.
 *   - -EIO: if device is removed.
 *   - -ENOTSUP: The function is not supported by the Ethernet driver.
 */
int rte_eth_timesync_write_time(uint16_t port_id, const struct timespec *time);

/**
 * Config l2 tunnel ether type of an Ethernet device for filtering specific
 * tunnel packets by ether type.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param l2_tunnel
 *   l2 tunnel configuration.
 *
 * @return
 *   - (0) if successful.
 *   - (-ENODEV) if port identifier is invalid.
 *   - (-EIO) if device is removed.
 *   - (-ENOTSUP) if hardware doesn't support tunnel type.
 */
int
rte_eth_dev_l2_tunnel_eth_type_conf(uint16_t port_id,
				    struct rte_eth_l2_tunnel_conf *l2_tunnel);

/**
 * Enable/disable l2 tunnel offload functions. Include,
 * 1, The ability of parsing a type of l2 tunnel of an Ethernet device.
 *    Filtering, forwarding and offloading this type of tunnel packets depend on
 *    this ability.
 * 2, Stripping the l2 tunnel tag.
 * 3, Insertion of the l2 tunnel tag.
 * 4, Forwarding the packets based on the l2 tunnel tag.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param l2_tunnel
 *   l2 tunnel parameters.
 * @param mask
 *   Indicate the offload function.
 * @param en
 *   Enable or disable this function.
 *
 * @return
 *   - (0) if successful.
 *   - (-ENODEV) if port identifier is invalid.
 *   - (-EIO) if device is removed.
 *   - (-ENOTSUP) if hardware doesn't support tunnel type.
 */
int
rte_eth_dev_l2_tunnel_offload_set(uint16_t port_id,
				  struct rte_eth_l2_tunnel_conf *l2_tunnel,
				  uint32_t mask,
				  uint8_t en);

/**
* Get the port id from device name. The device name should be specified
* as below:
* - PCIe address (Domain:Bus:Device.Function), for example- 0000:2:00.0
* - SoC device name, for example- fsl-gmac0
* - vdev dpdk name, for example- net_[pcap0|null0|tap0]
*
* @param name
*  pci address or name of the device
* @param port_id
*   pointer to port identifier of the device
* @return
*   - (0) if successful and port_id is filled.
*   - (-ENODEV or -EINVAL) on failure.
*/
int
rte_eth_dev_get_port_by_name(const char *name, uint16_t *port_id);

/**
* Get the device name from port id. The device name is specified as below:
* - PCIe address (Domain:Bus:Device.Function), for example- 0000:02:00.0
* - SoC device name, for example- fsl-gmac0
* - vdev dpdk name, for example- net_[pcap0|null0|tun0|tap0]
*
* @param port_id
*   Port identifier of the device.
* @param name
*   Buffer of size RTE_ETH_NAME_MAX_LEN to store the name.
* @return
*   - (0) if successful.
*   - (-EINVAL) on failure.
*/
int
rte_eth_dev_get_name_by_port(uint16_t port_id, char *name);

/**
 * Check that numbers of Rx and Tx descriptors satisfy descriptors limits from
 * the ethernet device information, otherwise adjust them to boundaries.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param nb_rx_desc
 *   A pointer to a uint16_t where the number of receive
 *   descriptors stored.
 * @param nb_tx_desc
 *   A pointer to a uint16_t where the number of transmit
 *   descriptors stored.
 * @return
 *   - (0) if successful.
 *   - (-ENOTSUP, -ENODEV or -EINVAL) on failure.
 */
int rte_eth_dev_adjust_nb_rx_tx_desc(uint16_t port_id,
				     uint16_t *nb_rx_desc,
				     uint16_t *nb_tx_desc);

/**
 * Test if a port supports specific mempool ops.
 *
 * @param port_id
 *   Port identifier of the Ethernet device.
 * @param [in] pool
 *   The name of the pool operations to test.
 * @return
 *   - 0: best mempool ops choice for this port.
 *   - 1: mempool ops are supported for this port.
 *   - -ENOTSUP: mempool ops not supported for this port.
 *   - -ENODEV: Invalid port Identifier.
 *   - -EINVAL: Pool param is null.
 */
int
rte_eth_dev_pool_ops_supported(uint16_t port_id, const char *pool);

/**
 * Get the security context for the Ethernet device.
 *
 * @param port_id
 *   Port identifier of the Ethernet device
 * @return
 *   - NULL on error.
 *   - pointer to security context on success.
 */
void *
rte_eth_dev_get_sec_ctx(uint16_t port_id);


#include <rte_ethdev_core.h>

/**
 *
 * Retrieve a burst of input packets from a receive queue of an Ethernet
 * device. The retrieved packets are stored in *rte_mbuf* structures whose
 * pointers are supplied in the *rx_pkts* array.
 *
 * The rte_eth_rx_burst() function loops, parsing the RX ring of the
 * receive queue, up to *nb_pkts* packets, and for each completed RX
 * descriptor in the ring, it performs the following operations:
 *
 * - Initialize the *rte_mbuf* data structure associated with the
 *   RX descriptor according to the information provided by the NIC into
 *   that RX descriptor.
 *
 * - Store the *rte_mbuf* data structure into the next entry of the
 *   *rx_pkts* array.
 *
 * - Replenish the RX descriptor with a new *rte_mbuf* buffer
 *   allocated from the memory pool associated with the receive queue at
 *   initialization time.
 *
 * When retrieving an input packet that was scattered by the controller
 * into multiple receive descriptors, the rte_eth_rx_burst() function
 * appends the associated *rte_mbuf* buffers to the first buffer of the
 * packet.
 *
 * The rte_eth_rx_burst() function returns the number of packets
 * actually retrieved, which is the number of *rte_mbuf* data structures
 * effectively supplied into the *rx_pkts* array.
 * A return value equal to *nb_pkts* indicates that the RX queue contained
 * at least *rx_pkts* packets, and this is likely to signify that other
 * received packets remain in the input queue. Applications implementing
 * a "retrieve as much received packets as possible" policy can check this
 * specific case and keep invoking the rte_eth_rx_burst() function until
 * a value less than *nb_pkts* is returned.
 *
 * This receive method has the following advantages:
 *
 * - It allows a run-to-completion network stack engine to retrieve and
 *   to immediately process received packets in a fast burst-oriented
 *   approach, avoiding the overhead of unnecessary intermediate packet
 *   queue/dequeue operations.
 *
 * - Conversely, it also allows an asynchronous-oriented processing
 *   method to retrieve bursts of received packets and to immediately
 *   queue them for further parallel processing by another logical core,
 *   for instance. However, instead of having received packets being
 *   individually queued by the driver, this approach allows the caller
 *   of the rte_eth_rx_burst() function to queue a burst of retrieved
 *   packets at a time and therefore dramatically reduce the cost of
 *   enqueue/dequeue operations per packet.
 *
 * - It allows the rte_eth_rx_burst() function of the driver to take
 *   advantage of burst-oriented hardware features (CPU cache,
 *   prefetch instructions, and so on) to minimize the number of CPU
 *   cycles per packet.
 *
 * To summarize, the proposed receive API enables many
 * burst-oriented optimizations in both synchronous and asynchronous
 * packet processing environments with no overhead in both cases.
 *
 * The rte_eth_rx_burst() function does not provide any error
 * notification to avoid the corresponding overhead. As a hint, the
 * upper-level application might check the status of the device link once
 * being systematically returned a 0 value for a given number of tries.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param queue_id
 *   The index of the receive queue from which to retrieve input packets.
 *   The value must be in the range [0, nb_rx_queue - 1] previously supplied
 *   to rte_eth_dev_configure().
 * @param rx_pkts
 *   The address of an array of pointers to *rte_mbuf* structures that
 *   must be large enough to store *nb_pkts* pointers in it.
 * @param nb_pkts
 *   The maximum number of packets to retrieve.
 * @return
 *   The number of packets actually retrieved, which is the number
 *   of pointers to *rte_mbuf* structures effectively supplied to the
 *   *rx_pkts* array.
 */
static inline uint16_t
rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id,
		 struct rte_mbuf **rx_pkts, const uint16_t nb_pkts)
{
	struct rte_eth_dev *dev = &rte_eth_devices[port_id];
	uint16_t nb_rx;

#ifdef RTE_LIBRTE_ETHDEV_DEBUG
	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, 0);
	RTE_FUNC_PTR_OR_ERR_RET(*dev->rx_pkt_burst, 0);

	if (queue_id >= dev->data->nb_rx_queues) {
		RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n", queue_id);
		return 0;
	}
#endif
	nb_rx = (*dev->rx_pkt_burst)(dev->data->rx_queues[queue_id],
				     rx_pkts, nb_pkts);

#ifdef RTE_ETHDEV_RXTX_CALLBACKS
	if (unlikely(dev->post_rx_burst_cbs[queue_id] != NULL)) {
		struct rte_eth_rxtx_callback *cb =
				dev->post_rx_burst_cbs[queue_id];

		do {
			nb_rx = cb->fn.rx(port_id, queue_id, rx_pkts, nb_rx,
						nb_pkts, cb->param);
			cb = cb->next;
		} while (cb != NULL);
	}
#endif

	return nb_rx;
}

/**
 * Get the number of used descriptors of a rx queue
 *
 * @param port_id
 *  The port identifier of the Ethernet device.
 * @param queue_id
 *  The queue id on the specific port.
 * @return
 *  The number of used descriptors in the specific queue, or:
 *     (-EINVAL) if *port_id* or *queue_id* is invalid
 *     (-ENOTSUP) if the device does not support this function
 */
static inline int
rte_eth_rx_queue_count(uint16_t port_id, uint16_t queue_id)
{
	struct rte_eth_dev *dev;

	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
	dev = &rte_eth_devices[port_id];
	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_queue_count, -ENOTSUP);
	if (queue_id >= dev->data->nb_rx_queues)
		return -EINVAL;

	return (int)(*dev->dev_ops->rx_queue_count)(dev, queue_id);
}

/**
 * Check if the DD bit of the specific RX descriptor in the queue has been set
 *
 * @param port_id
 *  The port identifier of the Ethernet device.
 * @param queue_id
 *  The queue id on the specific port.
 * @param offset
 *  The offset of the descriptor ID from tail.
 * @return
 *  - (1) if the specific DD bit is set.
 *  - (0) if the specific DD bit is not set.
 *  - (-ENODEV) if *port_id* invalid.
 *  - (-ENOTSUP) if the device does not support this function
 */
static inline int
rte_eth_rx_descriptor_done(uint16_t port_id, uint16_t queue_id, uint16_t offset)
{
	struct rte_eth_dev *dev = &rte_eth_devices[port_id];
	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_descriptor_done, -ENOTSUP);
	return (*dev->dev_ops->rx_descriptor_done)( \
		dev->data->rx_queues[queue_id], offset);
}

#define RTE_ETH_RX_DESC_AVAIL    0 /**< Desc available for hw. */
#define RTE_ETH_RX_DESC_DONE     1 /**< Desc done, filled by hw. */
#define RTE_ETH_RX_DESC_UNAVAIL  2 /**< Desc used by driver or hw. */

/**
 * Check the status of a Rx descriptor in the queue
 *
 * It should be called in a similar context than the Rx function:
 * - on a dataplane core
 * - not concurrently on the same queue
 *
 * Since it's a dataplane function, no check is performed on port_id and
 * queue_id. The caller must therefore ensure that the port is enabled
 * and the queue is configured and running.
 *
 * Note: accessing to a random descriptor in the ring may trigger cache
 * misses and have a performance impact.
 *
 * @param port_id
 *  A valid port identifier of the Ethernet device which.
 * @param queue_id
 *  A valid Rx queue identifier on this port.
 * @param offset
 *  The offset of the descriptor starting from tail (0 is the next
 *  packet to be received by the driver).
 *
 * @return
 *  - (RTE_ETH_RX_DESC_AVAIL): Descriptor is available for the hardware to
 *    receive a packet.
 *  - (RTE_ETH_RX_DESC_DONE): Descriptor is done, it is filled by hw, but
 *    not yet processed by the driver (i.e. in the receive queue).
 *  - (RTE_ETH_RX_DESC_UNAVAIL): Descriptor is unavailable, either hold by
 *    the driver and not yet returned to hw, or reserved by the hw.
 *  - (-EINVAL) bad descriptor offset.
 *  - (-ENOTSUP) if the device does not support this function.
 *  - (-ENODEV) bad port or queue (only if compiled with debug).
 */
static inline int
rte_eth_rx_descriptor_status(uint16_t port_id, uint16_t queue_id,
	uint16_t offset)
{
	struct rte_eth_dev *dev;
	void *rxq;

#ifdef RTE_LIBRTE_ETHDEV_DEBUG
	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
#endif
	dev = &rte_eth_devices[port_id];
#ifdef RTE_LIBRTE_ETHDEV_DEBUG
	if (queue_id >= dev->data->nb_rx_queues)
		return -ENODEV;
#endif
	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->rx_descriptor_status, -ENOTSUP);
	rxq = dev->data->rx_queues[queue_id];

	return (*dev->dev_ops->rx_descriptor_status)(rxq, offset);
}

#define RTE_ETH_TX_DESC_FULL    0 /**< Desc filled for hw, waiting xmit. */
#define RTE_ETH_TX_DESC_DONE    1 /**< Desc done, packet is transmitted. */
#define RTE_ETH_TX_DESC_UNAVAIL 2 /**< Desc used by driver or hw. */

/**
 * Check the status of a Tx descriptor in the queue.
 *
 * It should be called in a similar context than the Tx function:
 * - on a dataplane core
 * - not concurrently on the same queue
 *
 * Since it's a dataplane function, no check is performed on port_id and
 * queue_id. The caller must therefore ensure that the port is enabled
 * and the queue is configured and running.
 *
 * Note: accessing to a random descriptor in the ring may trigger cache
 * misses and have a performance impact.
 *
 * @param port_id
 *  A valid port identifier of the Ethernet device which.
 * @param queue_id
 *  A valid Tx queue identifier on this port.
 * @param offset
 *  The offset of the descriptor starting from tail (0 is the place where
 *  the next packet will be send).
 *
 * @return
 *  - (RTE_ETH_TX_DESC_FULL) Descriptor is being processed by the hw, i.e.
 *    in the transmit queue.
 *  - (RTE_ETH_TX_DESC_DONE) Hardware is done with this descriptor, it can
 *    be reused by the driver.
 *  - (RTE_ETH_TX_DESC_UNAVAIL): Descriptor is unavailable, reserved by the
 *    driver or the hardware.
 *  - (-EINVAL) bad descriptor offset.
 *  - (-ENOTSUP) if the device does not support this function.
 *  - (-ENODEV) bad port or queue (only if compiled with debug).
 */
static inline int rte_eth_tx_descriptor_status(uint16_t port_id,
	uint16_t queue_id, uint16_t offset)
{
	struct rte_eth_dev *dev;
	void *txq;

#ifdef RTE_LIBRTE_ETHDEV_DEBUG
	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
#endif
	dev = &rte_eth_devices[port_id];
#ifdef RTE_LIBRTE_ETHDEV_DEBUG
	if (queue_id >= dev->data->nb_tx_queues)
		return -ENODEV;
#endif
	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->tx_descriptor_status, -ENOTSUP);
	txq = dev->data->tx_queues[queue_id];

	return (*dev->dev_ops->tx_descriptor_status)(txq, offset);
}

/**
 * Send a burst of output packets on a transmit queue of an Ethernet device.
 *
 * The rte_eth_tx_burst() function is invoked to transmit output packets
 * on the output queue *queue_id* of the Ethernet device designated by its
 * *port_id*.
 * The *nb_pkts* parameter is the number of packets to send which are
 * supplied in the *tx_pkts* array of *rte_mbuf* structures, each of them
 * allocated from a pool created with rte_pktmbuf_pool_create().
 * The rte_eth_tx_burst() function loops, sending *nb_pkts* packets,
 * up to the number of transmit descriptors available in the TX ring of the
 * transmit queue.
 * For each packet to send, the rte_eth_tx_burst() function performs
 * the following operations:
 *
 * - Pick up the next available descriptor in the transmit ring.
 *
 * - Free the network buffer previously sent with that descriptor, if any.
 *
 * - Initialize the transmit descriptor with the information provided
 *   in the *rte_mbuf data structure.
 *
 * In the case of a segmented packet composed of a list of *rte_mbuf* buffers,
 * the rte_eth_tx_burst() function uses several transmit descriptors
 * of the ring.
 *
 * The rte_eth_tx_burst() function returns the number of packets it
 * actually sent. A return value equal to *nb_pkts* means that all packets
 * have been sent, and this is likely to signify that other output packets
 * could be immediately transmitted again. Applications that implement a
 * "send as many packets to transmit as possible" policy can check this
 * specific case and keep invoking the rte_eth_tx_burst() function until
 * a value less than *nb_pkts* is returned.
 *
 * It is the responsibility of the rte_eth_tx_burst() function to
 * transparently free the memory buffers of packets previously sent.
 * This feature is driven by the *tx_free_thresh* value supplied to the
 * rte_eth_dev_configure() function at device configuration time.
 * When the number of free TX descriptors drops below this threshold, the
 * rte_eth_tx_burst() function must [attempt to] free the *rte_mbuf*  buffers
 * of those packets whose transmission was effectively completed.
 *
 * If the PMD is DEV_TX_OFFLOAD_MT_LOCKFREE capable, multiple threads can
 * invoke this function concurrently on the same tx queue without SW lock.
 * @see rte_eth_dev_info_get, struct rte_eth_txconf::offloads
 *
 * @see rte_eth_tx_prepare to perform some prior checks or adjustments
 * for offloads.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param queue_id
 *   The index of the transmit queue through which output packets must be
 *   sent.
 *   The value must be in the range [0, nb_tx_queue - 1] previously supplied
 *   to rte_eth_dev_configure().
 * @param tx_pkts
 *   The address of an array of *nb_pkts* pointers to *rte_mbuf* structures
 *   which contain the output packets.
 * @param nb_pkts
 *   The maximum number of packets to transmit.
 * @return
 *   The number of output packets actually stored in transmit descriptors of
 *   the transmit ring. The return value can be less than the value of the
 *   *tx_pkts* parameter when the transmit ring is full or has been filled up.
 */
static inline uint16_t
rte_eth_tx_burst(uint16_t port_id, uint16_t queue_id,
		 struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
{
	struct rte_eth_dev *dev = &rte_eth_devices[port_id];

#ifdef RTE_LIBRTE_ETHDEV_DEBUG
	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, 0);
	RTE_FUNC_PTR_OR_ERR_RET(*dev->tx_pkt_burst, 0);

	if (queue_id >= dev->data->nb_tx_queues) {
		RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n", queue_id);
		return 0;
	}
#endif

#ifdef RTE_ETHDEV_RXTX_CALLBACKS
	struct rte_eth_rxtx_callback *cb = dev->pre_tx_burst_cbs[queue_id];

	if (unlikely(cb != NULL)) {
		do {
			nb_pkts = cb->fn.tx(port_id, queue_id, tx_pkts, nb_pkts,
					cb->param);
			cb = cb->next;
		} while (cb != NULL);
	}
#endif

	return (*dev->tx_pkt_burst)(dev->data->tx_queues[queue_id], tx_pkts, nb_pkts);
}

/**
 * @warning
 * @b EXPERIMENTAL: this API may change without prior notice
 *
 * Process a burst of output packets on a transmit queue of an Ethernet device.
 *
 * The rte_eth_tx_prepare() function is invoked to prepare output packets to be
 * transmitted on the output queue *queue_id* of the Ethernet device designated
 * by its *port_id*.
 * The *nb_pkts* parameter is the number of packets to be prepared which are
 * supplied in the *tx_pkts* array of *rte_mbuf* structures, each of them
 * allocated from a pool created with rte_pktmbuf_pool_create().
 * For each packet to send, the rte_eth_tx_prepare() function performs
 * the following operations:
 *
 * - Check if packet meets devices requirements for tx offloads.
 *
 * - Check limitations about number of segments.
 *
 * - Check additional requirements when debug is enabled.
 *
 * - Update and/or reset required checksums when tx offload is set for packet.
 *
 * Since this function can modify packet data, provided mbufs must be safely
 * writable (e.g. modified data cannot be in shared segment).
 *
 * The rte_eth_tx_prepare() function returns the number of packets ready to be
 * sent. A return value equal to *nb_pkts* means that all packets are valid and
 * ready to be sent, otherwise stops processing on the first invalid packet and
 * leaves the rest packets untouched.
 *
 * When this functionality is not implemented in the driver, all packets are
 * are returned untouched.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 *   The value must be a valid port id.
 * @param queue_id
 *   The index of the transmit queue through which output packets must be
 *   sent.
 *   The value must be in the range [0, nb_tx_queue - 1] previously supplied
 *   to rte_eth_dev_configure().
 * @param tx_pkts
 *   The address of an array of *nb_pkts* pointers to *rte_mbuf* structures
 *   which contain the output packets.
 * @param nb_pkts
 *   The maximum number of packets to process.
 * @return
 *   The number of packets correct and ready to be sent. The return value can be
 *   less than the value of the *tx_pkts* parameter when some packet doesn't
 *   meet devices requirements with rte_errno set appropriately:
 *   - -EINVAL: offload flags are not correctly set
 *   - -ENOTSUP: the offload feature is not supported by the hardware
 *
 */

#ifndef RTE_ETHDEV_TX_PREPARE_NOOP

static inline uint16_t
rte_eth_tx_prepare(uint16_t port_id, uint16_t queue_id,
		struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
{
	struct rte_eth_dev *dev;

#ifdef RTE_LIBRTE_ETHDEV_DEBUG
	if (!rte_eth_dev_is_valid_port(port_id)) {
		RTE_ETHDEV_LOG(ERR, "Invalid TX port_id=%u\n", port_id);
		rte_errno = -EINVAL;
		return 0;
	}
#endif

	dev = &rte_eth_devices[port_id];

#ifdef RTE_LIBRTE_ETHDEV_DEBUG
	if (queue_id >= dev->data->nb_tx_queues) {
		RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n", queue_id);
		rte_errno = -EINVAL;
		return 0;
	}
#endif

	if (!dev->tx_pkt_prepare)
		return nb_pkts;

	return (*dev->tx_pkt_prepare)(dev->data->tx_queues[queue_id],
			tx_pkts, nb_pkts);
}

#else

/*
 * Native NOOP operation for compilation targets which doesn't require any
 * preparations steps, and functional NOOP may introduce unnecessary performance
 * drop.
 *
 * Generally this is not a good idea to turn it on globally and didn't should
 * be used if behavior of tx_preparation can change.
 */

static inline uint16_t
rte_eth_tx_prepare(__rte_unused uint16_t port_id,
		__rte_unused uint16_t queue_id,
		__rte_unused struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
{
	return nb_pkts;
}

#endif

/**
 * Send any packets queued up for transmission on a port and HW queue
 *
 * This causes an explicit flush of packets previously buffered via the
 * rte_eth_tx_buffer() function. It returns the number of packets successfully
 * sent to the NIC, and calls the error callback for any unsent packets. Unless
 * explicitly set up otherwise, the default callback simply frees the unsent
 * packets back to the owning mempool.
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param queue_id
 *   The index of the transmit queue through which output packets must be
 *   sent.
 *   The value must be in the range [0, nb_tx_queue - 1] previously supplied
 *   to rte_eth_dev_configure().
 * @param buffer
 *   Buffer of packets to be transmit.
 * @return
 *   The number of packets successfully sent to the Ethernet device. The error
 *   callback is called for any packets which could not be sent.
 */
static inline uint16_t
rte_eth_tx_buffer_flush(uint16_t port_id, uint16_t queue_id,
		struct rte_eth_dev_tx_buffer *buffer)
{
	uint16_t sent;
	uint16_t to_send = buffer->length;

	if (to_send == 0)
		return 0;

	sent = rte_eth_tx_burst(port_id, queue_id, buffer->pkts, to_send);

	buffer->length = 0;

	/* All packets sent, or to be dealt with by callback below */
	if (unlikely(sent != to_send))
		buffer->error_callback(&buffer->pkts[sent],
				       (uint16_t)(to_send - sent),
				       buffer->error_userdata);

	return sent;
}

/**
 * Buffer a single packet for future transmission on a port and queue
 *
 * This function takes a single mbuf/packet and buffers it for later
 * transmission on the particular port and queue specified. Once the buffer is
 * full of packets, an attempt will be made to transmit all the buffered
 * packets. In case of error, where not all packets can be transmitted, a
 * callback is called with the unsent packets as a parameter. If no callback
 * is explicitly set up, the unsent packets are just freed back to the owning
 * mempool. The function returns the number of packets actually sent i.e.
 * 0 if no buffer flush occurred, otherwise the number of packets successfully
 * flushed
 *
 * @param port_id
 *   The port identifier of the Ethernet device.
 * @param queue_id
 *   The index of the transmit queue through which output packets must be
 *   sent.
 *   The value must be in the range [0, nb_tx_queue - 1] previously supplied
 *   to rte_eth_dev_configure().
 * @param buffer
 *   Buffer used to collect packets to be sent.
 * @param tx_pkt
 *   Pointer to the packet mbuf to be sent.
 * @return
 *   0 = packet has been buffered for later transmission
 *   N > 0 = packet has been buffered, and the buffer was subsequently flushed,
 *     causing N packets to be sent, and the error callback to be called for
 *     the rest.
 */
static __rte_always_inline uint16_t
rte_eth_tx_buffer(uint16_t port_id, uint16_t queue_id,
		struct rte_eth_dev_tx_buffer *buffer, struct rte_mbuf *tx_pkt)
{
	buffer->pkts[buffer->length++] = tx_pkt;
	if (buffer->length < buffer->size)
		return 0;

	return rte_eth_tx_buffer_flush(port_id, queue_id, buffer);
}

#ifdef __cplusplus
}
#endif

#endif /* _RTE_ETHDEV_H_ */
-------------- next part --------------
/*
 * Copyright (c) 2014, 2015, 2016, 2017 Nicira, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <config.h>
#include "netdev-dpdk.h"

#include <string.h>
#include <signal.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <linux/virtio_net.h>
#include <sys/socket.h>
#include <linux/if.h>

#include <rte_bus_pci.h>
#include <rte_config.h>
#include <rte_cycles.h>
#include <rte_errno.h>
#include <rte_eth_ring.h>
#include <rte_ethdev.h>
#include <rte_malloc.h>
#include <rte_mbuf.h>
#include <rte_meter.h>
#include <rte_pci.h>
#include <rte_vhost.h>
#include <rte_version.h>
#include <rte_flow.h>

#include "cmap.h"
#include "dirs.h"
#include "dp-packet.h"
#include "dpdk.h"
#include "dpif-netdev.h"
#include "fatal-signal.h"
#include "netdev-provider.h"
#include "netdev-vport.h"
#include "odp-util.h"
#include "openvswitch/dynamic-string.h"
#include "openvswitch/list.h"
#include "openvswitch/ofp-print.h"
#include "openvswitch/vlog.h"
#include "openvswitch/match.h"
#include "ovs-numa.h"
#include "ovs-thread.h"
#include "ovs-rcu.h"
#include "packets.h"
#include "openvswitch/shash.h"
#include "smap.h"
#include "sset.h"
#include "unaligned.h"
#include "timeval.h"
#include "uuid.h"
#include "unixctl.h"

enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};

VLOG_DEFINE_THIS_MODULE(netdev_dpdk);
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);

#define DPDK_PORT_WATCHDOG_INTERVAL 5

#define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
#define OVS_VPORT_DPDK "ovs_dpdk"

/*
 * need to reserve tons of extra space in the mbufs so we can align the
 * DMA addresses to 4KB.
 * The minimum mbuf size is limited to avoid scatter behaviour and drop in
 * performance for standard Ethernet MTU.
 */
#define ETHER_HDR_MAX_LEN           (ETHER_HDR_LEN + ETHER_CRC_LEN \
                                     + (2 * VLAN_HEADER_LEN))
#define MTU_TO_FRAME_LEN(mtu)       ((mtu) + ETHER_HDR_LEN + ETHER_CRC_LEN)
#define MTU_TO_MAX_FRAME_LEN(mtu)   ((mtu) + ETHER_HDR_MAX_LEN)
#define FRAME_LEN_TO_MTU(frame_len) ((frame_len)                    \
                                     - ETHER_HDR_LEN - ETHER_CRC_LEN)
#define MBUF_SIZE(mtu)              ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) \
                                             + sizeof(struct dp_packet) \
                                             + RTE_PKTMBUF_HEADROOM),   \
                                             RTE_CACHE_LINE_SIZE)
#define NETDEV_DPDK_MBUF_ALIGN      1024
#define NETDEV_DPDK_MAX_PKT_LEN     9728

/* Max and min number of packets in the mempool. OVS tries to allocate a
 * mempool with MAX_NB_MBUF: if this fails (because the system doesn't have
 * enough hugepages) we keep halving the number until the allocation succeeds
 * or we reach MIN_NB_MBUF */

#define MAX_NB_MBUF          (4096 * 64)
#define MIN_NB_MBUF          (4096 * 4)
#define MP_CACHE_SZ          RTE_MEMPOOL_CACHE_MAX_SIZE

/* MAX_NB_MBUF can be divided by 2 many times, until MIN_NB_MBUF */
BUILD_ASSERT_DECL(MAX_NB_MBUF % ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF)
                  == 0);

/* The smallest possible NB_MBUF that we're going to try should be a multiple
 * of MP_CACHE_SZ. This is advised by DPDK documentation. */
BUILD_ASSERT_DECL((MAX_NB_MBUF / ROUND_DOWN_POW2(MAX_NB_MBUF / MIN_NB_MBUF))
                  % MP_CACHE_SZ == 0);

/*
 * DPDK XSTATS Counter names definition
 */
#define XSTAT_RX_64_PACKETS              "rx_size_64_packets"
#define XSTAT_RX_65_TO_127_PACKETS       "rx_size_65_to_127_packets"
#define XSTAT_RX_128_TO_255_PACKETS      "rx_size_128_to_255_packets"
#define XSTAT_RX_256_TO_511_PACKETS      "rx_size_256_to_511_packets"
#define XSTAT_RX_512_TO_1023_PACKETS     "rx_size_512_to_1023_packets"
#define XSTAT_RX_1024_TO_1522_PACKETS    "rx_size_1024_to_1522_packets"
#define XSTAT_RX_1523_TO_MAX_PACKETS     "rx_size_1523_to_max_packets"

#define XSTAT_TX_64_PACKETS              "tx_size_64_packets"
#define XSTAT_TX_65_TO_127_PACKETS       "tx_size_65_to_127_packets"
#define XSTAT_TX_128_TO_255_PACKETS      "tx_size_128_to_255_packets"
#define XSTAT_TX_256_TO_511_PACKETS      "tx_size_256_to_511_packets"
#define XSTAT_TX_512_TO_1023_PACKETS     "tx_size_512_to_1023_packets"
#define XSTAT_TX_1024_TO_1522_PACKETS    "tx_size_1024_to_1522_packets"
#define XSTAT_TX_1523_TO_MAX_PACKETS     "tx_size_1523_to_max_packets"

#define XSTAT_RX_MULTICAST_PACKETS       "rx_multicast_packets"
#define XSTAT_TX_MULTICAST_PACKETS       "tx_multicast_packets"
#define XSTAT_RX_BROADCAST_PACKETS       "rx_broadcast_packets"
#define XSTAT_TX_BROADCAST_PACKETS       "tx_broadcast_packets"
#define XSTAT_RX_UNDERSIZED_ERRORS       "rx_undersized_errors"
#define XSTAT_RX_OVERSIZE_ERRORS         "rx_oversize_errors"
#define XSTAT_RX_FRAGMENTED_ERRORS       "rx_fragmented_errors"
#define XSTAT_RX_JABBER_ERRORS           "rx_jabber_errors"

#define SOCKET0              0

/* Default size of Physical NIC RXQ */
#define NIC_PORT_DEFAULT_RXQ_SIZE 2048
/* Default size of Physical NIC TXQ */
#define NIC_PORT_DEFAULT_TXQ_SIZE 2048
/* Maximum size of Physical NIC Queues */
#define NIC_PORT_MAX_Q_SIZE 4096

#define OVS_VHOST_MAX_QUEUE_NUM 1024  /* Maximum number of vHost TX queues. */
#define OVS_VHOST_QUEUE_MAP_UNKNOWN (-1) /* Mapping not initialized. */
#define OVS_VHOST_QUEUE_DISABLED    (-2) /* Queue was disabled by guest and not
                                          * yet mapped to another queue. */

#define DPDK_ETH_PORT_ID_INVALID    RTE_MAX_ETHPORTS

/* DPDK library uses uint16_t for port_id. */
typedef uint16_t dpdk_port_t;
#define DPDK_PORT_ID_FMT "%"PRIu16

#define VHOST_ENQ_RETRY_NUM 8
#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)

static const struct rte_eth_conf port_conf = {
    .rxmode = {
        .mq_mode = ETH_MQ_RX_RSS,
        .split_hdr_size = 0,
        .header_split   = 0, /* Header Split disabled */
        .hw_ip_checksum = 0, /* IP checksum offload disabled */
        .hw_vlan_filter = 0, /* VLAN filtering disabled */
        .jumbo_frame    = 0, /* Jumbo Frame Support disabled */
        .hw_strip_crc   = 0,
    },
    .rx_adv_conf = {
        .rss_conf = {
            .rss_key = NULL,
            .rss_hf = ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP,
        },
    },
    .txmode = {
        .mq_mode = ETH_MQ_TX_NONE,
    },
};

/*
 * A mapping from ufid to dpdk rte_flow.
 */
static struct cmap ufid_to_rte_flow = CMAP_INITIALIZER;

struct ufid_to_rte_flow_data {
    struct cmap_node node;
    ovs_u128 ufid;
    struct rte_flow *rte_flow;
};

/*
 * These callbacks allow virtio-net devices to be added to vhost ports when
 * configuration has been fully completed.
 */
static int new_device(int vid);
static void destroy_device(int vid);
static int vring_state_changed(int vid, uint16_t queue_id, int enable);
static const struct vhost_device_ops virtio_net_device_ops =
{
    .new_device =  new_device,
    .destroy_device = destroy_device,
    .vring_state_changed = vring_state_changed,
    .features_changed = NULL
};

enum { DPDK_RING_SIZE = 256 };
BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE));
enum { DRAIN_TSC = 200000ULL };

enum dpdk_dev_type {
    DPDK_DEV_ETH = 0,
    DPDK_DEV_VHOST = 1,
};

/* Quality of Service */

/* An instance of a QoS configuration.  Always associated with a particular
 * network device.
 *
 * Each QoS implementation subclasses this with whatever additional data it
 * needs.
 */
struct qos_conf {
    const struct dpdk_qos_ops *ops;
    rte_spinlock_t lock;
};

/* A particular implementation of dpdk QoS operations.
 *
 * The functions below return 0 if successful or a positive errno value on
 * failure, except where otherwise noted. All of them must be provided, except
 * where otherwise noted.
 */
struct dpdk_qos_ops {

    /* Name of the QoS type */
    const char *qos_name;

    /* Called to construct a qos_conf object. The implementation should make
     * the appropriate calls to configure QoS according to 'details'.
     *
     * The contents of 'details' should be documented as valid for 'ovs_name'
     * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
     * (which is built as ovs-vswitchd.conf.db(8)).
     *
     * This function must return 0 if and only if it sets '*conf' to an
     * initialized 'struct qos_conf'.
     *
     * For all QoS implementations it should always be non-null.
     */
    int (*qos_construct)(const struct smap *details, struct qos_conf **conf);

    /* Destroys the data structures allocated by the implementation as part of
     * 'qos_conf'.
     *
     * For all QoS implementations it should always be non-null.
     */
    void (*qos_destruct)(struct qos_conf *conf);

    /* Retrieves details of 'conf' configuration into 'details'.
     *
     * The contents of 'details' should be documented as valid for 'ovs_name'
     * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
     * (which is built as ovs-vswitchd.conf.db(8)).
     */
    int (*qos_get)(const struct qos_conf *conf, struct smap *details);

    /* Returns true if 'conf' is already configured according to 'details'.
     *
     * The contents of 'details' should be documented as valid for 'ovs_name'
     * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
     * (which is built as ovs-vswitchd.conf.db(8)).
     *
     * For all QoS implementations it should always be non-null.
     */
    bool (*qos_is_equal)(const struct qos_conf *conf,
                         const struct smap *details);

    /* Modify an array of rte_mbufs. The modification is specific to
     * each qos implementation.
     *
     * The function should take and array of mbufs and an int representing
     * the current number of mbufs present in the array.
     *
     * After the function has performed a qos modification to the array of
     * mbufs it returns an int representing the number of mbufs now present in
     * the array. This value is can then be passed to the port send function
     * along with the modified array for transmission.
     *
     * For all QoS implementations it should always be non-null.
     */
    int (*qos_run)(struct qos_conf *qos_conf, struct rte_mbuf **pkts,
                   int pkt_cnt, bool should_steal);
};

/* dpdk_qos_ops for each type of user space QoS implementation */
static const struct dpdk_qos_ops egress_policer_ops;

/*
 * Array of dpdk_qos_ops, contains pointer to all supported QoS
 * operations.
 */
static const struct dpdk_qos_ops *const qos_confs[] = {
    &egress_policer_ops,
    NULL
};

static struct ovs_mutex dpdk_mutex = OVS_MUTEX_INITIALIZER;

/* Contains all 'struct dpdk_dev's. */
static struct ovs_list dpdk_list OVS_GUARDED_BY(dpdk_mutex)
    = OVS_LIST_INITIALIZER(&dpdk_list);

static struct ovs_mutex dpdk_mp_mutex OVS_ACQ_AFTER(dpdk_mutex)
    = OVS_MUTEX_INITIALIZER;

/* Contains all 'struct dpdk_mp's. */
static struct ovs_list dpdk_mp_list OVS_GUARDED_BY(dpdk_mp_mutex)
    = OVS_LIST_INITIALIZER(&dpdk_mp_list);

struct dpdk_mp {
     struct rte_mempool *mp;
     int mtu;
     int socket_id;
     int refcount;
     struct ovs_list list_node OVS_GUARDED_BY(dpdk_mp_mutex);
 };

/* There should be one 'struct dpdk_tx_queue' created for
 * each cpu core. */
struct dpdk_tx_queue {
    rte_spinlock_t tx_lock;        /* Protects the members and the NIC queue
                                    * from concurrent access.  It is used only
                                    * if the queue is shared among different
                                    * pmd threads (see 'concurrent_txq'). */
    int map;                       /* Mapping of configured vhost-user queues
                                    * to enabled by guest. */
};

/* dpdk has no way to remove dpdk ring ethernet devices
   so we have to keep them around once they've been created
*/

static struct ovs_list dpdk_ring_list OVS_GUARDED_BY(dpdk_mutex)
    = OVS_LIST_INITIALIZER(&dpdk_ring_list);

struct dpdk_ring {
    /* For the client rings */
    struct rte_ring *cring_tx;
    struct rte_ring *cring_rx;
    unsigned int user_port_id; /* User given port no, parsed from port name */
    dpdk_port_t eth_port_id; /* ethernet device port id */
    struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
};

struct ingress_policer {
    struct rte_meter_srtcm_params app_srtcm_params;
    struct rte_meter_srtcm in_policer;
    rte_spinlock_t policer_lock;
};

enum dpdk_hw_ol_features {
    NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
    NETDEV_RX_HW_CRC_STRIP = 1 << 1,
};

/*
 * In order to avoid confusion in variables names, following naming convention
 * should be used, if possible:
 *
 *     'struct netdev'          : 'netdev'
 *     'struct netdev_dpdk'     : 'dev'
 *     'struct netdev_rxq'      : 'rxq'
 *     'struct netdev_rxq_dpdk' : 'rx'
 *
 * Example:
 *     struct netdev *netdev = netdev_from_name(name);
 *     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
 *
 *  Also, 'netdev' should be used instead of 'dev->up', where 'netdev' was
 *  already defined.
 */

struct netdev_dpdk {
    PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,
        dpdk_port_t port_id;

        /* If true, device was attached by rte_eth_dev_attach(). */
        bool attached;
        /* If true, rte_eth_dev_start() was successfully called */
        bool started;
        struct eth_addr hwaddr;
        int mtu;
        int socket_id;
        int buf_size;
        int max_packet_len;
        enum dpdk_dev_type type;
        enum netdev_flags flags;
        int link_reset_cnt;
        char *devargs;  /* Device arguments for dpdk ports */
        struct dpdk_tx_queue *tx_q;
        struct rte_eth_link link;
    );

    PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,
        struct ovs_mutex mutex OVS_ACQ_AFTER(dpdk_mutex);
        struct dpdk_mp *dpdk_mp;

        /* virtio identifier for vhost devices */
        ovsrcu_index vid;

        /* True if vHost device is 'up' and has been reconfigured at least once */
        bool vhost_reconfigured;
        /* 3 pad bytes here. */
    );

    PADDED_MEMBERS(CACHE_LINE_SIZE,
        /* Identifier used to distinguish vhost devices from each other. */
        char vhost_id[PATH_MAX];
    );

    PADDED_MEMBERS(CACHE_LINE_SIZE,
        struct netdev up;
        /* In dpdk_list. */
        struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);

        /* QoS configuration and lock for the device */
        OVSRCU_TYPE(struct qos_conf *) qos_conf;

        /* Ingress Policer */
        OVSRCU_TYPE(struct ingress_policer *) ingress_policer;
        uint32_t policer_rate;
        uint32_t policer_burst;
    );

    PADDED_MEMBERS(CACHE_LINE_SIZE,
        struct netdev_stats stats;
        /* Protects stats */
        rte_spinlock_t stats_lock;
        /* 44 pad bytes here. */
    );

    PADDED_MEMBERS(CACHE_LINE_SIZE,
        /* The following properties cannot be changed when a device is running,
         * so we remember the request and update them next time
         * netdev_dpdk*_reconfigure() is called */
        int requested_mtu;
        int requested_n_txq;
        int requested_n_rxq;
        int requested_rxq_size;
        int requested_txq_size;

        /* Number of rx/tx descriptors for physical devices */
        int rxq_size;
        int txq_size;

        /* Socket ID detected when vHost device is brought up */
        int requested_socket_id;

        /* Denotes whether vHost port is client/server mode */
        uint64_t vhost_driver_flags;

        /* DPDK-ETH Flow control */
        struct rte_eth_fc_conf fc_conf;

        /* DPDK-ETH hardware offload features,
         * from the enum set 'dpdk_hw_ol_features' */
        uint32_t hw_ol_features;

        /* Properties for link state change detection mode.
         * If lsc_interrupt_mode is set to false, poll mode is used,
         * otherwise interrupt mode is used. */
        bool requested_lsc_interrupt_mode;
        bool lsc_interrupt_mode;
    );

    PADDED_MEMBERS(CACHE_LINE_SIZE,
        /* Names of all XSTATS counters */
        struct rte_eth_xstat_name *rte_xstats_names;
        int rte_xstats_names_size;
        int rte_xstats_ids_size;
        uint64_t *rte_xstats_ids;
    );
};

struct netdev_rxq_dpdk {
    struct netdev_rxq up;
    dpdk_port_t port_id;
};

static void netdev_dpdk_destruct(struct netdev *netdev);
static void netdev_dpdk_vhost_destruct(struct netdev *netdev);

static void netdev_dpdk_clear_xstats(struct netdev_dpdk *dev);

int netdev_dpdk_get_vid(const struct netdev_dpdk *dev);

struct ingress_policer *
netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev);

static bool
is_dpdk_class(const struct netdev_class *class)
{
    return class->destruct == netdev_dpdk_destruct
           || class->destruct == netdev_dpdk_vhost_destruct;
}

/* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
 * aligned at 1k or less. If a declared mbuf size is not a multiple of this
 * value, insufficient buffers are allocated to accomodate the packet in its
 * entirety. Furthermore, certain drivers need to ensure that there is also
 * sufficient space in the Rx buffer to accommodate two VLAN tags (for QinQ
 * frames). If the RX buffer is too small, then the driver enables scatter RX
 * behaviour, which reduces performance. To prevent this, use a buffer size
 * that is closest to 'mtu', but which satisfies the aforementioned criteria.
 */
static uint32_t
dpdk_buf_size(int mtu)
{
    return ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) + RTE_PKTMBUF_HEADROOM),
                     NETDEV_DPDK_MBUF_ALIGN);
}

/* Allocates an area of 'sz' bytes from DPDK.  The memory is zero'ed.
 *
 * Unlike xmalloc(), this function can return NULL on failure. */
static void *
dpdk_rte_mzalloc(size_t sz)
{
    return rte_zmalloc(OVS_VPORT_DPDK, sz, OVS_CACHE_LINE_SIZE);
}

void
free_dpdk_buf(struct dp_packet *p)
{
    struct rte_mbuf *pkt = (struct rte_mbuf *) p;

    rte_pktmbuf_free(pkt);
}

static void
ovs_rte_pktmbuf_init(struct rte_mempool *mp OVS_UNUSED,
                     void *opaque_arg OVS_UNUSED,
                     void *_p,
                     unsigned i OVS_UNUSED)
{
    struct rte_mbuf *pkt = _p;

    dp_packet_init_dpdk((struct dp_packet *) pkt, pkt->buf_len);
}

static int
dpdk_mp_full(const struct rte_mempool *mp) OVS_REQUIRES(dpdk_mp_mutex)
{
    /* At this point we want to know if all the mbufs are back
     * in the mempool. rte_mempool_full() is not atomic but it's
     * the best available and as we are no longer requesting mbufs
     * from the mempool, it means mbufs will not move from
     * 'mempool ring' --> 'mempool cache'. In rte_mempool_full()
     * the ring is counted before caches, so we won't get false
     * positives in this use case and we handle false negatives.
     *
     * If future implementations of rte_mempool_full() were to change
     * it could be possible for a false positive. Even that would
     * likely be ok, as there are additional checks during mempool
     * freeing but it would make things racey.
     */
    return rte_mempool_full(mp);
}

/* Free unused mempools. */
static void
dpdk_mp_sweep(void) OVS_REQUIRES(dpdk_mp_mutex)
{
    struct dpdk_mp *dmp, *next;

    LIST_FOR_EACH_SAFE (dmp, next, list_node, &dpdk_mp_list) {
        if (!dmp->refcount && dpdk_mp_full(dmp->mp)) {
            VLOG_DBG("Freeing mempool \"%s\"", dmp->mp->name);
            ovs_list_remove(&dmp->list_node);
            rte_mempool_free(dmp->mp);
            rte_free(dmp);
        }
    }
}

/* Calculating the required number of mbufs differs depending on the
 * mempool model being used. Check if per port memory is in use before
 * calculating.
 */
static uint32_t
dpdk_calculate_mbufs(struct netdev_dpdk *dev, int mtu, bool per_port_mp)
{
    uint32_t n_mbufs;

    if (!per_port_mp) {
        /* Shared memory are being used.
         * XXX: this is a really rough method of provisioning memory.
         * It's impossible to determine what the exact memory requirements are
         * when the number of ports and rxqs that utilize a particular mempool
         * can change dynamically at runtime. For now, use this rough
         * heurisitic.
         */
        if (mtu >= ETHER_MTU) {
            n_mbufs = MAX_NB_MBUF;
        } else {
            n_mbufs = MIN_NB_MBUF;
        }
    } else {
        /* Per port memory is being used.
         * XXX: rough estimation of number of mbufs required for this port:
         * <packets required to fill the device rxqs>
         * + <packets that could be stuck on other ports txqs>
         * + <packets in the pmd threads>
         * + <additional memory for corner cases>
         */
        n_mbufs = dev->requested_n_rxq * dev->requested_rxq_size
                  + dev->requested_n_txq * dev->requested_txq_size
                  + MIN(RTE_MAX_LCORE, dev->requested_n_rxq) * NETDEV_MAX_BURST
                  + MIN_NB_MBUF;
    }

    return n_mbufs;
}

static struct dpdk_mp *
dpdk_mp_create(struct netdev_dpdk *dev, int mtu, bool per_port_mp)
{
    char mp_name[RTE_MEMPOOL_NAMESIZE];
    const char *netdev_name = netdev_get_name(&dev->up);
    int socket_id = dev->requested_socket_id;
    uint32_t n_mbufs;
    uint32_t hash = hash_string(netdev_name, 0);
    struct dpdk_mp *dmp = NULL;
    int ret;

    dmp = dpdk_rte_mzalloc(sizeof *dmp);
    if (!dmp) {
        return NULL;
    }
    dmp->socket_id = socket_id;
    dmp->mtu = mtu;
    dmp->refcount = 1;

    n_mbufs = dpdk_calculate_mbufs(dev, mtu, per_port_mp);

    do {
        /* Full DPDK memory pool name must be unique and cannot be
         * longer than RTE_MEMPOOL_NAMESIZE. Note that for the shared
         * mempool case this can result in one device using a mempool
         * which references a different device in it's name. However as
         * mempool names are hashed, the device name will not be readable
         * so this is not an issue for tasks such as debugging.
         */
        ret = snprintf(mp_name, RTE_MEMPOOL_NAMESIZE,
                           "ovs%08x%02d%05d%07u",
                           hash, socket_id, mtu, n_mbufs);
        if (ret < 0 || ret >= RTE_MEMPOOL_NAMESIZE) {
            VLOG_DBG("snprintf returned %d. "
                     "Failed to generate a mempool name for \"%s\". "
                     "Hash:0x%x, socket_id: %d, mtu:%d, mbufs:%u.",
                     ret, netdev_name, hash, socket_id, mtu, n_mbufs);
            break;
        }

        VLOG_DBG("Port %s: Requesting a mempool of %u mbufs "
                  "on socket %d for %d Rx and %d Tx queues.",
                  netdev_name, n_mbufs, socket_id,
                  dev->requested_n_rxq, dev->requested_n_txq);

        dmp->mp = rte_pktmbuf_pool_create(mp_name, n_mbufs,
                                          MP_CACHE_SZ,
                                          sizeof (struct dp_packet)
                                          - sizeof (struct rte_mbuf),
                                          MBUF_SIZE(mtu)
                                          - sizeof(struct dp_packet),
                                          socket_id);

        if (dmp->mp) {
            VLOG_DBG("Allocated \"%s\" mempool with %u mbufs",
                     mp_name, n_mbufs);
            /* rte_pktmbuf_pool_create has done some initialization of the
             * rte_mbuf part of each dp_packet, while ovs_rte_pktmbuf_init
             * initializes some OVS specific fields of dp_packet.
             */
            rte_mempool_obj_iter(dmp->mp, ovs_rte_pktmbuf_init, NULL);
            return dmp;
        } else if (rte_errno == EEXIST) {
            /* A mempool with the same name already exists.  We just
             * retrieve its pointer to be returned to the caller. */
            dmp->mp = rte_mempool_lookup(mp_name);
            /* As the mempool create returned EEXIST we can expect the
             * lookup has returned a valid pointer.  If for some reason
             * that's not the case we keep track of it. */
            VLOG_DBG("A mempool with name \"%s\" already exists at %p.",
                     mp_name, dmp->mp);
            return dmp;
        } else {
            VLOG_DBG("Failed to create mempool \"%s\" with a request of "
                     "%u mbufs, retrying with %u mbufs",
                     mp_name, n_mbufs, n_mbufs / 2);
        }
    } while (!dmp->mp && rte_errno == ENOMEM && (n_mbufs /= 2) >= MIN_NB_MBUF);

    VLOG_ERR("Failed to create mempool \"%s\" with a request of %u mbufs",
             mp_name, n_mbufs);

    rte_free(dmp);
    return NULL;
}

static struct dpdk_mp *
dpdk_mp_get(struct netdev_dpdk *dev, int mtu, bool per_port_mp)
{
    struct dpdk_mp *dmp, *next;
    bool reuse = false;

    ovs_mutex_lock(&dpdk_mp_mutex);
    /* Check if shared memory is being used, if so check existing mempools
     * to see if reuse is possible. */
    if (!per_port_mp) {
        LIST_FOR_EACH (dmp, list_node, &dpdk_mp_list) {
            if (dmp->socket_id == dev->requested_socket_id
                && dmp->mtu == mtu) {
                VLOG_DBG("Reusing mempool \"%s\"", dmp->mp->name);
                dmp->refcount++;
                reuse = true;
                break;
            }
        }
    }
    /* Sweep mempools after reuse or before create. */
    dpdk_mp_sweep();

    if (!reuse) {
        dmp = dpdk_mp_create(dev, mtu, per_port_mp);
        if (dmp) {
            /* Shared memory will hit the reuse case above so will not
             * request a mempool that already exists but we need to check
             * for the EEXIST case for per port memory case. Compare the
             * mempool returned by dmp to each entry in dpdk_mp_list. If a
             * match is found, free dmp as a new entry is not required, set
             * dmp to point to the existing entry and increment the refcount
             * to avoid being freed at a later stage.
             */
            if (per_port_mp && rte_errno == EEXIST) {
                LIST_FOR_EACH (next, list_node, &dpdk_mp_list) {
                    if (dmp->mp == next->mp) {
                        rte_free(dmp);
                        dmp = next;
                        dmp->refcount++;
                    }
                }
            } else {
                ovs_list_push_back(&dpdk_mp_list, &dmp->list_node);
            }
        }
    }


    ovs_mutex_unlock(&dpdk_mp_mutex);

    return dmp;
}

/* Decrement reference to a mempool. */
static void
dpdk_mp_put(struct dpdk_mp *dmp)
{
    if (!dmp) {
        return;
    }

    ovs_mutex_lock(&dpdk_mp_mutex);
    ovs_assert(dmp->refcount);
    dmp->refcount--;
    ovs_mutex_unlock(&dpdk_mp_mutex);
}

/* Depending on the memory model being used this function tries to
 * identify and reuse an existing mempool or tries to allocate a new
 * mempool on requested_socket_id with mbuf size corresponding to the
 * requested_mtu. On success, a new configuration will be applied.
 * On error, device will be left unchanged. */
static int
netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
    OVS_REQUIRES(dev->mutex)
{
    uint32_t buf_size = dpdk_buf_size(dev->requested_mtu);
    struct dpdk_mp *dmp;
    int ret = 0;
    bool per_port_mp = dpdk_per_port_memory();

    /* With shared memory we do not need to configure a mempool if the MTU
     * and socket ID have not changed, the previous configuration is still
     * valid so return 0 */
    if (!per_port_mp && dev->mtu == dev->requested_mtu
        && dev->socket_id == dev->requested_socket_id) {
        return ret;
    }

    dmp = dpdk_mp_get(dev, FRAME_LEN_TO_MTU(buf_size), per_port_mp);
    if (!dmp) {
        VLOG_ERR("Failed to create memory pool for netdev "
                 "%s, with MTU %d on socket %d: %s\n",
                 dev->up.name, dev->requested_mtu, dev->requested_socket_id,
                 rte_strerror(rte_errno));
        ret = rte_errno;
    } else {
        /* Check for any pre-existing dpdk_mp for the device before accessing
         * the associated mempool.
         */
        if (dev->dpdk_mp != NULL) {
            /* A new MTU was requested, decrement the reference count for the
             * devices current dpdk_mp. This is required even if a pointer to
             * same dpdk_mp is returned by dpdk_mp_get. The refcount for dmp
             * has already been incremented by dpdk_mp_get at this stage so it
             * must be decremented to keep an accurate refcount for the
             * dpdk_mp.
             */
            dpdk_mp_put(dev->dpdk_mp);
        }
        dev->dpdk_mp = dmp;
        dev->mtu = dev->requested_mtu;
        dev->socket_id = dev->requested_socket_id;
        dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
    }

    return ret;
}

static void
check_link_status(struct netdev_dpdk *dev)
{
    struct rte_eth_link link;

    rte_eth_link_get_nowait(dev->port_id, &link);

    if (dev->link.link_status != link.link_status) {
        netdev_change_seq_changed(&dev->up);

        dev->link_reset_cnt++;
        dev->link = link;
        if (dev->link.link_status) {
            VLOG_DBG_RL(&rl,
                        "Port "DPDK_PORT_ID_FMT" Link Up - speed %u Mbps - %s",
                        dev->port_id, (unsigned) dev->link.link_speed,
                        (dev->link.link_duplex == ETH_LINK_FULL_DUPLEX)
                        ? "full-duplex" : "half-duplex");
        } else {
            VLOG_DBG_RL(&rl, "Port "DPDK_PORT_ID_FMT" Link Down",
                        dev->port_id);
        }
    }
}

static void *
dpdk_watchdog(void *dummy OVS_UNUSED)
{
    struct netdev_dpdk *dev;

    pthread_detach(pthread_self());

    for (;;) {
        ovs_mutex_lock(&dpdk_mutex);
        LIST_FOR_EACH (dev, list_node, &dpdk_list) {
            ovs_mutex_lock(&dev->mutex);
            if (dev->type == DPDK_DEV_ETH) {
                check_link_status(dev);
            }
            ovs_mutex_unlock(&dev->mutex);
        }
        ovs_mutex_unlock(&dpdk_mutex);
        xsleep(DPDK_PORT_WATCHDOG_INTERVAL);
    }

    return NULL;
}

static int
dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
{
    int diag = 0;
    int i;
    struct rte_eth_conf conf = port_conf;
    struct rte_eth_dev_info info;
    uint16_t conf_mtu;

    /* As of DPDK 17.11.1 a few PMDs require to explicitly enable
     * scatter to support jumbo RX. Checking the offload capabilities
     * is not an option as PMDs are not required yet to report
     * them. The only reliable info is the driver name and knowledge
     * (testing or code review). Listing all such PMDs feels harder
     * than highlighting the one known not to need scatter */
    if (dev->mtu > ETHER_MTU) {
        rte_eth_dev_info_get(dev->port_id, &info);
        if (strncmp(info.driver_name, "net_nfp", 7)) {
            conf.rxmode.enable_scatter = 1;
        }
    }

    conf.intr_conf.lsc = dev->lsc_interrupt_mode;
    conf.rxmode.hw_ip_checksum = (dev->hw_ol_features &
                                  NETDEV_RX_CHECKSUM_OFFLOAD) != 0;

    if (dev->hw_ol_features & NETDEV_RX_HW_CRC_STRIP) {
        conf.rxmode.hw_strip_crc = 1;
    }

    /* A device may report more queues than it makes available (this has
     * been observed for Intel xl710, which reserves some of them for
     * SRIOV):  rte_eth_*_queue_setup will fail if a queue is not
     * available.  When this happens we can retry the configuration
     * and request less queues */
    while (n_rxq && n_txq) {
        if (diag) {
            VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
        }

        diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &conf);
        if (diag) {
            VLOG_WARN("Interface %s eth_dev setup error %s\n",
                      dev->up.name, rte_strerror(-diag));
            break;
        }

        diag = rte_eth_dev_set_mtu(dev->port_id, dev->mtu);
        if (diag) {
            /* A device may not support rte_eth_dev_set_mtu, in this case
             * flag a warning to the user and include the devices configured
             * MTU value that will be used instead. */
            if (-ENOTSUP == diag) {
                rte_eth_dev_get_mtu(dev->port_id, &conf_mtu);
                VLOG_WARN("Interface %s does not support MTU configuration, "
                          "max packet size supported is %"PRIu16".",
                          dev->up.name, conf_mtu);
            } else {
                VLOG_ERR("Interface %s MTU (%d) setup error: %s",
                         dev->up.name, dev->mtu, rte_strerror(-diag));
                break;
            }
        }

        for (i = 0; i < n_txq; i++) {
            diag = rte_eth_tx_queue_setup(dev->port_id, i, dev->txq_size,
                                          dev->socket_id, NULL);
            if (diag) {
                VLOG_INFO("Interface %s unable to setup txq(%d): %s",
                          dev->up.name, i, rte_strerror(-diag));
                break;
            }
        }

        if (i != n_txq) {
            /* Retry with less tx queues */
            n_txq = i;
            continue;
        }

        for (i = 0; i < n_rxq; i++) {
            diag = rte_eth_rx_queue_setup(dev->port_id, i, dev->rxq_size,
                                          dev->socket_id, NULL,
                                          dev->dpdk_mp->mp);
            if (diag) {
                VLOG_INFO("Interface %s unable to setup rxq(%d): %s",
                          dev->up.name, i, rte_strerror(-diag));
                break;
            }
        }

        if (i != n_rxq) {
            /* Retry with less rx queues */
            n_rxq = i;
            continue;
        }

        dev->up.n_rxq = n_rxq;
        dev->up.n_txq = n_txq;

        return 0;
    }

    return diag;
}

static void
dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex)
{
    if (rte_eth_dev_flow_ctrl_set(dev->port_id, &dev->fc_conf)) {
        VLOG_WARN("Failed to enable flow control on device "DPDK_PORT_ID_FMT,
                  dev->port_id);
    }
}

static int
dpdk_eth_dev_init(struct netdev_dpdk *dev)
    OVS_REQUIRES(dev->mutex)
{
    struct rte_pktmbuf_pool_private *mbp_priv;
    struct rte_eth_dev_info info;
    struct ether_addr eth_addr;
    int diag;
    int n_rxq, n_txq;
    uint32_t rx_chksm_offload_capa = DEV_RX_OFFLOAD_UDP_CKSUM |
                                     DEV_RX_OFFLOAD_TCP_CKSUM |
                                     DEV_RX_OFFLOAD_IPV4_CKSUM;

    rte_eth_dev_info_get(dev->port_id, &info);

    if (strstr(info.driver_name, "vf") != NULL) {
        VLOG_INFO("Virtual function detected, HW_CRC_STRIP will be enabled");
        dev->hw_ol_features |= NETDEV_RX_HW_CRC_STRIP;
    } else {
        dev->hw_ol_features &= ~NETDEV_RX_HW_CRC_STRIP;
    }

    if ((info.rx_offload_capa & rx_chksm_offload_capa) !=
            rx_chksm_offload_capa) {
        VLOG_WARN("Rx checksum offload is not supported on port "
                  DPDK_PORT_ID_FMT, dev->port_id);
        dev->hw_ol_features &= ~NETDEV_RX_CHECKSUM_OFFLOAD;
    } else {
        dev->hw_ol_features |= NETDEV_RX_CHECKSUM_OFFLOAD;
    }

    n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
    n_txq = MIN(info.max_tx_queues, dev->up.n_txq);

    diag = dpdk_eth_dev_port_config(dev, n_rxq, n_txq);
    if (diag) {
        VLOG_ERR("Interface %s(rxq:%d txq:%d lsc interrupt mode:%s) "
                 "configure error: %s",
                 dev->up.name, n_rxq, n_txq,
                 dev->lsc_interrupt_mode ? "true" : "false",
                 rte_strerror(-diag));
        return -diag;
    }

    diag = rte_eth_dev_start(dev->port_id);
    if (diag) {
        VLOG_ERR("Interface %s start error: %s", dev->up.name,
                 rte_strerror(-diag));
        return -diag;
    }
    dev->started = true;

    rte_eth_promiscuous_enable(dev->port_id);
    rte_eth_allmulticast_enable(dev->port_id);

    memset(&eth_addr, 0x0, sizeof(eth_addr));
    rte_eth_macaddr_get(dev->port_id, &eth_addr);
    VLOG_INFO_RL(&rl, "Port "DPDK_PORT_ID_FMT": "ETH_ADDR_FMT,
                 dev->port_id, ETH_ADDR_BYTES_ARGS(eth_addr.addr_bytes));

    memcpy(dev->hwaddr.ea, eth_addr.addr_bytes, ETH_ADDR_LEN);
    rte_eth_link_get_nowait(dev->port_id, &dev->link);

    mbp_priv = rte_mempool_get_priv(dev->dpdk_mp->mp);
    dev->buf_size = mbp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM;
    return 0;
}

static struct netdev_dpdk *
netdev_dpdk_cast(const struct netdev *netdev)
{
    return CONTAINER_OF(netdev, struct netdev_dpdk, up);
}

static struct netdev *
netdev_dpdk_alloc(void)
{
    struct netdev_dpdk *dev;

    dev = dpdk_rte_mzalloc(sizeof *dev);
    if (dev) {
        return &dev->up;
    }

    return NULL;
}

static struct dpdk_tx_queue *
netdev_dpdk_alloc_txq(unsigned int n_txqs)
{
    struct dpdk_tx_queue *txqs;
    unsigned i;

    txqs = dpdk_rte_mzalloc(n_txqs * sizeof *txqs);
    if (txqs) {
        for (i = 0; i < n_txqs; i++) {
            /* Initialize map for vhost devices. */
            txqs[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
            rte_spinlock_init(&txqs[i].tx_lock);
        }
    }

    return txqs;
}

static int
common_construct(struct netdev *netdev, dpdk_port_t port_no,
                 enum dpdk_dev_type type, int socket_id)
    OVS_REQUIRES(dpdk_mutex)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

    ovs_mutex_init(&dev->mutex);

    rte_spinlock_init(&dev->stats_lock);

    /* If the 'sid' is negative, it means that the kernel fails
     * to obtain the pci numa info.  In that situation, always
     * use 'SOCKET0'. */
    dev->socket_id = socket_id < 0 ? SOCKET0 : socket_id;
    dev->requested_socket_id = dev->socket_id;
    dev->port_id = port_no;
    dev->type = type;
    dev->flags = 0;
    dev->requested_mtu = ETHER_MTU;
    dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
    dev->requested_lsc_interrupt_mode = 0;
    ovsrcu_index_init(&dev->vid, -1);
    dev->vhost_reconfigured = false;
    dev->attached = false;

    ovsrcu_init(&dev->qos_conf, NULL);

    ovsrcu_init(&dev->ingress_policer, NULL);
    dev->policer_rate = 0;
    dev->policer_burst = 0;

    netdev->n_rxq = 0;
    netdev->n_txq = 0;
    dev->requested_n_rxq = NR_QUEUE;
    dev->requested_n_txq = NR_QUEUE;
    dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE;
    dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE;

    /* Initialize the flow control to NULL */
    memset(&dev->fc_conf, 0, sizeof dev->fc_conf);

    /* Initilize the hardware offload flags to 0 */
    dev->hw_ol_features = 0;

    dev->flags = NETDEV_UP | NETDEV_PROMISC;

    ovs_list_push_back(&dpdk_list, &dev->list_node);

    netdev_request_reconfigure(netdev);

    dev->rte_xstats_names = NULL;
    dev->rte_xstats_names_size = 0;

    dev->rte_xstats_ids = NULL;
    dev->rte_xstats_ids_size = 0;

    return 0;
}

/* dev_name must be the prefix followed by a positive decimal number.
 * (no leading + or - signs are allowed) */
static int
dpdk_dev_parse_name(const char dev_name[], const char prefix[],
                    unsigned int *port_no)
{
    const char *cport;

    if (strncmp(dev_name, prefix, strlen(prefix))) {
        return ENODEV;
    }

    cport = dev_name + strlen(prefix);

    if (str_to_uint(cport, 10, port_no)) {
        return 0;
    } else {
        return ENODEV;
    }
}

static int
vhost_common_construct(struct netdev *netdev)
    OVS_REQUIRES(dpdk_mutex)
{
    int socket_id = rte_lcore_to_socket_id(rte_get_master_lcore());
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

    dev->tx_q = netdev_dpdk_alloc_txq(OVS_VHOST_MAX_QUEUE_NUM);
    if (!dev->tx_q) {
        return ENOMEM;
    }

    return common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
                            DPDK_DEV_VHOST, socket_id);
}

static int
netdev_dpdk_vhost_construct(struct netdev *netdev)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    const char *name = netdev->name;
    int err;

    /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in
     * the file system. '/' or '\' would traverse directories, so they're not
     * acceptable in 'name'. */
    if (strchr(name, '/') || strchr(name, '\\')) {
        VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. "
                 "A valid name must not include '/' or '\\'",
                 name);
        return EINVAL;
    }

    ovs_mutex_lock(&dpdk_mutex);
    /* Take the name of the vhost-user port and append it to the location where
     * the socket is to be created, then register the socket.
     */
    snprintf(dev->vhost_id, sizeof dev->vhost_id, "%s/%s",
             dpdk_get_vhost_sock_dir(), name);

    dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT;
    err = rte_vhost_driver_register(dev->vhost_id, dev->vhost_driver_flags);
    if (err) {
        VLOG_ERR("vhost-user socket device setup failure for socket %s\n",
                 dev->vhost_id);
        goto out;
    } else {
        fatal_signal_add_file_to_unlink(dev->vhost_id);
        VLOG_INFO("Socket %s created for vhost-user port %s\n",
                  dev->vhost_id, name);
    }

    err = rte_vhost_driver_callback_register(dev->vhost_id,
                                                &virtio_net_device_ops);
    if (err) {
        VLOG_ERR("rte_vhost_driver_callback_register failed for vhost user "
                 "port: %s\n", name);
        goto out;
    }

    err = rte_vhost_driver_disable_features(dev->vhost_id,
                                1ULL << VIRTIO_NET_F_HOST_TSO4
                                | 1ULL << VIRTIO_NET_F_HOST_TSO6
                                | 1ULL << VIRTIO_NET_F_CSUM);
    if (err) {
        VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
                 "port: %s\n", name);
        goto out;
    }

    err = rte_vhost_driver_start(dev->vhost_id);
    if (err) {
        VLOG_ERR("rte_vhost_driver_start failed for vhost user "
                 "port: %s\n", name);
        goto out;
    }

    err = vhost_common_construct(netdev);
    if (err) {
        VLOG_ERR("vhost_common_construct failed for vhost user "
                 "port: %s\n", name);
    }

out:
    ovs_mutex_unlock(&dpdk_mutex);
    VLOG_WARN_ONCE("dpdkvhostuser ports are considered deprecated;  "
                   "please migrate to dpdkvhostuserclient ports.");
    return err;
}

static int
netdev_dpdk_vhost_client_construct(struct netdev *netdev)
{
    int err;

    ovs_mutex_lock(&dpdk_mutex);
    err = vhost_common_construct(netdev);
    if (err) {
        VLOG_ERR("vhost_common_construct failed for vhost user client"
                 "port: %s\n", netdev->name);
    }
    ovs_mutex_unlock(&dpdk_mutex);
    return err;
}

static int
netdev_dpdk_construct(struct netdev *netdev)
{
    int err;

    ovs_mutex_lock(&dpdk_mutex);
    err = common_construct(netdev, DPDK_ETH_PORT_ID_INVALID,
                           DPDK_DEV_ETH, SOCKET0);
    ovs_mutex_unlock(&dpdk_mutex);
    return err;
}

static void
common_destruct(struct netdev_dpdk *dev)
    OVS_REQUIRES(dpdk_mutex)
    OVS_EXCLUDED(dev->mutex)
{
    rte_free(dev->tx_q);
    dpdk_mp_put(dev->dpdk_mp);

    ovs_list_remove(&dev->list_node);
    free(ovsrcu_get_protected(struct ingress_policer *,
                              &dev->ingress_policer));
    ovs_mutex_destroy(&dev->mutex);
}

static void
netdev_dpdk_destruct(struct netdev *netdev)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    char devname[RTE_ETH_NAME_MAX_LEN];

    ovs_mutex_lock(&dpdk_mutex);

    rte_eth_dev_stop(dev->port_id);
    dev->started = false;

    if (dev->attached) {
        rte_eth_dev_close(dev->port_id);
        if (rte_eth_dev_detach(dev->port_id, devname) < 0) {
            VLOG_ERR("Device '%s' can not be detached", dev->devargs);
        } else {
            VLOG_INFO("Device '%s' has been detached", devname);
        }
    }

    netdev_dpdk_clear_xstats(dev);
    free(dev->devargs);
    common_destruct(dev);

    ovs_mutex_unlock(&dpdk_mutex);
}

/* rte_vhost_driver_unregister() can call back destroy_device(), which will
 * try to acquire 'dpdk_mutex' and possibly 'dev->mutex'.  To avoid a
 * deadlock, none of the mutexes must be held while calling this function. */
static int
dpdk_vhost_driver_unregister(struct netdev_dpdk *dev OVS_UNUSED,
                             char *vhost_id)
    OVS_EXCLUDED(dpdk_mutex)
    OVS_EXCLUDED(dev->mutex)
{
    return rte_vhost_driver_unregister(vhost_id);
}

static void
netdev_dpdk_vhost_destruct(struct netdev *netdev)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    char *vhost_id;

    ovs_mutex_lock(&dpdk_mutex);

    /* Guest becomes an orphan if still attached. */
    if (netdev_dpdk_get_vid(dev) >= 0
        && !(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
        VLOG_ERR("Removing port '%s' while vhost device still attached.",
                 netdev->name);
        VLOG_ERR("To restore connectivity after re-adding of port, VM on "
                 "socket '%s' must be restarted.", dev->vhost_id);
    }

    vhost_id = xstrdup(dev->vhost_id);

    common_destruct(dev);

    ovs_mutex_unlock(&dpdk_mutex);

    if (!vhost_id[0]) {
        goto out;
    }

    if (dpdk_vhost_driver_unregister(dev, vhost_id)) {
        VLOG_ERR("%s: Unable to unregister vhost driver for socket '%s'.\n",
                 netdev->name, vhost_id);
    } else if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
        /* OVS server mode - remove this socket from list for deletion */
        fatal_signal_remove_file_to_unlink(vhost_id);
    }
out:
    free(vhost_id);
}

static void
netdev_dpdk_dealloc(struct netdev *netdev)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

    rte_free(dev);
}

static void
netdev_dpdk_clear_xstats(struct netdev_dpdk *dev)
{
    /* If statistics are already allocated, we have to
     * reconfigure, as port_id could have been changed. */
    if (dev->rte_xstats_names) {
        free(dev->rte_xstats_names);
        dev->rte_xstats_names = NULL;
        dev->rte_xstats_names_size = 0;
    }
    if (dev->rte_xstats_ids) {
        free(dev->rte_xstats_ids);
        dev->rte_xstats_ids = NULL;
        dev->rte_xstats_ids_size = 0;
    }
}

static const char*
netdev_dpdk_get_xstat_name(struct netdev_dpdk *dev, uint64_t id)
{
    if (id >= dev->rte_xstats_names_size) {
        return "UNKNOWN";
    }
    return dev->rte_xstats_names[id].name;
}

static bool
netdev_dpdk_configure_xstats(struct netdev_dpdk *dev)
    OVS_REQUIRES(dev->mutex)
{
    int rte_xstats_len;
    bool ret;
    struct rte_eth_xstat *rte_xstats;
    uint64_t id;
    int xstats_no;
    const char *name;

    /* Retrieving all XSTATS names. If something will go wrong
     * or amount of counters will be equal 0, rte_xstats_names
     * buffer will be marked as NULL, and any further xstats
     * query won't be performed (e.g. during netdev_dpdk_get_stats
     * execution). */

    ret = false;
    rte_xstats = NULL;

    if (dev->rte_xstats_names == NULL || dev->rte_xstats_ids == NULL) {
        dev->rte_xstats_names_size =
                rte_eth_xstats_get_names(dev->port_id, NULL, 0);

        if (dev->rte_xstats_names_size < 0) {
            VLOG_WARN("Cannot get XSTATS for port: "DPDK_PORT_ID_FMT,
                      dev->port_id);
            dev->rte_xstats_names_size = 0;
        } else {
            /* Reserve memory for xstats names and values */
            dev->rte_xstats_names = xcalloc(dev->rte_xstats_names_size,
                                            sizeof *dev->rte_xstats_names);

            if (dev->rte_xstats_names) {
                /* Retreive xstats names */
                rte_xstats_len =
                        rte_eth_xstats_get_names(dev->port_id,
                                                 dev->rte_xstats_names,
                                                 dev->rte_xstats_names_size);

                if (rte_xstats_len < 0) {
                    VLOG_WARN("Cannot get XSTATS names for port: "
                              DPDK_PORT_ID_FMT, dev->port_id);
                    goto out;
                } else if (rte_xstats_len != dev->rte_xstats_names_size) {
                    VLOG_WARN("XSTATS size doesn't match for port: "
                              DPDK_PORT_ID_FMT, dev->port_id);
                    goto out;
                }

                dev->rte_xstats_ids = xcalloc(dev->rte_xstats_names_size,
                                              sizeof(uint64_t));

                /* We have to calculate number of counters */
                rte_xstats = xmalloc(rte_xstats_len * sizeof *rte_xstats);
                memset(rte_xstats, 0xff, sizeof *rte_xstats * rte_xstats_len);

                /* Retreive xstats values */
                if (rte_eth_xstats_get(dev->port_id, rte_xstats,
                                       rte_xstats_len) > 0) {
                    dev->rte_xstats_ids_size = 0;
                    xstats_no = 0;
                    for (uint32_t i = 0; i < rte_xstats_len; i++) {
                        id = rte_xstats[i].id;
                        name = netdev_dpdk_get_xstat_name(dev, id);
                        /* We need to filter out everything except
                         * dropped, error and management counters */
                        if (string_ends_with(name, "_errors") ||
                            strstr(name, "_management_") ||
                            string_ends_with(name, "_dropped")) {

                            dev->rte_xstats_ids[xstats_no] = id;
                            xstats_no++;
                        }
                    }
                    dev->rte_xstats_ids_size = xstats_no;
                    ret = true;
                } else {
                    VLOG_WARN("Can't get XSTATS IDs for port: "
                              DPDK_PORT_ID_FMT, dev->port_id);
                }

                free(rte_xstats);
            }
        }
    } else {
        /* Already configured */
        ret = true;
    }

out:
    if (!ret) {
        netdev_dpdk_clear_xstats(dev);
    }
    return ret;
}

static int
netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

    ovs_mutex_lock(&dev->mutex);

    smap_add_format(args, "requested_rx_queues", "%d", dev->requested_n_rxq);
    smap_add_format(args, "configured_rx_queues", "%d", netdev->n_rxq);
    smap_add_format(args, "requested_tx_queues", "%d", dev->requested_n_txq);
    smap_add_format(args, "configured_tx_queues", "%d", netdev->n_txq);
    smap_add_format(args, "mtu", "%d", dev->mtu);

    if (dev->type == DPDK_DEV_ETH) {
        smap_add_format(args, "requested_rxq_descriptors", "%d",
                        dev->requested_rxq_size);
        smap_add_format(args, "configured_rxq_descriptors", "%d",
                        dev->rxq_size);
        smap_add_format(args, "requested_txq_descriptors", "%d",
                        dev->requested_txq_size);
        smap_add_format(args, "configured_txq_descriptors", "%d",
                        dev->txq_size);
        if (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) {
            smap_add(args, "rx_csum_offload", "true");
        } else {
            smap_add(args, "rx_csum_offload", "false");
        }
        smap_add(args, "lsc_interrupt_mode",
                 dev->lsc_interrupt_mode ? "true" : "false");
    }
    ovs_mutex_unlock(&dev->mutex);

    return 0;
}

static struct netdev_dpdk *
netdev_dpdk_lookup_by_port_id(dpdk_port_t port_id)
    OVS_REQUIRES(dpdk_mutex)
{
    struct netdev_dpdk *dev;

    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
        if (dev->port_id == port_id) {
            return dev;
        }
    }

    return NULL;
}

static dpdk_port_t
netdev_dpdk_get_port_by_mac(const char *mac_str)
{
    dpdk_port_t port_id;
    struct eth_addr mac, port_mac;

    if (!eth_addr_from_string(mac_str, &mac)) {
        VLOG_ERR("invalid mac: %s", mac_str);
        return DPDK_ETH_PORT_ID_INVALID;
    }

    RTE_ETH_FOREACH_DEV (port_id) {
        struct ether_addr ea;

        rte_eth_macaddr_get(port_id, &ea);
        memcpy(port_mac.ea, ea.addr_bytes, ETH_ADDR_LEN);
        if (eth_addr_equals(mac, port_mac)) {
            return port_id;
        }
    }

    return DPDK_ETH_PORT_ID_INVALID;
}

/*
 * Normally, a PCI id is enough for identifying a specific DPDK port.
 * However, for some NICs having multiple ports sharing the same PCI
 * id, using PCI id won't work then.
 *
 * To fix that, here one more method is introduced: "class=eth,mac=$MAC".
 *
 * Note that the compatibility is fully kept: user can still use the
 * PCI id for adding ports (when it's enough for them).
 */
static dpdk_port_t
netdev_dpdk_process_devargs(struct netdev_dpdk *dev,
                            const char *devargs, char **errp)
{
    char *name;
    dpdk_port_t new_port_id = DPDK_ETH_PORT_ID_INVALID;

    if (strncmp(devargs, "class=eth,mac=", 14) == 0) {
        new_port_id = netdev_dpdk_get_port_by_mac(&devargs[14]);
    } else {
        name = xmemdup0(devargs, strcspn(devargs, ","));
        if (rte_eth_dev_get_port_by_name(name, &new_port_id)
                || !rte_eth_dev_is_valid_port(new_port_id)) {
            /* Device not found in DPDK, attempt to attach it */
            if (!rte_eth_dev_attach(devargs, &new_port_id)) {
                /* Attach successful */
                dev->attached = true;
                VLOG_INFO("Device '%s' attached to DPDK", devargs);
            } else {
                /* Attach unsuccessful */
                new_port_id = DPDK_ETH_PORT_ID_INVALID;
            }
        }
        free(name);
    }

    if (new_port_id == DPDK_ETH_PORT_ID_INVALID) {
        VLOG_WARN_BUF(errp, "Error attaching device '%s' to DPDK", devargs);
    }

    return new_port_id;
}

static void
dpdk_set_rxq_config(struct netdev_dpdk *dev, const struct smap *args)
    OVS_REQUIRES(dev->mutex)
{
    int new_n_rxq;

    new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
    if (new_n_rxq != dev->requested_n_rxq) {
        dev->requested_n_rxq = new_n_rxq;
        netdev_request_reconfigure(&dev->up);
    }
}

static void
dpdk_process_queue_size(struct netdev *netdev, const struct smap *args,
                        const char *flag, int default_size, int *new_size)
{
    int queue_size = smap_get_int(args, flag, default_size);

    if (queue_size <= 0 || queue_size > NIC_PORT_MAX_Q_SIZE
            || !is_pow2(queue_size)) {
        queue_size = default_size;
    }

    if (queue_size != *new_size) {
        *new_size = queue_size;
        netdev_request_reconfigure(netdev);
    }
}

static int
netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args,
                       char **errp)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    bool rx_fc_en, tx_fc_en, autoneg, lsc_interrupt_mode;
    enum rte_eth_fc_mode fc_mode;
    static const enum rte_eth_fc_mode fc_mode_set[2][2] = {
        {RTE_FC_NONE,     RTE_FC_TX_PAUSE},
        {RTE_FC_RX_PAUSE, RTE_FC_FULL    }
    };
    const char *new_devargs;
    int err = 0;

    ovs_mutex_lock(&dpdk_mutex);
    ovs_mutex_lock(&dev->mutex);

    dpdk_set_rxq_config(dev, args);

    dpdk_process_queue_size(netdev, args, "n_rxq_desc",
                            NIC_PORT_DEFAULT_RXQ_SIZE,
                            &dev->requested_rxq_size);
    dpdk_process_queue_size(netdev, args, "n_txq_desc",
                            NIC_PORT_DEFAULT_TXQ_SIZE,
                            &dev->requested_txq_size);

    new_devargs = smap_get(args, "dpdk-devargs");

    if (dev->devargs && strcmp(new_devargs, dev->devargs)) {
        /* The user requested a new device.  If we return error, the caller
         * will delete this netdev and try to recreate it. */
        err = EAGAIN;
        goto out;
    }

    /* dpdk-devargs is required for device configuration */
    if (new_devargs && new_devargs[0]) {
        /* Don't process dpdk-devargs if value is unchanged and port id
         * is valid */
        if (!(dev->devargs && !strcmp(dev->devargs, new_devargs)
               && rte_eth_dev_is_valid_port(dev->port_id))) {
            dpdk_port_t new_port_id = netdev_dpdk_process_devargs(dev,
                                                                  new_devargs,
                                                                  errp);
            if (!rte_eth_dev_is_valid_port(new_port_id)) {
                err = EINVAL;
            } else if (new_port_id == dev->port_id) {
                /* Already configured, do not reconfigure again */
                err = 0;
            } else {
                struct netdev_dpdk *dup_dev;

                dup_dev = netdev_dpdk_lookup_by_port_id(new_port_id);
                if (dup_dev) {
                    VLOG_WARN_BUF(errp, "'%s' is trying to use device '%s' "
                                        "which is already in use by '%s'",
                                  netdev_get_name(netdev), new_devargs,
                                  netdev_get_name(&dup_dev->up));
                    err = EADDRINUSE;
                } else {
                    int sid = rte_eth_dev_socket_id(new_port_id);

                    dev->requested_socket_id = sid < 0 ? SOCKET0 : sid;
                    dev->devargs = xstrdup(new_devargs);
                    dev->port_id = new_port_id;
                    netdev_request_reconfigure(&dev->up);
                    netdev_dpdk_clear_xstats(dev);
                    err = 0;
                }
            }
        }
    } else {
        VLOG_WARN_BUF(errp, "'%s' is missing 'options:dpdk-devargs'. "
                            "The old 'dpdk<port_id>' names are not supported",
                      netdev_get_name(netdev));
        err = EINVAL;
    }

    if (err) {
        goto out;
    }

    lsc_interrupt_mode = smap_get_bool(args, "dpdk-lsc-interrupt", false);
    if (dev->requested_lsc_interrupt_mode != lsc_interrupt_mode) {
        dev->requested_lsc_interrupt_mode = lsc_interrupt_mode;
        netdev_request_reconfigure(netdev);
    }

    rx_fc_en = smap_get_bool(args, "rx-flow-ctrl", false);
    tx_fc_en = smap_get_bool(args, "tx-flow-ctrl", false);
    autoneg = smap_get_bool(args, "flow-ctrl-autoneg", false);

    fc_mode = fc_mode_set[tx_fc_en][rx_fc_en];
    if (dev->fc_conf.mode != fc_mode || autoneg != dev->fc_conf.autoneg) {
        dev->fc_conf.mode = fc_mode;
        dev->fc_conf.autoneg = autoneg;
        /* Get the Flow control configuration for DPDK-ETH */
        err = rte_eth_dev_flow_ctrl_get(dev->port_id, &dev->fc_conf);
        if (err) {
            VLOG_WARN("Cannot get flow control parameters on port "
                DPDK_PORT_ID_FMT", err=%d", dev->port_id, err);
        }
        dpdk_eth_flow_ctrl_setup(dev);
    }

out:
    ovs_mutex_unlock(&dev->mutex);
    ovs_mutex_unlock(&dpdk_mutex);

    return err;
}

static int
netdev_dpdk_ring_set_config(struct netdev *netdev, const struct smap *args,
                            char **errp OVS_UNUSED)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

    ovs_mutex_lock(&dev->mutex);
    dpdk_set_rxq_config(dev, args);
    ovs_mutex_unlock(&dev->mutex);

    return 0;
}

static int
netdev_dpdk_vhost_client_set_config(struct netdev *netdev,
                                    const struct smap *args,
                                    char **errp OVS_UNUSED)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    const char *path;

    ovs_mutex_lock(&dev->mutex);
    if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)) {
        path = smap_get(args, "vhost-server-path");
        if (path && strcmp(path, dev->vhost_id)) {
            strcpy(dev->vhost_id, path);
            /* check zero copy configuration */
            if (smap_get_bool(args, "dq-zero-copy", false)) {
                dev->vhost_driver_flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
            } else {
                dev->vhost_driver_flags &= ~RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
            }
            netdev_request_reconfigure(netdev);
        }
    }
    ovs_mutex_unlock(&dev->mutex);

    return 0;
}

static int
netdev_dpdk_get_numa_id(const struct netdev *netdev)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

    return dev->socket_id;
}

/* Sets the number of tx queues for the dpdk interface. */
static int
netdev_dpdk_set_tx_multiq(struct netdev *netdev, unsigned int n_txq)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

    ovs_mutex_lock(&dev->mutex);

    if (dev->requested_n_txq == n_txq) {
        goto out;
    }

    dev->requested_n_txq = n_txq;
    netdev_request_reconfigure(netdev);

out:
    ovs_mutex_unlock(&dev->mutex);
    return 0;
}

static struct netdev_rxq *
netdev_dpdk_rxq_alloc(void)
{
    struct netdev_rxq_dpdk *rx = dpdk_rte_mzalloc(sizeof *rx);

    if (rx) {
        return &rx->up;
    }

    return NULL;
}

static struct netdev_rxq_dpdk *
netdev_rxq_dpdk_cast(const struct netdev_rxq *rxq)
{
    return CONTAINER_OF(rxq, struct netdev_rxq_dpdk, up);
}

static int
netdev_dpdk_rxq_construct(struct netdev_rxq *rxq)
{
    struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);

    ovs_mutex_lock(&dev->mutex);
    rx->port_id = dev->port_id;
    ovs_mutex_unlock(&dev->mutex);

    return 0;
}

static void
netdev_dpdk_rxq_destruct(struct netdev_rxq *rxq OVS_UNUSED)
{
}

static void
netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
{
    struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);

    rte_free(rx);
}

/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of
 * 'pkts', even in case of failure.
 *
 * Returns the number of packets that weren't transmitted. */
static inline int
netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
                         struct rte_mbuf **pkts, int cnt)
{
    uint32_t nb_tx = 0;

    while (nb_tx != cnt) {
        uint32_t ret;

        ret = rte_eth_tx_burst(dev->port_id, qid, pkts + nb_tx, cnt - nb_tx);
        if (!ret) {
            break;
        }

        nb_tx += ret;
    }

    if (OVS_UNLIKELY(nb_tx != cnt)) {
        /* Free buffers, which we couldn't transmit, one at a time (each
         * packet could come from a different mempool) */
        int i;

        for (i = nb_tx; i < cnt; i++) {
            rte_pktmbuf_free(pkts[i]);
        }
    }

    return cnt - nb_tx;
}

static inline bool
netdev_dpdk_policer_pkt_handle(struct rte_meter_srtcm *meter,
                               struct rte_mbuf *pkt, uint64_t time)
{
    uint32_t pkt_len = rte_pktmbuf_pkt_len(pkt) - sizeof(struct ether_hdr);

    return rte_meter_srtcm_color_blind_check(meter, time, pkt_len) ==
                                                e_RTE_METER_GREEN;
}

static int
netdev_dpdk_policer_run(struct rte_meter_srtcm *meter,
                        struct rte_mbuf **pkts, int pkt_cnt,
                        bool should_steal)
{
    int i = 0;
    int cnt = 0;
    struct rte_mbuf *pkt = NULL;
    uint64_t current_time = rte_rdtsc();

    for (i = 0; i < pkt_cnt; i++) {
        pkt = pkts[i];
        /* Handle current packet */
        if (netdev_dpdk_policer_pkt_handle(meter, pkt, current_time)) {
            if (cnt != i) {
                pkts[cnt] = pkt;
            }
            cnt++;
        } else {
            if (should_steal) {
                rte_pktmbuf_free(pkt);
            }
        }
    }

    return cnt;
}

static int
ingress_policer_run(struct ingress_policer *policer, struct rte_mbuf **pkts,
                    int pkt_cnt, bool should_steal)
{
    int cnt = 0;

    rte_spinlock_lock(&policer->policer_lock);
    cnt = netdev_dpdk_policer_run(&policer->in_policer, pkts,
                                  pkt_cnt, should_steal);
    rte_spinlock_unlock(&policer->policer_lock);

    return cnt;
}

static bool
is_vhost_running(struct netdev_dpdk *dev)
{
    return (netdev_dpdk_get_vid(dev) >= 0 && dev->vhost_reconfigured);
}

static inline void
netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats,
                                          unsigned int packet_size)
{
    /* Hard-coded search for the size bucket. */
    if (packet_size < 256) {
        if (packet_size >= 128) {
            stats->rx_128_to_255_packets++;
        } else if (packet_size <= 64) {
            stats->rx_1_to_64_packets++;
        } else {
            stats->rx_65_to_127_packets++;
        }
    } else {
        if (packet_size >= 1523) {
            stats->rx_1523_to_max_packets++;
        } else if (packet_size >= 1024) {
            stats->rx_1024_to_1522_packets++;
        } else if (packet_size < 512) {
            stats->rx_256_to_511_packets++;
        } else {
            stats->rx_512_to_1023_packets++;
        }
    }
}

static inline void
netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats,
                                     struct dp_packet **packets, int count,
                                     int dropped)
{
    int i;
    unsigned int packet_size;
    struct dp_packet *packet;

    stats->rx_packets += count;
    stats->rx_dropped += dropped;
    for (i = 0; i < count; i++) {
        packet = packets[i];
        packet_size = dp_packet_size(packet);

        if (OVS_UNLIKELY(packet_size < ETH_HEADER_LEN)) {
            /* This only protects the following multicast counting from
             * too short packets, but it does not stop the packet from
             * further processing. */
            stats->rx_errors++;
            stats->rx_length_errors++;
            continue;
        }

        netdev_dpdk_vhost_update_rx_size_counters(stats, packet_size);

        struct eth_header *eh = (struct eth_header *) dp_packet_data(packet);
        if (OVS_UNLIKELY(eth_addr_is_multicast(eh->eth_dst))) {
            stats->multicast++;
        }

        stats->rx_bytes += packet_size;
    }
}

/*
 * The receive path for the vhost port is the TX path out from guest.
 */
static int
netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
                           struct dp_packet_batch *batch, int *qfill)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
    struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
    uint16_t nb_rx = 0;
    uint16_t dropped = 0;
    int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
    int vid = netdev_dpdk_get_vid(dev);

    if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
                     || !(dev->flags & NETDEV_UP))) {
        return EAGAIN;
    }

    nb_rx = rte_vhost_dequeue_burst(vid, qid, dev->dpdk_mp->mp,
                                    (struct rte_mbuf **) batch->packets,
                                    NETDEV_MAX_BURST);
    if (!nb_rx) {
        return EAGAIN;
    }

    if (qfill) {
        if (nb_rx == NETDEV_MAX_BURST) {
            /* The DPDK API returns a uint32_t which often has invalid bits in
             * the upper 16-bits. Need to restrict the value to uint16_t. */
            *qfill = rte_vhost_rx_queue_count(vid, qid) & UINT16_MAX;
        } else {
            *qfill = 0;
        }
    }

    if (policer) {
        dropped = nb_rx;
        nb_rx = ingress_policer_run(policer,
                                    (struct rte_mbuf **) batch->packets,
                                    nb_rx, true);
        dropped -= nb_rx;
    }

    rte_spinlock_lock(&dev->stats_lock);
    netdev_dpdk_vhost_update_rx_counters(&dev->stats, batch->packets,
                                         nb_rx, dropped);
    rte_spinlock_unlock(&dev->stats_lock);

    batch->count = nb_rx;
    dp_packet_batch_init_packet_fields(batch);

    return 0;
}

static int
netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
                     int *qfill)
{
    struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
    struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
    struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
    int nb_rx;
    int dropped = 0;

    if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
        return EAGAIN;
    }

    nb_rx = rte_eth_rx_burst(rx->port_id, rxq->queue_id,
                             (struct rte_mbuf **) batch->packets,
                             NETDEV_MAX_BURST);
    if (!nb_rx) {
        return EAGAIN;
    }

    if (policer) {
        dropped = nb_rx;
        nb_rx = ingress_policer_run(policer,
                                    (struct rte_mbuf **) batch->packets,
                                    nb_rx, true);
        dropped -= nb_rx;
    }

    /* Update stats to reflect dropped packets */
    if (OVS_UNLIKELY(dropped)) {
        rte_spinlock_lock(&dev->stats_lock);
        dev->stats.rx_dropped += dropped;
        rte_spinlock_unlock(&dev->stats_lock);
    }

    batch->count = nb_rx;
    dp_packet_batch_init_packet_fields(batch);

    if (qfill) {
        if (nb_rx == NETDEV_MAX_BURST) {
            *qfill = rte_eth_rx_queue_count(rx->port_id, rxq->queue_id);
        } else {
            *qfill = 0;
        }
    }

    return 0;
}

static inline int
netdev_dpdk_qos_run(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
                    int cnt, bool should_steal)
{
    struct qos_conf *qos_conf = ovsrcu_get(struct qos_conf *, &dev->qos_conf);

    if (qos_conf) {
        rte_spinlock_lock(&qos_conf->lock);
        cnt = qos_conf->ops->qos_run(qos_conf, pkts, cnt, should_steal);
        rte_spinlock_unlock(&qos_conf->lock);
    }

    return cnt;
}

static int
netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
                              int pkt_cnt)
{
    int i = 0;
    int cnt = 0;
    struct rte_mbuf *pkt;

    for (i = 0; i < pkt_cnt; i++) {
        pkt = pkts[i];
        if (OVS_UNLIKELY(pkt->pkt_len > dev->max_packet_len)) {
            VLOG_WARN_RL(&rl, "%s: Too big size %" PRIu32 " max_packet_len %d",
                         dev->up.name, pkt->pkt_len, dev->max_packet_len);
            rte_pktmbuf_free(pkt);
            continue;
        }

        if (OVS_UNLIKELY(i != cnt)) {
            pkts[cnt] = pkt;
        }
        cnt++;
    }

    return cnt;
}

static inline void
netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats,
                                     struct dp_packet **packets,
                                     int attempted,
                                     int dropped)
{
    int i;
    int sent = attempted - dropped;

    stats->tx_packets += sent;
    stats->tx_dropped += dropped;

    for (i = 0; i < sent; i++) {
        stats->tx_bytes += dp_packet_size(packets[i]);
    }
}

static void
__netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
                         struct dp_packet **pkts, int cnt)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    struct rte_mbuf **cur_pkts = (struct rte_mbuf **) pkts;
    unsigned int total_pkts = cnt;
    unsigned int dropped = 0;
    int i, retries = 0;
    int vid = netdev_dpdk_get_vid(dev);

    qid = dev->tx_q[qid % netdev->n_txq].map;

    if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured || qid < 0
                     || !(dev->flags & NETDEV_UP))) {
        rte_spinlock_lock(&dev->stats_lock);
        dev->stats.tx_dropped+= cnt;
        rte_spinlock_unlock(&dev->stats_lock);
        goto out;
    }

    rte_spinlock_lock(&dev->tx_q[qid].tx_lock);

    cnt = netdev_dpdk_filter_packet_len(dev, cur_pkts, cnt);
    /* Check has QoS has been configured for the netdev */
    cnt = netdev_dpdk_qos_run(dev, cur_pkts, cnt, true);
    dropped = total_pkts - cnt;

    do {
        int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ;
        unsigned int tx_pkts;

        tx_pkts = rte_vhost_enqueue_burst(vid, vhost_qid, cur_pkts, cnt);
        if (OVS_LIKELY(tx_pkts)) {
            /* Packets have been sent.*/
            cnt -= tx_pkts;
            /* Prepare for possible retry.*/
            cur_pkts = &cur_pkts[tx_pkts];
        } else {
            /* No packets sent - do not retry.*/
            break;
        }
    } while (cnt && (retries++ <= VHOST_ENQ_RETRY_NUM));

    rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);

    rte_spinlock_lock(&dev->stats_lock);
    netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts,
                                         cnt + dropped);
    rte_spinlock_unlock(&dev->stats_lock);

out:
    for (i = 0; i < total_pkts - dropped; i++) {
        dp_packet_delete(pkts[i]);
    }
}

/* Tx function. Transmit packets indefinitely */
static void
dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
    OVS_NO_THREAD_SAFETY_ANALYSIS
{
    const size_t batch_cnt = dp_packet_batch_size(batch);
#if !defined(__CHECKER__) && !defined(_WIN32)
    const size_t PKT_ARRAY_SIZE = batch_cnt;
#else
    /* Sparse or MSVC doesn't like variable length array. */
    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
#endif
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    struct rte_mbuf *pkts[PKT_ARRAY_SIZE];
    uint32_t cnt = batch_cnt;
    uint32_t dropped = 0;

    if (dev->type != DPDK_DEV_VHOST) {
        /* Check if QoS has been configured for this netdev. */
        cnt = netdev_dpdk_qos_run(dev, (struct rte_mbuf **) batch->packets,
                                  batch_cnt, false);
        dropped += batch_cnt - cnt;
    }

    uint32_t txcnt = 0;

    for (uint32_t i = 0; i < cnt; i++) {
        struct dp_packet *packet = batch->packets[i];
        uint32_t size = dp_packet_size(packet);

        if (OVS_UNLIKELY(size > dev->max_packet_len)) {
            VLOG_WARN_RL(&rl, "Too big size %u max_packet_len %d",
                         size, dev->max_packet_len);

            dropped++;
            continue;
        }

        pkts[txcnt] = rte_pktmbuf_alloc(dev->dpdk_mp->mp);
        if (OVS_UNLIKELY(!pkts[txcnt])) {
            dropped += cnt - i;
            break;
        }

        /* We have to do a copy for now */
        memcpy(rte_pktmbuf_mtod(pkts[txcnt], void *),
               dp_packet_data(packet), size);
        dp_packet_set_size((struct dp_packet *)pkts[txcnt], size);

        txcnt++;
    }

    if (OVS_LIKELY(txcnt)) {
        if (dev->type == DPDK_DEV_VHOST) {
            __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) pkts,
                                     txcnt);
        } else {
            dropped += netdev_dpdk_eth_tx_burst(dev, qid, pkts, txcnt);
        }
    }

    if (OVS_UNLIKELY(dropped)) {
        rte_spinlock_lock(&dev->stats_lock);
        dev->stats.tx_dropped += dropped;
        rte_spinlock_unlock(&dev->stats_lock);
    }
}

static int
netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
                       struct dp_packet_batch *batch,
                       bool concurrent_txq OVS_UNUSED)
{

    if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
        dpdk_do_tx_copy(netdev, qid, batch);
        dp_packet_delete_batch(batch, true);
    } else {
        __netdev_dpdk_vhost_send(netdev, qid, batch->packets, batch->count);
    }
    return 0;
}

static inline void
netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
                   struct dp_packet_batch *batch,
                   bool concurrent_txq)
{
    if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
        dp_packet_delete_batch(batch, true);
        return;
    }

    if (OVS_UNLIKELY(concurrent_txq)) {
        qid = qid % dev->up.n_txq;
        rte_spinlock_lock(&dev->tx_q[qid].tx_lock);
    }

    if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
        struct netdev *netdev = &dev->up;

        dpdk_do_tx_copy(netdev, qid, batch);
        dp_packet_delete_batch(batch, true);
    } else {
        int tx_cnt, dropped;
        int batch_cnt = dp_packet_batch_size(batch);
        struct rte_mbuf **pkts = (struct rte_mbuf **) batch->packets;

        tx_cnt = netdev_dpdk_filter_packet_len(dev, pkts, batch_cnt);
        tx_cnt = netdev_dpdk_qos_run(dev, pkts, tx_cnt, true);
        dropped = batch_cnt - tx_cnt;

        dropped += netdev_dpdk_eth_tx_burst(dev, qid, pkts, tx_cnt);

        if (OVS_UNLIKELY(dropped)) {
            rte_spinlock_lock(&dev->stats_lock);
            dev->stats.tx_dropped += dropped;
            rte_spinlock_unlock(&dev->stats_lock);
        }
    }

    if (OVS_UNLIKELY(concurrent_txq)) {
        rte_spinlock_unlock(&dev->tx_q[qid].tx_lock);
    }
}

static int
netdev_dpdk_eth_send(struct netdev *netdev, int qid,
                     struct dp_packet_batch *batch, bool concurrent_txq)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

    netdev_dpdk_send__(dev, qid, batch, concurrent_txq);
    return 0;
}

static int
netdev_dpdk_set_etheraddr(struct netdev *netdev, const struct eth_addr mac)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

    ovs_mutex_lock(&dev->mutex);
    if (!eth_addr_equals(dev->hwaddr, mac)) {
        dev->hwaddr = mac;
        netdev_change_seq_changed(netdev);
    }
    ovs_mutex_unlock(&dev->mutex);

    return 0;
}

static int
netdev_dpdk_get_etheraddr(const struct netdev *netdev, struct eth_addr *mac)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

    ovs_mutex_lock(&dev->mutex);
    *mac = dev->hwaddr;
    ovs_mutex_unlock(&dev->mutex);

    return 0;
}

static int
netdev_dpdk_get_mtu(const struct netdev *netdev, int *mtup)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

    ovs_mutex_lock(&dev->mutex);
    *mtup = dev->mtu;
    ovs_mutex_unlock(&dev->mutex);

    return 0;
}

static int
netdev_dpdk_set_mtu(struct netdev *netdev, int mtu)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

    /* XXX: Ensure that the overall frame length of the requested MTU does not
     * surpass the NETDEV_DPDK_MAX_PKT_LEN. DPDK device drivers differ in how
     * the L2 frame length is calculated for a given MTU when
     * rte_eth_dev_set_mtu(mtu) is called e.g. i40e driver includes 2 x vlan
     * headers, the em driver includes 1 x vlan header, the ixgbe driver does
     * not include vlan headers. As such we should use
     * MTU_TO_MAX_FRAME_LEN(mtu) which includes an additional 2 x vlan headers
     * (8 bytes) for comparison. This avoids a failure later with
     * rte_eth_dev_set_mtu(). This approach should be used until DPDK provides
     * a method to retrieve the upper bound MTU for a given device.
     */
    if (MTU_TO_MAX_FRAME_LEN(mtu) > NETDEV_DPDK_MAX_PKT_LEN
        || mtu < ETHER_MIN_MTU) {
        VLOG_WARN("%s: unsupported MTU %d\n", dev->up.name, mtu);
        return EINVAL;
    }

    ovs_mutex_lock(&dev->mutex);
    if (dev->requested_mtu != mtu) {
        dev->requested_mtu = mtu;
        netdev_request_reconfigure(netdev);
    }
    ovs_mutex_unlock(&dev->mutex);

    return 0;
}

static int
netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier);

static int
netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
                            struct netdev_stats *stats)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

    ovs_mutex_lock(&dev->mutex);

    rte_spinlock_lock(&dev->stats_lock);
    /* Supported Stats */
    stats->rx_packets = dev->stats.rx_packets;
    stats->tx_packets = dev->stats.tx_packets;
    stats->rx_dropped = dev->stats.rx_dropped;
    stats->tx_dropped = dev->stats.tx_dropped;
    stats->multicast = dev->stats.multicast;
    stats->rx_bytes = dev->stats.rx_bytes;
    stats->tx_bytes = dev->stats.tx_bytes;
    stats->rx_errors = dev->stats.rx_errors;
    stats->rx_length_errors = dev->stats.rx_length_errors;

    stats->rx_1_to_64_packets = dev->stats.rx_1_to_64_packets;
    stats->rx_65_to_127_packets = dev->stats.rx_65_to_127_packets;
    stats->rx_128_to_255_packets = dev->stats.rx_128_to_255_packets;
    stats->rx_256_to_511_packets = dev->stats.rx_256_to_511_packets;
    stats->rx_512_to_1023_packets = dev->stats.rx_512_to_1023_packets;
    stats->rx_1024_to_1522_packets = dev->stats.rx_1024_to_1522_packets;
    stats->rx_1523_to_max_packets = dev->stats.rx_1523_to_max_packets;

    rte_spinlock_unlock(&dev->stats_lock);

    ovs_mutex_unlock(&dev->mutex);

    return 0;
}

static void
netdev_dpdk_convert_xstats(struct netdev_stats *stats,
                           const struct rte_eth_xstat *xstats,
                           const struct rte_eth_xstat_name *names,
                           const unsigned int size)
{
    for (unsigned int i = 0; i < size; i++) {
        if (strcmp(XSTAT_RX_64_PACKETS, names[i].name) == 0) {
            stats->rx_1_to_64_packets = xstats[i].value;
        } else if (strcmp(XSTAT_RX_65_TO_127_PACKETS, names[i].name) == 0) {
            stats->rx_65_to_127_packets = xstats[i].value;
        } else if (strcmp(XSTAT_RX_128_TO_255_PACKETS, names[i].name) == 0) {
            stats->rx_128_to_255_packets = xstats[i].value;
        } else if (strcmp(XSTAT_RX_256_TO_511_PACKETS, names[i].name) == 0) {
            stats->rx_256_to_511_packets = xstats[i].value;
        } else if (strcmp(XSTAT_RX_512_TO_1023_PACKETS, names[i].name) == 0) {
            stats->rx_512_to_1023_packets = xstats[i].value;
        } else if (strcmp(XSTAT_RX_1024_TO_1522_PACKETS, names[i].name) == 0) {
            stats->rx_1024_to_1522_packets = xstats[i].value;
        } else if (strcmp(XSTAT_RX_1523_TO_MAX_PACKETS, names[i].name) == 0) {
            stats->rx_1523_to_max_packets = xstats[i].value;
        } else if (strcmp(XSTAT_TX_64_PACKETS, names[i].name) == 0) {
            stats->tx_1_to_64_packets = xstats[i].value;
        } else if (strcmp(XSTAT_TX_65_TO_127_PACKETS, names[i].name) == 0) {
            stats->tx_65_to_127_packets = xstats[i].value;
        } else if (strcmp(XSTAT_TX_128_TO_255_PACKETS, names[i].name) == 0) {
            stats->tx_128_to_255_packets = xstats[i].value;
        } else if (strcmp(XSTAT_TX_256_TO_511_PACKETS, names[i].name) == 0) {
            stats->tx_256_to_511_packets = xstats[i].value;
        } else if (strcmp(XSTAT_TX_512_TO_1023_PACKETS, names[i].name) == 0) {
            stats->tx_512_to_1023_packets = xstats[i].value;
        } else if (strcmp(XSTAT_TX_1024_TO_1522_PACKETS, names[i].name) == 0) {
            stats->tx_1024_to_1522_packets = xstats[i].value;
        } else if (strcmp(XSTAT_TX_1523_TO_MAX_PACKETS, names[i].name) == 0) {
            stats->tx_1523_to_max_packets = xstats[i].value;
        } else if (strcmp(XSTAT_RX_MULTICAST_PACKETS, names[i].name) == 0) {
            stats->multicast = xstats[i].value;
        } else if (strcmp(XSTAT_TX_MULTICAST_PACKETS, names[i].name) == 0) {
            stats->tx_multicast_packets = xstats[i].value;
        } else if (strcmp(XSTAT_RX_BROADCAST_PACKETS, names[i].name) == 0) {
            stats->rx_broadcast_packets = xstats[i].value;
        } else if (strcmp(XSTAT_TX_BROADCAST_PACKETS, names[i].name) == 0) {
            stats->tx_broadcast_packets = xstats[i].value;
        } else if (strcmp(XSTAT_RX_UNDERSIZED_ERRORS, names[i].name) == 0) {
            stats->rx_undersized_errors = xstats[i].value;
        } else if (strcmp(XSTAT_RX_FRAGMENTED_ERRORS, names[i].name) == 0) {
            stats->rx_fragmented_errors = xstats[i].value;
        } else if (strcmp(XSTAT_RX_JABBER_ERRORS, names[i].name) == 0) {
            stats->rx_jabber_errors = xstats[i].value;
        }
    }
}

static int
netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    struct rte_eth_stats rte_stats;
    bool gg;

    netdev_dpdk_get_carrier(netdev, &gg);
    ovs_mutex_lock(&dev->mutex);

    struct rte_eth_xstat *rte_xstats = NULL;
    struct rte_eth_xstat_name *rte_xstats_names = NULL;
    int rte_xstats_len, rte_xstats_new_len, rte_xstats_ret;

    if (rte_eth_stats_get(dev->port_id, &rte_stats)) {
        VLOG_ERR("Can't get ETH statistics for port: "DPDK_PORT_ID_FMT,
                 dev->port_id);
        ovs_mutex_unlock(&dev->mutex);
        return EPROTO;
    }

    /* Get length of statistics */
    rte_xstats_len = rte_eth_xstats_get_names(dev->port_id, NULL, 0);
    if (rte_xstats_len < 0) {
        VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
                  dev->port_id);
        goto out;
    }
    /* Reserve memory for xstats names and values */
    rte_xstats_names = xcalloc(rte_xstats_len, sizeof *rte_xstats_names);
    rte_xstats = xcalloc(rte_xstats_len, sizeof *rte_xstats);

    /* Retreive xstats names */
    rte_xstats_new_len = rte_eth_xstats_get_names(dev->port_id,
                                                  rte_xstats_names,
                                                  rte_xstats_len);
    if (rte_xstats_new_len != rte_xstats_len) {
        VLOG_WARN("Cannot get XSTATS names for port: "DPDK_PORT_ID_FMT,
                  dev->port_id);
        goto out;
    }
    /* Retreive xstats values */
    memset(rte_xstats, 0xff, sizeof *rte_xstats * rte_xstats_len);
    rte_xstats_ret = rte_eth_xstats_get(dev->port_id, rte_xstats,
                                        rte_xstats_len);
    if (rte_xstats_ret > 0 && rte_xstats_ret <= rte_xstats_len) {
        netdev_dpdk_convert_xstats(stats, rte_xstats, rte_xstats_names,
                                   rte_xstats_len);
    } else {
        VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
                  dev->port_id);
    }

out:
    free(rte_xstats);
    free(rte_xstats_names);

    stats->rx_packets = rte_stats.ipackets;
    stats->tx_packets = rte_stats.opackets;
    stats->rx_bytes = rte_stats.ibytes;
    stats->tx_bytes = rte_stats.obytes;
    stats->rx_errors = rte_stats.ierrors;
    stats->tx_errors = rte_stats.oerrors;

    rte_spinlock_lock(&dev->stats_lock);
    stats->tx_dropped = dev->stats.tx_dropped;
    stats->rx_dropped = dev->stats.rx_dropped;
    rte_spinlock_unlock(&dev->stats_lock);

    /* These are the available DPDK counters for packets not received due to
     * local resource constraints in DPDK and NIC respectively. */
    stats->rx_dropped += rte_stats.rx_nombuf + rte_stats.imissed;
    stats->rx_missed_errors = rte_stats.imissed;

    ovs_mutex_unlock(&dev->mutex);

    return 0;
}

static int
netdev_dpdk_get_custom_stats(const struct netdev *netdev,
                             struct netdev_custom_stats *custom_stats)
{

    uint32_t i;
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    int rte_xstats_ret;

    ovs_mutex_lock(&dev->mutex);

    if (netdev_dpdk_configure_xstats(dev)) {
        uint64_t *values = xcalloc(dev->rte_xstats_ids_size,
                                   sizeof(uint64_t));

        rte_xstats_ret =
                rte_eth_xstats_get_by_id(dev->port_id, dev->rte_xstats_ids,
                                         values, dev->rte_xstats_ids_size);

        if (rte_xstats_ret > 0 &&
            rte_xstats_ret <= dev->rte_xstats_ids_size) {

            custom_stats->size = rte_xstats_ret;
            custom_stats->counters =
                    (struct netdev_custom_counter *) xcalloc(rte_xstats_ret,
                            sizeof(struct netdev_custom_counter));

            for (i = 0; i < rte_xstats_ret; i++) {
                ovs_strlcpy(custom_stats->counters[i].name,
                            netdev_dpdk_get_xstat_name(dev,
                                                       dev->rte_xstats_ids[i]),
                            NETDEV_CUSTOM_STATS_NAME_SIZE);
                custom_stats->counters[i].value = values[i];
            }
        } else {
            VLOG_WARN("Cannot get XSTATS values for port: "DPDK_PORT_ID_FMT,
                      dev->port_id);
            custom_stats->counters = NULL;
            custom_stats->size = 0;
            /* Let's clear statistics cache, so it will be
             * reconfigured */
            netdev_dpdk_clear_xstats(dev);
        }

        free(values);
    }

    ovs_mutex_unlock(&dev->mutex);

    return 0;
}

static int
netdev_dpdk_get_features(const struct netdev *netdev,
                         enum netdev_features *current,
                         enum netdev_features *advertised,
                         enum netdev_features *supported,
                         enum netdev_features *peer)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    struct rte_eth_link link;

    ovs_mutex_lock(&dev->mutex);
    link = dev->link;
    ovs_mutex_unlock(&dev->mutex);

    if (link.link_duplex == ETH_LINK_HALF_DUPLEX) {
        if (link.link_speed == ETH_SPEED_NUM_10M) {
            *current = NETDEV_F_10MB_HD;
        }
        if (link.link_speed == ETH_SPEED_NUM_100M) {
            *current = NETDEV_F_100MB_HD;
        }
        if (link.link_speed == ETH_SPEED_NUM_1G) {
            *current = NETDEV_F_1GB_HD;
        }
    } else if (link.link_duplex == ETH_LINK_FULL_DUPLEX) {
        if (link.link_speed == ETH_SPEED_NUM_10M) {
            *current = NETDEV_F_10MB_FD;
        }
        if (link.link_speed == ETH_SPEED_NUM_100M) {
            *current = NETDEV_F_100MB_FD;
        }
        if (link.link_speed == ETH_SPEED_NUM_1G) {
            *current = NETDEV_F_1GB_FD;
        }
        if (link.link_speed == ETH_SPEED_NUM_10G) {
            *current = NETDEV_F_10GB_FD;
        }
        if (link.link_speed == ETH_SPEED_NUM_40G) {
            *current = NETDEV_F_40GB_FD;
        }
    }

    if (link.link_autoneg) {
        *current |= NETDEV_F_AUTONEG;
    }

    *advertised = *supported = *peer = 0;

    return 0;
}

static struct ingress_policer *
netdev_dpdk_policer_construct(uint32_t rate, uint32_t burst)
{
    struct ingress_policer *policer = NULL;
    uint64_t rate_bytes;
    uint64_t burst_bytes;
    int err = 0;

    policer = xmalloc(sizeof *policer);
    rte_spinlock_init(&policer->policer_lock);

    /* rte_meter requires bytes so convert kbits rate and burst to bytes. */
    rate_bytes = rate * 1000ULL / 8;
    burst_bytes = burst * 1000ULL / 8;

    policer->app_srtcm_params.cir = rate_bytes;
    policer->app_srtcm_params.cbs = burst_bytes;
    policer->app_srtcm_params.ebs = 0;
    err = rte_meter_srtcm_config(&policer->in_policer,
                                    &policer->app_srtcm_params);
    if (err) {
        VLOG_ERR("Could not create rte meter for ingress policer");
        free(policer);
        return NULL;
    }

    return policer;
}

static int
netdev_dpdk_set_policing(struct netdev* netdev, uint32_t policer_rate,
                         uint32_t policer_burst)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    struct ingress_policer *policer;

    /* Force to 0 if no rate specified,
     * default to 8000 kbits if burst is 0,
     * else stick with user-specified value.
     */
    policer_burst = (!policer_rate ? 0
                     : !policer_burst ? 8000
                     : policer_burst);

    ovs_mutex_lock(&dev->mutex);

    policer = ovsrcu_get_protected(struct ingress_policer *,
                                    &dev->ingress_policer);

    if (dev->policer_rate == policer_rate &&
        dev->policer_burst == policer_burst) {
        /* Assume that settings haven't changed since we last set them. */
        ovs_mutex_unlock(&dev->mutex);
        return 0;
    }

    /* Destroy any existing ingress policer for the device if one exists */
    if (policer) {
        ovsrcu_postpone(free, policer);
    }

    if (policer_rate != 0) {
        policer = netdev_dpdk_policer_construct(policer_rate, policer_burst);
    } else {
        policer = NULL;
    }
    ovsrcu_set(&dev->ingress_policer, policer);
    dev->policer_rate = policer_rate;
    dev->policer_burst = policer_burst;
    ovs_mutex_unlock(&dev->mutex);

    return 0;
}

static int
netdev_dpdk_get_ifindex(const struct netdev *netdev)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

    ovs_mutex_lock(&dev->mutex);
    /* Calculate hash from the netdev name. Ensure that ifindex is a 24-bit
     * postive integer to meet RFC 2863 recommendations.
     */
    int ifindex = hash_string(netdev->name, 0) % 0xfffffe + 1;
    ovs_mutex_unlock(&dev->mutex);

    return ifindex;
}

static int
netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

    ovs_mutex_lock(&dev->mutex);
    check_link_status(dev);
    *carrier = dev->link.link_status;

    ovs_mutex_unlock(&dev->mutex);

    return 0;
}

static int
netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

    ovs_mutex_lock(&dev->mutex);

    if (is_vhost_running(dev)) {
        *carrier = 1;
    } else {
        *carrier = 0;
    }

    ovs_mutex_unlock(&dev->mutex);

    return 0;
}

static long long int
netdev_dpdk_get_carrier_resets(const struct netdev *netdev)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    long long int carrier_resets;

    ovs_mutex_lock(&dev->mutex);
    carrier_resets = dev->link_reset_cnt;
    ovs_mutex_unlock(&dev->mutex);

    return carrier_resets;
}

static int
netdev_dpdk_set_miimon(struct netdev *netdev OVS_UNUSED,
                       long long int interval OVS_UNUSED)
{
    return EOPNOTSUPP;
}

static int
netdev_dpdk_update_flags__(struct netdev_dpdk *dev,
                           enum netdev_flags off, enum netdev_flags on,
                           enum netdev_flags *old_flagsp)
    OVS_REQUIRES(dev->mutex)
{
    if ((off | on) & ~(NETDEV_UP | NETDEV_PROMISC)) {
        return EINVAL;
    }

    *old_flagsp = dev->flags;
    dev->flags |= on;
    dev->flags &= ~off;

    if (dev->flags == *old_flagsp) {
        return 0;
    }

    if (dev->type == DPDK_DEV_ETH) {
        if (dev->flags & NETDEV_PROMISC) {
            rte_eth_promiscuous_enable(dev->port_id);
        }

        netdev_change_seq_changed(&dev->up);
    } else {
        /* If DPDK_DEV_VHOST device's NETDEV_UP flag was changed and vhost is
         * running then change netdev's change_seq to trigger link state
         * update. */

        if ((NETDEV_UP & ((*old_flagsp ^ on) | (*old_flagsp ^ off)))
            && is_vhost_running(dev)) {
            netdev_change_seq_changed(&dev->up);

            /* Clear statistics if device is getting up. */
            if (NETDEV_UP & on) {
                rte_spinlock_lock(&dev->stats_lock);
                memset(&dev->stats, 0, sizeof dev->stats);
                rte_spinlock_unlock(&dev->stats_lock);
            }
        }
    }

    return 0;
}

static int
netdev_dpdk_update_flags(struct netdev *netdev,
                         enum netdev_flags off, enum netdev_flags on,
                         enum netdev_flags *old_flagsp)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    int error;

    ovs_mutex_lock(&dev->mutex);
    error = netdev_dpdk_update_flags__(dev, off, on, old_flagsp);
    ovs_mutex_unlock(&dev->mutex);

    return error;
}

static int
netdev_dpdk_vhost_user_get_status(const struct netdev *netdev,
                                  struct smap *args)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

    ovs_mutex_lock(&dev->mutex);

    bool client_mode = dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT;
    smap_add_format(args, "mode", "%s", client_mode ? "client" : "server");

    int vid = netdev_dpdk_get_vid(dev);
    if (vid < 0) {
        smap_add_format(args, "status", "disconnected");
        ovs_mutex_unlock(&dev->mutex);
        return 0;
    } else {
        smap_add_format(args, "status", "connected");
    }

    char socket_name[PATH_MAX];
    if (!rte_vhost_get_ifname(vid, socket_name, PATH_MAX)) {
        smap_add_format(args, "socket", "%s", socket_name);
    }

    uint64_t features;
    if (!rte_vhost_get_negotiated_features(vid, &features)) {
        smap_add_format(args, "features", "0x%016"PRIx64, features);
    }

    uint16_t mtu;
    if (!rte_vhost_get_mtu(vid, &mtu)) {
        smap_add_format(args, "mtu", "%d", mtu);
    }

    int numa = rte_vhost_get_numa_node(vid);
    if (numa >= 0) {
        smap_add_format(args, "numa", "%d", numa);
    }

    uint16_t vring_num = rte_vhost_get_vring_num(vid);
    if (vring_num) {
        smap_add_format(args, "num_of_vrings", "%d", vring_num);
    }

    for (int i = 0; i < vring_num; i++) {
        struct rte_vhost_vring vring;

        rte_vhost_get_vhost_vring(vid, i, &vring);
        smap_add_nocopy(args, xasprintf("vring_%d_size", i),
                        xasprintf("%d", vring.size));
    }

    ovs_mutex_unlock(&dev->mutex);
    return 0;
}

static int
netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    struct rte_eth_dev_info dev_info;

    if (!rte_eth_dev_is_valid_port(dev->port_id)) {
        return ENODEV;
    }

    ovs_mutex_lock(&dev->mutex);
    rte_eth_dev_info_get(dev->port_id, &dev_info);
    ovs_mutex_unlock(&dev->mutex);

    smap_add_format(args, "port_no", DPDK_PORT_ID_FMT, dev->port_id);
    smap_add_format(args, "numa_id", "%d",
                           rte_eth_dev_socket_id(dev->port_id));
    smap_add_format(args, "driver_name", "%s", dev_info.driver_name);
    smap_add_format(args, "min_rx_bufsize", "%u", dev_info.min_rx_bufsize);
    smap_add_format(args, "max_rx_pktlen", "%u", dev->max_packet_len);
    smap_add_format(args, "max_rx_queues", "%u", dev_info.max_rx_queues);
    smap_add_format(args, "max_tx_queues", "%u", dev_info.max_tx_queues);
    smap_add_format(args, "max_mac_addrs", "%u", dev_info.max_mac_addrs);
    smap_add_format(args, "max_hash_mac_addrs", "%u",
                           dev_info.max_hash_mac_addrs);
    smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs);
    smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools);

    /* Querying the DPDK library for iftype may be done in future, pending
     * support; cf. RFC 3635 Section 3.2.4. */
    enum { IF_TYPE_ETHERNETCSMACD = 6 };

    smap_add_format(args, "if_type", "%"PRIu32, IF_TYPE_ETHERNETCSMACD);
    smap_add_format(args, "if_descr", "%s %s", rte_version(),
                                               dev_info.driver_name);

    if (dev_info.pci_dev) {
        smap_add_format(args, "pci-vendor_id", "0x%x",
                        dev_info.pci_dev->id.vendor_id);
        smap_add_format(args, "pci-device_id", "0x%x",
                        dev_info.pci_dev->id.device_id);
    }

    return 0;
}

static void
netdev_dpdk_set_admin_state__(struct netdev_dpdk *dev, bool admin_state)
    OVS_REQUIRES(dev->mutex)
{
    enum netdev_flags old_flags;

    if (admin_state) {
        netdev_dpdk_update_flags__(dev, 0, NETDEV_UP, &old_flags);
    } else {
        netdev_dpdk_update_flags__(dev, NETDEV_UP, 0, &old_flags);
    }
}

static void
netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
                            const char *argv[], void *aux OVS_UNUSED)
{
    bool up;

    if (!strcasecmp(argv[argc - 1], "up")) {
        up = true;
    } else if ( !strcasecmp(argv[argc - 1], "down")) {
        up = false;
    } else {
        unixctl_command_reply_error(conn, "Invalid Admin State");
        return;
    }

    if (argc > 2) {
        struct netdev *netdev = netdev_from_name(argv[1]);

        if (netdev && is_dpdk_class(netdev->netdev_class)) {
            struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

            ovs_mutex_lock(&dev->mutex);
            netdev_dpdk_set_admin_state__(dev, up);
            ovs_mutex_unlock(&dev->mutex);

            netdev_close(netdev);
        } else {
            unixctl_command_reply_error(conn, "Not a DPDK Interface");
            netdev_close(netdev);
            return;
        }
    } else {
        struct netdev_dpdk *dev;

        ovs_mutex_lock(&dpdk_mutex);
        LIST_FOR_EACH (dev, list_node, &dpdk_list) {
            ovs_mutex_lock(&dev->mutex);
            netdev_dpdk_set_admin_state__(dev, up);
            ovs_mutex_unlock(&dev->mutex);
        }
        ovs_mutex_unlock(&dpdk_mutex);
    }
    unixctl_command_reply(conn, "OK");
}

static void
netdev_dpdk_detach(struct unixctl_conn *conn, int argc OVS_UNUSED,
                   const char *argv[], void *aux OVS_UNUSED)
{
    int ret;
    char *response;
    dpdk_port_t port_id;
    char devname[RTE_ETH_NAME_MAX_LEN];
    struct netdev_dpdk *dev;

    ovs_mutex_lock(&dpdk_mutex);

    if (rte_eth_dev_get_port_by_name(argv[1], &port_id)) {
        response = xasprintf("Device '%s' not found in DPDK", argv[1]);
        goto error;
    }

    dev = netdev_dpdk_lookup_by_port_id(port_id);
    if (dev) {
        response = xasprintf("Device '%s' is being used by interface '%s'. "
                             "Remove it before detaching",
                             argv[1], netdev_get_name(&dev->up));
        goto error;
    }

    rte_eth_dev_close(port_id);

    ret = rte_eth_dev_detach(port_id, devname);
    if (ret < 0) {
        response = xasprintf("Device '%s' can not be detached", argv[1]);
        goto error;
    }

    response = xasprintf("Device '%s' has been detached", argv[1]);

    ovs_mutex_unlock(&dpdk_mutex);
    unixctl_command_reply(conn, response);
    free(response);
    return;

error:
    ovs_mutex_unlock(&dpdk_mutex);
    unixctl_command_reply_error(conn, response);
    free(response);
}

static void
netdev_dpdk_get_mempool_info(struct unixctl_conn *conn,
                             int argc, const char *argv[],
                             void *aux OVS_UNUSED)
{
    size_t size;
    FILE *stream;
    char *response = NULL;
    struct netdev *netdev = NULL;

    if (argc == 2) {
        netdev = netdev_from_name(argv[1]);
        if (!netdev || !is_dpdk_class(netdev->netdev_class)) {
            unixctl_command_reply_error(conn, "Not a DPDK Interface");
            goto out;
        }
    }

    stream = open_memstream(&response, &size);
    if (!stream) {
        response = xasprintf("Unable to open memstream: %s.",
                             ovs_strerror(errno));
        unixctl_command_reply_error(conn, response);
        goto out;
    }

    if (netdev) {
        struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

        ovs_mutex_lock(&dev->mutex);
        ovs_mutex_lock(&dpdk_mp_mutex);

        rte_mempool_dump(stream, dev->dpdk_mp->mp);

        ovs_mutex_unlock(&dpdk_mp_mutex);
        ovs_mutex_unlock(&dev->mutex);
    } else {
        ovs_mutex_lock(&dpdk_mp_mutex);
        rte_mempool_list_dump(stream);
        ovs_mutex_unlock(&dpdk_mp_mutex);
    }

    fclose(stream);

    unixctl_command_reply(conn, response);
out:
    free(response);
    netdev_close(netdev);
}

/*
 * Set virtqueue flags so that we do not receive interrupts.
 */
static void
set_irq_status(int vid)
{
    uint32_t i;

    for (i = 0; i < rte_vhost_get_vring_num(vid); i++) {
        rte_vhost_enable_guest_notification(vid, i, 0);
    }
}

/*
 * Fixes mapping for vhost-user tx queues. Must be called after each
 * enabling/disabling of queues and n_txq modifications.
 */
static void
netdev_dpdk_remap_txqs(struct netdev_dpdk *dev)
    OVS_REQUIRES(dev->mutex)
{
    int *enabled_queues, n_enabled = 0;
    int i, k, total_txqs = dev->up.n_txq;

    enabled_queues = xcalloc(total_txqs, sizeof *enabled_queues);

    for (i = 0; i < total_txqs; i++) {
        /* Enabled queues always mapped to themselves. */
        if (dev->tx_q[i].map == i) {
            enabled_queues[n_enabled++] = i;
        }
    }

    if (n_enabled == 0 && total_txqs != 0) {
        enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED;
        n_enabled = 1;
    }

    k = 0;
    for (i = 0; i < total_txqs; i++) {
        if (dev->tx_q[i].map != i) {
            dev->tx_q[i].map = enabled_queues[k];
            k = (k + 1) % n_enabled;
        }
    }

    VLOG_DBG("TX queue mapping for %s\n", dev->vhost_id);
    for (i = 0; i < total_txqs; i++) {
        VLOG_DBG("%2d --> %2d", i, dev->tx_q[i].map);
    }

    free(enabled_queues);
}

/*
 * A new virtio-net device is added to a vhost port.
 */
static int
new_device(int vid)
{
    struct netdev_dpdk *dev;
    bool exists = false;
    int newnode = 0;
    char ifname[IF_NAME_SZ];

    rte_vhost_get_ifname(vid, ifname, sizeof ifname);

    ovs_mutex_lock(&dpdk_mutex);
    /* Add device to the vhost port with the same name as that passed down. */
    LIST_FOR_EACH(dev, list_node, &dpdk_list) {
        ovs_mutex_lock(&dev->mutex);
        if (strncmp(ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
            uint32_t qp_num = rte_vhost_get_vring_num(vid)/VIRTIO_QNUM;

            /* Get NUMA information */
            newnode = rte_vhost_get_numa_node(vid);
            if (newnode == -1) {
#ifdef VHOST_NUMA
                VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
                          ifname);
#endif
                newnode = dev->socket_id;
            }

            if (dev->requested_n_txq != qp_num
                || dev->requested_n_rxq != qp_num
                || dev->requested_socket_id != newnode) {
                dev->requested_socket_id = newnode;
                dev->requested_n_rxq = qp_num;
                dev->requested_n_txq = qp_num;
                netdev_request_reconfigure(&dev->up);
            } else {
                /* Reconfiguration not required. */
                dev->vhost_reconfigured = true;
            }

            ovsrcu_index_set(&dev->vid, vid);
            exists = true;

            /* Disable notifications. */
            set_irq_status(vid);
            netdev_change_seq_changed(&dev->up);
            ovs_mutex_unlock(&dev->mutex);
            break;
        }
        ovs_mutex_unlock(&dev->mutex);
    }
    ovs_mutex_unlock(&dpdk_mutex);

    if (!exists) {
        VLOG_INFO("vHost Device '%s' can't be added - name not found", ifname);

        return -1;
    }

    VLOG_INFO("vHost Device '%s' has been added on numa node %i",
              ifname, newnode);

    return 0;
}

/* Clears mapping for all available queues of vhost interface. */
static void
netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev)
    OVS_REQUIRES(dev->mutex)
{
    int i;

    for (i = 0; i < dev->up.n_txq; i++) {
        dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN;
    }
}

/*
 * Remove a virtio-net device from the specific vhost port.  Use dev->remove
 * flag to stop any more packets from being sent or received to/from a VM and
 * ensure all currently queued packets have been sent/received before removing
 *  the device.
 */
static void
destroy_device(int vid)
{
    struct netdev_dpdk *dev;
    bool exists = false;
    char ifname[IF_NAME_SZ];

    rte_vhost_get_ifname(vid, ifname, sizeof ifname);

    ovs_mutex_lock(&dpdk_mutex);
    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
        if (netdev_dpdk_get_vid(dev) == vid) {

            ovs_mutex_lock(&dev->mutex);
            dev->vhost_reconfigured = false;
            ovsrcu_index_set(&dev->vid, -1);
            netdev_dpdk_txq_map_clear(dev);

            netdev_change_seq_changed(&dev->up);
            ovs_mutex_unlock(&dev->mutex);
            exists = true;
            break;
        }
    }

    ovs_mutex_unlock(&dpdk_mutex);

    if (exists) {
        /*
         * Wait for other threads to quiesce after setting the 'virtio_dev'
         * to NULL, before returning.
         */
        ovsrcu_synchronize();
        /*
         * As call to ovsrcu_synchronize() will end the quiescent state,
         * put thread back into quiescent state before returning.
         */
        ovsrcu_quiesce_start();
        VLOG_INFO("vHost Device '%s' has been removed", ifname);
    } else {
        VLOG_INFO("vHost Device '%s' not found", ifname);
    }
}

static int
vring_state_changed(int vid, uint16_t queue_id, int enable)
{
    struct netdev_dpdk *dev;
    bool exists = false;
    int qid = queue_id / VIRTIO_QNUM;
    char ifname[IF_NAME_SZ];

    rte_vhost_get_ifname(vid, ifname, sizeof ifname);

    if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) {
        return 0;
    }

    ovs_mutex_lock(&dpdk_mutex);
    LIST_FOR_EACH (dev, list_node, &dpdk_list) {
        ovs_mutex_lock(&dev->mutex);
        if (strncmp(ifname, dev->vhost_id, IF_NAME_SZ) == 0) {
            if (enable) {
                dev->tx_q[qid].map = qid;
            } else {
                dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED;
            }
            netdev_dpdk_remap_txqs(dev);
            exists = true;
            ovs_mutex_unlock(&dev->mutex);
            break;
        }
        ovs_mutex_unlock(&dev->mutex);
    }
    ovs_mutex_unlock(&dpdk_mutex);

    if (exists) {
        VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s'"
                  "changed to \'%s\'", queue_id, qid, ifname,
                  (enable == 1) ? "enabled" : "disabled");
    } else {
        VLOG_INFO("vHost Device '%s' not found", ifname);
        return -1;
    }

    return 0;
}

/*
 * Retrieve the DPDK virtio device ID (vid) associated with a vhostuser
 * or vhostuserclient netdev.
 *
 * Returns a value greater or equal to zero for a valid vid or '-1' if
 * there is no valid vid associated. A vid of '-1' must not be used in
 * rte_vhost_ APi calls.
 *
 * Once obtained and validated, a vid can be used by a PMD for multiple
 * subsequent rte_vhost API calls until the PMD quiesces. A PMD should
 * not fetch the vid again for each of a series of API calls.
 */

int
netdev_dpdk_get_vid(const struct netdev_dpdk *dev)
{
    return ovsrcu_index_get(&dev->vid);
}

struct ingress_policer *
netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev)
{
    return ovsrcu_get(struct ingress_policer *, &dev->ingress_policer);
}

static int
netdev_dpdk_class_init(void)
{
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;

    /* This function can be called for different classes.  The initialization
     * needs to be done only once */
    if (ovsthread_once_start(&once)) {
        ovs_thread_create("dpdk_watchdog", dpdk_watchdog, NULL);
        unixctl_command_register("netdev-dpdk/set-admin-state",
                                 "[netdev] up|down", 1, 2,
                                 netdev_dpdk_set_admin_state, NULL);

        unixctl_command_register("netdev-dpdk/detach",
                                 "pci address of device", 1, 1,
                                 netdev_dpdk_detach, NULL);

        unixctl_command_register("netdev-dpdk/get-mempool-info",
                                 "[netdev]", 0, 1,
                                 netdev_dpdk_get_mempool_info, NULL);

        ovsthread_once_done(&once);
    }

    return 0;
}


/* Client Rings */

static int
dpdk_ring_create(const char dev_name[], unsigned int port_no,
                 dpdk_port_t *eth_port_id)
{
    struct dpdk_ring *ring_pair;
    char *ring_name;
    int port_id;

    ring_pair = dpdk_rte_mzalloc(sizeof *ring_pair);
    if (!ring_pair) {
        return ENOMEM;
    }

    /* XXX: Add support for multiquque ring. */
    ring_name = xasprintf("%s_tx", dev_name);

    /* Create single producer tx ring, netdev does explicit locking. */
    ring_pair->cring_tx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
                                        RING_F_SP_ENQ);
    free(ring_name);
    if (ring_pair->cring_tx == NULL) {
        rte_free(ring_pair);
        return ENOMEM;
    }

    ring_name = xasprintf("%s_rx", dev_name);

    /* Create single consumer rx ring, netdev does explicit locking. */
    ring_pair->cring_rx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
                                        RING_F_SC_DEQ);
    free(ring_name);
    if (ring_pair->cring_rx == NULL) {
        rte_free(ring_pair);
        return ENOMEM;
    }

    port_id = rte_eth_from_rings(dev_name, &ring_pair->cring_rx, 1,
                                 &ring_pair->cring_tx, 1, SOCKET0);

    if (port_id < 0) {
        rte_free(ring_pair);
        return ENODEV;
    }

    ring_pair->user_port_id = port_no;
    ring_pair->eth_port_id = port_id;
    *eth_port_id = port_id;

    ovs_list_push_back(&dpdk_ring_list, &ring_pair->list_node);

    return 0;
}

static int
dpdk_ring_open(const char dev_name[], dpdk_port_t *eth_port_id)
    OVS_REQUIRES(dpdk_mutex)
{
    struct dpdk_ring *ring_pair;
    unsigned int port_no;
    int err = 0;

    /* Names always start with "dpdkr" */
    err = dpdk_dev_parse_name(dev_name, "dpdkr", &port_no);
    if (err) {
        return err;
    }

    /* Look through our list to find the device */
    LIST_FOR_EACH (ring_pair, list_node, &dpdk_ring_list) {
         if (ring_pair->user_port_id == port_no) {
            VLOG_INFO("Found dpdk ring device %s:", dev_name);
            /* Really all that is needed */
            *eth_port_id = ring_pair->eth_port_id;
            return 0;
         }
    }
    /* Need to create the device rings */
    return dpdk_ring_create(dev_name, port_no, eth_port_id);
}

static int
netdev_dpdk_ring_send(struct netdev *netdev, int qid,
                      struct dp_packet_batch *batch, bool concurrent_txq)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    struct dp_packet *packet;

    /* When using 'dpdkr' and sending to a DPDK ring, we want to ensure that
     * the rss hash field is clear. This is because the same mbuf may be
     * modified by the consumer of the ring and return into the datapath
     * without recalculating the RSS hash. */
    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
        dp_packet_mbuf_rss_flag_reset(packet);
    }

    netdev_dpdk_send__(dev, qid, batch, concurrent_txq);
    return 0;
}

static int
netdev_dpdk_ring_construct(struct netdev *netdev)
{
    dpdk_port_t port_no = 0;
    int err = 0;

    ovs_mutex_lock(&dpdk_mutex);

    err = dpdk_ring_open(netdev->name, &port_no);
    if (err) {
        goto unlock_dpdk;
    }

    err = common_construct(netdev, port_no, DPDK_DEV_ETH,
                           rte_eth_dev_socket_id(port_no));
unlock_dpdk:
    ovs_mutex_unlock(&dpdk_mutex);
    return err;
}

/* QoS Functions */

/*
 * Initialize QoS configuration operations.
 */
static void
qos_conf_init(struct qos_conf *conf, const struct dpdk_qos_ops *ops)
{
    conf->ops = ops;
    rte_spinlock_init(&conf->lock);
}

/*
 * Search existing QoS operations in qos_ops and compare each set of
 * operations qos_name to name. Return a dpdk_qos_ops pointer to a match,
 * else return NULL
 */
static const struct dpdk_qos_ops *
qos_lookup_name(const char *name)
{
    const struct dpdk_qos_ops *const *opsp;

    for (opsp = qos_confs; *opsp != NULL; opsp++) {
        const struct dpdk_qos_ops *ops = *opsp;
        if (!strcmp(name, ops->qos_name)) {
            return ops;
        }
    }
    return NULL;
}

static int
netdev_dpdk_get_qos_types(const struct netdev *netdev OVS_UNUSED,
                           struct sset *types)
{
    const struct dpdk_qos_ops *const *opsp;

    for (opsp = qos_confs; *opsp != NULL; opsp++) {
        const struct dpdk_qos_ops *ops = *opsp;
        if (ops->qos_construct && ops->qos_name[0] != '\0') {
            sset_add(types, ops->qos_name);
        }
    }
    return 0;
}

static int
netdev_dpdk_get_qos(const struct netdev *netdev,
                    const char **typep, struct smap *details)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    struct qos_conf *qos_conf;
    int error = 0;

    ovs_mutex_lock(&dev->mutex);
    qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
    if (qos_conf) {
        *typep = qos_conf->ops->qos_name;
        error = (qos_conf->ops->qos_get
                 ? qos_conf->ops->qos_get(qos_conf, details): 0);
    } else {
        /* No QoS configuration set, return an empty string */
        *typep = "";
    }
    ovs_mutex_unlock(&dev->mutex);

    return error;
}

static int
netdev_dpdk_set_qos(struct netdev *netdev, const char *type,
                    const struct smap *details)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    const struct dpdk_qos_ops *new_ops = NULL;
    struct qos_conf *qos_conf, *new_qos_conf = NULL;
    int error = 0;

    ovs_mutex_lock(&dev->mutex);

    qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);

    new_ops = qos_lookup_name(type);

    if (!new_ops || !new_ops->qos_construct) {
        new_qos_conf = NULL;
        if (type && type[0]) {
            error = EOPNOTSUPP;
        }
    } else if (qos_conf && qos_conf->ops == new_ops
               && qos_conf->ops->qos_is_equal(qos_conf, details)) {
        new_qos_conf = qos_conf;
    } else {
        error = new_ops->qos_construct(details, &new_qos_conf);
    }

    if (error) {
        VLOG_ERR("Failed to set QoS type %s on port %s: %s",
                 type, netdev->name, rte_strerror(error));
    }

    if (new_qos_conf != qos_conf) {
        ovsrcu_set(&dev->qos_conf, new_qos_conf);
        if (qos_conf) {
            ovsrcu_postpone(qos_conf->ops->qos_destruct, qos_conf);
        }
    }

    ovs_mutex_unlock(&dev->mutex);

    return error;
}

/* egress-policer details */

struct egress_policer {
    struct qos_conf qos_conf;
    struct rte_meter_srtcm_params app_srtcm_params;
    struct rte_meter_srtcm egress_meter;
};

static void
egress_policer_details_to_param(const struct smap *details,
                                struct rte_meter_srtcm_params *params)
{
    memset(params, 0, sizeof *params);
    params->cir = smap_get_ullong(details, "cir", 0);
    params->cbs = smap_get_ullong(details, "cbs", 0);
    params->ebs = 0;
}

static int
egress_policer_qos_construct(const struct smap *details,
                             struct qos_conf **conf)
{
    struct egress_policer *policer;
    int err = 0;

    policer = xmalloc(sizeof *policer);
    qos_conf_init(&policer->qos_conf, &egress_policer_ops);
    egress_policer_details_to_param(details, &policer->app_srtcm_params);
    err = rte_meter_srtcm_config(&policer->egress_meter,
                                 &policer->app_srtcm_params);
    if (!err) {
        *conf = &policer->qos_conf;
    } else {
        free(policer);
        *conf = NULL;
        err = -err;
    }

    return err;
}

static void
egress_policer_qos_destruct(struct qos_conf *conf)
{
    struct egress_policer *policer = CONTAINER_OF(conf, struct egress_policer,
                                                  qos_conf);
    free(policer);
}

static int
egress_policer_qos_get(const struct qos_conf *conf, struct smap *details)
{
    struct egress_policer *policer =
        CONTAINER_OF(conf, struct egress_policer, qos_conf);

    smap_add_format(details, "cir", "%"PRIu64, policer->app_srtcm_params.cir);
    smap_add_format(details, "cbs", "%"PRIu64, policer->app_srtcm_params.cbs);

    return 0;
}

static bool
egress_policer_qos_is_equal(const struct qos_conf *conf,
                            const struct smap *details)
{
    struct egress_policer *policer =
        CONTAINER_OF(conf, struct egress_policer, qos_conf);
    struct rte_meter_srtcm_params params;

    egress_policer_details_to_param(details, &params);

    return !memcmp(&params, &policer->app_srtcm_params, sizeof params);
}

static int
egress_policer_run(struct qos_conf *conf, struct rte_mbuf **pkts, int pkt_cnt,
                   bool should_steal)
{
    int cnt = 0;
    struct egress_policer *policer =
        CONTAINER_OF(conf, struct egress_policer, qos_conf);

    cnt = netdev_dpdk_policer_run(&policer->egress_meter, pkts,
                                  pkt_cnt, should_steal);

    return cnt;
}

static const struct dpdk_qos_ops egress_policer_ops = {
    "egress-policer",    /* qos_name */
    egress_policer_qos_construct,
    egress_policer_qos_destruct,
    egress_policer_qos_get,
    egress_policer_qos_is_equal,
    egress_policer_run
};

static int
netdev_dpdk_reconfigure(struct netdev *netdev)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    int err = 0;

    ovs_mutex_lock(&dev->mutex);

    if (netdev->n_txq == dev->requested_n_txq
        && netdev->n_rxq == dev->requested_n_rxq
        && dev->mtu == dev->requested_mtu
        && dev->lsc_interrupt_mode == dev->requested_lsc_interrupt_mode
        && dev->rxq_size == dev->requested_rxq_size
        && dev->txq_size == dev->requested_txq_size
        && dev->socket_id == dev->requested_socket_id
        && dev->started) {
        /* Reconfiguration is unnecessary */

        goto out;
    }

    rte_eth_dev_stop(dev->port_id);
    dev->started = false;

    err = netdev_dpdk_mempool_configure(dev);
    if (err && err != EEXIST) {
        goto out;
    }

    dev->lsc_interrupt_mode = dev->requested_lsc_interrupt_mode;

    netdev->n_txq = dev->requested_n_txq;
    netdev->n_rxq = dev->requested_n_rxq;

    dev->rxq_size = dev->requested_rxq_size;
    dev->txq_size = dev->requested_txq_size;

    rte_free(dev->tx_q);
    err = dpdk_eth_dev_init(dev);
    dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq);
    if (!dev->tx_q) {
        err = ENOMEM;
    }

    netdev_change_seq_changed(netdev);

out:
    ovs_mutex_unlock(&dev->mutex);
    return err;
}

static int
dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
    OVS_REQUIRES(dev->mutex)
{
    dev->up.n_txq = dev->requested_n_txq;
    dev->up.n_rxq = dev->requested_n_rxq;
    int err;

    /* Enable TX queue 0 by default if it wasn't disabled. */
    if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) {
        dev->tx_q[0].map = 0;
    }

    netdev_dpdk_remap_txqs(dev);

    err = netdev_dpdk_mempool_configure(dev);
    if (!err) {
        /* A new mempool was created or re-used. */
        netdev_change_seq_changed(&dev->up);
    } else if (err != EEXIST){
        return err;
    }
    if (netdev_dpdk_get_vid(dev) >= 0) {
        if (dev->vhost_reconfigured == false) {
            dev->vhost_reconfigured = true;
            /* Carrier status may need updating. */
            netdev_change_seq_changed(&dev->up);
        }
    }

    return 0;
}

static int
netdev_dpdk_vhost_reconfigure(struct netdev *netdev)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    int err;

    ovs_mutex_lock(&dev->mutex);
    err = dpdk_vhost_reconfigure_helper(dev);
    ovs_mutex_unlock(&dev->mutex);

    return err;
}

static int
netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
{
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    int err;
    uint64_t vhost_flags = 0;
    bool zc_enabled;

    ovs_mutex_lock(&dev->mutex);

    /* Configure vHost client mode if requested and if the following criteria
     * are met:
     *  1. Device hasn't been registered yet.
     *  2. A path has been specified.
     */
    if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)
            && strlen(dev->vhost_id)) {
        /* Register client-mode device. */
        vhost_flags |= RTE_VHOST_USER_CLIENT;

        /* Enable IOMMU support, if explicitly requested. */
        if (dpdk_vhost_iommu_enabled()) {
            vhost_flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
        }

        zc_enabled = dev->vhost_driver_flags
                     & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
        /* Enable zero copy flag, if requested */
        if (zc_enabled) {
            vhost_flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
        }

        err = rte_vhost_driver_register(dev->vhost_id, vhost_flags);
        if (err) {
            VLOG_ERR("vhost-user device setup failure for device %s\n",
                     dev->vhost_id);
            goto unlock;
        } else {
            /* Configuration successful */
            dev->vhost_driver_flags |= vhost_flags;
            VLOG_INFO("vHost User device '%s' created in 'client' mode, "
                      "using client socket '%s'",
                      dev->up.name, dev->vhost_id);
            if (zc_enabled) {
                VLOG_INFO("Zero copy enabled for vHost port %s", dev->up.name);
            }
        }

        err = rte_vhost_driver_callback_register(dev->vhost_id,
                                                 &virtio_net_device_ops);
        if (err) {
            VLOG_ERR("rte_vhost_driver_callback_register failed for "
                     "vhost user client port: %s\n", dev->up.name);
            goto unlock;
        }

        err = rte_vhost_driver_disable_features(dev->vhost_id,
                                    1ULL << VIRTIO_NET_F_HOST_TSO4
                                    | 1ULL << VIRTIO_NET_F_HOST_TSO6
                                    | 1ULL << VIRTIO_NET_F_CSUM);
        if (err) {
            VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
                     "client port: %s\n", dev->up.name);
            goto unlock;
        }

        err = rte_vhost_driver_start(dev->vhost_id);
        if (err) {
            VLOG_ERR("rte_vhost_driver_start failed for vhost user "
                     "client port: %s\n", dev->up.name);
            goto unlock;
        }
    }

    err = dpdk_vhost_reconfigure_helper(dev);

unlock:
    ovs_mutex_unlock(&dev->mutex);

    return err;
}


/* Find rte_flow with @ufid */
static struct rte_flow *
ufid_to_rte_flow_find(const ovs_u128 *ufid) {
    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
    struct ufid_to_rte_flow_data *data;

    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
        if (ovs_u128_equals(*ufid, data->ufid)) {
            return data->rte_flow;
        }
    }

    return NULL;
}

static inline void
ufid_to_rte_flow_associate(const ovs_u128 *ufid,
                           struct rte_flow *rte_flow) {
    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
    struct ufid_to_rte_flow_data *data = xzalloc(sizeof(*data));

    /*
     * We should not simply overwrite an existing rte flow.
     * We should have deleted it first before re-adding it.
     * Thus, if following assert triggers, something is wrong:
     * the rte_flow is not destroyed.
     */
    ovs_assert(ufid_to_rte_flow_find(ufid) == NULL);

    data->ufid = *ufid;
    data->rte_flow = rte_flow;

    cmap_insert(&ufid_to_rte_flow,
                CONST_CAST(struct cmap_node *, &data->node), hash);
}

static inline void
ufid_to_rte_flow_disassociate(const ovs_u128 *ufid) {
    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
    struct ufid_to_rte_flow_data *data;

    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
        if (ovs_u128_equals(*ufid, data->ufid)) {
            cmap_remove(&ufid_to_rte_flow,
                        CONST_CAST(struct cmap_node *, &data->node), hash);
            free(data);
            return;
        }
    }

    VLOG_WARN("ufid "UUID_FMT" is not associated with an rte flow\n",
              UUID_ARGS((struct uuid *)ufid));
}

/*
 * To avoid individual xrealloc calls for each new element, a 'curent_max'
 * is used to keep track of current allocated number of elements. Starts
 * by 8 and doubles on each xrealloc call
 */
struct flow_patterns {
    struct rte_flow_item *items;
    int cnt;
    int current_max;
};

struct flow_actions {
    struct rte_flow_action *actions;
    int cnt;
    int current_max;
};

static void
dump_flow_pattern(struct rte_flow_item *item)
{
    if (item->type == RTE_FLOW_ITEM_TYPE_ETH) {
        const struct rte_flow_item_eth *eth_spec = item->spec;
        const struct rte_flow_item_eth *eth_mask = item->mask;

        VLOG_DBG("rte flow eth pattern:\n");
        if (eth_spec) {
            VLOG_DBG("  Spec: src="ETH_ADDR_FMT", dst="ETH_ADDR_FMT", "
                     "type=0x%04" PRIx16"\n",
                     ETH_ADDR_BYTES_ARGS(eth_spec->src.addr_bytes),
                     ETH_ADDR_BYTES_ARGS(eth_spec->dst.addr_bytes),
                     ntohs(eth_spec->type));
        } else {
            VLOG_DBG("  Spec = null\n");
        }
        if (eth_mask) {
            VLOG_DBG("  Mask: src="ETH_ADDR_FMT", dst="ETH_ADDR_FMT", "
                     "type=0x%04"PRIx16"\n",
                     ETH_ADDR_BYTES_ARGS(eth_mask->src.addr_bytes),
                     ETH_ADDR_BYTES_ARGS(eth_mask->dst.addr_bytes),
                     eth_mask->type);
        } else {
            VLOG_DBG("  Mask = null\n");
        }
    }

    if (item->type == RTE_FLOW_ITEM_TYPE_VLAN) {
        const struct rte_flow_item_vlan *vlan_spec = item->spec;
        const struct rte_flow_item_vlan *vlan_mask = item->mask;

        VLOG_DBG("rte flow vlan pattern:\n");
        if (vlan_spec) {
            VLOG_DBG("  Spec: tpid=0x%"PRIx16", tci=0x%"PRIx16"\n",
                     ntohs(vlan_spec->tpid), ntohs(vlan_spec->tci));
        } else {
            VLOG_DBG("  Spec = null\n");
        }

        if (vlan_mask) {
            VLOG_DBG("  Mask: tpid=0x%"PRIx16", tci=0x%"PRIx16"\n",
                     vlan_mask->tpid, vlan_mask->tci);
        } else {
            VLOG_DBG("  Mask = null\n");
        }
    }

    if (item->type == RTE_FLOW_ITEM_TYPE_IPV4) {
        const struct rte_flow_item_ipv4 *ipv4_spec = item->spec;
        const struct rte_flow_item_ipv4 *ipv4_mask = item->mask;

        VLOG_DBG("rte flow ipv4 pattern:\n");
        if (ipv4_spec) {
            VLOG_DBG("  Spec: tos=0x%"PRIx8", ttl=%"PRIx8", proto=0x%"PRIx8
                     ", src="IP_FMT", dst="IP_FMT"\n",
                     ipv4_spec->hdr.type_of_service,
                     ipv4_spec->hdr.time_to_live,
                     ipv4_spec->hdr.next_proto_id,
                     IP_ARGS(ipv4_spec->hdr.src_addr),
                     IP_ARGS(ipv4_spec->hdr.dst_addr));
        } else {
            VLOG_DBG("  Spec = null\n");
        }
        if (ipv4_mask) {
            VLOG_DBG("  Mask: tos=0x%"PRIx8", ttl=%"PRIx8", proto=0x%"PRIx8
                     ", src="IP_FMT", dst="IP_FMT"\n",
                     ipv4_mask->hdr.type_of_service,
                     ipv4_mask->hdr.time_to_live,
                     ipv4_mask->hdr.next_proto_id,
                     IP_ARGS(ipv4_mask->hdr.src_addr),
                     IP_ARGS(ipv4_mask->hdr.dst_addr));
        } else {
            VLOG_DBG("  Mask = null\n");
        }
    }

    if (item->type == RTE_FLOW_ITEM_TYPE_UDP) {
        const struct rte_flow_item_udp *udp_spec = item->spec;
        const struct rte_flow_item_udp *udp_mask = item->mask;

        VLOG_DBG("rte flow udp pattern:\n");
        if (udp_spec) {
            VLOG_DBG("  Spec: src_port=%"PRIu16", dst_port=%"PRIu16"\n",
                     ntohs(udp_spec->hdr.src_port),
                     ntohs(udp_spec->hdr.dst_port));
        } else {
            VLOG_DBG("  Spec = null\n");
        }
        if (udp_mask) {
            VLOG_DBG("  Mask: src_port=0x%"PRIx16", dst_port=0x%"PRIx16"\n",
                     udp_mask->hdr.src_port,
                     udp_mask->hdr.dst_port);
        } else {
            VLOG_DBG("  Mask = null\n");
        }
    }

    if (item->type == RTE_FLOW_ITEM_TYPE_SCTP) {
        const struct rte_flow_item_sctp *sctp_spec = item->spec;
        const struct rte_flow_item_sctp *sctp_mask = item->mask;

        VLOG_DBG("rte flow sctp pattern:\n");
        if (sctp_spec) {
            VLOG_DBG("  Spec: src_port=%"PRIu16", dst_port=%"PRIu16"\n",
                     ntohs(sctp_spec->hdr.src_port),
                     ntohs(sctp_spec->hdr.dst_port));
        } else {
            VLOG_DBG("  Spec = null\n");
        }
        if (sctp_mask) {
            VLOG_DBG("  Mask: src_port=0x%"PRIx16", dst_port=0x%"PRIx16"\n",
                     sctp_mask->hdr.src_port,
                     sctp_mask->hdr.dst_port);
        } else {
            VLOG_DBG("  Mask = null\n");
        }
    }

    if (item->type == RTE_FLOW_ITEM_TYPE_ICMP) {
        const struct rte_flow_item_icmp *icmp_spec = item->spec;
        const struct rte_flow_item_icmp *icmp_mask = item->mask;

        VLOG_DBG("rte flow icmp pattern:\n");
        if (icmp_spec) {
            VLOG_DBG("  Spec: icmp_type=%"PRIu8", icmp_code=%"PRIu8"\n",
                     icmp_spec->hdr.icmp_type,
                     icmp_spec->hdr.icmp_code);
        } else {
            VLOG_DBG("  Spec = null\n");
        }
        if (icmp_mask) {
            VLOG_DBG("  Mask: icmp_type=0x%"PRIx8", icmp_code=0x%"PRIx8"\n",
                     icmp_spec->hdr.icmp_type,
                     icmp_spec->hdr.icmp_code);
        } else {
            VLOG_DBG("  Mask = null\n");
        }
    }

    if (item->type == RTE_FLOW_ITEM_TYPE_TCP) {
        const struct rte_flow_item_tcp *tcp_spec = item->spec;
        const struct rte_flow_item_tcp *tcp_mask = item->mask;

        VLOG_DBG("rte flow tcp pattern:\n");
        if (tcp_spec) {
            VLOG_DBG("  Spec: src_port=%"PRIu16", dst_port=%"PRIu16
                     ", data_off=0x%"PRIx8", tcp_flags=0x%"PRIx8"\n",
                     ntohs(tcp_spec->hdr.src_port),
                     ntohs(tcp_spec->hdr.dst_port),
                     tcp_spec->hdr.data_off,
                     tcp_spec->hdr.tcp_flags);
        } else {
            VLOG_DBG("  Spec = null\n");
        }
        if (tcp_mask) {
            VLOG_DBG("  Mask: src_port=%"PRIx16", dst_port=%"PRIx16
                     ", data_off=0x%"PRIx8", tcp_flags=0x%"PRIx8"\n",
                     tcp_mask->hdr.src_port,
                     tcp_mask->hdr.dst_port,
                     tcp_mask->hdr.data_off,
                     tcp_mask->hdr.tcp_flags);
        } else {
            VLOG_DBG("  Mask = null\n");
        }
    }
}

static void
add_flow_pattern(struct flow_patterns *patterns, enum rte_flow_item_type type,
                 const void *spec, const void *mask) {
    int cnt = patterns->cnt;

    if (cnt == 0) {
        patterns->current_max = 8;
        patterns->items = xcalloc(patterns->current_max,
                                  sizeof(struct rte_flow_item));
    } else if (cnt == patterns->current_max) {
        patterns->current_max *= 2;
        patterns->items = xrealloc(patterns->items, patterns->current_max *
                                   sizeof(struct rte_flow_item));
    }

    patterns->items[cnt].type = type;
    patterns->items[cnt].spec = spec;
    patterns->items[cnt].mask = mask;
    patterns->items[cnt].last = NULL;
    dump_flow_pattern(&patterns->items[cnt]);
    patterns->cnt++;
}

static void
add_flow_action(struct flow_actions *actions, enum rte_flow_action_type type,
                const void *conf)
{
    int cnt = actions->cnt;

    if (cnt == 0) {
        actions->current_max = 8;
        actions->actions = xcalloc(actions->current_max,
                                   sizeof(struct rte_flow_action));
    } else if (cnt == actions->current_max) {
        actions->current_max *= 2;
        actions->actions = xrealloc(actions->actions, actions->current_max *
                                    sizeof(struct rte_flow_action));
    }

    actions->actions[cnt].type = type;
    actions->actions[cnt].conf = conf;
    actions->cnt++;
}

static struct rte_flow_action_rss *
add_flow_rss_action(struct flow_actions *actions,
                    struct netdev *netdev) {
    int i;
    struct rte_flow_action_rss *rss;

    rss = xmalloc(sizeof(*rss) + sizeof(uint16_t) * netdev->n_rxq);
    /*
     * Setting it to NULL will let the driver use the default RSS
     * configuration we have set: &port_conf.rx_adv_conf.rss_conf.
     */
    rss->rss_conf = NULL;
    rss->num = netdev->n_rxq;

    for (i = 0; i < rss->num; i++) {
        rss->queue[i] = i;
    }

    add_flow_action(actions, RTE_FLOW_ACTION_TYPE_RSS, rss);

    return rss;
}

static int
netdev_dpdk_add_rte_flow_offload(struct netdev *netdev,
                                 const struct match *match,
                                 struct nlattr *nl_actions OVS_UNUSED,
                                 size_t actions_len OVS_UNUSED,
                                 const ovs_u128 *ufid,
                                 struct offload_info *info) {
    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
    const struct rte_flow_attr flow_attr = {
        .group = 0,
        .priority = 0,
        .ingress = 1,
        .egress = 0
    };
    struct flow_patterns patterns = { .items = NULL, .cnt = 0 };
    struct flow_actions actions = { .actions = NULL, .cnt = 0 };
    struct rte_flow *flow;
    struct rte_flow_error error;
    uint8_t *ipv4_next_proto_mask = NULL;
    int ret = 0;

    /* Eth */
    struct rte_flow_item_eth eth_spec;
    struct rte_flow_item_eth eth_mask;
    memset(&eth_spec, 0, sizeof(eth_spec));
    memset(&eth_mask, 0, sizeof(eth_mask));
    if (!eth_addr_is_zero(match->wc.masks.dl_src) ||
        !eth_addr_is_zero(match->wc.masks.dl_dst)) {
        rte_memcpy(&eth_spec.dst, &match->flow.dl_dst, sizeof(eth_spec.dst));
        rte_memcpy(&eth_spec.src, &match->flow.dl_src, sizeof(eth_spec.src));
        eth_spec.type = match->flow.dl_type;

        rte_memcpy(&eth_mask.dst, &match->wc.masks.dl_dst,
                   sizeof(eth_mask.dst));
        rte_memcpy(&eth_mask.src, &match->wc.masks.dl_src,
                   sizeof(eth_mask.src));
        eth_mask.type = match->wc.masks.dl_type;

        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH,
                         &eth_spec, &eth_mask);
    } else {
        /*
         * If user specifies a flow (like UDP flow) without L2 patterns,
         * OVS will at least set the dl_type. Normally, it's enough to
         * create an eth pattern just with it. Unluckily, some Intel's
         * NIC (such as XL710) doesn't support that. Below is a workaround,
         * which simply matches any L2 pkts.
         */
        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH, NULL, NULL);
    }

    /* VLAN */
    struct rte_flow_item_vlan vlan_spec;
    struct rte_flow_item_vlan vlan_mask;
    memset(&vlan_spec, 0, sizeof(vlan_spec));
    memset(&vlan_mask, 0, sizeof(vlan_mask));
    if (match->wc.masks.vlans[0].tci && match->flow.vlans[0].tci) {
        vlan_spec.tci  = match->flow.vlans[0].tci & ~htons(VLAN_CFI);
        vlan_mask.tci  = match->wc.masks.vlans[0].tci & ~htons(VLAN_CFI);

        /* match any protocols */
        vlan_mask.tpid = 0;

        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_VLAN,
                         &vlan_spec, &vlan_mask);
    }

    /* IP v4 */
    uint8_t proto = 0;
    struct rte_flow_item_ipv4 ipv4_spec;
    struct rte_flow_item_ipv4 ipv4_mask;
    memset(&ipv4_spec, 0, sizeof(ipv4_spec));
    memset(&ipv4_mask, 0, sizeof(ipv4_mask));
    if (match->flow.dl_type == htons(ETH_TYPE_IP)) {

        ipv4_spec.hdr.type_of_service = match->flow.nw_tos;
        ipv4_spec.hdr.time_to_live    = match->flow.nw_ttl;
        ipv4_spec.hdr.next_proto_id   = match->flow.nw_proto;
        ipv4_spec.hdr.src_addr        = match->flow.nw_src;
        ipv4_spec.hdr.dst_addr        = match->flow.nw_dst;

        ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos;
        ipv4_mask.hdr.time_to_live    = match->wc.masks.nw_ttl;
        ipv4_mask.hdr.next_proto_id   = match->wc.masks.nw_proto;
        ipv4_mask.hdr.src_addr        = match->wc.masks.nw_src;
        ipv4_mask.hdr.dst_addr        = match->wc.masks.nw_dst;

        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_IPV4,
                         &ipv4_spec, &ipv4_mask);

        /* Save proto for L4 protocol setup */
        proto = ipv4_spec.hdr.next_proto_id &
                ipv4_mask.hdr.next_proto_id;

        /* Remember proto mask address for later modification */
        ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id;
    }

    if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP  &&
        proto != IPPROTO_SCTP && proto != IPPROTO_TCP  &&
        (match->wc.masks.tp_src ||
         match->wc.masks.tp_dst ||
         match->wc.masks.tcp_flags)) {
        VLOG_DBG("L4 Protocol (%u) not supported", proto);
        ret = -1;
        goto out;
    }

    if ((match->wc.masks.tp_src && match->wc.masks.tp_src != OVS_BE16_MAX) ||
        (match->wc.masks.tp_dst && match->wc.masks.tp_dst != OVS_BE16_MAX)) {
        ret = -1;
        goto out;
    }

    struct rte_flow_item_tcp tcp_spec;
    struct rte_flow_item_tcp tcp_mask;
    memset(&tcp_spec, 0, sizeof(tcp_spec));
    memset(&tcp_mask, 0, sizeof(tcp_mask));
    if (proto == IPPROTO_TCP) {
        tcp_spec.hdr.src_port  = match->flow.tp_src;
        tcp_spec.hdr.dst_port  = match->flow.tp_dst;
        tcp_spec.hdr.data_off  = ntohs(match->flow.tcp_flags) >> 8;
        tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) & 0xff;

        tcp_mask.hdr.src_port  = match->wc.masks.tp_src;
        tcp_mask.hdr.dst_port  = match->wc.masks.tp_dst;
        tcp_mask.hdr.data_off  = ntohs(match->wc.masks.tcp_flags) >> 8;
        tcp_mask.hdr.tcp_flags = ntohs(match->wc.masks.tcp_flags) & 0xff;

        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_TCP,
                         &tcp_spec, &tcp_mask);

        /* proto == TCP and ITEM_TYPE_TCP, thus no need for proto match */
        if (ipv4_next_proto_mask) {
            *ipv4_next_proto_mask = 0;
        }
        goto end_proto_check;
    }

    struct rte_flow_item_udp udp_spec;
    struct rte_flow_item_udp udp_mask;
    memset(&udp_spec, 0, sizeof(udp_spec));
    memset(&udp_mask, 0, sizeof(udp_mask));
    if (proto == IPPROTO_UDP) {
        udp_spec.hdr.src_port = match->flow.tp_src;
        udp_spec.hdr.dst_port = match->flow.tp_dst;

        udp_mask.hdr.src_port = match->wc.masks.tp_src;
        udp_mask.hdr.dst_port = match->wc.masks.tp_dst;

        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_UDP,
                         &udp_spec, &udp_mask);

        /* proto == UDP and ITEM_TYPE_UDP, thus no need for proto match */
        if (ipv4_next_proto_mask) {
            *ipv4_next_proto_mask = 0;
        }
        goto end_proto_check;
    }

    struct rte_flow_item_sctp sctp_spec;
    struct rte_flow_item_sctp sctp_mask;
    memset(&sctp_spec, 0, sizeof(sctp_spec));
    memset(&sctp_mask, 0, sizeof(sctp_mask));
    if (proto == IPPROTO_SCTP) {
        sctp_spec.hdr.src_port = match->flow.tp_src;
        sctp_spec.hdr.dst_port = match->flow.tp_dst;

        sctp_mask.hdr.src_port = match->wc.masks.tp_src;
        sctp_mask.hdr.dst_port = match->wc.masks.tp_dst;

        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_SCTP,
                         &sctp_spec, &sctp_mask);

        /* proto == SCTP and ITEM_TYPE_SCTP, thus no need for proto match */
        if (ipv4_next_proto_mask) {
            *ipv4_next_proto_mask = 0;
        }
        goto end_proto_check;
    }

    struct rte_flow_item_icmp icmp_spec;
    struct rte_flow_item_icmp icmp_mask;
    memset(&icmp_spec, 0, sizeof(icmp_spec));
    memset(&icmp_mask, 0, sizeof(icmp_mask));
    if (proto == IPPROTO_ICMP) {
        icmp_spec.hdr.icmp_type = (uint8_t)ntohs(match->flow.tp_src);
        icmp_spec.hdr.icmp_code = (uint8_t)ntohs(match->flow.tp_dst);

        icmp_mask.hdr.icmp_type = (uint8_t)ntohs(match->wc.masks.tp_src);
        icmp_mask.hdr.icmp_code = (uint8_t)ntohs(match->wc.masks.tp_dst);

        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ICMP,
                         &icmp_spec, &icmp_mask);

        /* proto == ICMP and ITEM_TYPE_ICMP, thus no need for proto match */
        if (ipv4_next_proto_mask) {
            *ipv4_next_proto_mask = 0;
        }
        goto end_proto_check;
    }

end_proto_check:

    add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_END, NULL, NULL);

    struct rte_flow_action_mark mark;
    mark.id = info->flow_mark;
    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_MARK, &mark);

    struct rte_flow_action_rss *rss;
    rss = add_flow_rss_action(&actions, netdev);
    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_END, NULL);

    flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items,
                           actions.actions, &error);
    free(rss);
    if (!flow) {
        VLOG_ERR("rte flow creat error: %u : message : %s\n",
                 error.type, error.message);
        ret = -1;
        goto out;
    }
    ufid_to_rte_flow_associate(ufid, flow);
    VLOG_DBG("installed flow %p by ufid "UUID_FMT"\n",
             flow, UUID_ARGS((struct uuid *)ufid));

out:
    free(patterns.items);
    free(actions.actions);
    return ret;
}

static bool
is_all_zero(const void *addr, size_t n) {
    size_t i = 0;
    const uint8_t *p = (uint8_t *)addr;

    for (i = 0; i < n; i++) {
        if (p[i] != 0) {
            return false;
        }
    }

    return true;
}

/*
 * Check if any unsupported flow patterns are specified.
 */
static int
netdev_dpdk_validate_flow(const struct match *match) {
    struct match match_zero_wc;

    /* Create a wc-zeroed version of flow */
    match_init(&match_zero_wc, &match->flow, &match->wc);

    if (!is_all_zero(&match_zero_wc.flow.tunnel,
                     sizeof(match_zero_wc.flow.tunnel))) {
        goto err;
    }

    if (match->wc.masks.metadata ||
        match->wc.masks.skb_priority ||
        match->wc.masks.pkt_mark ||
        match->wc.masks.dp_hash) {
        goto err;
    }

    /* recirc id must be zero */
    if (match_zero_wc.flow.recirc_id) {
        goto err;
    }

    if (match->wc.masks.ct_state ||
        match->wc.masks.ct_nw_proto ||
        match->wc.masks.ct_zone ||
        match->wc.masks.ct_mark ||
        match->wc.masks.ct_label.u64.hi ||
        match->wc.masks.ct_label.u64.lo) {
        goto err;
    }

    if (match->wc.masks.conj_id ||
        match->wc.masks.actset_output) {
        goto err;
    }

    /* unsupported L2 */
    if (!is_all_zero(&match->wc.masks.mpls_lse,
                     sizeof(match_zero_wc.flow.mpls_lse))) {
        goto err;
    }

    /* unsupported L3 */
    if (match->wc.masks.ipv6_label ||
        match->wc.masks.ct_nw_src ||
        match->wc.masks.ct_nw_dst ||
        !is_all_zero(&match->wc.masks.ipv6_src, sizeof(struct in6_addr)) ||
        !is_all_zero(&match->wc.masks.ipv6_dst, sizeof(struct in6_addr)) ||
        !is_all_zero(&match->wc.masks.ct_ipv6_src, sizeof(struct in6_addr)) ||
        !is_all_zero(&match->wc.masks.ct_ipv6_dst, sizeof(struct in6_addr)) ||
        !is_all_zero(&match->wc.masks.nd_target, sizeof(struct in6_addr)) ||
        !is_all_zero(&match->wc.masks.nsh, sizeof(struct ovs_key_nsh)) ||
        !is_all_zero(&match->wc.masks.arp_sha, sizeof(struct eth_addr)) ||
        !is_all_zero(&match->wc.masks.arp_tha, sizeof(struct eth_addr))) {
        goto err;
    }

    /* If fragmented, then don't HW accelerate - for now */
    if (match_zero_wc.flow.nw_frag) {
        goto err;
    }

    /* unsupported L4 */
    if (match->wc.masks.igmp_group_ip4 ||
        match->wc.masks.ct_tp_src ||
        match->wc.masks.ct_tp_dst) {
        goto err;
    }

    return 0;

err:
    VLOG_ERR("cannot HW accelerate this flow due to unsupported protocols");
    return -1;
}

static int
netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev,
                             const ovs_u128 *ufid,
                             struct rte_flow *rte_flow) {
    struct rte_flow_error error;
    int ret;

    ret = rte_flow_destroy(dev->port_id, rte_flow, &error);
    if (ret == 0) {
        ufid_to_rte_flow_disassociate(ufid);
        VLOG_DBG("removed rte flow %p associated with ufid " UUID_FMT "\n",
                 rte_flow, UUID_ARGS((struct uuid *)ufid));
    } else {
        VLOG_ERR("rte flow destroy error: %u : message : %s\n",
                 error.type, error.message);
    }

    return ret;
}

static int
netdev_dpdk_flow_put(struct netdev *netdev, struct match *match,
                     struct nlattr *actions, size_t actions_len,
                     const ovs_u128 *ufid, struct offload_info *info,
                     struct dpif_flow_stats *stats OVS_UNUSED) {
    struct rte_flow *rte_flow;
    int ret;

    /*
     * If an old rte_flow exists, it means it's a flow modification.
     * Here destroy the old rte flow first before adding a new one.
     */
    rte_flow = ufid_to_rte_flow_find(ufid);
    if (rte_flow) {
        ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
                                           ufid, rte_flow);
        if (ret < 0) {
            return ret;
        }
    }

    ret = netdev_dpdk_validate_flow(match);
    if (ret < 0) {
        return ret;
    }

    return netdev_dpdk_add_rte_flow_offload(netdev, match, actions,
                                            actions_len, ufid, info);
}

static int
netdev_dpdk_flow_del(struct netdev *netdev, const ovs_u128 *ufid,
                     struct dpif_flow_stats *stats OVS_UNUSED) {

    struct rte_flow *rte_flow = ufid_to_rte_flow_find(ufid);

    if (!rte_flow) {
        return -1;
    }

    return netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
                                        ufid, rte_flow);
}

#define DPDK_FLOW_OFFLOAD_API                                 \
    NULL,                   /* flow_flush */                  \
    NULL,                   /* flow_dump_create */            \
    NULL,                   /* flow_dump_destroy */           \
    NULL,                   /* flow_dump_next */              \
    netdev_dpdk_flow_put,                                     \
    NULL,                   /* flow_get */                    \
    netdev_dpdk_flow_del,                                     \
    NULL                    /* init_flow_api */


#define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT,    \
                          SET_CONFIG, SET_TX_MULTIQ, SEND,    \
                          GET_CARRIER, GET_STATS,			  \
                          GET_CUSTOM_STATS,					  \
                          GET_FEATURES, GET_STATUS,           \
                          RECONFIGURE, RXQ_RECV)              \
{                                                             \
    NAME,                                                     \
    true,                       /* is_pmd */                  \
    INIT,                       /* init */                    \
    NULL,                       /* netdev_dpdk_run */         \
    NULL,                       /* netdev_dpdk_wait */        \
                                                              \
    netdev_dpdk_alloc,                                        \
    CONSTRUCT,                                                \
    DESTRUCT,                                                 \
    netdev_dpdk_dealloc,                                      \
    netdev_dpdk_get_config,                                   \
    SET_CONFIG,                                               \
    NULL,                       /* get_tunnel_config */       \
    NULL,                       /* build header */            \
    NULL,                       /* push header */             \
    NULL,                       /* pop header */              \
    netdev_dpdk_get_numa_id,    /* get_numa_id */             \
    SET_TX_MULTIQ,                                            \
                                                              \
    SEND,                       /* send */                    \
    NULL,                       /* send_wait */               \
                                                              \
    netdev_dpdk_set_etheraddr,                                \
    netdev_dpdk_get_etheraddr,                                \
    netdev_dpdk_get_mtu,                                      \
    netdev_dpdk_set_mtu,                                      \
    netdev_dpdk_get_ifindex,                                  \
    GET_CARRIER,                                              \
    netdev_dpdk_get_carrier_resets,                           \
    netdev_dpdk_set_miimon,                                   \
    GET_STATS,                                                \
    GET_CUSTOM_STATS,										  \
    GET_FEATURES,                                             \
    NULL,                       /* set_advertisements */      \
    NULL,                       /* get_pt_mode */             \
                                                              \
    netdev_dpdk_set_policing,                                 \
    netdev_dpdk_get_qos_types,                                \
    NULL,                       /* get_qos_capabilities */    \
    netdev_dpdk_get_qos,                                      \
    netdev_dpdk_set_qos,                                      \
    NULL,                       /* get_queue */               \
    NULL,                       /* set_queue */               \
    NULL,                       /* delete_queue */            \
    NULL,                       /* get_queue_stats */         \
    NULL,                       /* queue_dump_start */        \
    NULL,                       /* queue_dump_next */         \
    NULL,                       /* queue_dump_done */         \
    NULL,                       /* dump_queue_stats */        \
                                                              \
    NULL,                       /* set_in4 */                 \
    NULL,                       /* get_addr_list */           \
    NULL,                       /* add_router */              \
    NULL,                       /* get_next_hop */            \
    GET_STATUS,                                               \
    NULL,                       /* arp_lookup */              \
                                                              \
    netdev_dpdk_update_flags,                                 \
    RECONFIGURE,                                              \
                                                              \
    netdev_dpdk_rxq_alloc,                                    \
    netdev_dpdk_rxq_construct,                                \
    netdev_dpdk_rxq_destruct,                                 \
    netdev_dpdk_rxq_dealloc,                                  \
    RXQ_RECV,                                                 \
    NULL,                       /* rx_wait */                 \
    NULL,                       /* rxq_drain */               \
    DPDK_FLOW_OFFLOAD_API,                                    \
    NULL                        /* get_block_id */            \
}

static const struct netdev_class dpdk_class =
    NETDEV_DPDK_CLASS(
        "dpdk",
        netdev_dpdk_class_init,
        netdev_dpdk_construct,
        netdev_dpdk_destruct,
        netdev_dpdk_set_config,
        netdev_dpdk_set_tx_multiq,
        netdev_dpdk_eth_send,
        netdev_dpdk_get_carrier,
        netdev_dpdk_get_stats,
        netdev_dpdk_get_custom_stats,
        netdev_dpdk_get_features,
        netdev_dpdk_get_status,
        netdev_dpdk_reconfigure,
        netdev_dpdk_rxq_recv);

static const struct netdev_class dpdk_ring_class =
    NETDEV_DPDK_CLASS(
        "dpdkr",
        netdev_dpdk_class_init,
        netdev_dpdk_ring_construct,
        netdev_dpdk_destruct,
        netdev_dpdk_ring_set_config,
        netdev_dpdk_set_tx_multiq,
        netdev_dpdk_ring_send,
        netdev_dpdk_get_carrier,
        netdev_dpdk_get_stats,
        netdev_dpdk_get_custom_stats,
        netdev_dpdk_get_features,
        netdev_dpdk_get_status,
        netdev_dpdk_reconfigure,
        netdev_dpdk_rxq_recv);

static const struct netdev_class dpdk_vhost_class =
    NETDEV_DPDK_CLASS(
        "dpdkvhostuser",
        NULL,
        netdev_dpdk_vhost_construct,
        netdev_dpdk_vhost_destruct,
        NULL,
        NULL,
        netdev_dpdk_vhost_send,
        netdev_dpdk_vhost_get_carrier,
        netdev_dpdk_vhost_get_stats,
        NULL,
        NULL,
        netdev_dpdk_vhost_user_get_status,
        netdev_dpdk_vhost_reconfigure,
        netdev_dpdk_vhost_rxq_recv);
static const struct netdev_class dpdk_vhost_client_class =
    NETDEV_DPDK_CLASS(
        "dpdkvhostuserclient",
        NULL,
        netdev_dpdk_vhost_client_construct,
        netdev_dpdk_vhost_destruct,
        netdev_dpdk_vhost_client_set_config,
        NULL,
        netdev_dpdk_vhost_send,
        netdev_dpdk_vhost_get_carrier,
        netdev_dpdk_vhost_get_stats,
        NULL,
        NULL,
        netdev_dpdk_vhost_user_get_status,
        netdev_dpdk_vhost_client_reconfigure,
        netdev_dpdk_vhost_rxq_recv);

void
netdev_dpdk_register(void)
{
    netdev_register_provider(&dpdk_class);
    netdev_register_provider(&dpdk_ring_class);
    netdev_register_provider(&dpdk_vhost_class);
    netdev_register_provider(&dpdk_vhost_client_class);
}


More information about the discuss mailing list