[ovs-dev] [PATCH] netdev-dpdk: add support for rings in secondary processes in IVSHMEM setups
Daniele Di Proietto
diproiettod at vmware.com
Tue Jul 28 18:28:09 UTC 2015
This is interesting, thanks for the patch.
I'm definitely not a IVSHMEM expert, but I have a concern: what
happens if the secondary OVS process allocates or frees some
mbufs? (e.g because a packet is sent to multiple destinations or is
dropped)
I found this in the DPDK documentation:
http://dpdk.org/doc/guides/prog_guide/ivshmem_lib.html#best-practices-for-w
riting-ivshmem-applications\
I'd appreciate feedback from someone with more IVSHMEM experience
than me (Maryam, perhaps?)
Thanks,
Daniele
On 22/07/2015 21:51, "Melvin Walls" <mwalls67 at gmail.com> wrote:
>In order for OVS running inside a VM using IVSHMEM to recognize ports
>created
>on the host, you have to start vswitchd with the --proc-type=secondary EAL
>option.
>
>When creating rings in secondary processes functions like
>rte_eth_dev_configure() fail with the error code E_RTE_SECONDARY, i.e.,
>the
>operations are not allowed in secondary processes. Avoiding this requires
>some
>changes to the way secondary processes handle dpdk rings.
>
>This patch changes dpdk_ring_create() to use rte_ring_lookup() instead of
>rte_ring_create() when called from a secondary process. It also
>introduces two
>functions: netdev_dpdk_ring_rxq_recv() and netdev_dpdk_ring_send__() to
>handle
>tx/rx on dpdk rings in secondary processes.
>
>Signed-off-by: Melvin Walls <mwalls67 at gmail.com>
>Signed-off-by: Ethan Jackson <ethan at nicira.com>
>---
> lib/netdev-dpdk.c | 158
>+++++++++++++++++++++++++++++++++++++++++-------------
> 1 file changed, 122 insertions(+), 36 deletions(-)
>
>diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>index 5ae805e..5abe90f 100644
>--- a/lib/netdev-dpdk.c
>+++ b/lib/netdev-dpdk.c
>@@ -227,6 +227,10 @@ struct netdev_dpdk {
> /* Identifier used to distinguish vhost devices from each other */
> char vhost_id[PATH_MAX];
>
>+ /* Rings for secondary processes in IVSHMEM setups, NULL otherwise */
>+ struct rte_ring *rx_ring;
>+ struct rte_ring *tx_ring;
>+
> /* In dpdk_list. */
> struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
> };
>@@ -340,12 +344,16 @@ dpdk_mp_get(int socket_id, int mtu)
>OVS_REQUIRES(dpdk_mutex)
> return NULL;
> }
>
>- dmp->mp = rte_mempool_create(mp_name, mp_size, MBUF_SIZE(mtu),
>- MP_CACHE_SZ,
>- sizeof(struct
>rte_pktmbuf_pool_private),
>- rte_pktmbuf_pool_init, NULL,
>- ovs_rte_pktmbuf_init, NULL,
>- socket_id, 0);
>+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
>+ dmp->mp = rte_mempool_create(mp_name, mp_size,
>MBUF_SIZE(mtu),
>+ MP_CACHE_SZ,
>+ sizeof(struct
>rte_pktmbuf_pool_private),
>+ rte_pktmbuf_pool_init, NULL,
>+ ovs_rte_pktmbuf_init, NULL,
>+ socket_id, 0);
>+ } else {
>+ dmp->mp = rte_mempool_lookup(mp_name);
>+ }
> } while (!dmp->mp && rte_errno == ENOMEM && (mp_size /= 2) >=
>MIN_NB_MBUF);
>
> if (dmp->mp == NULL) {
>@@ -439,39 +447,41 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>OVS_REQUIRES(dpdk_mutex)
> dev->up.n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
> dev->real_n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
>
>- diag = rte_eth_dev_configure(dev->port_id, dev->up.n_rxq,
>dev->real_n_txq,
>- &port_conf);
>- if (diag) {
>- VLOG_ERR("eth dev config error %d. rxq:%d txq:%d", diag,
>dev->up.n_rxq,
>- dev->real_n_txq);
>- return -diag;
>- }
>-
>- for (i = 0; i < dev->real_n_txq; i++) {
>- diag = rte_eth_tx_queue_setup(dev->port_id, i,
>NIC_PORT_TX_Q_SIZE,
>- dev->socket_id, NULL);
>+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
>+ diag = rte_eth_dev_configure(dev->port_id, dev->up.n_rxq,
>dev->real_n_txq,
>+ &port_conf);
> if (diag) {
>- VLOG_ERR("eth dev tx queue setup error %d",diag);
>+ VLOG_ERR("eth dev config error %d. rxq:%d txq:%d", diag,
>dev->up.n_rxq,
>+ dev->real_n_txq);
> return -diag;
> }
>- }
>
>- for (i = 0; i < dev->up.n_rxq; i++) {
>- diag = rte_eth_rx_queue_setup(dev->port_id, i,
>NIC_PORT_RX_Q_SIZE,
>- dev->socket_id,
>- NULL, dev->dpdk_mp->mp);
>+ for (i = 0; i < dev->real_n_txq; i++) {
>+ diag = rte_eth_tx_queue_setup(dev->port_id, i,
>NIC_PORT_TX_Q_SIZE,
>+ dev->socket_id, NULL);
>+ if (diag) {
>+ VLOG_ERR("eth dev tx queue setup error %d",diag);
>+ return -diag;
>+ }
>+ }
>+
>+ for (i = 0; i < dev->up.n_rxq; i++) {
>+ diag = rte_eth_rx_queue_setup(dev->port_id, i,
>NIC_PORT_RX_Q_SIZE,
>+ dev->socket_id,
>+ NULL, dev->dpdk_mp->mp);
>+ if (diag) {
>+ VLOG_ERR("eth dev rx queue setup error %d",diag);
>+ return -diag;
>+ }
>+ }
>+
>+ diag = rte_eth_dev_start(dev->port_id);
> if (diag) {
>- VLOG_ERR("eth dev rx queue setup error %d",diag);
>+ VLOG_ERR("eth dev start error %d",diag);
> return -diag;
> }
> }
>
>- diag = rte_eth_dev_start(dev->port_id);
>- if (diag) {
>- VLOG_ERR("eth dev start error %d",diag);
>- return -diag;
>- }
>-
> rte_eth_promiscuous_enable(dev->port_id);
> rte_eth_allmulticast_enable(dev->port_id);
>
>@@ -532,6 +542,8 @@ netdev_dpdk_init(struct netdev *netdev_, unsigned int
>port_no,
> OVS_REQUIRES(dpdk_mutex)
> {
> struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
>+ char *rxq_name = xasprintf("%s_tx", netdev->up.name);
>+ char *txq_name = xasprintf("%s_rx", netdev->up.name);
> int sid;
> int err = 0;
>
>@@ -574,6 +586,19 @@ netdev_dpdk_init(struct netdev *netdev_, unsigned
>int port_no,
> }
> }
>
>+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
>+ netdev->rx_ring = netdev->tx_ring = NULL;
>+ } else {
>+ netdev->rx_ring = rte_ring_lookup(rxq_name);
>+ netdev->tx_ring = rte_ring_lookup(txq_name);
>+ if (!netdev->rx_ring || !netdev->tx_ring) {
>+ err = ENOMEM;
>+ }
>+ }
>+
>+ free(rxq_name);
>+ free(txq_name);
>+
> list_push_back(&dpdk_list, &netdev->list_node);
>
> unlock:
>@@ -957,6 +982,36 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq_, struct
>dp_packet **packets,
> return 0;
> }
>
>+static int
>+netdev_dpdk_ring_rxq_recv(struct netdev_rxq *rxq_,
>+ struct dp_packet **packets, int *c)
>+{
>+ struct netdev_dpdk *netdev = netdev_dpdk_cast(rxq_->netdev);
>+ struct rte_ring *rx_ring = netdev->rx_ring;
>+ unsigned rx_pkts = NETDEV_MAX_BURST;
>+
>+ /* Only use netdev_dpdk_ring_rxq_recv() as a secondary process.
>There are operations
>+ * performed by netdev_dpdk_rxq_recv() that primary processes are
>responsible for and
>+ * cannot be performed by secondary processes. */
>+ if (OVS_LIKELY(rte_eal_process_type() == RTE_PROC_PRIMARY)) {
>+ return netdev_dpdk_rxq_recv(rxq_,packets,c);
>+ }
>+
>+ while (OVS_UNLIKELY(rte_ring_dequeue_bulk(rx_ring, (void **)packets,
>rx_pkts) != 0) &&
>+ rx_pkts > 0) {
>+ rx_pkts = rte_ring_count(rx_ring);
>+ rx_pkts = (unsigned)MIN(rx_pkts,NETDEV_MAX_BURST);
>+ }
>+
>+ if (!rx_pkts) {
>+ return EAGAIN;
>+ }
>+
>+ *c = rx_pkts;
>+
>+ return 0;
>+}
>+
> static void
> __netdev_dpdk_vhost_send(struct netdev *netdev, struct dp_packet **pkts,
> int cnt, bool may_steal)
>@@ -1147,6 +1202,20 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int
>qid OVS_UNUSED, struct dp_pack
> }
>
> static inline void
>+netdev_dpdk_ring_send__(struct netdev_dpdk *netdev,
>+ struct dp_packet **pkts, int cnt)
>+{
>+ struct rte_ring *tx_ring = netdev->tx_ring;
>+ int rslt = 0;
>+
>+ if (tx_ring != NULL) {
>+ do {
>+ rslt = rte_ring_enqueue_bulk(tx_ring, (void **)pkts, cnt);
>+ } while (rslt == -ENOBUFS);
>+ }
>+}
>+
>+static inline void
> netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
> struct dp_packet **pkts, int cnt, bool may_steal)
> {
>@@ -1812,8 +1881,13 @@ dpdk_ring_create(const char dev_name[], unsigned
>int port_no,
> }
>
> /* Create single consumer/producer rings, netdev does explicit
>locking. */
>- ivshmem->cring_tx = rte_ring_create(ring_name, DPDK_RING_SIZE,
>SOCKET0,
>- RING_F_SP_ENQ | RING_F_SC_DEQ);
>+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
>+ ivshmem->cring_tx = rte_ring_create(ring_name, DPDK_RING_SIZE,
>SOCKET0,
>+ RING_F_SP_ENQ |
>RING_F_SC_DEQ);
>+ } else {
>+ ivshmem->cring_tx = rte_ring_lookup(ring_name);
>+ }
>+
> if (ivshmem->cring_tx == NULL) {
> rte_free(ivshmem);
> return ENOMEM;
>@@ -1825,8 +1899,13 @@ dpdk_ring_create(const char dev_name[], unsigned
>int port_no,
> }
>
> /* Create single consumer/producer rings, netdev does explicit
>locking. */
>- ivshmem->cring_rx = rte_ring_create(ring_name, DPDK_RING_SIZE,
>SOCKET0,
>- RING_F_SP_ENQ | RING_F_SC_DEQ);
>+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
>+ ivshmem->cring_rx = rte_ring_create(ring_name, DPDK_RING_SIZE,
>SOCKET0,
>+ RING_F_SP_ENQ |
>RING_F_SC_DEQ);
>+ } else {
>+ ivshmem->cring_rx = rte_ring_lookup(ring_name);
>+ }
>+
> if (ivshmem->cring_rx == NULL) {
> rte_free(ivshmem);
> return ENOMEM;
>@@ -1888,7 +1967,14 @@ netdev_dpdk_ring_send(struct netdev *netdev_, int
>qid,
> dp_packet_set_rss_hash(pkts[i], 0);
> }
>
>- netdev_dpdk_send__(netdev, qid, pkts, cnt, may_steal);
>+ /* Only use netdev_dpdk_send__() as a primary process. It leads to
>the execution
>+ * of code that cannot be executed by secondary processes. */
>+ if (OVS_LIKELY(rte_eal_process_type() == RTE_PROC_PRIMARY)) {
>+ netdev_dpdk_send__(netdev, qid, pkts, cnt, may_steal);
>+ } else {
>+ netdev_dpdk_ring_send__(netdev, pkts, cnt);
>+ }
>+
> return 0;
> }
>
>@@ -2101,7 +2187,7 @@ static const struct netdev_class dpdk_ring_class =
> netdev_dpdk_get_stats,
> netdev_dpdk_get_features,
> netdev_dpdk_get_status,
>- netdev_dpdk_rxq_recv);
>+ netdev_dpdk_ring_rxq_recv);
>
> static const struct netdev_class OVS_UNUSED dpdk_vhost_cuse_class =
> NETDEV_DPDK_CLASS(
>--
>1.9.3 (Apple Git-50)
>_______________________________________________
>dev mailing list
>dev at openvswitch.org
>https://urldefense.proofpoint.com/v2/url?u=http-3A__openvswitch.org_mailma
>n_listinfo_dev&d=BQIGaQ&c=Sqcl0Ez6M0X8aeM67LKIiDJAXVeAw-YihVMNtXt-uEs&r=Sm
>B5nZacmXNq0gKCC1s_Cw5yUNjxgD4v5kJqZ2uWLlE&m=ksY9ulCC5TLZ-tQnj3WWXeuAHQUk9J
>o-mP_ruJ124K0&s=HiTz0cRNA8O4ZBvUJqeOL-KOwygu2-QlDmyVeLTpNBE&e=
More information about the dev
mailing list