[ovs-dev] [PATCH] netdev-dpdk: add support for rings in secondary processes in IVSHMEM setups
Ethan Jackson
ethan at nicira.com
Wed Jul 29 18:39:07 UTC 2015
> However, there is a bit of a problem with this model as it currently stands which is
> kind of independent of this patch and was suggested by Daniel above. Freeing
> mbufs in the guest may cause packet corruption. This was an issue with DPDK (I think
> it still is). When returning an mbuf to the mempool, DPDK will first try to return the
> mbuf to a per-lcore cache. The per-lcore cache will get periodically flushed to the
> central mempool. The per-lcore cache is not threadsafe but the central mempool is.
> This is not a major issue when everything is running on the host but when you run a
> dpdk application in the guest you potentially have two DPDK threads with an lcore of 0
> (one in the host and one in the guest!). You can hit a situation whereby both threads
> simultaneously try to access the per-lcore cache for lcore 0 and corrupt it.
Is this just an issue if two cores share lcore 0, or would the same
thing happen if two cores have lcore 1? Would a hacky solution be to
just somehow detect overlapping lcore ids between the guest and the
host, and avoid them?
> When we did something similar previously, we approached it differently by
> adding ivshmem support directly to DPDK (there is an ivshmem target). Using this, you
> are able to select which dpdk objects you would like to share to the guest from
> the host. For example, you would say "I want to share this ring and this mempool".
> Then, the DPDK process running in the guest would automatically detect that it
> was running in a virtualized environment with ivshmem (it would detect the ivshmem
> pci device) and allow the guest application to get those objects. In this model, you have
> better security (you don’t need to share an entire page - you can just share objects), you
> can run a primary process in the guest (even if the host process is a primary process).
> However, you still have the problem with per-lcore cache corruption described above.
> The way you can resolve this is :
> a) don’t ever free (or allocate) packets in the guest
This doesn't seem feasible in general. It's somewhat of an odd
requirement that would be hard to enforce I suspect.
> b) create a free and alloc ring that gets handled in the host. If the guest needs an mbuf,
> it would request one from the alloc ring. If it needs to free an mbuf, it would push it
> to the free ring.
This seems like a lot of work. Do we really care that much about
IVSHMEM? My impression was that we're more-or-less standardizing
around vhost-user. My inclination would be to drop the patch instead
of implementing this logic unless there's consensus that IVSHMEM is
going to be an relevant IO mechanism in the future.
> However, maybe for your use-case this isn’t an issue?
Yeah for our (Melvin + Me) specific use case, we'll just do (a) and
not free or allocate packets, shouldn't be too hard for us.
Ethan
>
>> >
>> >Signed-off-by: Melvin Walls <mwalls67 at gmail.com>
>> >Signed-off-by: Ethan Jackson <ethan at nicira.com>
>> >---
>> > lib/netdev-dpdk.c | 158
>> >+++++++++++++++++++++++++++++++++++++++++-------------
>> > 1 file changed, 122 insertions(+), 36 deletions(-)
>> >
>> >diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
>> >5ae805e..5abe90f 100644
>> >--- a/lib/netdev-dpdk.c
>> >+++ b/lib/netdev-dpdk.c
>> >@@ -227,6 +227,10 @@ struct netdev_dpdk {
>> > /* Identifier used to distinguish vhost devices from each other */
>> > char vhost_id[PATH_MAX];
>> >
>> >+ /* Rings for secondary processes in IVSHMEM setups, NULL otherwise
>> */
>> >+ struct rte_ring *rx_ring;
>> >+ struct rte_ring *tx_ring;
>> >+
>> > /* In dpdk_list. */
>> > struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex); }; @@
>> >-340,12 +344,16 @@ dpdk_mp_get(int socket_id, int mtu)
>> >OVS_REQUIRES(dpdk_mutex)
>> > return NULL;
>> > }
>> >
>> >- dmp->mp = rte_mempool_create(mp_name, mp_size,
>> MBUF_SIZE(mtu),
>> >- MP_CACHE_SZ,
>> >- sizeof(struct
>> >rte_pktmbuf_pool_private),
>> >- rte_pktmbuf_pool_init, NULL,
>> >- ovs_rte_pktmbuf_init, NULL,
>> >- socket_id, 0);
>> >+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
>> >+ dmp->mp = rte_mempool_create(mp_name, mp_size,
>> >MBUF_SIZE(mtu),
>> >+ MP_CACHE_SZ,
>> >+ sizeof(struct
>> >rte_pktmbuf_pool_private),
>> >+ rte_pktmbuf_pool_init, NULL,
>> >+ ovs_rte_pktmbuf_init, NULL,
>> >+ socket_id, 0);
>> >+ } else {
>> >+ dmp->mp = rte_mempool_lookup(mp_name);
>> >+ }
>
> Does this not break when someone sets the mtu?
>
> On a side note, I saw some DPDK patches on dpdk.org that will
> enable the freeing of a memzone!
>
>> > } while (!dmp->mp && rte_errno == ENOMEM && (mp_size /= 2) >=
>> >MIN_NB_MBUF);
>> >
>> > if (dmp->mp == NULL) {
>> >@@ -439,39 +447,41 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>> >OVS_REQUIRES(dpdk_mutex)
>> > dev->up.n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
>> > dev->real_n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
>> >
>> >- diag = rte_eth_dev_configure(dev->port_id, dev->up.n_rxq,
>> >dev->real_n_txq,
>> >- &port_conf);
>> >- if (diag) {
>> >- VLOG_ERR("eth dev config error %d. rxq:%d txq:%d", diag,
>> >dev->up.n_rxq,
>> >- dev->real_n_txq);
>> >- return -diag;
>> >- }
>> >-
>> >- for (i = 0; i < dev->real_n_txq; i++) {
>> >- diag = rte_eth_tx_queue_setup(dev->port_id, i,
>> >NIC_PORT_TX_Q_SIZE,
>> >- dev->socket_id, NULL);
>> >+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
>> >+ diag = rte_eth_dev_configure(dev->port_id, dev->up.n_rxq,
>> >dev->real_n_txq,
>> >+ &port_conf);
>> > if (diag) {
>> >- VLOG_ERR("eth dev tx queue setup error %d",diag);
>> >+ VLOG_ERR("eth dev config error %d. rxq:%d txq:%d", diag,
>> >dev->up.n_rxq,
>> >+ dev->real_n_txq);
>> > return -diag;
>> > }
>> >- }
>> >
>> >- for (i = 0; i < dev->up.n_rxq; i++) {
>> >- diag = rte_eth_rx_queue_setup(dev->port_id, i,
>> >NIC_PORT_RX_Q_SIZE,
>> >- dev->socket_id,
>> >- NULL, dev->dpdk_mp->mp);
>> >+ for (i = 0; i < dev->real_n_txq; i++) {
>> >+ diag = rte_eth_tx_queue_setup(dev->port_id, i,
>> >NIC_PORT_TX_Q_SIZE,
>> >+ dev->socket_id, NULL);
>> >+ if (diag) {
>> >+ VLOG_ERR("eth dev tx queue setup error %d",diag);
>> >+ return -diag;
>> >+ }
>> >+ }
>> >+
>> >+ for (i = 0; i < dev->up.n_rxq; i++) {
>> >+ diag = rte_eth_rx_queue_setup(dev->port_id, i,
>> >NIC_PORT_RX_Q_SIZE,
>> >+ dev->socket_id,
>> >+ NULL, dev->dpdk_mp->mp);
>> >+ if (diag) {
>> >+ VLOG_ERR("eth dev rx queue setup error %d",diag);
>> >+ return -diag;
>> >+ }
>> >+ }
>> >+
>> >+ diag = rte_eth_dev_start(dev->port_id);
>> > if (diag) {
>> >- VLOG_ERR("eth dev rx queue setup error %d",diag);
>> >+ VLOG_ERR("eth dev start error %d",diag);
>> > return -diag;
>> > }
>
> I don’t think you should enable this. Are you suggesting that a primary process
> could initialize the NIC but the vswitch would be the process that would actually use the NIC?
>
> I think that when the vswitch is started as a secondary process it should not
> be allowed to add physical ports at all.
>
>> > }
>> >
>> >- diag = rte_eth_dev_start(dev->port_id);
>> >- if (diag) {
>> >- VLOG_ERR("eth dev start error %d",diag);
>> >- return -diag;
>> >- }
>> >-
>> > rte_eth_promiscuous_enable(dev->port_id);
>> > rte_eth_allmulticast_enable(dev->port_id);
>> >
>> >@@ -532,6 +542,8 @@ netdev_dpdk_init(struct netdev *netdev_,
>> unsigned
>> >int port_no,
>> > OVS_REQUIRES(dpdk_mutex)
>> > {
>> > struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
>> >+ char *rxq_name = xasprintf("%s_tx", netdev->up.name);
>> >+ char *txq_name = xasprintf("%s_rx", netdev->up.name);
>
> I think this patch would be more useful if you could arbitrarily give
> the name of the ring you want to connect to rather than just the prefix
> - maybe an option to the add-port command?
>
>> > int sid;
>> > int err = 0;
>> >
>> >@@ -574,6 +586,19 @@ netdev_dpdk_init(struct netdev *netdev_,
>> unsigned
>> >int port_no,
>> > }
>> > }
>> >
>> >+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
>> >+ netdev->rx_ring = netdev->tx_ring = NULL;
>> >+ } else {
>> >+ netdev->rx_ring = rte_ring_lookup(rxq_name);
>> >+ netdev->tx_ring = rte_ring_lookup(txq_name);
>> >+ if (!netdev->rx_ring || !netdev->tx_ring) {
>> >+ err = ENOMEM;
>> >+ }
>> >+ }
>> >+
>> >+ free(rxq_name);
>> >+ free(txq_name);
>> >+
>> > list_push_back(&dpdk_list, &netdev->list_node);
>> >
>> > unlock:
>> >@@ -957,6 +982,36 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq_,
>> >struct dp_packet **packets,
>> > return 0;
>> > }
>> >
>> >+static int
>> >+netdev_dpdk_ring_rxq_recv(struct netdev_rxq *rxq_,
>> >+ struct dp_packet **packets, int *c) {
>> >+ struct netdev_dpdk *netdev = netdev_dpdk_cast(rxq_->netdev);
>> >+ struct rte_ring *rx_ring = netdev->rx_ring;
>> >+ unsigned rx_pkts = NETDEV_MAX_BURST;
>> >+
>> >+ /* Only use netdev_dpdk_ring_rxq_recv() as a secondary process.
>> >There are operations
>> >+ * performed by netdev_dpdk_rxq_recv() that primary processes are
>> >responsible for and
>> >+ * cannot be performed by secondary processes. */
>> >+ if (OVS_LIKELY(rte_eal_process_type() == RTE_PROC_PRIMARY)) {
>> >+ return netdev_dpdk_rxq_recv(rxq_,packets,c);
>> >+ }
>> >+
>> >+ while (OVS_UNLIKELY(rte_ring_dequeue_bulk(rx_ring, (void
>> >+ **)packets,
>> >rx_pkts) != 0) &&
>> >+ rx_pkts > 0) {
>> >+ rx_pkts = rte_ring_count(rx_ring);
>> >+ rx_pkts = (unsigned)MIN(rx_pkts,NETDEV_MAX_BURST);
>
> I wonder would it be more efficient to return EAGAIN at this point rather than retry? Would
> the following 'if' statement ever hit?
>
>> >+ }
>> >+
>> >+ if (!rx_pkts) {
>> >+ return EAGAIN;
>> >+ }
>> >+
>> >+ *c = rx_pkts;
>> >+
>> >+ return 0;
>> >+}
>> >+
>> > static void
>> > __netdev_dpdk_vhost_send(struct netdev *netdev, struct dp_packet
>> **pkts,
>> > int cnt, bool may_steal) @@ -1147,6 +1202,20
>> >@@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid
>> OVS_UNUSED,
>> >struct dp_pack }
>> >
>> > static inline void
>> >+netdev_dpdk_ring_send__(struct netdev_dpdk *netdev,
>> >+ struct dp_packet **pkts, int cnt) {
>> >+ struct rte_ring *tx_ring = netdev->tx_ring;
>> >+ int rslt = 0;
>> >+
>> >+ if (tx_ring != NULL) {
>> >+ do {
>> >+ rslt = rte_ring_enqueue_bulk(tx_ring, (void **)pkts, cnt);
>> >+ } while (rslt == -ENOBUFS);
>
> Bit of a nit but convention in the file is to use err instead or rslt
>
>> >+ }
>> >+}
>> >+
>> >+static inline void
>> > netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
>> > struct dp_packet **pkts, int cnt, bool may_steal)
>> >{ @@ -1812,8 +1881,13 @@ dpdk_ring_create(const char dev_name[],
>> >unsigned int port_no,
>> > }
>> >
>> > /* Create single consumer/producer rings, netdev does explicit
>> >locking. */
>> >- ivshmem->cring_tx = rte_ring_create(ring_name, DPDK_RING_SIZE,
>> >SOCKET0,
>> >- RING_F_SP_ENQ | RING_F_SC_DEQ);
>> >+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
>> >+ ivshmem->cring_tx = rte_ring_create(ring_name, DPDK_RING_SIZE,
>> >SOCKET0,
>> >+ RING_F_SP_ENQ |
>> >RING_F_SC_DEQ);
>> >+ } else {
>> >+ ivshmem->cring_tx = rte_ring_lookup(ring_name);
>> >+ }
>> >+
>> > if (ivshmem->cring_tx == NULL) {
>> > rte_free(ivshmem);
>> > return ENOMEM;
>> >@@ -1825,8 +1899,13 @@ dpdk_ring_create(const char dev_name[],
>> unsigned
>> >int port_no,
>> > }
>> >
>> > /* Create single consumer/producer rings, netdev does explicit
>> >locking. */
>> >- ivshmem->cring_rx = rte_ring_create(ring_name, DPDK_RING_SIZE,
>> >SOCKET0,
>> >- RING_F_SP_ENQ | RING_F_SC_DEQ);
>> >+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
>> >+ ivshmem->cring_rx = rte_ring_create(ring_name, DPDK_RING_SIZE,
>> >SOCKET0,
>> >+ RING_F_SP_ENQ |
>> >RING_F_SC_DEQ);
>> >+ } else {
>> >+ ivshmem->cring_rx = rte_ring_lookup(ring_name);
>> >+ }
>> >+
>> > if (ivshmem->cring_rx == NULL) {
>> > rte_free(ivshmem);
>> > return ENOMEM;
>> >@@ -1888,7 +1967,14 @@ netdev_dpdk_ring_send(struct netdev
>> *netdev_,
>> >int qid,
>> > dp_packet_set_rss_hash(pkts[i], 0);
>> > }
>> >
>> >- netdev_dpdk_send__(netdev, qid, pkts, cnt, may_steal);
>> >+ /* Only use netdev_dpdk_send__() as a primary process. It leads to
>> >the execution
>> >+ * of code that cannot be executed by secondary processes. */
>> >+ if (OVS_LIKELY(rte_eal_process_type() == RTE_PROC_PRIMARY)) {
>> >+ netdev_dpdk_send__(netdev, qid, pkts, cnt, may_steal);
>> >+ } else {
>> >+ netdev_dpdk_ring_send__(netdev, pkts, cnt);
>
> If this is a single producer ring and there is only one qid, how do you ensure
> thread safety?
>
> What happens if may_steal is true?
>
>> >+ }
>> >+
>> > return 0;
>> > }
>> >
>> >@@ -2101,7 +2187,7 @@ static const struct netdev_class dpdk_ring_class =
>> > netdev_dpdk_get_stats,
>> > netdev_dpdk_get_features,
>> > netdev_dpdk_get_status,
>> >- netdev_dpdk_rxq_recv);
>> >+ netdev_dpdk_ring_rxq_recv);
>> >
>> > static const struct netdev_class OVS_UNUSED dpdk_vhost_cuse_class =
>> > NETDEV_DPDK_CLASS(
>> >--
>> >1.9.3 (Apple Git-50)
>> >_______________________________________________
>> >dev mailing list
>> >dev at openvswitch.org
>> >https://urldefense.proofpoint.com/v2/url?u=http-
>> 3A__openvswitch.org_mai
>> >lma
>> >n_listinfo_dev&d=BQIGaQ&c=Sqcl0Ez6M0X8aeM67LKIiDJAXVeAw-
>> YihVMNtXt-uEs&r
>> >=Sm
>> >B5nZacmXNq0gKCC1s_Cw5yUNjxgD4v5kJqZ2uWLlE&m=ksY9ulCC5TLZ-
>> tQnj3WWXeuAHQU
>> >k9J o-mP_ruJ124K0&s=HiTz0cRNA8O4ZBvUJqeOL-KOwygu2-
>> QlDmyVeLTpNBE&e=
>>
>> _______________________________________________
>> dev mailing list
>> dev at openvswitch.org
>> http://openvswitch.org/mailman/listinfo/dev
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
More information about the dev
mailing list