[ovs-dev] [PATCH v2] netdev-dpdk: Fix race condition with DPDK mempools in non pmd threads

Daniele Di Proietto ddiproietto at vmware.com
Sun Jul 20 18:11:26 UTC 2014


Thanks for removing the extra rte_mempool_put_bulk().
About the dpdk_buf pointer in ofpbuf, I felt that it might come in handy for
the patch 'dpif-netdev: Polling threads directly call ofproto upcall functions’,
but we might change the interface as well.

Thanks again,

Daniele

On Jul 20, 2014, at 10:28 AM, Pravin Shelar <pshelar at nicira.com> wrote:

> I removed call to rte_mempool_put_bulk() from dpdk_queue_flush__().
> Also removed unnecessary dpdk_buf pointer from ofpbuf. Then Pushed to
> master.
> 
> Thanks.
> 
> On Thu, Jul 17, 2014 at 2:29 PM, Daniele Di Proietto
> <ddiproietto at vmware.com> wrote:
>> DPDK mempools rely on rte_lcore_id() to implement a thread-local cache.
>> Our non pmd threads had rte_lcore_id() == 0. This allowed concurrent access to
>> the "thread-local" cache, causing crashes.
>> 
>> This commit resolves the issue with the following changes:
>> 
>> - Every non pmd thread has the same lcore_id (0, for management reasons), which
>>  is not shared with any pmd thread (lcore_id for pmd threads now start from 1)
>> - DPDK mbufs must be allocated/freed in pmd threads. When there is the need to
>>  use mempools in non pmd threads, like in dpdk_do_tx_copy(), a mutex must be
>>  held.
>> - The previous change does not allow us anymore to pass DPDK mbufs to handler
>>  threads: therefore this commit partially revert 143859ec63d45e. Now packets
>>  are copied for upcall processing. We can remove the extra memcpy by
>>  processing upcalls in the pmd thread itself.
>> 
>> With the introduction of the extra locking, the packet throughput will be lower
>> in the following cases:
>> 
>> - When using internal (tap) devices with DPDK devices on the same datapath.
>>  Anyway, to support internal devices efficiently, we needed DPDK KNI devices,
>>  which will be proper pmd devices and will not need this locking.
>> - When packets are processed in the slow path by non pmd threads. This overhead
>>  can be avoided by handling the upcalls directly in pmd threads (a change that
>>  has already been proposed by Ryan Wilson)
>> 
>> Also, the following two fixes have been introduced:
>> - In dpdk_free_buf() use rte_pktmbuf_free_seg() instead of rte_mempool_put().
>>  This allows OVS to run properly with CONFIG_RTE_LIBRTE_MBUF_DEBUG DPDK option
>> - Do not bulk free mbufs in a transmission queue. They may belong to different
>>  mempools
>> 
>> Signed-off-by: Daniele Di Proietto <ddiproietto at vmware.com>
>> ---
>> lib/dpif-netdev.c | 25 +++++++++++++++----------
>> lib/dpif-netdev.h |  1 +
>> lib/netdev-dpdk.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++---
>> lib/netdev-dpdk.h |  8 +++++++-
>> lib/ovs-thread.c  |  3 +++
>> 5 files changed, 77 insertions(+), 14 deletions(-)
>> 
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index 9270873..7ca929a 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -15,7 +15,7 @@
>>  */
>> 
>> #include <config.h>
>> -#include "dpif.h"
>> +#include "dpif-netdev.h"
>> 
>> #include <ctype.h>
>> #include <errno.h>
>> @@ -71,7 +71,6 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev);
>> #define NETDEV_RULE_PRIORITY 0x8000
>> 
>> #define FLOW_DUMP_MAX_BATCH 50
>> -#define NR_THREADS 1
>> /* Use per thread recirc_depth to prevent recirculation loop. */
>> #define MAX_RECIRC_DEPTH 5
>> DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
>> @@ -733,7 +732,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
>> 
>>     if (netdev_is_pmd(netdev)) {
>>         dp->pmd_count++;
>> -        dp_netdev_set_pmd_threads(dp, NR_THREADS);
>> +        dp_netdev_set_pmd_threads(dp, NR_PMD_THREADS);
>>         dp_netdev_reload_pmd_threads(dp);
>>     }
>>     ovs_refcount_init(&port->ref_cnt);
>> @@ -2095,6 +2094,7 @@ dp_netdev_input(struct dp_netdev *dp, struct dpif_packet **packets, int cnt,
>> 
>>                 dp_netdev_output_userspace(dp, &buf, 1, hash % dp->n_handlers,
>>                                            DPIF_UC_MISS, mfs[i], NULL);
>> +                dpif_packet_delete(packets[i]);
>>             } else {
>>                 /* No upcall queue.  Freeing the packet */
>>                 dpif_packet_delete(packets[i]);
>> @@ -2161,6 +2161,7 @@ OVS_REQUIRES(q->mutex)
>>         if (userdata) {
>>             buf_size += NLA_ALIGN(userdata->nla_len);
>>         }
>> +        buf_size += ofpbuf_size(packet);
>>         ofpbuf_init(buf, buf_size);
>> 
>>         /* Put ODP flow. */
>> @@ -2175,16 +2176,19 @@ OVS_REQUIRES(q->mutex)
>>                     NLA_ALIGN(userdata->nla_len));
>>         }
>> 
>> -        upcall->packet = *packet;
>> +        /* We have to perform a copy of the packet, because we cannot send DPDK
>> +         * mbufs to a non pmd thread. When the upcall processing will be done
>> +         * in the pmd thread, this copy can be avoided */
>> +        ofpbuf_set_data(&upcall->packet, ofpbuf_put(buf, ofpbuf_data(packet),
>> +                        ofpbuf_size(packet)));
>> +        ofpbuf_set_size(&upcall->packet, ofpbuf_size(packet));
>> 
>>         seq_change(q->seq);
>> 
>>         return 0;
>>     } else {
>> -        ofpbuf_delete(packet);
>>         return ENOBUFS;
>>     }
>> -
>> }
>> 
>> static int
>> @@ -2252,19 +2256,20 @@ dp_execute_cb(void *aux_, struct dpif_packet **packets, int cnt,
>>         miniflow_initialize(&key.flow, key.buf);
>> 
>>         for (i = 0; i < cnt; i++) {
>> -            struct ofpbuf *packet, *userspace_packet;
>> +            struct ofpbuf *packet;
>> 
>>             packet = &packets[i]->ofpbuf;
>> 
>>             miniflow_extract(packet, md, &key.flow);
>> 
>> -            userspace_packet = may_steal ? packet : ofpbuf_clone(packet);
>> -
>> -            dp_netdev_output_userspace(aux->dp, &userspace_packet, 1,
>> +            dp_netdev_output_userspace(aux->dp, &packet, 1,
>>                                        miniflow_hash_5tuple(&key.flow, 0)
>>                                            % aux->dp->n_handlers,
>>                                        DPIF_UC_ACTION, &key.flow,
>>                                        userdata);
>> +            if (may_steal) {
>> +                dpif_packet_delete(packets[i]);
>> +            }
>>         }
>>         break;
>>     }
>> diff --git a/lib/dpif-netdev.h b/lib/dpif-netdev.h
>> index af4a969..0f42d7a 100644
>> --- a/lib/dpif-netdev.h
>> +++ b/lib/dpif-netdev.h
>> @@ -40,6 +40,7 @@ static inline void dp_packet_pad(struct ofpbuf *b)
>> }
>> 
>> #define NR_QUEUE   1
>> +#define NR_PMD_THREADS 1
>> 
>> #ifdef  __cplusplus
>> }
>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>> index b925dd28..9315ed9 100644
>> --- a/lib/netdev-dpdk.c
>> +++ b/lib/netdev-dpdk.c
>> @@ -138,6 +138,11 @@ static struct list dpdk_list OVS_GUARDED_BY(dpdk_mutex)
>> static struct list dpdk_mp_list OVS_GUARDED_BY(dpdk_mutex)
>>     = LIST_INITIALIZER(&dpdk_mp_list);
>> 
>> +/* This mutex must be used by non pmd threads when allocating or freeing
>> + * mbufs through mempools. Since dpdk_queue_pkts() and dpdk_queue_flush() may
>> + * use mempools, a non pmd thread should hold this mutex while calling them */
>> +struct ovs_mutex nonpmd_mempool_mutex = OVS_MUTEX_INITIALIZER;
>> +
>> struct dpdk_mp {
>>     struct rte_mempool *mp;
>>     int mtu;
>> @@ -200,6 +205,8 @@ struct netdev_rxq_dpdk {
>>     int port_id;
>> };
>> 
>> +static bool thread_is_pmd(void);
>> +
>> static int netdev_dpdk_construct(struct netdev *);
>> 
>> static bool
>> @@ -223,13 +230,15 @@ dpdk_rte_mzalloc(size_t sz)
>>     return ptr;
>> }
>> 
>> +/* XXX this function should be called only by pmd threads (or by non pmd
>> + * threads holding the nonpmd_mempool_mutex) */
>> void
>> free_dpdk_buf(struct dpif_packet *p)
>> {
>>     struct ofpbuf *buf = &p->ofpbuf;
>>     struct rte_mbuf *pkt = (struct rte_mbuf *) buf->dpdk_buf;
>> 
>> -    rte_mempool_put(pkt->pool, pkt);
>> +    rte_pktmbuf_free_seg(pkt);
>> }
>> 
>> static void
>> @@ -617,7 +626,13 @@ dpdk_queue_flush__(struct netdev_dpdk *dev, int qid)
>> 
>>     nb_tx = rte_eth_tx_burst(dev->port_id, qid, txq->burst_pkts, txq->count);
>>     if (OVS_UNLIKELY(nb_tx != txq->count)) {
>> -        /* free buffers if we couldn't transmit packets */
>> +        /* free buffers, which we couldn't transmit, one at a time (each
>> +         * packet could come from a different mempool) */
>> +        int i;
>> +
>> +        for (i = nb_tx; i < txq->count; i++) {
>> +            rte_pktmbuf_free_seg(txq->burst_pkts[i]);
>> +        }
>>         rte_mempool_put_bulk(dev->dpdk_mp->mp,
>>                              (void **) &txq->burst_pkts[nb_tx],
>>                              (txq->count - nb_tx));
>> @@ -697,6 +712,7 @@ dpdk_queue_pkts(struct netdev_dpdk *dev, int qid,
>> /* Tx function. Transmit packets indefinitely */
>> static void
>> dpdk_do_tx_copy(struct netdev *netdev, struct dpif_packet ** pkts, int cnt)
>> +    OVS_NO_THREAD_SAFETY_ANALYSIS
>> {
>>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>     struct rte_mbuf *mbufs[cnt];
>> @@ -704,6 +720,13 @@ dpdk_do_tx_copy(struct netdev *netdev, struct dpif_packet ** pkts, int cnt)
>>     int newcnt = 0;
>>     int i;
>> 
>> +    /* If we are on a non pmd thread we have to use the mempool mutex, because
>> +     * every non pmd thread shares the same mempool cache */
>> +
>> +    if (!thread_is_pmd()) {
>> +        ovs_mutex_lock(&nonpmd_mempool_mutex);
>> +    }
>> +
>>     for (i = 0; i < cnt; i++) {
>>         int size = ofpbuf_size(&pkts[i]->ofpbuf);
>> 
>> @@ -739,6 +762,10 @@ dpdk_do_tx_copy(struct netdev *netdev, struct dpif_packet ** pkts, int cnt)
>> 
>>     dpdk_queue_pkts(dev, NON_PMD_THREAD_TX_QUEUE, mbufs, newcnt);
>>     dpdk_queue_flush(dev, NON_PMD_THREAD_TX_QUEUE);
>> +
>> +    if (!thread_is_pmd()) {
>> +        ovs_mutex_unlock(&nonpmd_mempool_mutex);
>> +    }
>> }
>> 
>> static int
>> @@ -1384,6 +1411,9 @@ dpdk_init(int argc, char **argv)
>>         argv[result] = argv[0];
>>     }
>> 
>> +    /* We are called from the main thread here */
>> +    thread_set_nonpmd();
>> +
>>     return result + 1;
>> }
>> 
>> @@ -1429,7 +1459,25 @@ pmd_thread_setaffinity_cpu(int cpu)
>>         VLOG_ERR("Thread affinity error %d",err);
>>         return err;
>>     }
>> -    RTE_PER_LCORE(_lcore_id) = cpu;
>> +    /* lcore_id 0 is reseved for use by non pmd threads. */
>> +    RTE_PER_LCORE(_lcore_id) = cpu + 1;
>> 
>>     return 0;
>> }
>> +
>> +void
>> +thread_set_nonpmd(void)
>> +{
>> +    /* We cannot have RTE_MAX_LCORE pmd threads, because lcore_id 0 is reserved
>> +     * for non pmd threads */
>> +    BUILD_ASSERT(NR_PMD_THREADS < RTE_MAX_LCORE);
>> +    /* We have to use 0 to allow non pmd threads to perform certain DPDK
>> +     * operations, like rte_eth_dev_configure(). */
>> +    RTE_PER_LCORE(_lcore_id) = 0;
>> +}
>> +
>> +static bool
>> +thread_is_pmd(void)
>> +{
>> +    return rte_lcore_id() != 0;
>> +}
>> diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h
>> index b021aa5..e4ba6fc 100644
>> --- a/lib/netdev-dpdk.h
>> +++ b/lib/netdev-dpdk.h
>> @@ -2,7 +2,6 @@
>> #define NETDEV_DPDK_H
>> 
>> #include <config.h>
>> -#include "ofpbuf.h"
>> 
>> struct dpif_packet;
>> 
>> @@ -25,6 +24,7 @@ int dpdk_init(int argc, char **argv);
>> void netdev_dpdk_register(void);
>> void free_dpdk_buf(struct dpif_packet *);
>> int pmd_thread_setaffinity_cpu(int cpu);
>> +void thread_set_nonpmd(void);
>> 
>> #else
>> 
>> @@ -52,5 +52,11 @@ pmd_thread_setaffinity_cpu(int cpu OVS_UNUSED)
>>     return 0;
>> }
>> 
>> +static inline void
>> +thread_set_nonpmd(void)
>> +{
>> +    /* Nothing */
>> +}
>> +
>> #endif /* DPDK_NETDEV */
>> #endif
>> diff --git a/lib/ovs-thread.c b/lib/ovs-thread.c
>> index cbee582..fe6fb43 100644
>> --- a/lib/ovs-thread.c
>> +++ b/lib/ovs-thread.c
>> @@ -25,6 +25,7 @@
>> #include <unistd.h>
>> #include "compiler.h"
>> #include "hash.h"
>> +#include "netdev-dpdk.h"
>> #include "ovs-rcu.h"
>> #include "poll-loop.h"
>> #include "seq.h"
>> @@ -326,6 +327,8 @@ ovsthread_wrapper(void *aux_)
>>     set_subprogram_name("%s%u", aux.name, id);
>>     ovsrcu_quiesce_end();
>> 
>> +    thread_set_nonpmd();
>> +
>>     return aux.start(aux.arg);
>> }
>> 
>> --
>> 2.0.0
>> 
>> _______________________________________________
>> dev mailing list
>> dev at openvswitch.org
>> https://urldefense.proofpoint.com/v1/url?u=http://openvswitch.org/mailman/listinfo/dev&k=oIvRg1%2BdGAgOoM1BIlLLqw%3D%3D%0A&r=MV9BdLjtFIdhBDBaw5z%2BU6SSA2gAfY4L%2F1HCy3VjlKU%3D%0A&m=1aFXYM8Q%2FCc7rrU9fjucJ%2BLJAIVF5%2BeP4p1vVjkCNZQ%3D%0A&s=9ce47e62568f8df2a5ca1995ef84d665028fab559df097ba6be34a2cb80d9676




More information about the dev mailing list