[ovs-dev] [PATCHv15 2/2] netdev-afxdp: add new netdev type for AF_XDP.

Ilya Maximets i.maximets at samsung.com
Thu Jul 11 13:42:31 UTC 2019


On 09.07.2019 22:35, William Tu wrote:
> The patch introduces experimental AF_XDP support for OVS netdev.
> AF_XDP, the Address Family of the eXpress Data Path, is a new Linux socket
> type built upon the eBPF and XDP technology.  It is aims to have comparable
> performance to DPDK but cooperate better with existing kernel's networking
> stack.  An AF_XDP socket receives and sends packets from an eBPF/XDP program
> attached to the netdev, by-passing a couple of Linux kernel's subsystems
> As a result, AF_XDP socket shows much better performance than AF_PACKET
> For more details about AF_XDP, please see linux kernel's
> Documentation/networking/af_xdp.rst. Note that by default, this feature is
> not compiled in.
> 
> Signed-off-by: William Tu <u9012063 at gmail.com>
> ---
> v14:
>  * Mainly address issue reported by Ilya
>    https://protect2.fireeye.com/url?k=0b6c291c248670fb.0b6da253-6021601b254970fd&u=https://patchwork.ozlabs.org/patch/1118972/
>    when doing 'make check-afxdp'
>  * Fix xdp frame headroom issue
>  * Fix vlan test cases by disabling txvlan offload
>  * Skip cvlan
>  * Document TCP limitation (currently all tcp tests fail due to
>    kernel veth driver)
>  * Fix tunnel test cases due to --disable-system (another patch)
>  * Switch to use pthread_spin_lock, suggested by Ben
>  * Add coverage counter for debugging
>  * Fix buffer starvation issue at batch_send reported by Eelco
>    when using tap device with type=afxdp
> 
> v15:
>  * address review feedback from Ilay
>    https://protect2.fireeye.com/url?k=ceb755d3074c79a5.ceb6de9c-b1b2f6a490a479b8&u=https://patchwork.ozlabs.org/patch/1125476/
>  * skip TCP related test cases
>  * reclaim all CONS_NUM_DESC at complete tx
>  * add retries to kick_tx
>  * increase memory pool size
>  * remove redundant xdp flag and bind flag
>  * remove unused rx_dropped var
>  * make tx_dropped counter atomic
>  * refactor dp_packet_init_afxdp using dp_packet_init__
>  * rebase to ovs master, test with latest bpf-next kernel commit b14a260e33ddb4
>    Ilya's kernel patches are required
>    commit 455302d1c9ae ("xdp: fix hang while unregistering device bound to xdp socket")
>    commit 162c820ed896 ("xdp: hold device for umem regardless of zero-copy mode")
>  Possible issues:
>  * still lots of afxdp_cq_skip  (ovs-appctl coverage/show)
>     afxdp_cq_skip  44325273.6/sec 34362312.683/sec   572705.2114/sec   total: 2106010377
>  * TODO:
>    'make check-afxdp' still not all pass
>    IP fragmentation expiry test not fix yet, need to implement
>    deferral memory free, s.t like dpdk_mp_sweep.  Currently hit
>    some missing umem descs when reclaiming.

Hi. Regarding this issue: We don't need to reclaim everything from the rings.
We only need to count number of descriptors that are currently in rings.
When we're xlosing xdp socket kernel stops processing rings, also, all the
buffers in the rings are buffers from current umem. So, we could just count them
and wait for the number of elements in umem pool to become (size - n_packets_in_rings).
'outstanding_tx' already counts all the packets that are in TX and CQ or in the
middle of processing in kernel. If we'll count the same way number of packets
in RX and FQ, we'll know the total number of buffers currently in kernel.

It might be hard or even impossible to reclaim all the packets from rings
because kernel updates consumer/producer heads not for every packet and it
depends on the kernel implementation in which state rings will be after the
closing of the socket.

Suggesting following incremental that works for me:

diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
index fe9d5300a..4b9262189 100644
--- a/lib/netdev-afxdp.c
+++ b/lib/netdev-afxdp.c
@@ -34,7 +34,9 @@
 #include "coverage.h"
 #include "dp-packet.h"
 #include "dpif-netdev.h"
+#include "fatal-signal.h"
 #include "openvswitch/dynamic-string.h"
+#include "openvswitch/list.h"
 #include "openvswitch/vlog.h"
 #include "packets.h"
 #include "socket-util.h"
@@ -65,6 +67,57 @@ static void xsk_destroy(struct xsk_socket_info *xsk);
 static int xsk_configure_all(struct netdev *netdev);
 static void xsk_destroy_all(struct netdev *netdev);
 
+struct unused_pool {
+    struct xsk_umem_info *umem_info;
+    int lost_in_rings; /* Number of packets left in tx, rx, cq and fq. */
+    struct ovs_list list_node;
+};
+
+static struct ovs_mutex unused_pools_mutex = OVS_MUTEX_INITIALIZER;
+static struct ovs_list unused_pools OVS_GUARDED_BY(unused_pools_mutex) =
+    OVS_LIST_INITIALIZER(&unused_pools);
+
+static void
+netdev_afxdp_cleanup_unused_pool(struct unused_pool *pool)
+{
+    /* free the packet buffer */
+    free_pagealign(pool->umem_info->buffer);
+
+    /* cleanup umem pool */
+    umem_pool_cleanup(&pool->umem_info->mpool);
+
+    /* cleanup metadata pool */
+    xpacket_pool_cleanup(&pool->umem_info->xpool);
+
+    free(pool->umem_info);
+}
+
+static void
+netdev_afxdp_sweep_unused_pools(void *aux OVS_UNUSED)
+{
+    struct unused_pool *pool, *next;
+    unsigned int count;
+
+    ovs_mutex_lock(&unused_pools_mutex);
+    LIST_FOR_EACH_SAFE (pool, next, list_node, &unused_pools) {
+
+        count = umem_pool_count(&pool->umem_info->mpool);
+        ovs_assert(count + pool->lost_in_rings <= NUM_FRAMES);
+
+        if (count + pool->lost_in_rings == NUM_FRAMES) {
+            /* OVS doesn't use this memory pool anymore.  Kernel doesn't
+             * use it since closing the xdp socket.  So, it's safe to free
+             * the pool now. */
+            VLOG_DBG("Freeing umem pool at 0x%"PRIxPTR,
+                     (uintptr_t) pool->umem_info);
+            ovs_list_remove(&pool->list_node);
+            netdev_afxdp_cleanup_unused_pool(pool);
+            free(pool);
+        }
+    }
+    ovs_mutex_unlock(&unused_pools_mutex);
+}
+
 static struct xsk_umem_info *
 xsk_configure_umem(void *buffer, uint64_t size, int xdpmode)
 {
@@ -221,6 +274,8 @@ xsk_configure(int ifindex, int xdp_queue_id, int xdpmode)
     struct xsk_umem_info *umem;
     void *bufs;
 
+    netdev_afxdp_sweep_unused_pools(NULL);
+
     /* umem memory region */
     bufs = xmalloc_pagealign(NUM_FRAMES * FRAME_SIZE);
     memset(bufs, 0, NUM_FRAMES * FRAME_SIZE);
@@ -234,6 +289,8 @@ xsk_configure(int ifindex, int xdp_queue_id, int xdpmode)
         return NULL;
     }
 
+    VLOG_DBG("Allocated umem pool at 0x%"PRIxPTR, (uintptr_t) umem);
+
     xsk = xsk_configure_socket(umem, ifindex, xdp_queue_id, xdpmode);
     if (!xsk) {
         /* clean up umem and xpacket pool */
@@ -273,6 +330,7 @@ xsk_configure_all(struct netdev *netdev)
         dev->xsks[i] = xsk_info;
         xsk_info->tx_dropped = 0;
         xsk_info->outstanding_tx = 0;
+        xsk_info->available_rx = PROD_NUM_DESCS;
     }
 
     return 0;
@@ -286,6 +344,7 @@ static void
 xsk_destroy(struct xsk_socket_info *xsk_info)
 {
     struct xsk_umem *umem;
+    struct unused_pool *pool;
 
     xsk_socket__delete(xsk_info->xsk);
     xsk_info->xsk = NULL;
@@ -295,17 +354,17 @@ xsk_destroy(struct xsk_socket_info *xsk_info)
         VLOG_ERR("xsk_umem__delete failed");
     }
 
-    /* free the packet buffer */
-    free_pagealign(xsk_info->umem->buffer);
+    pool = xzalloc(sizeof *pool);
+    pool->umem_info = xsk_info->umem;
+    pool->lost_in_rings = xsk_info->outstanding_tx + xsk_info->available_rx;
 
-    /* cleanup umem pool */
-    umem_pool_cleanup(&xsk_info->umem->mpool);
-
-    /* cleanup metadata pool */
-    xpacket_pool_cleanup(&xsk_info->umem->xpool);
+    ovs_mutex_lock(&unused_pools_mutex);
+    ovs_list_push_back(&unused_pools, &pool->list_node);
+    ovs_mutex_unlock(&unused_pools_mutex);
 
-    free(xsk_info->umem);
     free(xsk_info);
+
+    netdev_afxdp_sweep_unused_pools(NULL);
 }
 
 static void
@@ -546,6 +605,7 @@ prepare_fill_queue(struct xsk_socket_info *xsk_info)
         idx_fq++;
     }
     xsk_ring_prod__submit(&umem->fq, BATCH_SIZE);
+    xsk_info->available_rx += BATCH_SIZE;
 }
 
 int
@@ -607,6 +667,7 @@ netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
     }
     /* Release the RX queue */
     xsk_ring_cons__release(&xsk_info->rx, rcvd);
+    xsk_info->available_rx -= rcvd;
 
     if (qfill) {
         /* TODO: return the number of remaining packets in the queue. */
@@ -865,10 +926,17 @@ netdev_afxdp_rxq_construct(struct netdev_rxq *rxq_ OVS_UNUSED)
 void
 netdev_afxdp_destruct(struct netdev *netdev)
 {
+    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
     struct netdev_linux *dev = netdev_linux_cast(netdev);
     int n_txqs = netdev_n_rxq(netdev);
     int i;
 
+    if (ovsthread_once_start(&once)) {
+        fatal_signal_add_hook(netdev_afxdp_sweep_unused_pools,
+                              NULL, NULL, true);
+        ovsthread_once_done(&once);
+    }
+
     /* Note: tc is by-passed when using drv-mode, but when using
      * skb-mode, we might need to clean up tc. */
 
diff --git a/lib/xdpsock.c b/lib/xdpsock.c
index 40b4462a7..f3d316b8f 100644
--- a/lib/xdpsock.c
+++ b/lib/xdpsock.c
@@ -147,6 +147,12 @@ umem_pool_cleanup(struct umem_pool *umemp)
     umemp->array = NULL;
 }
 
+unsigned int
+umem_pool_count(struct umem_pool *umemp)
+{
+    return umemp->index;
+}
+
 /* AF_XDP metadata init/destroy */
 int
 xpacket_pool_init(struct xpacket_pool *xp, unsigned int size)
diff --git a/lib/xdpsock.h b/lib/xdpsock.h
index e5f1008b7..4c4df9c8a 100644
--- a/lib/xdpsock.h
+++ b/lib/xdpsock.h
@@ -80,7 +80,8 @@ struct xsk_socket_info {
     struct xsk_ring_prod tx;
     struct xsk_umem_info *umem;
     struct xsk_socket *xsk;
-    uint32_t outstanding_tx;
+    uint32_t outstanding_tx; /* Number of descriptors filled in tx and cq. */
+    uint32_t available_rx;   /* Number of descriptors filled in rx and fq. */
     atomic_ulong tx_dropped;
 };
 
@@ -96,6 +97,7 @@ int umem_elem_pop_n(struct umem_pool *umemp, int n, void **addrs);
 
 int umem_pool_init(struct umem_pool *umemp, unsigned int size);
 void umem_pool_cleanup(struct umem_pool *umemp);
+unsigned int umem_pool_count(struct umem_pool *umemp);
 int xpacket_pool_init(struct xpacket_pool *xp, unsigned int size);
 void xpacket_pool_cleanup(struct xpacket_pool *xp);
 
-- 
2.18.1

Best regards, Ilya Maximets.


More information about the dev mailing list