[ovs-dev] [PATCHv17 2/2] netdev-afxdp: add new netdev type for AF_XDP.

Ilya Maximets i.maximets at samsung.com
Thu Jul 18 17:19:23 UTC 2019


On 17.07.2019 23:23, William Tu wrote:
> The patch introduces experimental AF_XDP support for OVS netdev.
> AF_XDP, the Address Family of the eXpress Data Path, is a new Linux socket
> type built upon the eBPF and XDP technology.  It is aims to have comparable
> performance to DPDK but cooperate better with existing kernel's networking
> stack.  An AF_XDP socket receives and sends packets from an eBPF/XDP program
> attached to the netdev, by-passing a couple of Linux kernel's subsystems
> As a result, AF_XDP socket shows much better performance than AF_PACKET
> For more details about AF_XDP, please see linux kernel's
> Documentation/networking/af_xdp.rst. Note that by default, this feature is
> not compiled in.
> 
> Signed-off-by: William Tu <u9012063 at gmail.com>
> ---

I prepared an incremental patch to fix some of our misunderstandings and
also cover few new issues I discovered today while reading the code.

Description:

1. I changed 'ovs_assert(false)' to asserts that I wanted to see.
   'ovs_assert' includes a LIKELY macro, so there should be no functional
   changes.

2. I checked "AF_XDP device added to bridge, remove, and added again will fail."
   with virtual interfaces and below incremental applied and everything
   worked fine. So, this limitation seems redundant. (probably #8 fixed it?)

3. I added the note about non-working TCP and vlans over virtual interfaces
   as this seems important.

4. I moved defines and build asserts out of lib/netdev-afxdp-pool.h to
   lib/netdev-afxdp.c which is the only user for them.

5. Refactored __umem_pool_alloc to avoid type casts.

6. I changed the type of xpool->array from the double pointer to a
   single pointer which it really was. This way UMEM2XPKT became redundant.

7. Found the leak of spinlocks on reconfiguration. tx_locks was freed
   without destroying the spinlocks. To fix that I moved locks allocation
   to 'xsk_configure_all' and destruction to 'xsk_destroy_all'. So, these 2
   functions are the only functions that handles 'tx_locks' and everything
   should be cleared correctly now.

8. 'netdev_linux_rxq_destroy' closes the socket. This is unwanted behaviour
    because we'll close it again on netdev_destruct.
    So, I implemented the empty 'netdev_afxdp_rxq_destruct'.

Please, check the code below and share your thoughts.

diff --git a/Documentation/intro/install/afxdp.rst b/Documentation/intro/install/afxdp.rst
index fd0407299..820e9d993 100644
--- a/Documentation/intro/install/afxdp.rst
+++ b/Documentation/intro/install/afxdp.rst
@@ -312,10 +312,11 @@ Limitations/Known Issues
 #. Device's numa ID is always 0, need a way to find numa id from a netdev.
 #. No QoS support because AF_XDP netdev by-pass the Linux TC layer. A possible
    work-around is to use OpenFlow meter action.
-#. AF_XDP device added to bridge, remove, and added again will fail.
 #. Most of the tests are done using i40e single port. Multiple ports and
    also ixgbe driver also needs to be tested.
 #. No latency test result (TODO items)
+#. Due to limitations of current upstream kernel, TCP and various offloading
+   (vlan, cvlan) is not working over virtual interfaces (i.e. veth pair).
 
 
 PVP using tap device
diff --git a/lib/netdev-afxdp-pool.c b/lib/netdev-afxdp-pool.c
index f5898af74..6d29da4d7 100644
--- a/lib/netdev-afxdp-pool.c
+++ b/lib/netdev-afxdp-pool.c
@@ -17,11 +17,7 @@
 
 #include "dp-packet.h"
 #include "netdev-afxdp-pool.h"
-#include "openvswitch/compiler.h"
-
-BUILD_ASSERT_DECL(IS_POW2(NUM_FRAMES));
-BUILD_ASSERT_DECL(PROD_NUM_DESCS == CONS_NUM_DESCS);
-BUILD_ASSERT_DECL(NUM_FRAMES == 4 * (PROD_NUM_DESCS + CONS_NUM_DESCS));
+#include "openvswitch/util.h"
 
 /* Note:
  * umem_elem_push* shouldn't overflow because we always pop
@@ -32,9 +28,7 @@ __umem_elem_push_n(struct umem_pool *umemp, int n, void **addrs)
 {
     void *ptr;
 
-    if (OVS_UNLIKELY(umemp->index + n > umemp->size)) {
-        ovs_assert(false);
-    }
+    ovs_assert(umemp->index + n <= umemp->size);
 
     ptr = &umemp->array[umemp->index];
     memcpy(ptr, addrs, n * sizeof(void *));
@@ -51,9 +45,7 @@ void umem_elem_push_n(struct umem_pool *umemp, int n, void **addrs)
 static inline void
 __umem_elem_push(struct umem_pool *umemp, void *addr)
 {
-    if (OVS_UNLIKELY(umemp->index + 1 > umemp->size)) {
-        ovs_assert(false);
-    }
+    ovs_assert(umemp->index + 1 <= umemp->size);
 
     umemp->array[umemp->index++] = addr;
 }
@@ -119,12 +111,12 @@ umem_elem_pop(struct umem_pool *umemp)
 static void **
 __umem_pool_alloc(unsigned int size)
 {
-    void *bufs;
+    void **bufs;
 
-    bufs = xmalloc_pagealign(size * sizeof bufs);
-    memset(bufs, 0, size * sizeof bufs);
+    bufs = xmalloc_pagealign(size * sizeof *bufs);
+    memset(bufs, 0, size * sizeof *bufs);
 
-    return (void **)bufs;
+    return bufs;
 }
 
 int
@@ -159,14 +151,11 @@ umem_pool_count(struct umem_pool *umemp)
 int
 xpacket_pool_init(struct xpacket_pool *xp, unsigned int size)
 {
-    void *bufs;
-
-    bufs = xmalloc_pagealign(size * sizeof(struct dp_packet_afxdp));
-    memset(bufs, 0, size * sizeof(struct dp_packet_afxdp));
-
-    xp->array = bufs;
+    xp->array = xmalloc_pagealign(size * sizeof *xp->array);
     xp->size = size;
 
+    memset(xp->array, 0, size * sizeof *xp->array);
+
     return 0;
 }
 
diff --git a/lib/netdev-afxdp-pool.h b/lib/netdev-afxdp-pool.h
index dd358ba0c..a8c7e2b8c 100644
--- a/lib/netdev-afxdp-pool.h
+++ b/lib/netdev-afxdp-pool.h
@@ -24,30 +24,10 @@
 #include <bpf/xsk.h>
 #include <errno.h>
 #include <stdbool.h>
-#include <stdio.h>
 
 #include "openvswitch/thread.h"
 #include "ovs-atomic.h"
 
-#define FRAME_HEADROOM  XDP_PACKET_HEADROOM
-#define OVS_XDP_HEADROOM     128
-#define FRAME_SIZE      XSK_UMEM__DEFAULT_FRAME_SIZE
-#define FRAME_SHIFT     XSK_UMEM__DEFAULT_FRAME_SHIFT
-#define FRAME_SHIFT_MASK    ((1 << FRAME_SHIFT) - 1)
-
-#define PROD_NUM_DESCS XSK_RING_PROD__DEFAULT_NUM_DESCS
-#define CONS_NUM_DESCS XSK_RING_CONS__DEFAULT_NUM_DESCS
-
-/* The worst case is all 4 queues TX/CQ/RX/FILL are full + some packets
- * still on processing in threads. Number of packets currently in OVS
- * processing is hard to estimate because it depends on number of ports.
- * Setting NUM_FRAMES twice as large than total of ring sizes should be
- * enough for most corner cases.
- */
-#define NUM_FRAMES      (4 * (PROD_NUM_DESCS + CONS_NUM_DESCS))
-
-#define BATCH_SIZE      NETDEV_MAX_BURST
-
 /* LIFO ptr_array. */
 struct umem_pool {
     int index;      /* Point to top. */
@@ -59,7 +39,7 @@ struct umem_pool {
 /* Array-based dp_packet_afxdp. */
 struct xpacket_pool {
     unsigned int size;
-    struct dp_packet_afxdp **array;
+    struct dp_packet_afxdp *array;
 };
 
 void umem_elem_push(struct umem_pool *umemp, void *addr);
diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
index a931c19ec..f273baca8 100644
--- a/lib/netdev-afxdp.c
+++ b/lib/netdev-afxdp.c
@@ -36,6 +36,7 @@
 #include "dp-packet.h"
 #include "dpif-netdev.h"
 #include "fatal-signal.h"
+#include "openvswitch/compiler.h"
 #include "openvswitch/dynamic-string.h"
 #include "openvswitch/list.h"
 #include "openvswitch/vlog.h"
@@ -53,12 +54,32 @@ COVERAGE_DEFINE(afxdp_tx_full);
 COVERAGE_DEFINE(afxdp_cq_skip);
 
 VLOG_DEFINE_THIS_MODULE(netdev_afxdp);
+
 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
 
+#define MAX_XSKQ            16
+#define FRAME_HEADROOM      XDP_PACKET_HEADROOM
+#define OVS_XDP_HEADROOM    128
+#define FRAME_SIZE          XSK_UMEM__DEFAULT_FRAME_SIZE
+#define FRAME_SHIFT         XSK_UMEM__DEFAULT_FRAME_SHIFT
+#define FRAME_SHIFT_MASK    ((1 << FRAME_SHIFT) - 1)
+
+#define PROD_NUM_DESCS      XSK_RING_PROD__DEFAULT_NUM_DESCS
+#define CONS_NUM_DESCS      XSK_RING_CONS__DEFAULT_NUM_DESCS
+
+/* The worst case is all 4 queues TX/CQ/RX/FILL are full + some packets
+ * still on processing in threads. Number of packets currently in OVS
+ * processing is hard to estimate because it depends on number of ports.
+ * Setting NUM_FRAMES twice as large than total of ring sizes should be
+ * enough for most corner cases.
+ */
+#define NUM_FRAMES          (4 * (PROD_NUM_DESCS + CONS_NUM_DESCS))
+#define BATCH_SIZE          NETDEV_MAX_BURST
+
+BUILD_ASSERT_DECL(IS_POW2(NUM_FRAMES));
+BUILD_ASSERT_DECL(PROD_NUM_DESCS == CONS_NUM_DESCS);
+
 #define UMEM2DESC(elem, base) ((uint64_t)((char *)elem - (char *)base))
-#define UMEM2XPKT(base, i) \
-                  ALIGNED_CAST(struct dp_packet_afxdp *, (char *)base + \
-                               i * sizeof(struct dp_packet_afxdp))
 
 static struct xsk_socket_info *xsk_configure(int ifindex, int xdp_queue_id,
                                              int mode);
@@ -201,7 +222,7 @@ xsk_configure_umem(void *buffer, uint64_t size, int xdpmode)
         struct dp_packet_afxdp *xpacket;
         struct dp_packet *packet;
 
-        xpacket = UMEM2XPKT(umem->xpool.array, i);
+        xpacket = &umem->xpool.array[i];
         xpacket->mpool = &umem->mpool;
 
         packet = &xpacket->packet;
@@ -222,7 +243,7 @@ xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex,
     int ret;
     int i;
 
-    xsk = xzalloc(sizeof(*xsk));
+    xsk = xzalloc(sizeof *xsk);
     xsk->umem = umem;
     cfg.rx_size = CONS_NUM_DESCS;
     cfg.tx_size = PROD_NUM_DESCS;
@@ -328,12 +349,15 @@ xsk_configure_all(struct netdev *netdev)
 {
     struct netdev_linux *dev = netdev_linux_cast(netdev);
     struct xsk_socket_info *xsk_info;
-    int i, ifindex, n_rxq;
+    int i, ifindex, n_rxq, n_txq;
 
     ifindex = linux_get_ifindex(netdev_get_name(netdev));
 
+    ovs_assert(dev->xsks == NULL);
+    ovs_assert(dev->tx_locks == NULL);
+
     n_rxq = netdev_n_rxq(netdev);
-    dev->xsks = xzalloc(n_rxq * sizeof *dev->xsks);
+    dev->xsks = xcalloc(n_rxq, sizeof *dev->xsks);
 
     /* Configure each queue. */
     for (i = 0; i < n_rxq; i++) {
@@ -351,6 +375,13 @@ xsk_configure_all(struct netdev *netdev)
         xsk_info->available_rx = PROD_NUM_DESCS;
     }
 
+    n_txq = netdev_n_txq(netdev);
+    dev->tx_locks = xcalloc(n_txq, sizeof *dev->tx_locks);
+
+    for (i = 0; i < n_txq; i++) {
+        ovs_spin_init(&dev->tx_locks[i]);
+    }
+
     return 0;
 
 err:
@@ -391,21 +422,30 @@ xsk_destroy_all(struct netdev *netdev)
     struct netdev_linux *dev = netdev_linux_cast(netdev);
     int i, ifindex;
 
-    ifindex = linux_get_ifindex(netdev_get_name(netdev));
-
-    for (i = 0; i < netdev_n_rxq(netdev); i++) {
-        if (dev->xsks && dev->xsks[i]) {
-            xsk_destroy(dev->xsks[i]);
-            dev->xsks[i] = NULL;
-            VLOG_INFO("Destroyed xsk[%d].", i);
+    if (dev->xsks) {
+        for (i = 0; i < netdev_n_rxq(netdev); i++) {
+            if (dev->xsks[i]) {
+                xsk_destroy(dev->xsks[i]);
+                dev->xsks[i] = NULL;
+                VLOG_INFO("Destroyed xsk[%d].", i);
+            }
         }
+
+        free(dev->xsks);
+        dev->xsks = NULL;
     }
 
     VLOG_INFO("%s: Removing xdp program.", netdev_get_name(netdev));
+    ifindex = linux_get_ifindex(netdev_get_name(netdev));
     xsk_remove_xdp_program(ifindex, dev->xdpmode);
 
-    free(dev->xsks);
-    dev->xsks = NULL;
+    if (dev->tx_locks) {
+        for (i = 0; i < netdev_n_txq(netdev); i++) {
+            ovs_spin_destroy(&dev->tx_locks[i]);
+        }
+        free(dev->tx_locks);
+        dev->tx_locks = NULL;
+    }
 }
 
 static inline void OVS_UNUSED
@@ -475,20 +515,6 @@ netdev_afxdp_get_config(const struct netdev *netdev, struct smap *args)
     return 0;
 }
 
-static void
-netdev_afxdp_alloc_txq(struct netdev *netdev)
-{
-    struct netdev_linux *dev = netdev_linux_cast(netdev);
-    int n_txqs = netdev_n_txq(netdev);
-    int i;
-
-    dev->tx_locks = xcalloc(n_txqs, sizeof *dev->tx_locks);
-
-    for (i = 0; i < n_txqs; i++) {
-        ovs_spin_init(&dev->tx_locks[i]);
-    }
-}
-
 int
 netdev_afxdp_reconfigure(struct netdev *netdev)
 {
@@ -504,11 +530,9 @@ netdev_afxdp_reconfigure(struct netdev *netdev)
     }
 
     xsk_destroy_all(netdev);
-    free(dev->tx_locks);
 
     netdev->n_rxq = dev->requested_n_rxq;
     netdev->n_txq = netdev->n_rxq;
-    netdev_afxdp_alloc_txq(netdev);
 
     if (dev->requested_xdpmode == XDP_ZEROCOPY) {
         dev->xdpmode = XDP_ZEROCOPY;
@@ -664,7 +688,7 @@ netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
 
         pkt = xsk_umem__get_data(umem->buffer, addr);
         index = addr >> FRAME_SHIFT;
-        xpacket = UMEM2XPKT(umem->xpool.array, index);
+        xpacket = &umem->xpool.array[index];
         packet = &xpacket->packet;
 
         /* Initialize the struct dp_packet. */
@@ -935,13 +959,17 @@ netdev_afxdp_rxq_construct(struct netdev_rxq *rxq_ OVS_UNUSED)
    return 0;
 }
 
+void
+netdev_afxdp_rxq_destruct(struct netdev_rxq *rxq_ OVS_UNUSED)
+{
+    /* Nothing. */
+}
+
 void
 netdev_afxdp_destruct(struct netdev *netdev)
 {
     static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
     struct netdev_linux *dev = netdev_linux_cast(netdev);
-    int n_txqs = netdev_n_txq(netdev);
-    int i;
 
     if (ovsthread_once_start(&once)) {
         fatal_signal_add_hook(netdev_afxdp_sweep_unused_pools,
@@ -954,10 +982,6 @@ netdev_afxdp_destruct(struct netdev *netdev)
 
     xsk_destroy_all(netdev);
     ovs_mutex_destroy(&dev->mutex);
-
-    for (i = 0; i < n_txqs; i++) {
-        ovs_spin_destroy(&dev->tx_locks[i]);
-    }
 }
 
 int
diff --git a/lib/netdev-afxdp.h b/lib/netdev-afxdp.h
index c7d5a0a7b..e9e031c33 100644
--- a/lib/netdev-afxdp.h
+++ b/lib/netdev-afxdp.h
@@ -27,8 +27,6 @@
 /* These functions are Linux AF_XDP specific, so they should be used directly
  * only by Linux-specific code. */
 
-#define MAX_XSKQ 16
-
 struct netdev;
 struct xsk_socket_info;
 struct xdp_umem;
@@ -40,6 +38,7 @@ struct netdev_stats;
 struct netdev_custom_stats;
 
 int netdev_afxdp_rxq_construct(struct netdev_rxq *rxq_);
+void netdev_afxdp_rxq_destruct(struct netdev_rxq *rxq_);
 void netdev_afxdp_destruct(struct netdev *netdev_);
 
 int netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_,
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index c55f79261..ad73c98e6 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -3236,7 +3236,6 @@ exit:
     .arp_lookup = netdev_linux_arp_lookup,                      \
     .update_flags = netdev_linux_update_flags,                  \
     .rxq_alloc = netdev_linux_rxq_alloc,                        \
-    .rxq_destruct = netdev_linux_rxq_destruct,                  \
     .rxq_dealloc = netdev_linux_rxq_dealloc,                    \
     .rxq_wait = netdev_linux_rxq_wait,                          \
     .rxq_drain = netdev_linux_rxq_drain
@@ -3253,6 +3252,7 @@ const struct netdev_class netdev_linux_class = {
     .get_block_id = netdev_linux_get_block_id,
     .send = netdev_linux_send,
     .rxq_construct = netdev_linux_rxq_construct,
+    .rxq_destruct = netdev_linux_rxq_destruct,
     .rxq_recv = netdev_linux_rxq_recv,
 };
 
@@ -3267,6 +3267,7 @@ const struct netdev_class netdev_tap_class = {
     .get_status = netdev_linux_get_status,
     .send = netdev_linux_send,
     .rxq_construct = netdev_linux_rxq_construct,
+    .rxq_destruct = netdev_linux_rxq_destruct,
     .rxq_recv = netdev_linux_rxq_recv,
 };
 
@@ -3280,6 +3281,7 @@ const struct netdev_class netdev_internal_class = {
     .get_status = netdev_internal_get_status,
     .send = netdev_linux_send,
     .rxq_construct = netdev_linux_rxq_construct,
+    .rxq_destruct = netdev_linux_rxq_destruct,
     .rxq_recv = netdev_linux_rxq_recv,
 };
 
@@ -3299,6 +3301,7 @@ const struct netdev_class netdev_afxdp_class = {
     .get_numa_id = netdev_afxdp_get_numa_id,
     .send = netdev_afxdp_batch_send,
     .rxq_construct = netdev_afxdp_rxq_construct,
+    .rxq_destruct = netdev_afxdp_rxq_destruct,
     .rxq_recv = netdev_afxdp_rxq_recv,
 };
 #endif
---


Best regards, Ilya Maximets.


More information about the dev mailing list