[ovs-dev] PATCH 1/1 : netdev-dpdk / add dpdk rings to netdev-dpdk

Rogers, Gerald gerald.rogers at intel.com
Wed Apr 30 20:01:50 UTC 2014


This patch enables the client dpdk rings within the netdev-dpdk.  It adds
a new
dpdk device called dpdkr (other naming suggestions?).  This allows for the
use
of shared memory to communicate with other dpdk applications, on the host
or
within a virtual machine.  Instructions for use are in INSTALL.DPDK.

This has been tested on Intel multi-core platforms and with the client
application
within the host.

Signed-off-by: Gerald Rogers <gerald.rogers at intel.com>
Signed-off-by: Gerald Rogers <gerald.rogers at intel.com>


diff --git a/INSTALL.DPDK b/INSTALL.DPDK
index 3e0247a..ec1b814 100644
--- a/INSTALL.DPDK
+++ b/INSTALL.DPDK
@@ -74,6 +74,44 @@ Once first DPDK port is added vswitchd, it creates
Polling thread and
 polls dpdk device in continuous loop. Therefore CPU utilization
 for that thread is always 100%.
 
+DPDK Rings :
+
+Following the steps above to create a bridge, you can now add dpdk rings
+as a port to the vswitch.  OVS will expect the DPDK ring device name to
+start with dpdkr and end with portid.
+
+    ovs-vsctl add-port br0 dpdkr0 -- set Interface dpdkr0 type=dpdkr
+
+DPDK rings client test application
+
+Included in the test directory is a sample DPDK application for testing
+the rings.  This is from the base dpdk directory and modified to work
+with the ring naming used within ovs.
+
+location tests/ovs_client
+
+To run the client in tests directory :
+
+ovsclient -c 1 -n 4 --proc-type=secondary -- --n "port id you gave dpdkr"
+
+It is essential to have --proc-type=secondary
+
+The application simply receives an mbuf on the receive queue of the
+ethernet ring and then places that same mbuf on the transmit ring of
+the ethernet ring.  It is a trivial loopback application.
+
+In addition to executing the client in the host, you can execute it
within 
+a guest VM. To do so you will need a patched qemu.  You can download the
+patch and getting started guid at :
+
+https://01.org/packet-processing/downloads
+
+A general rule of thumb for better performance is that the client
+application shouldn't be assigned the same dpdk core mask "-c" as
+the vswitchd.
+
+
+
 Restrictions:
 -------------
 
@@ -86,6 +124,11 @@ Restrictions:
     device queues configured.
   - Work with 1500 MTU, needs few changes in DPDK lib to fix this issue.
   - Currently DPDK port does not make use any offload functionality.
+  ivshmem
+  - The shared memory is currently restricted to the use of a 1GB
+   huge pages.  
+  - All huge pages are shared amongst the host, clients, virtual
+   machines etc.  
 
 Bug Reporting:
 --------------
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index fd991ab..a90913f 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -148,6 +148,23 @@ struct dpdk_tx_queue {
     struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];
 };
 
+/* dpdk has no way to remove dpdk ring ethernet devices
+   so we have to keep them around once they've been created
+*/
+
+static struct list dpdk_ring_list OVS_GUARDED_BY(dpdk_mutex)
+    = LIST_INITIALIZER(&dpdk_ring_list);
+
+struct dpdk_ring {
+    /* For the client rings */
+    struct rte_ring * cring_tx;
+    struct rte_ring * cring_rx;
+    int port_id; /* dpdkr port id */
+    int eth_port_id; /* ethernet device port id */
+    struct list list_node OVS_GUARDED_BY(mutex);
+
+};
+
 struct netdev_dpdk {
     struct netdev up;
     int port_id;
@@ -167,9 +184,14 @@ struct netdev_dpdk {
     uint8_t hwaddr[ETH_ADDR_LEN];
     enum netdev_flags flags;
 
+
     struct rte_eth_link link;
     int link_reset_cnt;
 
+    /* If a shared memory ring */
+ 
+    struct dpdk_ring *ivshmem;
+
     /* In dpdk_list. */
     struct list list_node OVS_GUARDED_BY(dpdk_mutex);
 };
@@ -571,6 +593,7 @@ dpdk_queue_flush(struct netdev_dpdk *dev, int qid)
     if (txq->count == 0) {
         return;
     }
+
     rte_spinlock_lock(&txq->tx_lock);
     nb_tx = rte_eth_tx_burst(dev->port_id, qid, txq->burst_pkts,
txq->count);
     if (nb_tx != txq->count) {
@@ -593,6 +616,7 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq_, struct
ofpbuf **packets, int *c)
 
     dpdk_queue_flush(dev, rxq_->queue_id);
 
+
     nb_rx = rte_eth_rx_burst(rx->port_id, rxq_->queue_id,
                              (struct rte_mbuf **) packets,
MAX_RX_QUEUE_LEN);
     if (!nb_rx) {
@@ -1170,6 +1194,200 @@ static struct netdev_class netdev_dpdk_class = {
     NULL,                       /* rxq_drain */
 };
 
+/* Client Rings */
+
+static int
+dpdk_classr_init(void)
+{
+  VLOG_INFO("Initialized dpdk client handlers:\n");
+  return 0;
+}
+
+static int
+netdev_dpdkr_construct(struct netdev *netdev_)
+{
+    struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
+    struct dpdk_ring *ivshmem;
+    unsigned int port_no;
+    char *cport;
+    int err=0;
+    int found=0;
+    char name[10];
+
+
+    if (rte_eal_init_ret) {
+        return rte_eal_init_ret;
+    }
+
+    ovs_mutex_lock(&dpdk_mutex);
+    cport = netdev_->name + 5; /* Names always start with "dpdkr" */
+
+    if (strncmp(netdev_->name, "dpdkr", 5)) {
+        VLOG_ERR("Not a valid dpdkr name %s:\n",netdev_->name);
+        err = ENODEV;
+        goto unlock_dpdk;
+    }
+
+    port_no = strtol(cport, 0, 0); /* string must be null terminated */
+
+    ovs_mutex_init(&netdev->mutex);
+
+    ovs_mutex_lock(&netdev->mutex);
+    netdev->flags = 0;
+
+    netdev->mtu = ETHER_MTU;
+    netdev->max_packet_len = MTU_TO_MAX_LEN(netdev->mtu);
+
+    /* TODO: need to discover device node at run time. */
+    netdev->socket_id = SOCKET0;
+
+    netdev->dpdk_mp = dpdk_mp_get(netdev->socket_id, netdev->mtu);
+    if (!netdev->dpdk_mp) {
+        VLOG_ERR("Unable to get memory pool\n");
+        err = ENOMEM;
+        goto unlock_dev;
+    }
+
+     /* look through our list to find the device */
+     LIST_FOR_EACH(ivshmem, list_node, &dpdk_ring_list){
+         if (ivshmem->port_id == port_no) {
+            VLOG_INFO("Found dpdk ring device %s:\n", netdev_->name);
+            found = 1;
+            break;
+         }
+    }
+    if (found) {
+        netdev->ivshmem = ivshmem;
+        netdev->port_id = ivshmem->eth_port_id; /* really all that is
needed */
+    } 
+    else {
+        /* Need to create the device rings */
+        
+        ivshmem = dpdk_rte_mzalloc(sizeof *ivshmem);
+        if (ivshmem == NULL)
+            goto unlock_dev;
+
+        snprintf(name,10,"%s_tx",netdev_->name);
+        ivshmem->cring_tx = rte_ring_create(name, MAX_RX_QUEUE_LEN,
SOCKET0, 0);
+        if (ivshmem->cring_tx == NULL)
+            goto unlock_dev;
+
+        snprintf(name,10,"%s_rx",netdev_->name);
+        ivshmem->cring_rx = rte_ring_create(name, MAX_RX_QUEUE_LEN,
SOCKET0, 0);
+        if (ivshmem->cring_rx == NULL)
+            goto unlock_dev;
+
+        err = rte_eth_from_rings(&ivshmem->cring_rx, 1,
&ivshmem->cring_tx, 1, SOCKET0);
+        if (err < 0) {
+           goto unlock_dev;
+         }
+
+        ivshmem->port_id = port_no;
+        ivshmem->eth_port_id = netdev->port_id = rte_eth_dev_count()-1 ;
+       
+        netdev->ivshmem=ivshmem;
+
+        list_push_back(&dpdk_ring_list, &ivshmem->list_node);
+
+    }
+
+    err = dpdk_eth_dev_init(netdev);
+    if (err) {
+        goto unlock_dev;
+    }
+
+    list_push_back(&dpdk_list, &netdev->list_node);
+
+    err = 0;
+unlock_dev:
+    ovs_mutex_unlock(&netdev->mutex);
+unlock_dpdk:
+    ovs_mutex_unlock(&dpdk_mutex);
+    return err;
+}
+
+static void
+netdev_dpdkr_destruct(struct netdev *netdev_)
+{
+    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
+
+    ovs_mutex_lock(&dev->mutex);
+    rte_eth_dev_stop(dev->port_id);
+
+    ovs_mutex_unlock(&dev->mutex);
+
+    ovs_mutex_lock(&dpdk_mutex);
+    list_remove(&dev->list_node);
+    dpdk_mp_put(dev->dpdk_mp);
+    ovs_mutex_unlock(&dpdk_mutex);
+
+    ovs_mutex_destroy(&dev->mutex);
+}
+
+static struct netdev_class netdev_dpdkr_class = {
+    "dpdkr",
+    dpdk_classr_init,            /* init */
+    NULL,                       /* netdev_dpdk_run */
+    NULL,                       /* netdev_dpdk_wait */
+
+    netdev_dpdk_alloc,
+    netdev_dpdkr_construct,
+    netdev_dpdkr_destruct,
+    netdev_dpdk_dealloc,
+    netdev_dpdk_get_config,
+    NULL,                       /* netdev_dpdk_set_config */
+    NULL,                       /* get_tunnel_config */
+
+    netdev_dpdk_send,           /* send */
+    NULL,                       /* send_wait */
+
+    netdev_dpdk_set_etheraddr,
+    netdev_dpdk_get_etheraddr,
+    netdev_dpdk_get_mtu,
+    netdev_dpdk_set_mtu,
+    netdev_dpdk_get_ifindex,
+    netdev_dpdk_get_carrier,
+    netdev_dpdk_get_carrier_resets,
+    netdev_dpdk_set_miimon,
+    netdev_dpdk_get_stats,
+    netdev_dpdk_set_stats,
+    netdev_dpdk_get_features,
+    NULL,                       /* set_advertisements */
+
+    NULL,                       /* set_policing */
+    NULL,                       /* get_qos_types */
+    NULL,                       /* get_qos_capabilities */
+    NULL,                       /* get_qos */
+    NULL,                       /* set_qos */
+    NULL,                       /* get_queue */
+    NULL,                       /* set_queue */
+    NULL,                       /* delete_queue */
+    NULL,                       /* get_queue_stats */
+    NULL,                       /* queue_dump_start */
+    NULL,                       /* queue_dump_next */
+    NULL,                       /* queue_dump_done */
+    NULL,                       /* dump_queue_stats */
+
+    NULL,                       /* get_in4 */
+    NULL,                       /* set_in4 */
+    NULL,                       /* get_in6 */
+    NULL,                       /* add_router */
+    NULL,                       /* get_next_hop */
+    netdev_dpdk_get_status,
+    NULL,                       /* arp_lookup */
+
+    netdev_dpdk_update_flags,
+
+    netdev_dpdk_rxq_alloc,
+    netdev_dpdk_rxq_construct,
+    netdev_dpdk_rxq_destruct,
+    netdev_dpdk_rxq_dealloc,
+    netdev_dpdk_rxq_recv,
+    NULL,                       /* rx_wait */
+    NULL,                       /* rxq_drain */
+};
+
+
 int
 dpdk_init(int argc, char **argv)
 {
@@ -1196,6 +1414,7 @@ void
 netdev_dpdk_register(void)
 {
     netdev_register_provider(&netdev_dpdk_class);
+    netdev_register_provider(&netdev_dpdkr_class);
 }
 
 int
diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h
index 2807310..8cb715a 100644
--- a/lib/netdev-dpdk.h
+++ b/lib/netdev-dpdk.h
@@ -10,6 +10,7 @@
 #include <rte_eal.h>
 #include <rte_debug.h>
 #include <rte_ethdev.h>
+#include <rte_eth_ring.h>
 #include <rte_errno.h>
 #include <rte_memzone.h>
 #include <rte_memcpy.h>
diff --git a/lib/netdev.c b/lib/netdev.c
index 2fc1834..610de13 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -98,7 +98,7 @@ netdev_n_rxq(const struct netdev *netdev)
 bool
 netdev_is_pmd(const struct netdev *netdev)
 {
-    return !strcmp(netdev->netdev_class->type, "dpdk");
+    return !strncmp(netdev->netdev_class->type, "dpdk",4);
 }
 
 static void
diff --git a/tests/automake.mk b/tests/automake.mk
index bf80702..0067812 100644
--- a/tests/automake.mk
+++ b/tests/automake.mk
@@ -204,6 +204,11 @@ tests/idltest.ovsidl: $(IDLTEST_IDL_FILES)
 
 tests/idltest.c: tests/idltest.h
 
+noinst_PROGRAMS += tests/ovsclient
+tests_ovsclient_SOURCES = \
+	tests/ovs_client/ovs_client.c
+tests_ovsclient_LDADD = lib/libopenvswitch.la $(LIBS)
+
 noinst_PROGRAMS += tests/ovstest
 tests_ovstest_SOURCES = \
 	tests/ovstest.c \
diff --git a/tests/ovs_client/ovs_client.c b/tests/ovs_client/ovs_client.c
new file mode 100644
index 0000000..07f49ed
--- /dev/null
+++ b/tests/ovs_client/ovs_client.c
@@ -0,0 +1,217 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <getopt.h>
+
+#include <config.h>
+#include <rte_config.h>
+#include <rte_mbuf.h>
+#include <rte_ether.h>
+#include <rte_string_fns.h>
+#include <rte_ip.h>
+#include <rte_byteorder.h>
+
+/* Number of packets to attempt to read from queue */
+#define PKT_READ_SIZE  ((uint16_t)32)
+
+/* define common names for structures shared between ovs_dpdk and client
*/
+#define MP_CLIENT_RXQ_NAME "dpdkr%u_tx"
+#define MP_CLIENT_TXQ_NAME "dpdkr%u_rx"
+
+#define RTE_LOGTYPE_APP RTE_LOGTYPE_USER1
+
+#define BASE_10 10
+
+/* our client id number - tells us which rx queue to read, and tx
+ * queue to write to. */
+static uint8_t client_id = 0;
+
+int no_pkt;
+int pkt;
+
+/*
+ * Given the rx queue name template above, get the queue name
+ */
+static inline const char *
+get_rx_queue_name(unsigned id)
+{
+	/* buffer for return value. Size calculated by %u being replaced
+	 * by maximum 3 digits (plus an extra byte for safety) */
+	static char buffer[sizeof(MP_CLIENT_RXQ_NAME) + 2];
+
+	rte_snprintf(buffer, sizeof(buffer) - 1, MP_CLIENT_RXQ_NAME, id);
+	return buffer;
+}
+
+/*
+ * Given the tx queue name template above, get the queue name
+ */
+static inline const char *
+get_tx_queue_name(unsigned id)
+{
+	/* buffer for return value. Size calculated by %u being replaced
+	 * by maximum 3 digits (plus an extra byte for safety) */
+	static char buffer[sizeof(MP_CLIENT_TXQ_NAME) + 2];
+
+	rte_snprintf(buffer, sizeof(buffer) - 1, MP_CLIENT_TXQ_NAME, id);
+	return buffer;
+}
+
+/*
+ * print a usage message
+ */
+static void
+usage(const char *progname)
+{
+	printf("\nUsage: %s [EAL args] -- -n <client_id>\n", progname);
+}
+
+/*
+ * Convert the client id number from a string to an int.
+ */
+static int
+parse_client_num(const char *client)
+{
+	char *end = NULL;
+	unsigned long temp = 0;
+
+	if (client == NULL || *client == '\0')
+		return -1;
+
+	temp = strtoul(client, &end, BASE_10);
+	/* If valid string argument is provided, terminating '/0' character
+	 * is stored in 'end' */
+	if (end == NULL || *end != '\0')
+		return -1;
+
+	client_id = (uint8_t)temp;
+	return 0;
+}
+
+/*
+ * Parse the application arguments to the client app.
+ */
+static int
+parse_app_args(int argc, char *argv[])
+{
+	int option_index = 0, opt = 0;
+	char **argvopt = argv;
+	const char *progname = NULL;
+	static struct option lgopts[] = {
+		{NULL, 0, 0, 0 }
+	};
+	progname = argv[0];
+
+	while ((opt = getopt_long(argc, argvopt, "n:s:d:", lgopts,
+		&option_index)) != EOF){
+		switch (opt){
+			case 'n':
+				if (parse_client_num(optarg) != 0){
+					usage(progname);
+					return -1;
+				}
+				break;
+			default:
+				usage(progname);
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Application main function - loops through
+ * receiving and processing packets. Never returns
+ */
+int
+main(int argc, char *argv[])
+{
+	struct rte_ring *rx_ring = NULL;
+	struct rte_ring *tx_ring = NULL;
+	int retval = 0;
+	void *pkts[PKT_READ_SIZE];
+	int rslt = 0;
+
+	if ((retval = rte_eal_init(argc, argv)) < 0)
+		return -1;
+
+	argc -= retval;
+	argv += retval;
+
+	if (parse_app_args(argc, argv) < 0)
+		rte_exit(EXIT_FAILURE, "Invalid command-line arguments\n");
+
+	rx_ring = rte_ring_lookup(get_rx_queue_name(client_id));
+	if (rx_ring == NULL)
+		rte_exit(EXIT_FAILURE, "Cannot get RX ring - is server process
running?\n");
+
+	tx_ring = rte_ring_lookup(get_tx_queue_name(client_id));
+	if (tx_ring == NULL)
+		rte_exit(EXIT_FAILURE, "Cannot get TX ring - is server process
running?\n");
+
+	RTE_LOG(INFO, APP, "Finished Process Init.\n");
+
+	printf("\nClient process %d handling packets\n", client_id);
+	printf("[Press Ctrl-C to quit ...]\n");
+
+	for (;;) {
+		unsigned rx_pkts = PKT_READ_SIZE;
+
+		/* try dequeuing max possible packets first, if that fails, get the
+		 * most we can. Loop body should only execute once, maximum */
+		while (rx_pkts > 0 &&
+				unlikely(rte_ring_dequeue_bulk(rx_ring, pkts, rx_pkts) != 0))
+			rx_pkts = (uint16_t)RTE_MIN(rte_ring_count(rx_ring), PKT_READ_SIZE);
+
+                if(rx_pkts>0)
+                {
+
+                    pkt++;
+
+		    /* blocking enqueue */
+		    do {
+			    rslt = rte_ring_enqueue_bulk(tx_ring, pkts, rx_pkts);
+		    } while (rslt == -ENOBUFS);
+                }
+                else
+                   no_pkt++;
+
+               if (!(pkt %  100000))
+               {
+                     printf("pkt %d %d\n", pkt, no_pkt);
+                     pkt=no_pkt=0;
+               }
+	}
+}





More information about the dev mailing list