[ovs-dev] PATCH 1/1 : netdev-dpdk / add dpdk rings to netdev-dpdk
Pravin Shelar
pshelar at nicira.com
Tue May 6 21:49:27 UTC 2014
Thanks for patch, It looks good, I have comments mostly related to coding style.
Subject line should follow following format
[PATCH <n>/<m>] <area>: <summary>
On Thu, May 1, 2014 at 3:58 AM, Gerald Rogers <gerald.rogers at intel.com> wrote:
> This patch enables the client dpdk rings within the netdev-dpdk. It adds
> a new dpdk device called dpdkr (other naming suggestions?). This allows
> for the use of shared memory to communicate with other dpdk applications,
> on the host or within a virtual machine. Instructions for use are in
> INSTALL.DPDK.
>
> This has been tested on Intel multi-core platforms and with the client
> application within the host.
>
>
> Signed-off-by: Gerald Rogers <gerald.rogers at intel.com>
> Signed-off-by: Gerald Rogers <gerald.rogers at intel.com>
>
Duplicate signed-off line.
> diff --git a/INSTALL.DPDK b/INSTALL.DPDK
> index 3e0247a..ec1b814 100644
> --- a/INSTALL.DPDK
> +++ b/INSTALL.DPDK
> @@ -74,6 +74,44 @@ Once first DPDK port is added vswitchd, it creates Polling thread and
> polls dpdk device in continuous loop. Therefore CPU utilization
> for that thread is always 100%.
>
> +DPDK Rings :
> +
> +Following the steps above to create a bridge, you can now add dpdk rings
> +as a port to the vswitch. OVS will expect the DPDK ring device name to
> +start with dpdkr and end with portid.
> +
> + ovs-vsctl add-port br0 dpdkr0 -- set Interface dpdkr0 type=dpdkr
> +
> +DPDK rings client test application
> +
> +Included in the test directory is a sample DPDK application for testing
> +the rings. This is from the base dpdk directory and modified to work
> +with the ring naming used within ovs.
> +
> +location tests/ovs_client
> +
> +To run the client :
> +
> +ovsclient -c 1 -n 4 --proc-type=secondary -- --n "port id you gave dpdkr"
> +
> +It is essential to have --proc-type=secondary
> +
> +The application simply receives an mbuf on the receive queue of the
> +ethernet ring and then places that same mbuf on the transmit ring of
> +the ethernet ring. It is a trivial loopback application.
> +
> +In addition to executing the client in the host, you can execute it within
> +a guest VM. To do so you will need a patched qemu. You can download the
> +patch and getting started guid at :
> +
> +https://01.org/packet-processing/downloads
> +
> +A general rule of thumb for better performance is that the client
> +application shouldn't be assigned the same dpdk core mask "-c" as
> +the vswitchd.
> +
> +
> +
> Restrictions:
> -------------
>
> @@ -86,6 +124,11 @@ Restrictions:
> device queues configured.
> - Work with 1500 MTU, needs few changes in DPDK lib to fix this issue.
> - Currently DPDK port does not make use any offload functionality.
> + ivshmem
> + - The shared memory is currently restricted to the use of a 1GB
> + huge pages.
I am not sure why do we have this limit.
> + - All huge pages are shared amongst the host, clients, virtual
> + machines etc.
>
> Bug Reporting:
> --------------
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index fd991ab..a90913f 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -148,6 +148,23 @@ struct dpdk_tx_queue {
> struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];
> };
>
> +/* dpdk has no way to remove dpdk ring ethernet devices
> + so we have to keep them around once they've been created
> +*/
> +
> +static struct list dpdk_ring_list OVS_GUARDED_BY(dpdk_mutex)
> + = LIST_INITIALIZER(&dpdk_ring_list);
> +
> +struct dpdk_ring {
> + /* For the client rings */
> + struct rte_ring * cring_tx;
> + struct rte_ring * cring_rx;
No need to have space after *.
> + int port_id; /* dpdkr port id */
> + int eth_port_id; /* ethernet device port id */
> + struct list list_node OVS_GUARDED_BY(mutex);
> +
> +};
Extra blank line.
> +
> struct netdev_dpdk {
> struct netdev up;
> int port_id;
> @@ -167,9 +184,14 @@ struct netdev_dpdk {
> uint8_t hwaddr[ETH_ADDR_LEN];
> enum netdev_flags flags;
>
> +
> struct rte_eth_link link;
> int link_reset_cnt;
>
> + /* If a shared memory ring */
> +
> + struct dpdk_ring *ivshmem;
> +
> /* In dpdk_list. */
> struct list list_node OVS_GUARDED_BY(dpdk_mutex);
> };
> @@ -571,6 +593,7 @@ dpdk_queue_flush(struct netdev_dpdk *dev, int qid)
> if (txq->count == 0) {
> return;
> }
> +
> rte_spinlock_lock(&txq->tx_lock);
> nb_tx = rte_eth_tx_burst(dev->port_id, qid, txq->burst_pkts, txq->count);
> if (nb_tx != txq->count) {
> @@ -593,6 +616,7 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq_, struct ofpbuf **packets, int *c)
>
> dpdk_queue_flush(dev, rxq_->queue_id);
>
> +
> nb_rx = rte_eth_rx_burst(rx->port_id, rxq_->queue_id,
> (struct rte_mbuf **) packets, MAX_RX_QUEUE_LEN);
> if (!nb_rx) {
> @@ -1170,6 +1194,200 @@ static struct netdev_class netdev_dpdk_class = {
> NULL, /* rxq_drain */
> };
>
> +/* Client Rings */
> +
> +static int
> +dpdk_classr_init(void)
> +{
> + VLOG_INFO("Initialized dpdk client handlers:\n");
> + return 0;
> +}
> +
> +static int
> +netdev_dpdkr_construct(struct netdev *netdev_)
> +{
> + struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
> + struct dpdk_ring *ivshmem;
> + unsigned int port_no;
> + char *cport;
> + int err=0;
> + int found=0;
> + char name[10];
We can move name-definition to if-else block where is used.
> +
> +
Extra blank line.
> + if (rte_eal_init_ret) {
> + return rte_eal_init_ret;
> + }
> +
> + ovs_mutex_lock(&dpdk_mutex);
> + cport = netdev_->name + 5; /* Names always start with "dpdkr" */
> +
> + if (strncmp(netdev_->name, "dpdkr", 5)) {
> + VLOG_ERR("Not a valid dpdkr name %s:\n",netdev_->name);
> + err = ENODEV;
> + goto unlock_dpdk;
> + }
> +
> + port_no = strtol(cport, 0, 0); /* string must be null terminated */
> +
> + ovs_mutex_init(&netdev->mutex);
> +
> + ovs_mutex_lock(&netdev->mutex);
> + netdev->flags = 0;
> +
> + netdev->mtu = ETHER_MTU;
> + netdev->max_packet_len = MTU_TO_MAX_LEN(netdev->mtu);
> +
> + /* TODO: need to discover device node at run time. */
> + netdev->socket_id = SOCKET0;
> +
> + netdev->dpdk_mp = dpdk_mp_get(netdev->socket_id, netdev->mtu);
> + if (!netdev->dpdk_mp) {
> + VLOG_ERR("Unable to get memory pool\n");
> + err = ENOMEM;
> + goto unlock_dev;
> + }
> +
> + /* look through our list to find the device */
> + LIST_FOR_EACH(ivshmem, list_node, &dpdk_ring_list){
> + if (ivshmem->port_id == port_no) {
> + VLOG_INFO("Found dpdk ring device %s:\n", netdev_->name);
> + found = 1;
> + break;
> + }
> + }
> + if (found) {
> + netdev->ivshmem = ivshmem;
"netdev->ivshmem" is only assigned not used anywhere in code.
> + netdev->port_id = ivshmem->eth_port_id; /* really all that is needed */
> + }
> + else {
> + /* Need to create the device rings */
> +
> + ivshmem = dpdk_rte_mzalloc(sizeof *ivshmem);
> + if (ivshmem == NULL)
> + goto unlock_dev;
> +
> + snprintf(name,10,"%s_tx",netdev_->name);
> + ivshmem->cring_tx = rte_ring_create(name, MAX_RX_QUEUE_LEN, SOCKET0, 0);
> + if (ivshmem->cring_tx == NULL)
> + goto unlock_dev;
> +
> + snprintf(name,10,"%s_rx",netdev_->name);
> + ivshmem->cring_rx = rte_ring_create(name, MAX_RX_QUEUE_LEN, SOCKET0, 0);
Do we need to relation between ring size when we create device and
when we open it?
> + if (ivshmem->cring_rx == NULL)
> + goto unlock_dev;
Need to free memory in error case.
> +
> + err = rte_eth_from_rings(&ivshmem->cring_rx, 1, &ivshmem->cring_tx, 1, SOCKET0);
> + if (err < 0) {
> + goto unlock_dev;
> + }
Extra space
> +
> + ivshmem->port_id = port_no;
> + ivshmem->eth_port_id = netdev->port_id = rte_eth_dev_count()-1 ;
> +
> + netdev->ivshmem=ivshmem;
> +
Need space.
> + list_push_back(&dpdk_ring_list, &ivshmem->list_node);
> +
> + }
> +
> + err = dpdk_eth_dev_init(netdev);
> + if (err) {
> + goto unlock_dev;
> + }
> +
> + list_push_back(&dpdk_list, &netdev->list_node);
> +
> + err = 0;
> +unlock_dev:
> + ovs_mutex_unlock(&netdev->mutex);
> +unlock_dpdk:
> + ovs_mutex_unlock(&dpdk_mutex);
> + return err;
> +}
> +
> +static void
> +netdev_dpdkr_destruct(struct netdev *netdev_)
> +{
> + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_);
> +
> + ovs_mutex_lock(&dev->mutex);
> + rte_eth_dev_stop(dev->port_id);
> +
> + ovs_mutex_unlock(&dev->mutex);
> +
> + ovs_mutex_lock(&dpdk_mutex);
> + list_remove(&dev->list_node);
> + dpdk_mp_put(dev->dpdk_mp);
> + ovs_mutex_unlock(&dpdk_mutex);
> +
> + ovs_mutex_destroy(&dev->mutex);
> +}
> +
> +static struct netdev_class netdev_dpdkr_class = {
> + "dpdkr",
> + dpdk_classr_init, /* init */
> + NULL, /* netdev_dpdk_run */
> + NULL, /* netdev_dpdk_wait */
> +
> + netdev_dpdk_alloc,
> + netdev_dpdkr_construct,
> + netdev_dpdkr_destruct,
> + netdev_dpdk_dealloc,
> + netdev_dpdk_get_config,
> + NULL, /* netdev_dpdk_set_config */
> + NULL, /* get_tunnel_config */
> +
> + netdev_dpdk_send, /* send */
> + NULL, /* send_wait */
> +
> + netdev_dpdk_set_etheraddr,
> + netdev_dpdk_get_etheraddr,
> + netdev_dpdk_get_mtu,
> + netdev_dpdk_set_mtu,
> + netdev_dpdk_get_ifindex,
> + netdev_dpdk_get_carrier,
> + netdev_dpdk_get_carrier_resets,
> + netdev_dpdk_set_miimon,
> + netdev_dpdk_get_stats,
> + netdev_dpdk_set_stats,
> + netdev_dpdk_get_features,
> + NULL, /* set_advertisements */
> +
> + NULL, /* set_policing */
> + NULL, /* get_qos_types */
> + NULL, /* get_qos_capabilities */
> + NULL, /* get_qos */
> + NULL, /* set_qos */
> + NULL, /* get_queue */
> + NULL, /* set_queue */
> + NULL, /* delete_queue */
> + NULL, /* get_queue_stats */
> + NULL, /* queue_dump_start */
> + NULL, /* queue_dump_next */
> + NULL, /* queue_dump_done */
> + NULL, /* dump_queue_stats */
> +
> + NULL, /* get_in4 */
> + NULL, /* set_in4 */
> + NULL, /* get_in6 */
> + NULL, /* add_router */
> + NULL, /* get_next_hop */
> + netdev_dpdk_get_status,
> + NULL, /* arp_lookup */
> +
> + netdev_dpdk_update_flags,
> +
> + netdev_dpdk_rxq_alloc,
> + netdev_dpdk_rxq_construct,
> + netdev_dpdk_rxq_destruct,
> + netdev_dpdk_rxq_dealloc,
> + netdev_dpdk_rxq_recv,
> + NULL, /* rx_wait */
> + NULL, /* rxq_drain */
> +};
> +
> +
We can avoid dup definition, checkout TUNNEL_CLASS() in netdev-vport.c
> int
> dpdk_init(int argc, char **argv)
> {
> @@ -1196,6 +1414,7 @@ void
> netdev_dpdk_register(void)
> {
> netdev_register_provider(&netdev_dpdk_class);
> + netdev_register_provider(&netdev_dpdkr_class);
> }
>
> int
> diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h
> index 2807310..8cb715a 100644
> --- a/lib/netdev-dpdk.h
> +++ b/lib/netdev-dpdk.h
> @@ -10,6 +10,7 @@
> #include <rte_eal.h>
> #include <rte_debug.h>
> #include <rte_ethdev.h>
> +#include <rte_eth_ring.h>
> #include <rte_errno.h>
> #include <rte_memzone.h>
> #include <rte_memcpy.h>
> diff --git a/lib/netdev.c b/lib/netdev.c
> index 2fc1834..610de13 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> @@ -98,7 +98,7 @@ netdev_n_rxq(const struct netdev *netdev)
> bool
> netdev_is_pmd(const struct netdev *netdev)
> {
> - return !strcmp(netdev->netdev_class->type, "dpdk");
> + return !strncmp(netdev->netdev_class->type, "dpdk",4);
> }
It would be easier to understand if dkdpr is explicitly checked.
>
> static void
> diff --git a/tests/automake.mk b/tests/automake.mk
> index bf80702..0067812 100644
> --- a/tests/automake.mk
> +++ b/tests/automake.mk
> @@ -204,6 +204,11 @@ tests/idltest.ovsidl: $(IDLTEST_IDL_FILES)
>
> tests/idltest.c: tests/idltest.h
>
> +noinst_PROGRAMS += tests/ovsclient
> +tests_ovsclient_SOURCES = \
> + tests/ovs_client/ovs_client.c
> +tests_ovsclient_LDADD = lib/libopenvswitch.la $(LIBS)
> +
> noinst_PROGRAMS += tests/ovstest
> tests_ovstest_SOURCES = \
> tests/ovstest.c \
> diff --git a/tests/ovs_client/ovs_client.c b/tests/ovs_client/ovs_client.c
> new file mode 100644
> index 0000000..07f49ed
> --- /dev/null
> +++ b/tests/ovs_client/ovs_client.c
This file need lot of coding style fixes.
> @@ -0,0 +1,217 @@
> +/*
> + * BSD LICENSE
> + *
> + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + *
> + * * Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * * Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in
> + * the documentation and/or other materials provided with the
> + * distribution.
> + * * Neither the name of Intel Corporation nor the names of its
> + * contributors may be used to endorse or promote products derived
> + * from this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + *
> + */
> +
> +#include <getopt.h>
> +
> +#include <config.h>
> +#include <rte_config.h>
> +#include <rte_mbuf.h>
> +#include <rte_ether.h>
> +#include <rte_string_fns.h>
> +#include <rte_ip.h>
> +#include <rte_byteorder.h>
> +
> +/* Number of packets to attempt to read from queue */
> +#define PKT_READ_SIZE ((uint16_t)32)
> +
> +/* define common names for structures shared between ovs_dpdk and client */
> +#define MP_CLIENT_RXQ_NAME "dpdkr%u_tx"
> +#define MP_CLIENT_TXQ_NAME "dpdkr%u_rx"
> +
> +#define RTE_LOGTYPE_APP RTE_LOGTYPE_USER1
> +
> +#define BASE_10 10
> +
> +/* our client id number - tells us which rx queue to read, and tx
> + * queue to write to. */
> +static uint8_t client_id = 0;
> +
> +int no_pkt;
> +int pkt;
> +
> +/*
> + * Given the rx queue name template above, get the queue name
> + */
> +static inline const char *
> +get_rx_queue_name(unsigned id)
> +{
> + /* buffer for return value. Size calculated by %u being replaced
> + * by maximum 3 digits (plus an extra byte for safety) */
> + static char buffer[sizeof(MP_CLIENT_RXQ_NAME) + 2];
> +
> + rte_snprintf(buffer, sizeof(buffer) - 1, MP_CLIENT_RXQ_NAME, id);
> + return buffer;
> +}
> +
> +/*
> + * Given the tx queue name template above, get the queue name
> + */
> +static inline const char *
> +get_tx_queue_name(unsigned id)
> +{
> + /* buffer for return value. Size calculated by %u being replaced
> + * by maximum 3 digits (plus an extra byte for safety) */
> + static char buffer[sizeof(MP_CLIENT_TXQ_NAME) + 2];
> +
> + rte_snprintf(buffer, sizeof(buffer) - 1, MP_CLIENT_TXQ_NAME, id);
> + return buffer;
> +}
> +
> +/*
> + * print a usage message
> + */
> +static void
> +usage(const char *progname)
> +{
> + printf("\nUsage: %s [EAL args] -- -n <client_id>\n", progname);
> +}
> +
> +/*
> + * Convert the client id number from a string to an int.
> + */
> +static int
> +parse_client_num(const char *client)
> +{
> + char *end = NULL;
> + unsigned long temp = 0;
> +
> + if (client == NULL || *client == '\0')
> + return -1;
> +
> + temp = strtoul(client, &end, BASE_10);
> + /* If valid string argument is provided, terminating '/0' character
> + * is stored in 'end' */
> + if (end == NULL || *end != '\0')
> + return -1;
> +
> + client_id = (uint8_t)temp;
> + return 0;
> +}
> +
> +/*
> + * Parse the application arguments to the client app.
> + */
> +static int
> +parse_app_args(int argc, char *argv[])
> +{
> + int option_index = 0, opt = 0;
> + char **argvopt = argv;
> + const char *progname = NULL;
> + static struct option lgopts[] = {
> + {NULL, 0, 0, 0 }
> + };
> + progname = argv[0];
> +
> + while ((opt = getopt_long(argc, argvopt, "n:s:d:", lgopts,
> + &option_index)) != EOF){
> + switch (opt){
> + case 'n':
> + if (parse_client_num(optarg) != 0){
> + usage(progname);
> + return -1;
> + }
> + break;
> + default:
> + usage(progname);
> + return -1;
> + }
> + }
> +
> + return 0;
> +}
> +
> +/*
> + * Application main function - loops through
> + * receiving and processing packets. Never returns
> + */
> +int
> +main(int argc, char *argv[])
> +{
> + struct rte_ring *rx_ring = NULL;
> + struct rte_ring *tx_ring = NULL;
> + int retval = 0;
> + void *pkts[PKT_READ_SIZE];
> + int rslt = 0;
> +
> + if ((retval = rte_eal_init(argc, argv)) < 0)
> + return -1;
> +
> + argc -= retval;
> + argv += retval;
> +
> + if (parse_app_args(argc, argv) < 0)
> + rte_exit(EXIT_FAILURE, "Invalid command-line arguments\n");
> +
> + rx_ring = rte_ring_lookup(get_rx_queue_name(client_id));
> + if (rx_ring == NULL)
> + rte_exit(EXIT_FAILURE, "Cannot get RX ring - is server process running?\n");
> +
> + tx_ring = rte_ring_lookup(get_tx_queue_name(client_id));
> + if (tx_ring == NULL)
> + rte_exit(EXIT_FAILURE, "Cannot get TX ring - is server process running?\n");
> +
> + RTE_LOG(INFO, APP, "Finished Process Init.\n");
> +
> + printf("\nClient process %d handling packets\n", client_id);
> + printf("[Press Ctrl-C to quit ...]\n");
> +
> + for (;;) {
> + unsigned rx_pkts = PKT_READ_SIZE;
> +
> + /* try dequeuing max possible packets first, if that fails, get the
> + * most we can. Loop body should only execute once, maximum */
> + while (rx_pkts > 0 &&
> + unlikely(rte_ring_dequeue_bulk(rx_ring, pkts, rx_pkts) != 0))
> + rx_pkts = (uint16_t)RTE_MIN(rte_ring_count(rx_ring), PKT_READ_SIZE);
> +
> + if(rx_pkts>0)
> + {
> +
> + pkt++;
> +
> + /* blocking enqueue */
> + do {
> + rslt = rte_ring_enqueue_bulk(tx_ring, pkts, rx_pkts);
> + } while (rslt == -ENOBUFS);
> + }
> + else
> + no_pkt++;
> +
> + if (!(pkt % 100000))
> + {
> + printf("pkt %d %d\n", pkt, no_pkt);
> + pkt=no_pkt=0;
> + }
> + }
> +}
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
More information about the dev
mailing list