[ovs-dev] [PATCH 2/4] netdev-dpdk: Apply ingress_sched config to dpdk phy ports

Kavanagh, Mark B mark.b.kavanagh at intel.com
Fri Aug 4 15:13:49 UTC 2017


>From: ovs-dev-bounces at openvswitch.org [mailto:ovs-dev-bounces at openvswitch.org]
>On Behalf Of Billy O'Mahony
>Sent: Thursday, July 20, 2017 5:11 PM
>To: dev at openvswitch.org
>Subject: [ovs-dev] [PATCH 2/4] netdev-dpdk: Apply ingress_sched config to dpdk
>phy ports
>
>Ingress scheduling configuration is given effect by way of Flow Director
>filters. A small subset of the ingress scheduling possible is
>implemented in this patch.
>
>Signed-off-by: Billy O'Mahony <billy.o.mahony at intel.com>

Hi Billy,

As a general comment, this patch doesn't apply to HEAD of master, so please rebase as part of rework.

Review comments inline.

Thanks,
Mark 


>---
> include/openvswitch/ofp-parse.h |   3 +
> lib/dpif-netdev.c               |   1 +
> lib/netdev-dpdk.c               | 167 ++++++++++++++++++++++++++++++++++++++-
>-
> vswitchd/bridge.c               |   2 +
> 4 files changed, 166 insertions(+), 7 deletions(-)
>
>diff --git a/include/openvswitch/ofp-parse.h b/include/openvswitch/ofp-parse.h
>index fc5784e..08d6086 100644
>--- a/include/openvswitch/ofp-parse.h
>+++ b/include/openvswitch/ofp-parse.h
>@@ -37,6 +37,9 @@ struct ofputil_table_mod;
> struct ofputil_bundle_msg;
> struct ofputil_tlv_table_mod;
> struct simap;
>+struct tun_table;
>+struct flow_wildcards;
>+struct ofputil_port_map;
> enum ofputil_protocol;
>
> char *parse_ofp_str(struct ofputil_flow_mod *, int command, const char *str_,
>diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>index 47a9fa0..d35566f 100644
>--- a/lib/dpif-netdev.c
>+++ b/lib/dpif-netdev.c
>@@ -44,6 +44,7 @@
> #include "dp-packet.h"
> #include "dpif.h"
> #include "dpif-provider.h"
>+#include "netdev-provider.h"

If a setter function for modifying netdev->ingress_sched_str is implemented, there is no need to include netdev-provider.h

> #include "dummy.h"
> #include "fat-rwlock.h"
> #include "flow.h"
>diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>index e74c50f..e393abf 100644
>--- a/lib/netdev-dpdk.c
>+++ b/lib/netdev-dpdk.c
>@@ -33,6 +33,8 @@
> #include <rte_meter.h>
> #include <rte_virtio_net.h>
>
>+#include <openvswitch/ofp-parse.h>
>+#include <openvswitch/ofp-util.h>

Move these include below, with the other openvswitch include file.

> #include "dirs.h"
> #include "dp-packet.h"
> #include "dpdk.h"
>@@ -169,6 +171,10 @@ static const struct rte_eth_conf port_conf = {
>     .txmode = {
>         .mq_mode = ETH_MQ_TX_NONE,
>     },
>+    .fdir_conf = {
>+        .mode = RTE_FDIR_MODE_PERFECT,

As you mentioned in your cover letter, you've only tested on a Fortville NIC.
How widely supported are the Flow Director features across DPDK-supported NICs? 

>+    },
>+
> };
>
> enum { DPDK_RING_SIZE = 256 };
>@@ -330,6 +336,11 @@ enum dpdk_hw_ol_features {
>     NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
> };
>
>+union ingress_filter {
>+    struct rte_eth_ethertype_filter ethertype;
>+    struct rte_eth_fdir_filter fdir;
>+};
>+
> struct netdev_dpdk {
>     struct netdev up;
>     dpdk_port_t port_id;
>@@ -369,8 +380,11 @@ struct netdev_dpdk {
>     /* If true, device was attached by rte_eth_dev_attach(). */
>     bool attached;
>
>-    /* Ingress Scheduling config */
>+    /* Ingress Scheduling config & state. */
>     char *ingress_sched_str;
>+    bool ingress_sched_changed;
>+    enum rte_filter_type ingress_filter_type;
>+    union ingress_filter ingress_filter;
>
>     /* In dpdk_list. */
>     struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
>@@ -653,6 +667,15 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int
>n_rxq, int n_txq)
>     int i;
>     struct rte_eth_conf conf = port_conf;
>
>+    /* Ingress scheduling requires ETH_MQ_RX_NONE so limit it to when exactly
>+     * two rxqs are defined. Otherwise MQ will not work as expected. */

So if a user later enables multiq (i.e. > 2 rxqs), that implicitly disables ingress scheduling?

In any event, the comment here needs to be much clearer, as this is the first mention within the code that
ingress scheduling is limited to just 2 rxqs. Is that limitation just for the PoC?

It may also be useful to briefly explain why RSS prevents ingress scheduling. 

>+    if (dev->ingress_sched_str && n_rxq == 2) {
>+        conf.rxmode.mq_mode = ETH_MQ_RX_NONE;
>+    }
>+    else {
>+        conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
>+    }
>+
>     if (dev->mtu > ETHER_MTU) {
>         conf.rxmode.jumbo_frame = 1;
>         conf.rxmode.max_rx_pkt_len = dev->max_packet_len;
>@@ -730,6 +753,121 @@ dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev)
>OVS_REQUIRES(dev->mutex)
>     }
> }
>
>+static void

Why no return value?

>+dpdk_apply_ingress_scheduling(struct netdev_dpdk *dev, int n_rxq)
>+{
>+    if (!dev->ingress_sched_str) {
>+        return;
>+    }
>+
>+    if (n_rxq != 2) {

If n_rxq is always 2, then remove that parameter from this function, and perform the check (i.e. n_rxq ==2) in dpdk_eth_dev_init instead.

>+        VLOG_ERR("Interface %s: Ingress scheduling config ignored; " \
>+                 "Requires n_rxq==2.", dev->up.name);
>+    }
>+
>+    int priority_q_id = n_rxq-1;

So, this is always 1, right?

>+    char *key, *val, *str, *iter;
>+
>+    ovs_be32 ip_src, ip_dst;
>+    ip_src = ip_dst = 0;
>+
>+    uint16_t eth_type, port_src, port_dst;
>+    eth_type = port_src = port_dst = 0;
>+    uint8_t ip_proto = 0;
>+    int diag = 0;
>+
>+    /* delete any existing filter */
>+    if (dev->ingress_filter_type == RTE_ETH_FILTER_FDIR) {
>+        diag = rte_eth_dev_filter_ctrl(dev->port_id, RTE_ETH_FILTER_FDIR,
>+            RTE_ETH_FILTER_DELETE, &dev->ingress_filter.fdir);
>+    } else if (dev->ingress_filter_type == RTE_ETH_FILTER_ETHERTYPE) {
>+        diag = rte_eth_dev_filter_ctrl(dev->port_id,
>RTE_ETH_FILTER_ETHERTYPE,
>+            RTE_ETH_FILTER_DELETE, &dev->ingress_filter.ethertype);
>+    }
>+
>+    char *mallocd_str; /* str_to_x returns malloc'd str we'll need to free */

Move inline comment above the line. Also, malloc'd string is only returned in the event of error (otherwise NULL), so you might want to clarify the comment.

>+    /* Parse the configuration into local vars */
>+    iter = str = xstrdup(dev->ingress_sched_str);

Why are two strings (i.e. iter and str) needed here?

>+    while (ofputil_parse_key_value (&iter, &key, &val)) {
>+        if (strcmp(key, "nw_src") == 0 || strcmp(key, "ip_src") == 0) {
>+            mallocd_str = str_to_ip(val, &ip_src);
>+        } else if (strcmp(key, "nw_dst") == 0 || strcmp(key, "ip_dst") == 0)
>{
>+            mallocd_str = str_to_ip(val, &ip_dst);
>+        } else if (strcmp(key, "dl_type") == 0 ||
>+                   strcmp(key, "eth_type") == 0) {
>+            mallocd_str = str_to_u16(val, "eth_type/dl_type", &eth_type);
>+        } else if (strcmp(key, "tcp_src") == 0 ||
>+                  strcmp(key, "tp_src") == 0 ||
>+                  strcmp(key, "udp_src") == 0) {
>+            mallocd_str = str_to_u16(val, "tcp/udp_src", &port_src);
>+        } else if (strcmp(key, "tcp_dst") == 0 ||
>+                   strcmp(key, "tp_dst") == 0 ||
>+                   strcmp(key, "udp_dst") == 0) {
>+            mallocd_str = str_to_u16(val, "tcp/udp_dst", &port_dst);
>+        } else if (strcmp(key, "ip") == 0) {

Use defined macros for well-known values - examples below.

>+            eth_type = 0x0800;

ETH_P_IP <linux/if_ether.h>

>+        } else if (strcmp(key, "udp") == 0) {
>+            eth_type = 0x0800;
>+            ip_proto = 17;

IPPROTO_UDP <netinet/in.h>

>+        } else if (strcmp(key, "tcp") == 0) {
>+            eth_type = 0x0800;
>+            ip_proto = 6;

IPPROTO_TCP <netinet/in.h>

>+        } else {
>+            VLOG_WARN("Ignoring unsupported ingress scheduling field '%s'", \
>+                      key);
>+        }
>+        if (mallocd_str) {
>+            VLOG_ERR ("%s", mallocd_str);
>+            free(mallocd_str);
>+            mallocd_str = NULL;
>+        }
>+    }
>+    free (str);

Breaking out the parsing into a separate function would make this function more readable.

>+
>+    /* Set the filters */
>+    if (eth_type && ip_src && ip_dst && port_src && port_dst && ip_proto) {
>+        struct rte_eth_fdir_filter entry = { 0 };

This causes a compiler error for me.

>+        entry.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP;

Is only UDP flow supported for the POC?

>+        entry.input.flow.udp4_flow.ip.src_ip = ip_src;
>+        entry.input.flow.udp4_flow.ip.dst_ip = ip_dst;
>+        entry.input.flow.udp4_flow.src_port = htons(port_src);
>+        entry.input.flow.udp4_flow.dst_port = htons(port_dst);
>+        entry.action.rx_queue = priority_q_id;
>+        entry.action.behavior = RTE_ETH_FDIR_ACCEPT;
>+        entry.action.report_status = RTE_ETH_FDIR_REPORT_ID;
>+        diag = rte_eth_dev_filter_ctrl(dev->port_id,
>+                RTE_ETH_FILTER_FDIR, RTE_ETH_FILTER_ADD, &entry);
>+        dev->ingress_filter_type = RTE_ETH_FILTER_FDIR;
>+        dev->ingress_filter.fdir = entry;
>+    }
>+    else if (eth_type && !ip_src && !ip_dst && !port_src
>+             && !port_dst && !ip_proto) {
>+        struct rte_eth_ethertype_filter entry = {0};
>+        memset (&entry, 0, sizeof entry);
>+        entry.ether_type = eth_type;
>+        entry.flags = 0;
>+        entry.queue = priority_q_id;
>+        diag = rte_eth_dev_filter_ctrl(dev->port_id,
>+                RTE_ETH_FILTER_ETHERTYPE, RTE_ETH_FILTER_ADD, &entry);
>+        dev->ingress_filter.ethertype = entry;
>+        dev->ingress_filter_type = RTE_ETH_FILTER_ETHERTYPE;
>+    }
>+    else {
>+        VLOG_ERR("Unsupported ingress scheduling match-field combination.");
>+        dev->ingress_filter_type = RTE_ETH_FILTER_NONE;
>+        return;
>+    }
>+
>+    if (diag) {
>+        dev->ingress_filter_type = RTE_ETH_FILTER_NONE;
>+        VLOG_ERR("Failed to add ingress scheduling filter.");
>+    }
>+    else {
>+        /* Mark the appropriate q as prioritized */
>+        dev->up.priority_rxq = priority_q_id;
>+    }
>+}
>+
> static int
> dpdk_eth_dev_init(struct netdev_dpdk *dev)
>     OVS_REQUIRES(dev->mutex)
>@@ -764,6 +902,8 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>         return -diag;
>     }
>
>+    dpdk_apply_ingress_scheduling(dev, n_rxq);
>+
>     diag = rte_eth_dev_start(dev->port_id);
>     if (diag) {
>         VLOG_ERR("Interface %s start error: %s", dev->up.name,
>@@ -870,6 +1010,9 @@ common_construct(struct netdev *netdev, dpdk_port_t
>port_no,
>     dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE;
>     dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE;
>
>+    dev->ingress_sched_str = NULL;
>+    dev->ingress_sched_changed = false;
>+    dev->ingress_filter_type = RTE_ETH_FILTER_NONE;
>     /* Initialize the flow control to NULL */
>     memset(&dev->fc_conf, 0, sizeof dev->fc_conf);
>
>@@ -1950,11 +2093,20 @@ netdev_dpdk_set_ingress_sched(struct netdev *netdev,
> {
>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>
>-    free(dev->ingress_sched_str);
>-    if (ingress_sched_str) {
>-        dev->ingress_sched_str = xstrdup(ingress_sched_str);
>+    if ((ingress_sched_str && dev->ingress_sched_str &&
>+        strcmp(ingress_sched_str, dev->ingress_sched_str) == 0) ||
>+        (!ingress_sched_str && !dev->ingress_sched_str)) {
>+        /* no-op; new cfg == old cfg or else both are NULL */

Just one may be NULL, not necessarily both.

>+        return 0;
>+    } else {
>+        /* free the old, copy in the new */
>+        free(dev->ingress_sched_str);
>+        if (ingress_sched_str) {
>+            dev->ingress_sched_str = xstrdup(ingress_sched_str);
>+        }
>+        dev->ingress_sched_changed = true;
>+        netdev_request_reconfigure(netdev);

Why trigger a netdev_request_reconfigure here? The ingress_sched_str has already been set, so it doesn't make sense.
A solution may be to add another netdev member 'requested_ingress_sched_str', which is assigned the value of 'ingress_sched_str' in this function.
Then, in netdev_dpdk_reconfigure, dev->ingress_sched_str = dev->requested_ingress_sched_str.

>     }
>-
>     return 0;
> }
>
>@@ -3112,12 +3264,13 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
>         && dev->mtu == dev->requested_mtu
>         && dev->rxq_size == dev->requested_rxq_size
>         && dev->txq_size == dev->requested_txq_size
>-        && dev->socket_id == dev->requested_socket_id) {
>+        && dev->socket_id == dev->requested_socket_id
>+        && !dev->ingress_sched_changed) {

If you make the changes I've suggested above, the last line above should read 'dev->requested_ingress_sched_str == dev->ingress_sched_str'.


>         /* Reconfiguration is unnecessary */
>-
>         goto out;
>     }
>
>+    dev->ingress_sched_changed = false;
>     rte_eth_dev_stop(dev->port_id);
>
>     if (dev->mtu != dev->requested_mtu
>diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c
>index 9113195..2c5dfd3 100644
>--- a/vswitchd/bridge.c
>+++ b/vswitchd/bridge.c
>@@ -831,6 +831,8 @@ bridge_delete_or_reconfigure_ports(struct bridge *br)
>         }
>
>         iface_set_netdev_mtu(iface->cfg, iface->netdev);
>+        netdev_set_ingress_sched(iface->netdev,
>+            smap_get(&iface->cfg->other_config, "ingress_sched"));
>
>         /* If the requested OpenFlow port for 'iface' changed, and it's not
>          * already the correct port, then we might want to temporarily delete
>--
>2.7.4
>
>_______________________________________________
>dev mailing list
>dev at openvswitch.org
>https://mail.openvswitch.org/mailman/listinfo/ovs-dev


More information about the dev mailing list