[ovs-dev] [PATCH v2 2/4] netdev-dpdk: Apply ingress_sched config to dpdk phy ports

Kavanagh, Mark B mark.b.kavanagh at intel.com
Wed Aug 30 15:59:16 UTC 2017


>From: O Mahony, Billy
>Sent: Thursday, August 17, 2017 3:24 PM
>To: dev at openvswitch.org
>Cc: Kavanagh, Mark B <mark.b.kavanagh at intel.com>; O Mahony, Billy
><billy.o.mahony at intel.com>
>Subject: [PATCH v2 2/4] netdev-dpdk: Apply ingress_sched config to dpdk phy
>ports
>
>Ingress scheduling configuration is given effect by way of Flow Director
>filters. A small subset of the ingress scheduling possible is
>implemented in this patch.
>
>Signed-off-by: Billy O'Mahony <billy.o.mahony at intel.com>

Hi Billy,

As a general comment, this patch doesn't compile for me:

	gcc version: gcc (GCC) 4.8.3 20140911 (Red Hat 4.8.3-7)

	Compilation flags: -march=native -g --enable-Werror

	Error:
	lib/netdev-dpdk.c:886:16: error: missing braces around initializer [-Werror=missing-braces]
         struct rte_eth_ethertype_filter entry = {0};
                ^
	lib/netdev-dpdk.c:886:16: error: (near initialization for 'entry.mac_addr') [-Werror=missing-braces]
	lib/netdev-dpdk.c:886:16: error: missing initializer for field 'ether_type' of 'struct rte_eth_ethertype_filter' [-Werror=missing-field-initializers]
	In file included from <redacted>/x86_64-native-linuxapp-gcc/include/rte_ethdev.h:186:0,
      	           from lib/netdev-dpdk.c:33:
	<redacted>/x86_64-native-linuxapp-gcc/include/rte_eth_ctrl.h:158:11: note: 'ether_type' declared here
	  uint16_t ether_type;          /**< Ether type to match */
           ^
	cc1: all warnings being treated as errors
	make[2]: *** [lib/netdev-dpdk.lo] Error 1

The following modification resolves the issue:

	-	struct rte_eth_ethertype_filter entry = {0};
	+	struct rte_eth_ethertype_filter entry;

Review comments also inline, as usual.

Thanks,
Mark


>---
> include/openvswitch/ofp-parse.h |   3 +
> lib/dpif-netdev.c               |   1 +
> lib/netdev-dpdk.c               | 181 ++++++++++++++++++++++++++++++++++++++-
>-
> vswitchd/bridge.c               |   2 +
> 4 files changed, 180 insertions(+), 7 deletions(-)
>
>diff --git a/include/openvswitch/ofp-parse.h b/include/openvswitch/ofp-parse.h
>index 013a8f3..1991694 100644
>--- a/include/openvswitch/ofp-parse.h
>+++ b/include/openvswitch/ofp-parse.h
>@@ -41,6 +41,9 @@ struct ofputil_table_mod;
> struct ofputil_bundle_msg;
> struct ofputil_tlv_table_mod;
> struct simap;
>+struct tun_table;
>+struct flow_wildcards;
>+struct ofputil_port_map;
> enum ofputil_protocol;
>
> char *parse_ofp_str(struct ofputil_flow_mod *, int command, const char *str_,
>diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>index e2cd931..9ce3456 100644
>--- a/lib/dpif-netdev.c
>+++ b/lib/dpif-netdev.c
>@@ -44,6 +44,7 @@
> #include "dp-packet.h"
> #include "dpif.h"
> #include "dpif-provider.h"
>+#include "netdev-provider.h"

Is this needed? Maybe be an artifact from a previous version.

> #include "dummy.h"
> #include "fat-rwlock.h"
> #include "flow.h"
>diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>index 1ffedd4..d9aab4f 100644
>--- a/lib/netdev-dpdk.c
>+++ b/lib/netdev-dpdk.c
>@@ -49,6 +49,8 @@
> #include "openvswitch/list.h"
> #include "openvswitch/ofp-print.h"
> #include "openvswitch/vlog.h"
>+#include "openvswitch/ofp-parse.h"
>+#include "openvswitch/ofp-util.h"

The defacto standard is to include includes in alphabetical order - move these above the 'vlog.h' include.

> #include "ovs-numa.h"
> #include "ovs-thread.h"
> #include "ovs-rcu.h"
>@@ -175,6 +177,10 @@ static const struct rte_eth_conf port_conf = {
>     .txmode = {
>         .mq_mode = ETH_MQ_TX_NONE,
>     },
>+    .fdir_conf = {
>+        .mode = RTE_FDIR_MODE_PERFECT,
>+    },
>+
> };
>
> /*
>@@ -351,6 +357,11 @@ enum dpdk_hw_ol_features {
>     NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
> };
>
>+union ingress_filter {
>+    struct rte_eth_ethertype_filter ethertype;
>+    struct rte_eth_fdir_filter fdir;
>+};
>+
> struct netdev_dpdk {
>     struct netdev up;
>     dpdk_port_t port_id;
>@@ -390,8 +401,11 @@ struct netdev_dpdk {
>     /* If true, device was attached by rte_eth_dev_attach(). */
>     bool attached;
>
>-    /* Ingress Scheduling config */
>+    /* Ingress Scheduling config & state. */
>     char *ingress_sched_str;
>+    bool ingress_sched_changed;
>+    enum rte_filter_type ingress_filter_type;
>+    union ingress_filter ingress_filter;
>
>     /* In dpdk_list. */
>     struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
>@@ -674,6 +688,22 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int
>n_rxq, int n_txq)
>     int i;
>     struct rte_eth_conf conf = port_conf;
>
>+    /* Ingress scheduling uses an extra RX queue reserved for prioritized
>+       frames but using RSS will 'pollute' that queue by distributing
>+       non-priority packets on to it.  Therefore RSS is not compatible with
>+       ingress scheduling.  Also requesting anything other than two queues
>+       with ingress scheduling is wasteful as without RSS only two queues are
>+       required.  Rather than force n_rxq to two here and overriding the
>+       configured value it's less surprising for the user to issue a warning
>+       (see dpdk_apply_ingress_scheduling()) and not enable ingress
>shceduling.
>+    */
>+    if (dev->ingress_sched_str && n_rxq == 2) {
>+        conf.rxmode.mq_mode = ETH_MQ_RX_NONE;
>+    }
>+    else {
>+        conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
>+    }
>+
>     /* For some NICs (e.g. Niantic), scatter_rx mode needs to be explicitly
>      * enabled. */
>     if (dev->mtu > ETHER_MTU) {
>@@ -757,6 +787,128 @@ dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev)
>OVS_REQUIRES(dev->mutex)
>     }
> }
>
>+static void
>+dpdk_apply_ingress_scheduling(struct netdev_dpdk *dev, int n_rxq)

As a general comment on this function, I find it to be too long/unwieldy, and would
like to see parts of it split out into 'helper' functions (parsing dev->ingress_str, for example).

>+{
>+    if (!dev->ingress_sched_str) {
>+        return;
>+    }
>+
>+    /* See dpdk_eth_dev_queue_setup for n_rxq requirement */
>+    if (n_rxq != 2) {
>+        VLOG_ERR("Interface %s: Ingress scheduling config ignored; " \
>+                 "Requires n_rxq==2.", dev->up.name);
>+        return;
>+    }
>+
>+    int priority_q_id = n_rxq-1;

Same comment as made in the previous set applies here - since n_rxq is always 2, this calculation is redundant.
If a solution to the queue limitation is found in future, then the code can be refactored accordingly at that time.

>+    char *key, *val, *str, *iter;
>+
>+    ovs_be32 ip_src, ip_dst;
>+    ip_src = ip_dst = 0;
>+
>+    uint16_t eth_type, port_src, port_dst;
>+    eth_type = port_src = port_dst = 0;
>+    uint8_t ip_proto = 0;
>+    int diag = 0;
>+
>+    /* delete any existing filter */
>+    if (dev->ingress_filter_type == RTE_ETH_FILTER_FDIR) {
>+        diag = rte_eth_dev_filter_ctrl(dev->port_id, RTE_ETH_FILTER_FDIR,
>+            RTE_ETH_FILTER_DELETE, &dev->ingress_filter.fdir);
>+    } else if (dev->ingress_filter_type == RTE_ETH_FILTER_ETHERTYPE) {
>+        diag = rte_eth_dev_filter_ctrl(dev->port_id,
>RTE_ETH_FILTER_ETHERTYPE,
>+            RTE_ETH_FILTER_DELETE, &dev->ingress_filter.ethertype);
>+    }
>
>+    /* str_to_x on error returns malloc'd str we'll need to free */
>+    char *mallocd_str;
>+    /* Parse the configuration into local vars */
>+    iter = str = xstrdup(dev->ingress_sched_str);
>+    while (ofputil_parse_key_value (&iter, &key, &val)) {
>+        if (strcmp(key, "nw_src") == 0 || strcmp(key, "ip_src") == 0) {
>+            mallocd_str = str_to_ip(val, &ip_src);
>+        } else if (strcmp(key, "nw_dst") == 0 || strcmp(key, "ip_dst") == 0)
>{
>+            mallocd_str = str_to_ip(val, &ip_dst);
>+        } else if (strcmp(key, "dl_type") == 0 ||
>+                   strcmp(key, "eth_type") == 0) {
>+            mallocd_str = str_to_u16(val, "eth_type/dl_type", &eth_type);
>+        } else if (strcmp(key, "tcp_src") == 0 ||
>+                  strcmp(key, "tp_src") == 0 ||
>+                  strcmp(key, "udp_src") == 0) {
>+            mallocd_str = str_to_u16(val, "tcp/udp_src", &port_src);
>+        } else if (strcmp(key, "tcp_dst") == 0 ||
>+                   strcmp(key, "tp_dst") == 0 ||
>+                   strcmp(key, "udp_dst") == 0) {
>+            mallocd_str = str_to_u16(val, "tcp/udp_dst", &port_dst);
>+        } else if (strcmp(key, "ip") == 0) {
>+            eth_type = ETH_P_IP;
>+        } else if (strcmp(key, "udp") == 0) {
>+            eth_type = ETH_P_IP;
>+            ip_proto = IPPROTO_UDP;
>+        } else if (strcmp(key, "tcp") == 0) {
>+            eth_type = ETH_P_IP;
>+            ip_proto = IPPROTO_TCP;
>+        } else {
>+            VLOG_WARN("Ignoring unsupported ingress scheduling field '%s'", \
>+                      key);
>+        }
>+        if (mallocd_str) {
>+            VLOG_ERR ("%s", mallocd_str);
>+            free(mallocd_str);
>+            mallocd_str = NULL;
>+        }
>+    }
>+    free (str);
>+
>+    /* Set the filters */
>+    if (eth_type && ip_src && ip_dst && port_src && port_dst && ip_proto) {
>+        struct rte_eth_fdir_filter entry = { 0 };
>+        if (ip_proto == IPPROTO_TCP) {
>+            entry.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_TCP;
>+        } else {
>+            entry.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP;
>+        }
>+        entry.input.flow.udp4_flow.ip.src_ip = ip_src;
>+        entry.input.flow.udp4_flow.ip.dst_ip = ip_dst;
>+        entry.input.flow.udp4_flow.src_port = htons(port_src);

Shouldn't all of these statements be within the body of the else statement above?
(with a corresponding set of entry.input.flow.tcp4_flow statements?)

>+        entry.input.flow.udp4_flow.dst_port = htons(port_dst);

rte_cpu_to_be_16 _may_ be more performant than htons.

>+        entry.action.rx_queue = priority_q_id;

Again, in the current implementation, this is always 1.

>+        entry.action.behavior = RTE_ETH_FDIR_ACCEPT;
>+        entry.action.report_status = RTE_ETH_FDIR_REPORT_ID;
>+        diag = rte_eth_dev_filter_ctrl(dev->port_id,
>+                RTE_ETH_FILTER_FDIR, RTE_ETH_FILTER_ADD, &entry);
>+        dev->ingress_filter_type = RTE_ETH_FILTER_FDIR;
>+        dev->ingress_filter.fdir = entry;
>+    }
>+    else if (eth_type && !ip_src && !ip_dst && !port_src
>+             && !port_dst && !ip_proto) {
>+        struct rte_eth_ethertype_filter entry = {0};
>+        memset (&entry, 0, sizeof entry);

Why memset here, but not in the fdir filter case?

>+        entry.ether_type = eth_type;
>+        entry.flags = 0;
>+        entry.queue = priority_q_id;
>+        diag = rte_eth_dev_filter_ctrl(dev->port_id,
>+                RTE_ETH_FILTER_ETHERTYPE, RTE_ETH_FILTER_ADD, &entry);
>+        dev->ingress_filter.ethertype = entry;
>+        dev->ingress_filter_type = RTE_ETH_FILTER_ETHERTYPE;
>+    }
>+    else {
>+        VLOG_ERR("Unsupported ingress scheduling match-field combination.");
>+        dev->ingress_filter_type = RTE_ETH_FILTER_NONE;
>+        return;
>+    }
>+
>+    if (diag) {
>+        dev->ingress_filter_type = RTE_ETH_FILTER_NONE;
>+        VLOG_ERR("Failed to add ingress scheduling filter.");
>+    }
>+    else {
>+        /* Mark the appropriate q as prioritized */
>+        dev->up.priority_rxq = priority_q_id;
>+    }
>+}
>+
> static int
> dpdk_eth_dev_init(struct netdev_dpdk *dev)
>     OVS_REQUIRES(dev->mutex)
>@@ -791,6 +943,8 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>         return -diag;
>     }
>
>+    dpdk_apply_ingress_scheduling(dev, n_rxq);

I'd prefer to see dpdk_apply_ingress_scheduling return a value, and have warnings logged here, instead of in that function.

>+
>     diag = rte_eth_dev_start(dev->port_id);
>     if (diag) {
>         VLOG_ERR("Interface %s start error: %s", dev->up.name,
>@@ -897,6 +1051,9 @@ common_construct(struct netdev *netdev, dpdk_port_t
>port_no,
>     dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE;
>     dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE;
>
>+    dev->ingress_sched_str = NULL;
>+    dev->ingress_sched_changed = false;
>+    dev->ingress_filter_type = RTE_ETH_FILTER_NONE;

Would this code not be more appropriate in netdev_dpdk_construct?
(since Flow Director is not applicable to vhost devices, and presumably, for vHost devices,
the set_ingress_sched function is NULL)?

>     /* Initialize the flow control to NULL */
>     memset(&dev->fc_conf, 0, sizeof dev->fc_conf);
>
>@@ -2015,11 +2172,20 @@ netdev_dpdk_set_ingress_sched(struct netdev *netdev,
> {
>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>
>-    free(dev->ingress_sched_str);
>-    if (ingress_sched_str) {
>-        dev->ingress_sched_str = xstrdup(ingress_sched_str);
>+    if ((ingress_sched_str && dev->ingress_sched_str &&
>+        strcmp(ingress_sched_str, dev->ingress_sched_str) == 0) ||
>+        (!ingress_sched_str && !dev->ingress_sched_str)) {
>+        /* no-op; new cfg == old cfg or else both are NULL */
>+        return 0;
>+    } else {
>+        /* free the old, copy in the new */
>+        free(dev->ingress_sched_str);
>+        if (ingress_sched_str) {
>+            dev->ingress_sched_str = xstrdup(ingress_sched_str);
>+        }
>+        dev->ingress_sched_changed = true;
>+        netdev_request_reconfigure(netdev);

I presume that the purpose of the netdev_reconfigure here is to apply the scheduling policy to
the PMDs for the physical devices (which requires them to be torn down and restarted)?

>     }
>-
>     return 0;
> }
>
>@@ -3185,12 +3351,13 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
>         && dev->mtu == dev->requested_mtu
>         && dev->rxq_size == dev->requested_rxq_size
>         && dev->txq_size == dev->requested_txq_size
>-        && dev->socket_id == dev->requested_socket_id) {
>+        && dev->socket_id == dev->requested_socket_id
>+        && !dev->ingress_sched_changed) {
>         /* Reconfiguration is unnecessary */
>-
>         goto out;
>     }
>
>+    dev->ingress_sched_changed = false;
>     rte_eth_dev_stop(dev->port_id);
>
>     if (dev->mtu != dev->requested_mtu
>diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c
>index 93c91f6..077a6d6 100644
>--- a/vswitchd/bridge.c
>+++ b/vswitchd/bridge.c
>@@ -831,6 +831,8 @@ bridge_delete_or_reconfigure_ports(struct bridge *br)
>         }
>
>         iface_set_netdev_mtu(iface->cfg, iface->netdev);
>+        netdev_set_ingress_sched(iface->netdev,
>+            smap_get(&iface->cfg->other_config, "ingress_sched"));

Same comment as before WRT iface_set_ingress_sched.

>
>         /* If the requested OpenFlow port for 'iface' changed, and it's not
>          * already the correct port, then we might want to temporarily delete
>--
>2.7.4



More information about the dev mailing list