[ovs-dev] [PATCH v2 2/4] netdev-dpdk: Apply ingress_sched config to dpdk phy ports
Kavanagh, Mark B
mark.b.kavanagh at intel.com
Wed Aug 30 15:59:16 UTC 2017
>From: O Mahony, Billy
>Sent: Thursday, August 17, 2017 3:24 PM
>To: dev at openvswitch.org
>Cc: Kavanagh, Mark B <mark.b.kavanagh at intel.com>; O Mahony, Billy
><billy.o.mahony at intel.com>
>Subject: [PATCH v2 2/4] netdev-dpdk: Apply ingress_sched config to dpdk phy
>ports
>
>Ingress scheduling configuration is given effect by way of Flow Director
>filters. A small subset of the ingress scheduling possible is
>implemented in this patch.
>
>Signed-off-by: Billy O'Mahony <billy.o.mahony at intel.com>
Hi Billy,
As a general comment, this patch doesn't compile for me:
gcc version: gcc (GCC) 4.8.3 20140911 (Red Hat 4.8.3-7)
Compilation flags: -march=native -g --enable-Werror
Error:
lib/netdev-dpdk.c:886:16: error: missing braces around initializer [-Werror=missing-braces]
struct rte_eth_ethertype_filter entry = {0};
^
lib/netdev-dpdk.c:886:16: error: (near initialization for 'entry.mac_addr') [-Werror=missing-braces]
lib/netdev-dpdk.c:886:16: error: missing initializer for field 'ether_type' of 'struct rte_eth_ethertype_filter' [-Werror=missing-field-initializers]
In file included from <redacted>/x86_64-native-linuxapp-gcc/include/rte_ethdev.h:186:0,
from lib/netdev-dpdk.c:33:
<redacted>/x86_64-native-linuxapp-gcc/include/rte_eth_ctrl.h:158:11: note: 'ether_type' declared here
uint16_t ether_type; /**< Ether type to match */
^
cc1: all warnings being treated as errors
make[2]: *** [lib/netdev-dpdk.lo] Error 1
The following modification resolves the issue:
- struct rte_eth_ethertype_filter entry = {0};
+ struct rte_eth_ethertype_filter entry;
Review comments also inline, as usual.
Thanks,
Mark
>---
> include/openvswitch/ofp-parse.h | 3 +
> lib/dpif-netdev.c | 1 +
> lib/netdev-dpdk.c | 181 ++++++++++++++++++++++++++++++++++++++-
>-
> vswitchd/bridge.c | 2 +
> 4 files changed, 180 insertions(+), 7 deletions(-)
>
>diff --git a/include/openvswitch/ofp-parse.h b/include/openvswitch/ofp-parse.h
>index 013a8f3..1991694 100644
>--- a/include/openvswitch/ofp-parse.h
>+++ b/include/openvswitch/ofp-parse.h
>@@ -41,6 +41,9 @@ struct ofputil_table_mod;
> struct ofputil_bundle_msg;
> struct ofputil_tlv_table_mod;
> struct simap;
>+struct tun_table;
>+struct flow_wildcards;
>+struct ofputil_port_map;
> enum ofputil_protocol;
>
> char *parse_ofp_str(struct ofputil_flow_mod *, int command, const char *str_,
>diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>index e2cd931..9ce3456 100644
>--- a/lib/dpif-netdev.c
>+++ b/lib/dpif-netdev.c
>@@ -44,6 +44,7 @@
> #include "dp-packet.h"
> #include "dpif.h"
> #include "dpif-provider.h"
>+#include "netdev-provider.h"
Is this needed? Maybe be an artifact from a previous version.
> #include "dummy.h"
> #include "fat-rwlock.h"
> #include "flow.h"
>diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>index 1ffedd4..d9aab4f 100644
>--- a/lib/netdev-dpdk.c
>+++ b/lib/netdev-dpdk.c
>@@ -49,6 +49,8 @@
> #include "openvswitch/list.h"
> #include "openvswitch/ofp-print.h"
> #include "openvswitch/vlog.h"
>+#include "openvswitch/ofp-parse.h"
>+#include "openvswitch/ofp-util.h"
The defacto standard is to include includes in alphabetical order - move these above the 'vlog.h' include.
> #include "ovs-numa.h"
> #include "ovs-thread.h"
> #include "ovs-rcu.h"
>@@ -175,6 +177,10 @@ static const struct rte_eth_conf port_conf = {
> .txmode = {
> .mq_mode = ETH_MQ_TX_NONE,
> },
>+ .fdir_conf = {
>+ .mode = RTE_FDIR_MODE_PERFECT,
>+ },
>+
> };
>
> /*
>@@ -351,6 +357,11 @@ enum dpdk_hw_ol_features {
> NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
> };
>
>+union ingress_filter {
>+ struct rte_eth_ethertype_filter ethertype;
>+ struct rte_eth_fdir_filter fdir;
>+};
>+
> struct netdev_dpdk {
> struct netdev up;
> dpdk_port_t port_id;
>@@ -390,8 +401,11 @@ struct netdev_dpdk {
> /* If true, device was attached by rte_eth_dev_attach(). */
> bool attached;
>
>- /* Ingress Scheduling config */
>+ /* Ingress Scheduling config & state. */
> char *ingress_sched_str;
>+ bool ingress_sched_changed;
>+ enum rte_filter_type ingress_filter_type;
>+ union ingress_filter ingress_filter;
>
> /* In dpdk_list. */
> struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
>@@ -674,6 +688,22 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int
>n_rxq, int n_txq)
> int i;
> struct rte_eth_conf conf = port_conf;
>
>+ /* Ingress scheduling uses an extra RX queue reserved for prioritized
>+ frames but using RSS will 'pollute' that queue by distributing
>+ non-priority packets on to it. Therefore RSS is not compatible with
>+ ingress scheduling. Also requesting anything other than two queues
>+ with ingress scheduling is wasteful as without RSS only two queues are
>+ required. Rather than force n_rxq to two here and overriding the
>+ configured value it's less surprising for the user to issue a warning
>+ (see dpdk_apply_ingress_scheduling()) and not enable ingress
>shceduling.
>+ */
>+ if (dev->ingress_sched_str && n_rxq == 2) {
>+ conf.rxmode.mq_mode = ETH_MQ_RX_NONE;
>+ }
>+ else {
>+ conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
>+ }
>+
> /* For some NICs (e.g. Niantic), scatter_rx mode needs to be explicitly
> * enabled. */
> if (dev->mtu > ETHER_MTU) {
>@@ -757,6 +787,128 @@ dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev)
>OVS_REQUIRES(dev->mutex)
> }
> }
>
>+static void
>+dpdk_apply_ingress_scheduling(struct netdev_dpdk *dev, int n_rxq)
As a general comment on this function, I find it to be too long/unwieldy, and would
like to see parts of it split out into 'helper' functions (parsing dev->ingress_str, for example).
>+{
>+ if (!dev->ingress_sched_str) {
>+ return;
>+ }
>+
>+ /* See dpdk_eth_dev_queue_setup for n_rxq requirement */
>+ if (n_rxq != 2) {
>+ VLOG_ERR("Interface %s: Ingress scheduling config ignored; " \
>+ "Requires n_rxq==2.", dev->up.name);
>+ return;
>+ }
>+
>+ int priority_q_id = n_rxq-1;
Same comment as made in the previous set applies here - since n_rxq is always 2, this calculation is redundant.
If a solution to the queue limitation is found in future, then the code can be refactored accordingly at that time.
>+ char *key, *val, *str, *iter;
>+
>+ ovs_be32 ip_src, ip_dst;
>+ ip_src = ip_dst = 0;
>+
>+ uint16_t eth_type, port_src, port_dst;
>+ eth_type = port_src = port_dst = 0;
>+ uint8_t ip_proto = 0;
>+ int diag = 0;
>+
>+ /* delete any existing filter */
>+ if (dev->ingress_filter_type == RTE_ETH_FILTER_FDIR) {
>+ diag = rte_eth_dev_filter_ctrl(dev->port_id, RTE_ETH_FILTER_FDIR,
>+ RTE_ETH_FILTER_DELETE, &dev->ingress_filter.fdir);
>+ } else if (dev->ingress_filter_type == RTE_ETH_FILTER_ETHERTYPE) {
>+ diag = rte_eth_dev_filter_ctrl(dev->port_id,
>RTE_ETH_FILTER_ETHERTYPE,
>+ RTE_ETH_FILTER_DELETE, &dev->ingress_filter.ethertype);
>+ }
>
>+ /* str_to_x on error returns malloc'd str we'll need to free */
>+ char *mallocd_str;
>+ /* Parse the configuration into local vars */
>+ iter = str = xstrdup(dev->ingress_sched_str);
>+ while (ofputil_parse_key_value (&iter, &key, &val)) {
>+ if (strcmp(key, "nw_src") == 0 || strcmp(key, "ip_src") == 0) {
>+ mallocd_str = str_to_ip(val, &ip_src);
>+ } else if (strcmp(key, "nw_dst") == 0 || strcmp(key, "ip_dst") == 0)
>{
>+ mallocd_str = str_to_ip(val, &ip_dst);
>+ } else if (strcmp(key, "dl_type") == 0 ||
>+ strcmp(key, "eth_type") == 0) {
>+ mallocd_str = str_to_u16(val, "eth_type/dl_type", ð_type);
>+ } else if (strcmp(key, "tcp_src") == 0 ||
>+ strcmp(key, "tp_src") == 0 ||
>+ strcmp(key, "udp_src") == 0) {
>+ mallocd_str = str_to_u16(val, "tcp/udp_src", &port_src);
>+ } else if (strcmp(key, "tcp_dst") == 0 ||
>+ strcmp(key, "tp_dst") == 0 ||
>+ strcmp(key, "udp_dst") == 0) {
>+ mallocd_str = str_to_u16(val, "tcp/udp_dst", &port_dst);
>+ } else if (strcmp(key, "ip") == 0) {
>+ eth_type = ETH_P_IP;
>+ } else if (strcmp(key, "udp") == 0) {
>+ eth_type = ETH_P_IP;
>+ ip_proto = IPPROTO_UDP;
>+ } else if (strcmp(key, "tcp") == 0) {
>+ eth_type = ETH_P_IP;
>+ ip_proto = IPPROTO_TCP;
>+ } else {
>+ VLOG_WARN("Ignoring unsupported ingress scheduling field '%s'", \
>+ key);
>+ }
>+ if (mallocd_str) {
>+ VLOG_ERR ("%s", mallocd_str);
>+ free(mallocd_str);
>+ mallocd_str = NULL;
>+ }
>+ }
>+ free (str);
>+
>+ /* Set the filters */
>+ if (eth_type && ip_src && ip_dst && port_src && port_dst && ip_proto) {
>+ struct rte_eth_fdir_filter entry = { 0 };
>+ if (ip_proto == IPPROTO_TCP) {
>+ entry.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_TCP;
>+ } else {
>+ entry.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP;
>+ }
>+ entry.input.flow.udp4_flow.ip.src_ip = ip_src;
>+ entry.input.flow.udp4_flow.ip.dst_ip = ip_dst;
>+ entry.input.flow.udp4_flow.src_port = htons(port_src);
Shouldn't all of these statements be within the body of the else statement above?
(with a corresponding set of entry.input.flow.tcp4_flow statements?)
>+ entry.input.flow.udp4_flow.dst_port = htons(port_dst);
rte_cpu_to_be_16 _may_ be more performant than htons.
>+ entry.action.rx_queue = priority_q_id;
Again, in the current implementation, this is always 1.
>+ entry.action.behavior = RTE_ETH_FDIR_ACCEPT;
>+ entry.action.report_status = RTE_ETH_FDIR_REPORT_ID;
>+ diag = rte_eth_dev_filter_ctrl(dev->port_id,
>+ RTE_ETH_FILTER_FDIR, RTE_ETH_FILTER_ADD, &entry);
>+ dev->ingress_filter_type = RTE_ETH_FILTER_FDIR;
>+ dev->ingress_filter.fdir = entry;
>+ }
>+ else if (eth_type && !ip_src && !ip_dst && !port_src
>+ && !port_dst && !ip_proto) {
>+ struct rte_eth_ethertype_filter entry = {0};
>+ memset (&entry, 0, sizeof entry);
Why memset here, but not in the fdir filter case?
>+ entry.ether_type = eth_type;
>+ entry.flags = 0;
>+ entry.queue = priority_q_id;
>+ diag = rte_eth_dev_filter_ctrl(dev->port_id,
>+ RTE_ETH_FILTER_ETHERTYPE, RTE_ETH_FILTER_ADD, &entry);
>+ dev->ingress_filter.ethertype = entry;
>+ dev->ingress_filter_type = RTE_ETH_FILTER_ETHERTYPE;
>+ }
>+ else {
>+ VLOG_ERR("Unsupported ingress scheduling match-field combination.");
>+ dev->ingress_filter_type = RTE_ETH_FILTER_NONE;
>+ return;
>+ }
>+
>+ if (diag) {
>+ dev->ingress_filter_type = RTE_ETH_FILTER_NONE;
>+ VLOG_ERR("Failed to add ingress scheduling filter.");
>+ }
>+ else {
>+ /* Mark the appropriate q as prioritized */
>+ dev->up.priority_rxq = priority_q_id;
>+ }
>+}
>+
> static int
> dpdk_eth_dev_init(struct netdev_dpdk *dev)
> OVS_REQUIRES(dev->mutex)
>@@ -791,6 +943,8 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
> return -diag;
> }
>
>+ dpdk_apply_ingress_scheduling(dev, n_rxq);
I'd prefer to see dpdk_apply_ingress_scheduling return a value, and have warnings logged here, instead of in that function.
>+
> diag = rte_eth_dev_start(dev->port_id);
> if (diag) {
> VLOG_ERR("Interface %s start error: %s", dev->up.name,
>@@ -897,6 +1051,9 @@ common_construct(struct netdev *netdev, dpdk_port_t
>port_no,
> dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE;
> dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE;
>
>+ dev->ingress_sched_str = NULL;
>+ dev->ingress_sched_changed = false;
>+ dev->ingress_filter_type = RTE_ETH_FILTER_NONE;
Would this code not be more appropriate in netdev_dpdk_construct?
(since Flow Director is not applicable to vhost devices, and presumably, for vHost devices,
the set_ingress_sched function is NULL)?
> /* Initialize the flow control to NULL */
> memset(&dev->fc_conf, 0, sizeof dev->fc_conf);
>
>@@ -2015,11 +2172,20 @@ netdev_dpdk_set_ingress_sched(struct netdev *netdev,
> {
> struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>
>- free(dev->ingress_sched_str);
>- if (ingress_sched_str) {
>- dev->ingress_sched_str = xstrdup(ingress_sched_str);
>+ if ((ingress_sched_str && dev->ingress_sched_str &&
>+ strcmp(ingress_sched_str, dev->ingress_sched_str) == 0) ||
>+ (!ingress_sched_str && !dev->ingress_sched_str)) {
>+ /* no-op; new cfg == old cfg or else both are NULL */
>+ return 0;
>+ } else {
>+ /* free the old, copy in the new */
>+ free(dev->ingress_sched_str);
>+ if (ingress_sched_str) {
>+ dev->ingress_sched_str = xstrdup(ingress_sched_str);
>+ }
>+ dev->ingress_sched_changed = true;
>+ netdev_request_reconfigure(netdev);
I presume that the purpose of the netdev_reconfigure here is to apply the scheduling policy to
the PMDs for the physical devices (which requires them to be torn down and restarted)?
> }
>-
> return 0;
> }
>
>@@ -3185,12 +3351,13 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
> && dev->mtu == dev->requested_mtu
> && dev->rxq_size == dev->requested_rxq_size
> && dev->txq_size == dev->requested_txq_size
>- && dev->socket_id == dev->requested_socket_id) {
>+ && dev->socket_id == dev->requested_socket_id
>+ && !dev->ingress_sched_changed) {
> /* Reconfiguration is unnecessary */
>-
> goto out;
> }
>
>+ dev->ingress_sched_changed = false;
> rte_eth_dev_stop(dev->port_id);
>
> if (dev->mtu != dev->requested_mtu
>diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c
>index 93c91f6..077a6d6 100644
>--- a/vswitchd/bridge.c
>+++ b/vswitchd/bridge.c
>@@ -831,6 +831,8 @@ bridge_delete_or_reconfigure_ports(struct bridge *br)
> }
>
> iface_set_netdev_mtu(iface->cfg, iface->netdev);
>+ netdev_set_ingress_sched(iface->netdev,
>+ smap_get(&iface->cfg->other_config, "ingress_sched"));
Same comment as before WRT iface_set_ingress_sched.
>
> /* If the requested OpenFlow port for 'iface' changed, and it's not
> * already the correct port, then we might want to temporarily delete
>--
>2.7.4
More information about the dev
mailing list