[ovs-dev] [dpdk-howl PATCH v2] netdev-dpdk: Upgrade to dpdk v18.08.0

Eelco Chaudron echaudro at redhat.com
Mon Sep 17 16:38:44 UTC 2018


Hi Ophir,

I did some basic PVP testing root vs non-root and it all seems to work 
as expected (did not do any performance comparison).

I do see compiler warnings about usage of deprecated DPDK functions, I 
guess these need to be fixed:


lib/netdev-dpdk.c: In function 'netdev_dpdk_destruct':
lib/netdev-dpdk.c:1336:9: error: 'rte_eth_dev_detach' is deprecated 
(declared at 
/home/root/OVS_dpdk-hwol_DPDK_v18.08/dpdk-stable/x86_64-native-linuxapp-gcc/include/rte_ethdev.h:1451) 
[-Werror=deprecated-declarations]
          if (rte_eth_dev_detach(dev->port_id, devname) < 0) {
          ^
lib/netdev-dpdk.c: In function 'netdev_dpdk_process_devargs':
lib/netdev-dpdk.c:1629:13: error: 'rte_eth_dev_attach' is deprecated 
(declared at 
/home/root/OVS_dpdk-hwol_DPDK_v18.08/dpdk-stable/x86_64-native-linuxapp-gcc/include/rte_ethdev.h:1435) 
[-Werror=deprecated-declarations]
              if (!rte_eth_dev_attach(devargs, &new_port_id)) {
              ^
lib/netdev-dpdk.c: In function 'netdev_dpdk_detach':
lib/netdev-dpdk.c:3159:5: error: 'rte_eth_dev_detach' is deprecated 
(declared at 
/home/root/OVS_dpdk-hwol_DPDK_v18.08/dpdk-stable/x86_64-native-linuxapp-gcc/include/rte_ethdev.h:1451) 
[-Werror=deprecated-declarations]
      ret = rte_eth_dev_detach(port_id, devname);
      ^

See more comments inline…

//Eelco


	
On 11 Sep 2018, at 1:04, Ophir Munk wrote:

> 1. Enable compilation and linkage with dpdk 18.08.0
> The following dpdk commits which were introduced after dpdk 17.11.x
> require OVS updates to accommodate to the dpdk changes.
> - ce17eddefc20 ("ethdev: introduce Rx queue offloads API")
> - ab3ce1e0c193 ("ethdev: remove old offload API")
> - c06ddf9698e0 ("meter: add configuration profile")
> - e58638c32411 ("ethdev: fix TPID handling in flow API")
> - cd8c7c7ce241 ("ethdev: replace bus specific struct with generic 
> dev")
> - ac8d22de2394 ("ethdev: flatten RSS configuration in flow API")
>
> 2. Limit configured rss hash functions to only those supported
> by the eth device.
>
> 3. Update references to DPDK version in Documentation
>
> 4. Update DPDK version in travis' linux-build script
>
> Signed-off-by: Ophir Munk <ophirmu at mellanox.com>
> ---
> v1:
> First version
>
> v2:
> Avoid seg faults cases as described in
> https://patchwork.ozlabs.org/patch/965451/
> by using the patch in:
> https://github.com/kevintraynor/ovs-dpdk-master/commit/88f46cc5ab338eb4f3ca5db1eacd0effefe4fa0c
>
>  .travis/linux-build.sh         |   2 +-
>  Documentation/faq/releases.rst |   2 +-
>  lib/netdev-dpdk.c              | 142 
> +++++++++++++++++++++++++++++------------
>  3 files changed, 104 insertions(+), 42 deletions(-)
>
> diff --git a/.travis/linux-build.sh b/.travis/linux-build.sh
> index 4b9fc4a..c60ac71 100755
> --- a/.travis/linux-build.sh
> +++ b/.travis/linux-build.sh
> @@ -83,7 +83,7 @@ fi
>
>  if [ "$DPDK" ]; then
>      if [ -z "$DPDK_VER" ]; then
> -        DPDK_VER="17.11.3"
> +        DPDK_VER="18.08.0"
>      fi
>      install_dpdk $DPDK_VER
>      if [ "$CC" = "clang" ]; then
> diff --git a/Documentation/faq/releases.rst 
> b/Documentation/faq/releases.rst
> index 41d41e3..646ae09 100644
> --- a/Documentation/faq/releases.rst
> +++ b/Documentation/faq/releases.rst
> @@ -168,7 +168,7 @@ Q: What DPDK version does each Open vSwitch 
> release work with?
>      2.7.x        16.11.7
>      2.8.x        17.05.2
>      2.9.x        17.11.3
> -    2.10.x       17.11.3
> +    2.10.x       18.08.0
>      ============ =======
>
>  Q: Are all the DPDK releases that OVS versions work with maintained?
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index f91aa27..6879e9a 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -168,11 +168,7 @@ static const struct rte_eth_conf port_conf = {
>      .rxmode = {
>          .mq_mode = ETH_MQ_RX_RSS,
>          .split_hdr_size = 0,
> -        .header_split   = 0, /* Header Split disabled */
> -        .hw_ip_checksum = 0, /* IP checksum offload disabled */
> -        .hw_vlan_filter = 0, /* VLAN filtering disabled */
> -        .jumbo_frame    = 0, /* Jumbo Frame Support disabled */
> -        .hw_strip_crc   = 0,
> +        .offloads = 0,
>      },
>      .rx_adv_conf = {
>          .rss_conf = {
> @@ -364,6 +360,7 @@ struct dpdk_ring {
>  struct ingress_policer {
>      struct rte_meter_srtcm_params app_srtcm_params;
>      struct rte_meter_srtcm in_policer;
> +    struct rte_meter_srtcm_profile in_prof;
>      rte_spinlock_t policer_lock;
>  };
> @@ -894,6 +891,8 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, 
> int n_rxq, int n_txq)
>      struct rte_eth_dev_info info;
>      uint16_t conf_mtu;
> +    rte_eth_dev_info_get(dev->port_id, &info);
> +
>      /* As of DPDK 17.11.1 a few PMDs require to explicitly enable
>       * scatter to support jumbo RX. Checking the offload capabilities
>       * is not an option as PMDs are not required yet to report
> @@ -901,20 +900,24 @@ dpdk_eth_dev_port_config(struct netdev_dpdk 
> *dev, int n_rxq, int n_txq)
>       * (testing or code review). Listing all such PMDs feels harder
>       * than highlighting the one known not to need scatter */
>      if (dev->mtu > ETHER_MTU) {
> -        rte_eth_dev_info_get(dev->port_id, &info);
>          if (strncmp(info.driver_name, "net_nfp", 7)) {
> -            conf.rxmode.enable_scatter = 1;
> +            conf.rxmode.offloads |= DEV_RX_OFFLOAD_SCATTER;
>          }
>      }

EC> The above was supposed to be fixed in the drivers for 18+, was this 
the case? If so we should be able to change this hard-coded nic name? 
I’ve copied in Pablo who added the code above.


>      conf.intr_conf.lsc = dev->lsc_interrupt_mode;
> -    conf.rxmode.hw_ip_checksum = (dev->hw_ol_features &
> -                                  NETDEV_RX_CHECKSUM_OFFLOAD) != 0;
> +    conf.rxmode.offloads |= ((dev->hw_ol_features &
> +                             NETDEV_RX_CHECKSUM_OFFLOAD) != 0) ?
> +                             DEV_RX_OFFLOAD_CHECKSUM : 0;

EC> Maybe convert the above also to an if, as below, to make it more 
clear?

>      if (dev->hw_ol_features & NETDEV_RX_HW_CRC_STRIP) {
> -        conf.rxmode.hw_strip_crc = 1;
> +        conf.rxmode.offloads |= DEV_RX_OFFLOAD_CRC_STRIP;
>      }
> +    /* Limit configured rss hash functions to only those supported
> +     * by the eth device. */
> +    conf.rx_adv_conf.rss_conf.rss_hf &= info.flow_type_rss_offloads;
> +
>      /* A device may report more queues than it makes available (this 
> has
>       * been observed for Intel xl710, which reserves some of them for
>       * SRIOV):  rte_eth_*_queue_setup will fail if a queue is not
> @@ -1932,16 +1935,18 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk 
> *dev, int qid,
>  static inline bool
>  netdev_dpdk_policer_pkt_handle(struct rte_meter_srtcm *meter,
> +                               struct rte_meter_srtcm_profile 
> *profile,
>                                 struct rte_mbuf *pkt, uint64_t time)
>  {
>      uint32_t pkt_len = rte_pktmbuf_pkt_len(pkt) - sizeof(struct 
> ether_hdr);
> -    return rte_meter_srtcm_color_blind_check(meter, time, pkt_len) ==
> +    return rte_meter_srtcm_color_blind_check(meter, profile, time, 
> pkt_len) ==
>                                                  e_RTE_METER_GREEN;

EC> Indentation looks odd

>  }
>  static int
>  netdev_dpdk_policer_run(struct rte_meter_srtcm *meter,
> +                        struct rte_meter_srtcm_profile *profile,
>                          struct rte_mbuf **pkts, int pkt_cnt,
>                          bool should_steal)
>  {
> @@ -1953,7 +1958,8 @@ netdev_dpdk_policer_run(struct rte_meter_srtcm 
> *meter,
>      for (i = 0; i < pkt_cnt; i++) {
>          pkt = pkts[i];
>          /* Handle current packet */
> -        if (netdev_dpdk_policer_pkt_handle(meter, pkt, current_time)) 
> {
> +        if (netdev_dpdk_policer_pkt_handle(meter, profile, pkt,
> +                                                 current_time)) {

EC> Indentation
>
>              if (cnt != i) {
>                  pkts[cnt] = pkt;
>              }
> @@ -1975,8 +1981,8 @@ ingress_policer_run(struct ingress_policer 
> *policer, struct rte_mbuf **pkts,
>      int cnt = 0;
>      rte_spinlock_lock(&policer->policer_lock);
> -    cnt = netdev_dpdk_policer_run(&policer->in_policer, pkts,
> -                                  pkt_cnt, should_steal);
> +    cnt = netdev_dpdk_policer_run(&policer->in_policer, 
> &policer->in_prof,
> +                                  pkts, pkt_cnt, should_steal);
>      rte_spinlock_unlock(&policer->policer_lock);
>      return cnt;
> @@ -2767,8 +2773,15 @@ netdev_dpdk_policer_construct(uint32_t rate, 
> uint32_t burst)
>      policer->app_srtcm_params.cir = rate_bytes;
>      policer->app_srtcm_params.cbs = burst_bytes;
>      policer->app_srtcm_params.ebs = 0;
> +    err = rte_meter_srtcm_profile_config(&policer->in_prof,
> +                                         &policer->app_srtcm_params);
> +    if (err) {
> +        VLOG_ERR("Could not create rte meter profile for ingress 
> policer");
> +        free(policer);
> +        return NULL;
> +    }
>      err = rte_meter_srtcm_config(&policer->in_policer,
> -                                    &policer->app_srtcm_params);
> +                                    &policer->in_prof);

EC> indentation

>      if (err) {
>          VLOG_ERR("Could not create rte meter for ingress policer");
>          free(policer);
> @@ -3043,13 +3056,18 @@ netdev_dpdk_get_status(const struct netdev 
> *netdev, struct smap *args)
>      smap_add_format(args, "if_descr", "%s %s", rte_version(),
>                                                 dev_info.driver_name);
> -    if (dev_info.pci_dev) {
> -        smap_add_format(args, "pci-vendor_id", "0x%x",
> -                        dev_info.pci_dev->id.vendor_id);
> -        smap_add_format(args, "pci-device_id", "0x%x",
> -                        dev_info.pci_dev->id.device_id);
> +    const struct rte_bus *bus;
> +    const struct rte_pci_device *pci_dev;
> +    bus = rte_bus_find_by_device(dev_info.device);
> +    if (bus && !strcmp(bus->name, "pci")) {
> +        pci_dev = RTE_DEV_TO_PCI(dev_info.device);
> +        if (pci_dev) {
> +            smap_add_format(args, "pci-vendor_id", "0x%u",
> +                            pci_dev->id.vendor_id);
> +             smap_add_format(args, "pci-device_id", "0x%x",
> +                             pci_dev->id.device_id);

EC> Guess this should be %x, and odd indentation, but I see Ilya got 
this also.

> +        }
>      }
> -
>      return 0;
>  }
> @@ -3727,6 +3745,7 @@ struct egress_policer {
>      struct qos_conf qos_conf;
>      struct rte_meter_srtcm_params app_srtcm_params;
>      struct rte_meter_srtcm egress_meter;
> +    struct rte_meter_srtcm_profile egress_prof;
>  };
>  static void
> @@ -3749,8 +3768,16 @@ egress_policer_qos_construct(const struct smap 
> *details,
>      policer = xmalloc(sizeof *policer);
>      qos_conf_init(&policer->qos_conf, &egress_policer_ops);
>      egress_policer_details_to_param(details, 
> &policer->app_srtcm_params);
> +    err = rte_meter_srtcm_profile_config(&policer->egress_prof,
> +                                         &policer->app_srtcm_params);
> +    if (err) {
> +        free(policer);
> +        *conf = NULL;
> +        return -err;
> +    }
>      err = rte_meter_srtcm_config(&policer->egress_meter,
> -                                 &policer->app_srtcm_params);
> +                                 &policer->egress_prof);
> +
>      if (!err) {
>          *conf = &policer->qos_conf;
>      } else {
> @@ -3803,7 +3830,8 @@ egress_policer_run(struct qos_conf *conf, struct 
> rte_mbuf **pkts, int pkt_cnt,
>      struct egress_policer *policer =
>          CONTAINER_OF(conf, struct egress_policer, qos_conf);
> -    cnt = netdev_dpdk_policer_run(&policer->egress_meter, pkts,
> +    cnt = netdev_dpdk_policer_run(&policer->egress_meter,
> +                                  &policer->egress_prof, pkts,
>                                    pkt_cnt, should_steal);
>      return cnt;
> @@ -4103,15 +4131,15 @@ dump_flow_pattern(struct rte_flow_item *item)
>          VLOG_DBG("rte flow vlan pattern:\n");
>          if (vlan_spec) {
> -            VLOG_DBG("  Spec: tpid=0x%"PRIx16", tci=0x%"PRIx16"\n",
> -                     ntohs(vlan_spec->tpid), ntohs(vlan_spec->tci));
> +            VLOG_DBG("  Spec: inner_type=0x%"PRIx16", 
> tci=0x%"PRIx16"\n",
> +                     ntohs(vlan_spec->inner_type), 
> ntohs(vlan_spec->tci));
>          } else {
>              VLOG_DBG("  Spec = null\n");
>          }
>          if (vlan_mask) {
> -            VLOG_DBG("  Mask: tpid=0x%"PRIx16", tci=0x%"PRIx16"\n",
> -                     vlan_mask->tpid, vlan_mask->tci);
> +            VLOG_DBG("  Mask: inner_type=0x%"PRIx16", 
> tci=0x%"PRIx16"\n",
> +                     vlan_mask->inner_type, vlan_mask->tci);

EC> is the mask always in host order vs the value being in network 
order? Or is this an existing bug?

>          } else {
>              VLOG_DBG("  Mask = null\n");
>          }
> @@ -4281,27 +4309,56 @@ add_flow_action(struct flow_actions *actions, 
> enum rte_flow_action_type type,
>      actions->cnt++;
>  }
> +/*
> + * Storage for struct rte_flow_action_rss
> + * including storage for key and queue array
> + */
> +#define MAX_RSS_HASH_KEY_LENGTH  128
> +#define MAX_ACTION_RSS_QUEUE_NUM 128
> +struct action_rss_data {
> +    struct rte_flow_action_rss conf;
> +    uint16_t queue[MAX_ACTION_RSS_QUEUE_NUM];
> +    uint8_t key[MAX_RSS_HASH_KEY_LENGTH];
> +};
> +
>  static struct rte_flow_action_rss *
>  add_flow_rss_action(struct flow_actions *actions,
>                      struct netdev *netdev) {
>      int i;
> -    struct rte_flow_action_rss *rss;
> +    struct action_rss_data *rss_data;
> -    rss = xmalloc(sizeof(*rss) + sizeof(uint16_t) * netdev->n_rxq);
> -    /*
> -     * Setting it to NULL will let the driver use the default RSS
> -     * configuration we have set: &port_conf.rx_adv_conf.rss_conf.
> -     */
> -    rss->rss_conf = NULL;
> -    rss->num = netdev->n_rxq;
> +    if (netdev->n_rxq > MAX_ACTION_RSS_QUEUE_NUM) {
> +        VLOG_ERR("Num of rxq (%u) must not be greater " \
> +                 "than max rss num of queues (%u)",
> +                 netdev->n_rxq, MAX_ACTION_RSS_QUEUE_NUM);
> +        return NULL;
> +    }
> -    for (i = 0; i < rss->num; i++) {
> -        rss->queue[i] = i;
> +    rss_data = xmalloc(sizeof(struct action_rss_data));
> +    *rss_data = (struct action_rss_data){
> +        .conf = (struct rte_flow_action_rss){
> +            .func = RTE_ETH_HASH_FUNCTION_DEFAULT,
> +            .level = 0,
> +            .types = ETH_RSS_IP,
> +            .key_len = 0,
> +            .queue_num = netdev->n_rxq,
> +            .queue = rss_data->queue,
> +            .key = rss_data->key,
> +        },
> +        .key = { 0 },
> +        .queue = { 0 },
> +    };
> +

EC> See Ilya's comment on xzalloc() so only non-zero values are needed.

> +    /* TODO: Override key with default */

EC> Any reason why this todo is left in?

> +    /* Override queue array with default */
> +    for (i = 0; i < rss_data->conf.queue_num; i++) {
> +       rss_data->queue[i] = i;
>      }
> -    add_flow_action(actions, RTE_FLOW_ACTION_TYPE_RSS, rss);
> +    add_flow_action(actions, RTE_FLOW_ACTION_TYPE_RSS, 
> &rss_data->conf);
> -    return rss;
> +    return &rss_data->conf;
>  }
>  static int
> @@ -4365,7 +4422,7 @@ netdev_dpdk_add_rte_flow_offload(struct netdev 
> *netdev,
>          vlan_mask.tci  = match->wc.masks.vlans[0].tci & 
> ~htons(VLAN_CFI);
>          /* match any protocols */
> -        vlan_mask.tpid = 0;
> +        vlan_mask.inner_type = 0;
>          add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_VLAN,
>                           &vlan_spec, &vlan_mask);
> @@ -4516,6 +4573,11 @@ end_proto_check:
>      struct rte_flow_action_rss *rss;
>      rss = add_flow_rss_action(&actions, netdev);
> +    if (!rss) {
> +        VLOG_ERR("add_flow_rss_action error.\n");
> +        ret = -1;
> +        goto out;
> +    }
>      add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_END, NULL);
>      flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items,


More information about the dev mailing list