[ovs-dev] [PATCH v6 2/6] dpif-netdev: retrieve flow directly from the flow mark

Stokes, Ian ian.stokes at intel.com
Wed Mar 7 17:00:01 UTC 2018


> So that we could skip some very costly CPU operations, including but not
> limiting to miniflow_extract, emc lookup, dpcls lookup, etc. Thus,
> performance could be greatly improved.
> 
> A PHY-PHY forwarding with 1000 mega flows (udp,tp_src=1000-1999) and
> 1 million streams (tp_src=1000-1999, tp_dst=2000-2999) show more that 260%
> performance boost.
> 
> Note that though the heavy miniflow_extract is skipped, we still have to
> do per packet checking, due to we have to check the tcp_flags.
> 

Hi Yuanhan,

This patch breaks a number of tests in the OVS Travis nightly build, these will have to be fixed as each patch must be able to compile independently.

I've included a link to the Travis build where the failures occur (note the tests passed without issue for patch 1 of the series).

https://travis-ci.org/istokes/ovs/builds/350372444

Few more comments below.

> Co-authored-by: Finn Christensen <fc at napatech.com>
> Signed-off-by: Yuanhan Liu <yliu at fridaylinux.org>
> Signed-off-by: Finn Christensen <fc at napatech.com>
> 
> ---
> v5: - do not fetch the flow if the flow has been dead
> ---
>  lib/dp-packet.h   |  13 +++++
>  lib/dpif-netdev.c |  43 ++++++++++++---
>  lib/flow.c        | 155 +++++++++++++++++++++++++++++++++++++++++++------
> -----
>  lib/flow.h        |   1 +
>  4 files changed, 174 insertions(+), 38 deletions(-)
> 
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h index b4b721c..ea1194c
> 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -691,6 +691,19 @@ reset_dp_packet_checksum_ol_flags(struct dp_packet
> *p)  #define reset_dp_packet_checksum_ol_flags(arg)
>  #endif
> 
> +static inline bool
> +dp_packet_has_flow_mark(struct dp_packet *p OVS_UNUSED,
> +                        uint32_t *mark OVS_UNUSED) { #ifdef DPDK_NETDEV
> +    if (p->mbuf.ol_flags & PKT_RX_FDIR_ID) {
> +        *mark = p->mbuf.hash.fdir.hi;
> +        return true;
> +    }
> +#endif
> +    return false;
> +}
> +
>  enum { NETDEV_MAX_BURST = 32 }; /* Maximum number packets in a batch. */
> 
>  struct dp_packet_batch {
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index a514de8..e16f5ce
> 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -2013,6 +2013,22 @@ flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
>      }
>  }
> 
> +static struct dp_netdev_flow *
> +mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
> +                  const uint32_t mark)
> +{
> +    struct dp_netdev_flow *flow;
> +
> +    CMAP_FOR_EACH_WITH_HASH (flow, mark_node, mark,
> &flow_mark.mark_to_flow) {
> +        if (flow->mark == mark && flow->pmd_id == pmd->core_id &&
> +            flow->dead == false) {
> +            return flow;
> +        }
> +    }
> +
> +    return NULL;
> +}
> +
>  static void
>  dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
>                            struct dp_netdev_flow *flow) @@ -5203,10
> +5219,10 @@ struct packet_batch_per_flow {  static inline void
> packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
>                               struct dp_packet *packet,
> -                             const struct miniflow *mf)
> +                             uint16_t tcp_flags)
>  {
>      batch->byte_count += dp_packet_size(packet);
> -    batch->tcp_flags |= miniflow_get_tcp_flags(mf);
> +    batch->tcp_flags |= tcp_flags;
>      batch->array.packets[batch->array.count++] = packet;  }
> 
> @@ -5240,7 +5256,7 @@ packet_batch_per_flow_execute(struct
> packet_batch_per_flow *batch,
> 
>  static inline void
>  dp_netdev_queue_batches(struct dp_packet *pkt,
> -                        struct dp_netdev_flow *flow, const struct
> miniflow *mf,
> +                        struct dp_netdev_flow *flow, uint16_t
> + tcp_flags,
>                          struct packet_batch_per_flow *batches,
>                          size_t *n_batches)  { @@ -5251,7 +5267,7 @@
> dp_netdev_queue_batches(struct dp_packet *pkt,
>          packet_batch_per_flow_init(batch, flow);
>      }
> 
> -    packet_batch_per_flow_update(batch, pkt, mf);
> +    packet_batch_per_flow_update(batch, pkt, tcp_flags);
>  }
> 
>  /* Try to process all ('cnt') the 'packets' using only the exact match
> cache @@ -5282,6 +5298,7 @@ emc_processing(struct dp_netdev_pmd_thread
> *pmd,
>      const size_t cnt = dp_packet_batch_size(packets_);
>      uint32_t cur_min;
>      int i;
> +    uint16_t tcp_flags;
> 
>      atomic_read_relaxed(&pmd->dp->emc_insert_min, &cur_min);
>      pmd_perf_update_counter(&pmd->perf_stats,
> @@ -5290,6 +5307,7 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
> 
>      DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
>          struct dp_netdev_flow *flow;
> +        uint32_t flow_mark;

This will cause a compilation failure

lib/dpif-netdev.c:5310:18: error: declaration of 'flow_mark' shadows a global declaration [-Werror=shadow]
         uint32_t flow_mark;
                  ^
lib/dpif-netdev.c:1860:25: error: shadowed declaration is here [-Werror=shadow]
 static struct flow_mark flow_mark = {

> 
>          if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
>              dp_packet_delete(packet);
> @@ -5297,6 +5315,16 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
>              continue;
>          }
> 
> +        if (dp_packet_has_flow_mark(packet, &flow_mark)) {
> +            flow = mark_to_flow_find(pmd, flow_mark);
> +            if (flow) {
> +                tcp_flags = parse_tcp_flags(packet);
> +                dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
> +                                        n_batches);
> +                continue;
> +            }
> +        }
> +
>          if (i != cnt - 1) {
>              struct dp_packet **packets = packets_->packets;
>              /* Prefetch next packet data and metadata. */ @@ -5322,7
> +5350,8 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
>              flow = NULL;
>          }
>          if (OVS_LIKELY(flow)) {
> -            dp_netdev_queue_batches(packet, flow, &key->mf, batches,
> +            tcp_flags = miniflow_get_tcp_flags(&key->mf);
> +            dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
>                                      n_batches);
>          } else {
>              /* Exact match cache missed. Group missed packets together at
> @@ -5501,7 +5530,9 @@ fast_path_processing(struct dp_netdev_pmd_thread
> *pmd,
>          flow = dp_netdev_flow_cast(rules[i]);
> 
>          emc_probabilistic_insert(pmd, &keys[i], flow);
> -        dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches,
> n_batches);
> +        dp_netdev_queue_batches(packet, flow,
> +                                miniflow_get_tcp_flags(&keys[i].mf),
> +                                batches, n_batches);
>      }
> 
>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT, diff -
> -git a/lib/flow.c b/lib/flow.c index f9d7c2a..80718d4 100644
> --- a/lib/flow.c
> +++ b/lib/flow.c
> @@ -624,6 +624,70 @@ flow_extract(struct dp_packet *packet, struct flow
> *flow)
>      miniflow_expand(&m.mf, flow);
>  }
> 

I'm not a fan of the sanity checks for IPv4,IPv6,fragementaion etc. being included in this patch.

I understand the motivation of doing this but nowhere is it mentioned in the commit message that these changes occur.

As the miniflow extract function is an important function that affects users regardless of HWOL usage, I think these changes should be applied in a separate patch and called out clearly in the commit so that users are aware.

> +static inline bool
> +ipv4_sanity_check(const struct ip_header *nh, size_t size,
> +                  int *ip_lenp, uint16_t *tot_lenp) {
> +    int ip_len;
> +    uint16_t tot_len;
> +
> +    if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
> +        return false;
> +    }
> +    ip_len = IP_IHL(nh->ip_ihl_ver) * 4;
> +
> +    if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN || size < ip_len)) {
> +        return false;
> +    }
> +
> +    tot_len = ntohs(nh->ip_tot_len);
> +    if (OVS_UNLIKELY(tot_len > size || ip_len > tot_len ||
> +                size - tot_len > UINT8_MAX)) {
> +        return false;
> +    }
> +
> +    *ip_lenp = ip_len;
> +    *tot_lenp = tot_len;
> +
> +    return true;
> +}
> +
> +static inline uint8_t
> +ipv4_get_nw_frag(const struct ip_header *nh) {
> +    uint8_t nw_frag = 0;
> +
> +    if (OVS_UNLIKELY(IP_IS_FRAGMENT(nh->ip_frag_off))) {
> +        nw_frag = FLOW_NW_FRAG_ANY;
> +        if (nh->ip_frag_off & htons(IP_FRAG_OFF_MASK)) {
> +            nw_frag |= FLOW_NW_FRAG_LATER;
> +        }
> +    }
> +
> +    return nw_frag;
> +}
> +
> +static inline bool
> +ipv6_sanity_check(const struct ovs_16aligned_ip6_hdr *nh, size_t size)
> +{
> +    uint16_t plen;
> +
> +    if (OVS_UNLIKELY(size < sizeof *nh)) {
> +        return false;
> +    }
> +
> +    plen = ntohs(nh->ip6_plen);
> +    if (OVS_UNLIKELY(plen > size)) {
> +        return false;
> +    }
> +    /* Jumbo Payload option not supported yet. */
> +    if (OVS_UNLIKELY(size - plen > UINT8_MAX)) {
> +        return false;
> +    }
> +
> +    return true;
> +}
> +
>  /* Caller is responsible for initializing 'dst' with enough storage for
>   * FLOW_U64S * 8 bytes. */
>  void
> @@ -748,22 +812,7 @@ miniflow_extract(struct dp_packet *packet, struct
> miniflow *dst)
>          int ip_len;
>          uint16_t tot_len;
> 
> -        if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
> -            goto out;
> -        }
> -        ip_len = IP_IHL(nh->ip_ihl_ver) * 4;
> -
> -        if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
> -            goto out;
> -        }
> -        if (OVS_UNLIKELY(size < ip_len)) {
> -            goto out;
> -        }
> -        tot_len = ntohs(nh->ip_tot_len);
> -        if (OVS_UNLIKELY(tot_len > size || ip_len > tot_len)) {
> -            goto out;
> -        }
> -        if (OVS_UNLIKELY(size - tot_len > UINT8_MAX)) {
> +        if (OVS_UNLIKELY(!ipv4_sanity_check(nh, size, &ip_len,
> + &tot_len))) {
>              goto out;
>          }
>          dp_packet_set_l2_pad_size(packet, size - tot_len); @@ -786,31
> +835,19 @@ miniflow_extract(struct dp_packet *packet, struct miniflow
> *dst)
>          nw_tos = nh->ip_tos;
>          nw_ttl = nh->ip_ttl;
>          nw_proto = nh->ip_proto;
> -        if (OVS_UNLIKELY(IP_IS_FRAGMENT(nh->ip_frag_off))) {
> -            nw_frag = FLOW_NW_FRAG_ANY;
> -            if (nh->ip_frag_off & htons(IP_FRAG_OFF_MASK)) {
> -                nw_frag |= FLOW_NW_FRAG_LATER;
> -            }
> -        }
> +        nw_frag = ipv4_get_nw_frag(nh);
>          data_pull(&data, &size, ip_len);
>      } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> -        const struct ovs_16aligned_ip6_hdr *nh;
> +        const struct ovs_16aligned_ip6_hdr *nh = data;
>          ovs_be32 tc_flow;
>          uint16_t plen;
> 
> -        if (OVS_UNLIKELY(size < sizeof *nh)) {
> +        if (OVS_UNLIKELY(!ipv6_sanity_check(nh, size))) {
>              goto out;
>          }
> -        nh = data_pull(&data, &size, sizeof *nh);
> +        data_pull(&data, &size, sizeof *nh);
> 
>          plen = ntohs(nh->ip6_plen);
> -        if (OVS_UNLIKELY(plen > size)) {
> -            goto out;
> -        }
> -        /* Jumbo Payload option not supported yet. */
> -        if (OVS_UNLIKELY(size - plen > UINT8_MAX)) {
> -            goto out;
> -        }
>          dp_packet_set_l2_pad_size(packet, size - plen);
>          size = plen;   /* Never pull padding. */
> 
> @@ -982,6 +1019,60 @@ parse_dl_type(const struct eth_header *data_, size_t
> size)
>      return parse_ethertype(&data, &size);  }
> 
> +uint16_t
> +parse_tcp_flags(struct dp_packet *packet) {
> +    const void *data = dp_packet_data(packet);
> +    size_t size = dp_packet_size(packet);
> +    ovs_be16 dl_type;
> +    uint8_t nw_frag = 0, nw_proto = 0;
> +
> +    if (packet->packet_type != htonl(PT_ETH)) {
> +        return 0;
> +    }
> +
> +    data_pull(&data, &size, ETH_ADDR_LEN * 2);
> +    dl_type = parse_ethertype(&data, &size);
> +    if (OVS_LIKELY(dl_type == htons(ETH_TYPE_IP))) {
> +        const struct ip_header *nh = data;
> +        int ip_len;
> +        uint16_t tot_len;
> +
> +        if (OVS_UNLIKELY(!ipv4_sanity_check(nh, size, &ip_len,
> &tot_len))) {
> +            return 0;
> +        }
> +        nw_proto = nh->ip_proto;
> +        nw_frag = ipv4_get_nw_frag(nh);
> +
> +        size = tot_len;   /* Never pull padding. */
> +        data_pull(&data, &size, ip_len);
> +    } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> +        const struct ovs_16aligned_ip6_hdr *nh = data;
> +
> +        if (OVS_UNLIKELY(!ipv6_sanity_check(nh, size))) {
> +            return 0;
> +        }
> +        data_pull(&data, &size, sizeof *nh);
> +
> +        size = ntohs(nh->ip6_plen); /* Never pull padding. */
> +        if (!parse_ipv6_ext_hdrs__(&data, &size, &nw_proto, &nw_frag)) {
> +            return 0;
> +        }
> +        nw_proto = nh->ip6_nxt;
> +    } else {
> +        return 0;
> +    }
> +
> +    if (!(nw_frag & FLOW_NW_FRAG_LATER) && nw_proto == IPPROTO_TCP &&
> +        size >= TCP_HEADER_LEN) {
> +        const struct tcp_header *tcp = data;
> +
> +        return TCP_FLAGS_BE32(tcp->tcp_ctl);
> +    }
> +
> +    return 0;
> +}
> +
>  /* For every bit of a field that is wildcarded in 'wildcards', sets the
>   * corresponding bit in 'flow' to zero. */  void diff --git a/lib/flow.h
> b/lib/flow.h index eb1e2bf..6be4199 100644
> --- a/lib/flow.h
> +++ b/lib/flow.h
> @@ -130,6 +130,7 @@ bool parse_ipv6_ext_hdrs(const void **datap, size_t
> *sizep, uint8_t *nw_proto,
>                           uint8_t *nw_frag);
>  ovs_be16 parse_dl_type(const struct eth_header *data_, size_t size);
> bool parse_nsh(const void **datap, size_t *sizep, struct ovs_key_nsh
> *key);
> +uint16_t parse_tcp_flags(struct dp_packet *packet);
> 
>  static inline uint64_t
>  flow_get_xreg(const struct flow *flow, int idx)
> --
> 2.7.4



More information about the dev mailing list