[ovs-dev] [PATCH v11 4/5] dpif-netdev: Refactor generic implementation

Van Haaren, Harry harry.van.haaren at intel.com
Wed Jul 17 10:29:18 UTC 2019


> -----Original Message-----
> From: Ilya Maximets [mailto:i.maximets at samsung.com]
> Sent: Wednesday, July 17, 2019 11:14 AM
> To: Van Haaren, Harry <harry.van.haaren at intel.com>; dev at openvswitch.org
> Cc: echaudro at redhat.com; malvika.gupta at arm.com; Stokes, Ian
> <ian.stokes at intel.com>
> Subject: Re: [PATCH v11 4/5] dpif-netdev: Refactor generic implementation

<snip>

> >> diff --git a/lib/dpif-netdev-lookup-generic.c b/lib/dpif-netdev-lookup-
> generic.c
> >> index 833abf54f..abd166fc3 100644
> >> --- a/lib/dpif-netdev-lookup-generic.c
> >> +++ b/lib/dpif-netdev-lookup-generic.c
> >> @@ -30,68 +30,210 @@
> >>  #include "packets.h"
> >>  #include "pvector.h"
> >>
> >> -/* Returns a hash value for the bits of 'key' where there are 1-bits in
> >> - * 'mask'. */
> >> -static inline uint32_t
> >> -netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
> >> -                             const struct netdev_flow_key *mask)
> >> +VLOG_DEFINE_THIS_MODULE(dpif_lookup_generic);
> >
> > dpif_netdev_lookup_generic?
> >
> > BTW, this might be not needed if we'll remove logging in the patch #5.

Refer to feedback on 5/5 just now - leaving the logging in the implementation
is the better solution IMO.


> >> +/* Lookup functions below depends on the internal structure of flowmap. */
> >> +BUILD_ASSERT_DECL(FLOWMAP_UNITS == 2);
> >> +
> >> +/* Given a packet, table and mf_masks, this function iterates over each
> bit
> >> + * set in the subtable, and calculates the appropriate metadata to store
> in the
> >> + * blocks_scratch[].
> >> + *
> >> + * The results of the blocks_scratch[] can be used for hashing, and later
> for
> >> + * verification of if a rule matches the given packet.
> >> + */
> >> +static inline void
> >> +netdev_flow_key_flatten_unit(const uint64_t *pkt_blocks,
> >> +                             const uint64_t *tbl_blocks,
> >> +                             const uint64_t *mf_masks,
> >> +                             uint64_t *blocks_scratch,
> >> +                             const uint64_t pkt_mf_bits,
> >> +                             const uint32_t count)
> >>  {
> >> -    const uint64_t *p = miniflow_get_values(&mask->mf);
> >> -    uint32_t hash = 0;
> >> -    uint64_t value;
> >> +    uint32_t i;
> >> +
> >> +    for (i = 0; i < count; i++) {
> >> +        uint64_t mf_mask = mf_masks[i];
> >> +        /* Calculate the block index for the packet metadata. */
> >> +        uint64_t idx_bits = mf_mask & pkt_mf_bits;
> >> +        const uint32_t pkt_idx = count_1bits(idx_bits);
> >> +
> >> +        /* Check if the packet has the subtable miniflow bit set. If yes,
> the
> >> +         * block at the above pkt_idx will be stored, otherwise it is
> masked
> >> +         * out to be zero.
> >> +         */
> >> +        uint64_t pkt_has_mf_bit = (mf_mask + 1) & pkt_mf_bits;
> >> +        uint64_t no_bit = ((!pkt_has_mf_bit) > 0) - 1;
> >>
> >> -    NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP (value, key, mask->mf.map) {
> >> -        hash = hash_add64(hash, value & *p);
> >> -        p++;
> >> +        /* Mask packet block by table block, and mask to zero if packet
> >> +         * doesn't actually contain this block of metadata.
> >> +         */
> >> +        blocks_scratch[i] = pkt_blocks[pkt_idx] & tbl_blocks[i] & no_bit;
> >>      }
> >> +}
> >> +
> >> +/* This function takes a packet, and subtable and writes an array of
> uint64_t
> >> + * blocks. The blocks contain the metadata that the subtable matches on,
> in
> >> + * the same order as the subtable, allowing linear iteration over the
> blocks.
> >> + *
> >> + * To calculate the blocks contents, the netdev_flow_key_flatten_unit
> function
> >> + * is called twice, once for each "unit" of the miniflow. This call can be
> >> + * inlined by the compiler for performance.
> >> + *
> >> + * Note that the u0_count and u1_count variables can be compile-time
> constants,
> >> + * allowing the loop in the inlined flatten_unit() function to be compile-
> time
> >> + * unrolled, or possibly removed totally by unrolling by the loop
> iterations.
> >> + * The compile time optimizations enabled by this design improves
> performance.
> >> + */
> >> +static inline void
> >> +netdev_flow_key_flatten(const struct netdev_flow_key *key,
> >> +                        const struct netdev_flow_key *mask,
> >> +                        const uint64_t *mf_masks,
> >> +                        uint64_t *blocks_scratch,
> >> +                        const uint32_t u0_count,
> >> +                        const uint32_t u1_count)
> >> +{
> >> +    /* Load mask from subtable, mask with packet mf, popcount to get idx.
> */
> >> +    const uint64_t *pkt_blocks = miniflow_get_values(&key->mf);
> >> +    const uint64_t *tbl_blocks = miniflow_get_values(&mask->mf);
> >>
> >> -    return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
> >> +    /* Packet miniflow bits to be masked by pre-calculated mf_masks. */
> >> +    const uint64_t pkt_bits_u0 = key->mf.map.bits[0];
> >> +    const uint32_t pkt_bits_u0_pop = count_1bits(pkt_bits_u0);
> >> +    const uint64_t pkt_bits_u1 = key->mf.map.bits[1];
> >> +
> >> +    /* Unit 0 flattening */
> >> +    netdev_flow_key_flatten_unit(&pkt_blocks[0],
> >> +                                 &tbl_blocks[0],
> >> +                                 &mf_masks[0],
> >> +                                 &blocks_scratch[0],
> >> +                                 pkt_bits_u0,
> >> +                                 u0_count);
> >> +
> >> +    /* Unit 1 flattening:
> >> +     * Move the pointers forward in the arrays based on u0 offsets, NOTE:
> >> +     * 1) pkt blocks indexed by actual popcount of u0, which is NOT always
> >> +     *    the same as the amount of bits set in the subtable.
> >> +     * 2) mf_masks, tbl_block and blocks_scratch are all "flat" arrays, so
> >> +     *    the index is always u0_count.
> >> +     */
> >> +    netdev_flow_key_flatten_unit(&pkt_blocks[pkt_bits_u0_pop],
> >> +                                 &tbl_blocks[u0_count],
> >> +                                 &mf_masks[u0_count],
> >> +                                 &blocks_scratch[u0_count],
> >> +                                 pkt_bits_u1,
> >> +                                 u1_count);
> >>  }
> >>
> >> -uint32_t
> >> -dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,
> >> -                              uint32_t keys_map,
> >> -                              const struct netdev_flow_key *keys[],
> >> -                              struct dpcls_rule **rules)
> >> +/* Compares a rule and the blocks representing a key, returns 1 on a
> match. */
> >> +static inline uint64_t
> >> +netdev_rule_matches_key(const struct dpcls_rule *rule,
> >> +                        const uint32_t mf_bits_total,
> >> +                        const uint64_t *blocks_scratch)
> >>  {
> >> -    int i;
> >> -    uint32_t found_map;
> >> +    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
> >> +    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
> >> +    uint64_t not_match = 0;
> >> +
> >> +    for (int i = 0; i < mf_bits_total; i++) {
> >> +        not_match |= (blocks_scratch[i] & maskp[i]) != keyp[i];
> >> +    }
> >>
> >> -    /* Compute hashes for the remaining keys.  Each search-key is
> >> -     * masked with the subtable's mask to avoid hashing the wildcarded
> >> -     * bits. */
> >> +    /* invert result to show match as 1 */
> >
> > Missed comment (style).

Will fix in v12.


<snip>
> >> +    /* Hash the now linearized blocks of packet metadata. */
> >>      ULLONG_FOR_EACH_1 (i, keys_map) {
> >> -        hashes[i] = netdev_flow_key_hash_in_mask(keys[i], &subtable-
> >mask);
> >> +        uint64_t *block_ptr = &blocks_scratch[i * bit_count_total];
> >> +        uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total);
> >> +        hashes[i] = hash_finish(hash, bit_count_total * 8);
> 
> You mentioned before that 'hash_words64_inline()' has a different finish
> value, but I don't think that it matters to preserve the actual value of
> a hash here. Even 'hash_words64()' should be fine, because it checks if
> the argument is a compile time constant and calls the 'hash_words64_inline'
> in this case.

Be very careful reading the hash.h inline functions, there's lots of them, and
they're #ifdef-ed based on platform etc. It is quite confusing IMO.

Notice the call I'm making is to "hash_add_words64()", and not "hash_words64()".
The hash_add_words64() only provides just the loop, and does not call hash_finish(),
and hash_finish() is called by my code above with the correct hash_finish() value.

You're right that hash_words64() calls the _inline() flavor if the n_words
variable is constant, and would end up calling the hash_finish() version, and
hence is not valid.


I've prototyped a variety of different code-paths here, the above (as v11)
makes the most re-use of the existing hash code. The v10 and before is the
same code, but with own loops rolled.

I suggest to use the v11 version as it re-uses the hash loops - which is cleaner.


> >> -    /* Lookup. */
> >> +    /* Lookup: this returns a bitmask of packets where the hash table had
> >> +     * an entry for the given hash key. Presence of a hash key does not
> >> +     * guarantee matching the key, as there can be hash collisions.
> >> +     */
> >> +    uint32_t found_map;
> >>      const struct cmap_node *nodes[NETDEV_MAX_BURST];
> >> +
> >>      found_map = cmap_find_batch(&subtable->rules, keys_map, hashes,
> nodes);
> >>
> >> -    /* Check results.  When the i-th bit of found_map is set, it means
> >> -     * that a set of nodes with a matching hash value was found for the
> >> -     * i-th search-key.  Due to possible hash collisions we need to check
> >> -     * which of the found rules, if any, really matches our masked
> >> -     * search-key. */
> >> +    /* Verify that packet actually matched rule. If not found, a hash
> >> +     * collision has taken place, so continue searching with the next
> node.
> >> +     */
> >>      ULLONG_FOR_EACH_1 (i, found_map) {
> >>          struct dpcls_rule *rule;
> >>
> >>          CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
> >> -            if (OVS_LIKELY(dpcls_rule_matches_key(rule, keys[i]))) {
> >> +            const uint32_t cidx = i * bit_count_total;
> >> +            uint32_t match = netdev_rule_matches_key(rule,
> bit_count_total,
> >> +
> &blocks_scratch[cidx]);
> >> +
> >> +            if (OVS_LIKELY(match)) {
> >>                  rules[i] = rule;
> >> -                /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
> >> -                 * within one second optimization interval. */
> >>                  subtable->hit_cnt++;
> >>                  goto next;
> >>              }
> >>          }
> >> -        /* None of the found rules was a match.  Reset the i-th bit to
> >> -         * keep searching this key in the next subtable. */
> >> -        ULLONG_SET0(found_map, i);  /* Did not match. */
> >> +
> >> +        /* None of the found rules was a match.  Clear the i-th bit to
> >> +         * search for this key in the next subtable. */
> >> +        ULLONG_SET0(found_map, i);
> >>      next:
> >> -        ;             /* Keep Sparse happy. */
> >> +        ;                     /* Keep Sparse happy. */
> >
> > You're changing the indentation of this comment back and forth few times
> > around the patch series.

Will investigate and fix in v12.


<snip>

> >> -/* Prototype for generic lookup func, using same code path as before. */
> >> +/* Prototype for generic lookup func, using generic scalar code path. */
> >>  uint32_t
> >>  dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,
> >> +                              uint64_t *blocks_scratch,
> >>                                uint32_t keys_map,
> >>                                const struct netdev_flow_key *keys[],
> >>                                struct dpcls_rule **rules);
> >> @@ -84,13 +86,13 @@ struct dpcls_subtable {
> >>      uint8_t mf_bits_set_unit0;
> >>      uint8_t mf_bits_set_unit1;
> >>
> >> -    /* the lookup function to use for this subtable. If there is a known
> >> +    /* The lookup function to use for this subtable. If there is a known
> >>       * property of the subtable (eg: only 3 bits of miniflow metadata is
> >>       * used for the lookup) then this can point at an optimized version of
> >>       * the lookup function for this particular subtable. */
> >>      dpcls_subtable_lookup_func lookup_func;
> >>
> >> -    /* caches the masks to match a packet to, reducing runtime
> calculations */
> >> +    /* Caches the masks to match a packet to, reducing runtime
> calculations. */
> >
> > Above two changes should be part of the previous patches.
> >
> > BTW, introduction of the 'mf_masks' and 'mf_bits*' should happen in this
> patch,
> > not in the "dpif-netdev: Move dpcls lookup structures to .h".

Will investigate and fix in v12.


> >>      uint64_t *mf_masks;
> >>
> >>      struct netdev_flow_key mask; /* Wildcards for fields (const). */
> >> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> >> index b42ca35e3..8acc1445a 100644
> >> --- a/lib/dpif-netdev.c
> >> +++ b/lib/dpif-netdev.c
> >> @@ -233,6 +233,15 @@ struct dpcls {
> >>      odp_port_t in_port;
> >>      struct cmap subtables_map;
> >>      struct pvector subtables;
> >> +
> >> +    /* Region of memory for this DPCLS instance to use as scratch.
> >> +     * Size is garaunteed to be large enough to hold all blocks required
> for
> >
> > s/garaunteed/guaranteed/

Typo - will fix.


> >> +     * the subtable's to match on. This allows each dpcls lookup to
> flatten
> >> +     * the packet miniflows into this blocks_scratch area, without using
> >> +     * variable lenght arrays. This region is allocated on subtable
> create, and
> >
> > s/lenght/length/

Typo - will fix.

<snip>


More information about the dev mailing list