[ovs-dev] [PATCH v9 4/5] dpif-netdev: refactor generic implementation
Harry van Haaren
harry.van.haaren at intel.com
Wed May 8 15:13:20 UTC 2019
This commit refactors the generic implementation. The
goal of this refactor is to simply the code to enable
"specialization" of the functions at compile time.
Given compile-time optimizations, the compiler is able
to unroll loops, and create optimized code sequences due
to compile time knowledge of loop-trip counts.
In order to enable these compiler optimizations, we must
refactor the code to pass the loop-trip counts to functions
as compile time constants.
This patch allows the number of miniflow-bits set per "unit"
in the miniflow to be passed around as a function argument.
Note that this patch does NOT yet take advantage of doing so,
this is only a refactor to enable it in the next patches.
Signed-off-by: Harry van Haaren <harry.van.haaren at intel.com>
---
v9:
- Use count_1bits in favour of __builtin_popcount (Ilya)
- Use ALWAYS_INLINE instead of __attribute__ synatx (Ilya)
v8:
- Rework block_cache and mf_masks to avoid variable-lenght array
due to compiler issues. Provisioning for worst case is not a
good solution due to magnitue of over-provisioning required.
- Rework netdev_flatten function removing unused parameter
---
lib/dpif-netdev-lookup-generic.c | 239 ++++++++++++++++++++++++-------
lib/dpif-netdev.c | 79 +++++++++-
lib/dpif-netdev.h | 20 ++-
3 files changed, 283 insertions(+), 55 deletions(-)
diff --git a/lib/dpif-netdev-lookup-generic.c b/lib/dpif-netdev-lookup-generic.c
index 2e4003408..901d28ff0 100644
--- a/lib/dpif-netdev-lookup-generic.c
+++ b/lib/dpif-netdev-lookup-generic.c
@@ -28,67 +28,204 @@
#include "packets.h"
#include "pvector.h"
-/* Returns a hash value for the bits of 'key' where there are 1-bits in
- * 'mask'. */
-static inline uint32_t
-netdev_flow_key_hash_in_mask(const struct netdev_flow_key *key,
- const struct netdev_flow_key *mask)
+VLOG_DEFINE_THIS_MODULE(dpif_lookup_generic);
+
+/* netdev_flow_key_flatten_unit:
+ * Given a packet, table and mf_masks, this function iterates over each bit
+ * set in the subtable, and calculates the appropriate metadata to store in the
+ * blocks_scratch[].
+ *
+ * The results of the blocks_scratch[] can be used for hashing, and later for
+ * verification of if a rule matches the given packet.
+ */
+static inline void
+netdev_flow_key_flatten_unit(const uint64_t * restrict pkt_blocks,
+ const uint64_t * restrict tbl_blocks,
+ const uint64_t * restrict mf_masks,
+ uint64_t * restrict blocks_scratch,
+ const uint64_t pkt_mf_bits,
+ const uint32_t count)
{
- const uint64_t *p = miniflow_get_values(&mask->mf);
- uint32_t hash = 0;
- uint64_t value;
+ uint32_t i;
+ for (i = 0; i < count; i++) {
+ uint64_t mf_mask = mf_masks[i];
+ /* Calculate the block index for the packet metadata */
+ uint64_t idx_bits = mf_mask & pkt_mf_bits;
+ const uint32_t pkt_idx = count_1bits(idx_bits);
- NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP (value, key, mask->mf.map) {
- hash = hash_add64(hash, value & *p);
- p++;
+ /* check if the packet has the subtable miniflow bit set. If yes, the
+ * block at the above pkt_idx will be stored, otherwise it is masked
+ * out to be zero.
+ */
+ uint64_t pkt_has_mf_bit = (mf_mask + 1) & pkt_mf_bits;
+ uint64_t no_bit = ((!pkt_has_mf_bit) > 0) - 1;
+
+ /* mask packet block by table block, and mask to zero if packet
+ * doesn't actually contain this block of metadata
+ */
+ blocks_scratch[i] = pkt_blocks[pkt_idx] & tbl_blocks[i] & no_bit;
}
+}
+
+/* netdev_flow_key_flatten:
+ * This function takes a packet, and subtable and writes an array of uint64_t
+ * blocks. The blocks contain the metadata that the subtable matches on, in
+ * the same order as the subtable, allowing linear iteration over the blocks.
+ *
+ * To calculate the blocks contents, the netdev_flow_key_flatten_unit function
+ * is called twice, once for each "unit" of the miniflow. This call can be
+ * inlined by the compiler for performance.
+ *
+ * Note that the u0_count and u1_count variables can be compile-time constants,
+ * allowing the loop in the inlined flatten_unit() function to be compile-time
+ * unrolled, or possibly removed totally by unrolling by the loop iterations.
+ * The compile time optimizations enabled by this design improves performance.
+ */
+static inline void
+netdev_flow_key_flatten(const struct netdev_flow_key * restrict key,
+ const struct netdev_flow_key * restrict mask,
+ const uint64_t * restrict mf_masks,
+ uint64_t * restrict blocks_scratch,
+ const uint32_t u0_count,
+ const uint32_t u1_count)
+{
+ /* load mask from subtable, mask with packet mf, popcount to get idx */
+ const uint64_t *pkt_blocks = miniflow_get_values(&key->mf);
+ const uint64_t *tbl_blocks = miniflow_get_values(&mask->mf);
+
+ /* packet miniflow bits to be masked by pre-calculated mf_masks */
+ const uint64_t pkt_bits_u0 = key->mf.map.bits[0];
+ const uint32_t pkt_bits_u0_pop = count_1bits(pkt_bits_u0);
+ const uint64_t pkt_bits_u1 = key->mf.map.bits[1];
- return hash_finish(hash, (p - miniflow_get_values(&mask->mf)) * 8);
+ /* Unit 0 flattening */
+ netdev_flow_key_flatten_unit(&pkt_blocks[0],
+ &tbl_blocks[0],
+ &mf_masks[0],
+ &blocks_scratch[0],
+ pkt_bits_u0,
+ u0_count);
+
+ /* Unit 1 flattening:
+ * Move the pointers forward in the arrays based on u0 offsets, NOTE:
+ * 1) pkt blocks indexed by actual popcount of u0, which is NOT always
+ * the same as the amount of bits set in the subtable.
+ * 2) mf_masks, tbl_block and blocks_scratch are all "flat" arrays, so
+ * the index is always u0_count.
+ */
+ netdev_flow_key_flatten_unit(&pkt_blocks[pkt_bits_u0_pop],
+ &tbl_blocks[u0_count],
+ &mf_masks[u0_count],
+ &blocks_scratch[u0_count],
+ pkt_bits_u1,
+ u1_count);
+}
+
+static inline uint64_t
+netdev_rule_matches_key(const struct dpcls_rule * restrict rule,
+ const uint32_t mf_bits_total,
+ const uint64_t * restrict blocks_scratch)
+{
+ const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
+ const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
+
+ uint64_t not_match = 0;
+ for (int i = 0; i < mf_bits_total; i++) {
+ not_match |= (blocks_scratch[i] & maskp[i]) != keyp[i];
+ }
+
+ /* invert result to show match as 1 */
+ return !not_match;
}
+/* const prop version of the function: note that mf bits total and u0 are
+ * explicitly passed in here, while they're also available at runtime from the
+ * subtable pointer. By making them compile time, we enable the compiler to
+ * unroll loops and flatten out code-sequences based on the knowledge of the
+ * mf_bits_* compile time values. This results in improved performance.
+ */
+static inline uint32_t ALWAYS_INLINE
+lookup_generic_impl(struct dpcls_subtable *subtable,
+ uint64_t *blocks_scratch,
+ uint32_t keys_map,
+ const struct netdev_flow_key *keys[],
+ struct dpcls_rule **rules,
+ const uint32_t bit_count_u0,
+ const uint32_t bit_count_u1)
+{
+ const uint32_t n_pkts = count_1bits(keys_map);
+ ovs_assert(NETDEV_MAX_BURST >= n_pkts);
+ uint32_t hashes[NETDEV_MAX_BURST];
+
+ const uint32_t bit_count_total = bit_count_u0 + bit_count_u1;
+ uint64_t *mf_masks = subtable->mf_masks;
+ int i;
+
+ /* Flatten the packet metadata into the blocks_scratch[] using subtable */
+ ULLONG_FOR_EACH_1(i, keys_map) {
+ netdev_flow_key_flatten(keys[i],
+ &subtable->mask,
+ mf_masks,
+ &blocks_scratch[i * bit_count_total],
+ bit_count_u0,
+ bit_count_u1);
+ }
+
+ /* Hash the now linearized blocks of packet metadata */
+ ULLONG_FOR_EACH_1(i, keys_map) {
+ uint32_t hash = 0;
+ uint32_t i_off = i * bit_count_total;
+ for (int h = 0; h < bit_count_total; h++) {
+ hash = hash_add64(hash, blocks_scratch[i_off + h]);
+ }
+ hashes[i] = hash_finish(hash, bit_count_total * 8);
+ }
+
+ /* Lookup: this returns a bitmask of packets where the hash table had
+ * an entry for the given hash key. Presence of a hash key does not
+ * guarantee matching the key, as there can be hash collisions.
+ */
+ uint32_t found_map;
+ const struct cmap_node *nodes[NETDEV_MAX_BURST];
+ found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
+
+ /* Verify that packet actually matched rule. If not found, a hash
+ * collision has taken place, so continue searching with the next node.
+ */
+ ULLONG_FOR_EACH_1(i, found_map) {
+ struct dpcls_rule *rule;
+
+ CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
+ const uint32_t cidx = i * bit_count_total;
+ uint32_t match = netdev_rule_matches_key(rule, bit_count_total,
+ &blocks_scratch[cidx]);
+
+ if (OVS_LIKELY(match)) {
+ rules[i] = rule;
+ subtable->hit_cnt++;
+ goto next;
+ }
+ }
+
+ /* None of the found rules was a match. Clear the i-th bit to
+ * search for this key in the next subtable. */
+ ULLONG_SET0(found_map, i);
+ next:
+ ; /* Keep Sparse happy. */
+ }
+
+ return found_map;
+}
+
+/* Generic - use runtime provided mf bits */
uint32_t
dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,
+ uint64_t *blocks_scratch,
uint32_t keys_map,
const struct netdev_flow_key *keys[],
struct dpcls_rule **rules)
{
- int i;
- /* Compute hashes for the remaining keys. Each search-key is
- * masked with the subtable's mask to avoid hashing the wildcarded
- * bits. */
- uint32_t hashes[NETDEV_MAX_BURST];
- ULLONG_FOR_EACH_1(i, keys_map) {
- hashes[i] = netdev_flow_key_hash_in_mask(keys[i],
- &subtable->mask);
- }
-
- /* Lookup. */
- const struct cmap_node *nodes[NETDEV_MAX_BURST];
- uint32_t found_map =
- cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
- /* Check results. When the i-th bit of found_map is set, it means
- * that a set of nodes with a matching hash value was found for the
- * i-th search-key. Due to possible hash collisions we need to check
- * which of the found rules, if any, really matches our masked
- * search-key. */
- ULLONG_FOR_EACH_1(i, found_map) {
- struct dpcls_rule *rule;
-
- CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
- if (OVS_LIKELY(dpcls_rule_matches_key(rule, keys[i]))) {
- rules[i] = rule;
- /* Even at 20 Mpps the 32-bit hit_cnt cannot wrap
- * within one second optimization interval. */
- subtable->hit_cnt++;
- goto next;
- }
- }
- /* None of the found rules was a match. Reset the i-th bit to
- * keep searching this key in the next subtable. */
- ULLONG_SET0(found_map, i); /* Did not match. */
- next:
- ; /* Keep Sparse happy. */
- }
-
- return found_map;
+ return lookup_generic_impl(subtable, blocks_scratch, keys_map, keys,
+ rules, subtable->mf_bits_set_unit0,
+ subtable->mf_bits_set_unit1);
}
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 0eb34d9d7..33b93cfdf 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -232,6 +232,15 @@ struct dpcls {
odp_port_t in_port;
struct cmap subtables_map;
struct pvector subtables;
+
+ /* Region of memory for this DPCLS instance to use as scratch.
+ * Size is garaunteed to be large enough to hold all blocks required for
+ * the subtable's to match on. This allows each dpcls lookup to flatten
+ * the packet miniflows into this blocks_scratch area, without using
+ * variable lenght arrays. This region is allocated on subtable create, and
+ * will be resized as required if a larger subtable is added. */
+ uint64_t *blocks_scratch;
+ uint32_t blocks_scratch_size;
};
/* Data structure to keep packet order till fastpath processing. */
@@ -7536,6 +7545,7 @@ static void
dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
{
cmap_destroy(&subtable->rules);
+ ovsrcu_postpone(free, subtable->mf_masks);
ovsrcu_postpone(free, subtable);
}
@@ -7546,6 +7556,8 @@ dpcls_init(struct dpcls *cls)
{
cmap_init(&cls->subtables_map);
pvector_init(&cls->subtables);
+ cls->blocks_scratch = 0;
+ cls->blocks_scratch_size = 0;
}
static void
@@ -7573,6 +7585,7 @@ dpcls_destroy(struct dpcls *cls)
}
cmap_destroy(&cls->subtables_map);
pvector_destroy(&cls->subtables);
+ free(cls->blocks_scratch);
}
}
@@ -7588,7 +7601,29 @@ dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
subtable->hit_cnt = 0;
netdev_flow_key_clone(&subtable->mask, mask);
- /* decide which hash/lookup/verify function to use */
+ /* The count of bits in the mask defines the space required for masks.
+ * Then call gen_masks() to create the appropriate masks, avoiding the cost
+ * of doing runtime calculations */
+ uint32_t unit0 = count_1bits(mask->mf.map.bits[0]);
+ uint32_t unit1 = count_1bits(mask->mf.map.bits[1]);
+ subtable->mf_bits_set_unit0 = unit0;
+ subtable->mf_bits_set_unit1 = unit1;
+
+ subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
+ netdev_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
+
+
+ /* allocate blocks scratch space only if subtable requires more size than
+ * is currently allocated */
+ const uint32_t blocks_required_per_pkt = unit0 + unit1;
+ if (cls->blocks_scratch_size < blocks_required_per_pkt) {
+ free(cls->blocks_scratch);
+ cls->blocks_scratch = xmalloc(sizeof(uint64_t) * NETDEV_MAX_BURST *
+ blocks_required_per_pkt);
+ cls->blocks_scratch_size = blocks_required_per_pkt;
+ }
+
+ /* Assign the generic lookup - this works with any miniflow fingerprint */
subtable->lookup_func = dpcls_subtable_lookup_generic;
cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
@@ -7733,6 +7768,43 @@ dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
}
}
+/* inner loop for mask generation of a unit, see netdev_flow_key_gen_masks */
+static inline void
+netdev_flow_key_gen_mask_unit(uint64_t iter,
+ const uint64_t count,
+ uint64_t *mf_masks)
+{
+ int i;
+ for (i = 0; i < count; i++) {
+ uint64_t lowest_bit = (iter & -iter);
+ iter &= ~lowest_bit;
+ mf_masks[i] = (lowest_bit - 1);
+ }
+ /* checks that count has covered all bits in the iter bitmap */
+ ovs_assert(iter == 0);
+}
+
+/* generate a mask for each block in the miniflow, based on the bits set. This
+ * allows easily masking packets with the generated array here, without
+ * calculations. This replaces runtime-calculating the masks.
+ * @param key The table to generate the mf_masks for
+ * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size
+ * @param mf_bits_total Number of bits set in the whole miniflow (both units)
+ * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
+ */
+void
+netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
+ uint64_t *mf_masks,
+ const uint32_t mf_bits_u0,
+ const uint32_t mf_bits_u1)
+{
+ uint64_t iter_u0 = tbl->mf.map.bits[0];
+ uint64_t iter_u1 = tbl->mf.map.bits[1];
+
+ netdev_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]);
+ netdev_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]);
+}
+
/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
* in 'mask' the values in 'key' and 'target' are the same. */
bool
@@ -7773,6 +7845,7 @@ dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
struct dpcls_subtable *subtable;
+ uint64_t *blocks_scratch = cls->blocks_scratch;
uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
@@ -7792,8 +7865,8 @@ dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
* non-overlapping. */
PVECTOR_FOR_EACH (subtable, &cls->subtables) {
/* call the subtable specific lookup function */
- uint32_t found_map = subtable->lookup_func(subtable, keys_map,
- keys, rules);
+ uint32_t found_map = subtable->lookup_func(subtable, blocks_scratch,
+ keys_map, keys, rules);
/* Count the number of subtables searched for this packet match. This
* estimates the "spread" of subtables looked at per matched packet */
diff --git a/lib/dpif-netdev.h b/lib/dpif-netdev.h
index 27145d721..9263256a9 100644
--- a/lib/dpif-netdev.h
+++ b/lib/dpif-netdev.h
@@ -66,12 +66,14 @@ struct dpcls_rule {
* CPU instruction set available at runtime.
*/
typedef uint32_t (*dpcls_subtable_lookup_func)(struct dpcls_subtable *subtable,
- uint32_t keys_map, const struct netdev_flow_key *keys[],
+ uint64_t *blocks_scratch, uint32_t keys_map,
+ const struct netdev_flow_key *keys[],
struct dpcls_rule **rules);
/* Prototype for generic lookup func, using same code path as before */
uint32_t
dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,
+ uint64_t *blocks_scratch,
uint32_t keys_map,
const struct netdev_flow_key *keys[],
struct dpcls_rule **rules);
@@ -92,8 +94,18 @@ struct dpcls_subtable {
* subtable matches on. The miniflow "bits" are used to select the actual
* dpcls lookup implementation at subtable creation time.
*/
+ uint8_t mf_bits_set_unit0;
+ uint8_t mf_bits_set_unit1;
+
+ /* the lookup function to use for this subtable. If there is a known
+ * property of the subtable (eg: only 3 bits of miniflow metadata is
+ * used for the lookup) then this can point at an optimized version of
+ * the lookup function for this particular subtable. */
dpcls_subtable_lookup_func lookup_func;
+ /* caches the masks to match a packet to, reducing runtime calculations */
+ uint64_t *mf_masks;
+
struct netdev_flow_key mask; /* Wildcards for fields (const). */
/* 'mask' must be the last field, additional space is allocated here. */
};
@@ -102,6 +114,12 @@ struct dpcls_subtable {
#define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP) \
MINIFLOW_FOR_EACH_IN_FLOWMAP (VALUE, &(KEY)->mf, FLOWMAP)
+void
+netdev_flow_key_gen_masks(const struct netdev_flow_key *tbl,
+ uint64_t *mf_masks,
+ const uint32_t mf_bits_u0,
+ const uint32_t mf_bits_u1);
+
bool dpcls_rule_matches_key(const struct dpcls_rule *rule,
const struct netdev_flow_key *target);
--
2.17.1
More information about the dev
mailing list