[ovs-dev] [RFC Patch] dpif-netdev: Sorted subtable vectors per in_port in dpcls

Thu Jun 16 13:56:00 UTC 2016

The user-space datapath (dpif-netdev) consists of a first level "exact match cache" (EMC) matching on 5-tuples and the normal megaflow classifier. With many parallel packet flows (e.g. TCP connections) the EMC becomes inefficient and the OVS forwarding performance is determined by the megaflow classifier.

The megaflow classifier (dpcls) consists of a variable number of hash tables (aka subtables), each containing megaflow entries with the same mask of packet header and metadata fields to match upon. A dpcls lookup matches a given packet against all subtables in sequence until it hits a match. As megaflow cache entries are by construction non-overlapping, the first match is the only match.

Today the order of the subtables in the dpcls is essentially random so that on average a dpcsl lookup has to visit N/2 subtables for a hit, when N is the total number of subtables. Even though every single hash-table lookup is fast, the performance of the current dpcls degrades when there are many subtables.

How does the patch address this issue:

In reality there is often a strong correlation between the ingress port and a small subset of subtables that have hits. The entire megaflow cache typically decomposes nicely into partitions that are hit only by packets entering from a range of similar ports (e.g. traffic from Phy  -> VM vs. traffic from VM -> Phy). 

Therefore, keeping a separate list of subtables per ingress port, sorted by frequency of hits, reduces the average number of subtables lookups in the dpcls to a minimum, even if the total number of subtables gets large. 

The patch introduces 32 subtable vectors per dpcls and hashes the ingress port to select the subtable vector. The patch also counts matches per 32 slots in each vector (hashing the subtable pointer to obtain the slot) and sorts the vectors according to match frequency every second.

To monitor the effectiveness of the patch we have enhanced the ovs-appctl dpif-netdev/pmd-stats-show command with an extra line "avg. subtable lookups per hit" to report the average number of subtable lookup needed for a megaflow match. Ideally, this should be close to 1 and much smaller than N/2.

I have benchmarked a cloud L3 overlay pipeline with a VXLAN overlay mesh. With pure L3 tenant traffic between VMs on different nodes the resulting netdev dpcls contains N=4 subtables.

Disabling the EMC, I have measured a baseline performance (in+out) of ~1.32 Mpps (64 bytes, 1000 L4 flows). The average number of subtable lookups per dpcls match is 2.5.

With the patch the average number of subtable lookups per dpcls match goes down to 1.25 (apparently there are still two ports of different nature hashed to the same vector, otherwise it should be exactly one). Even so the forwarding performance grows by ~30% to 1.72 Mpps. 

As the number of subtables will often be higher in reality, we can assume that this is at the lower end of the speed-up one can expect from this optimization. Just running a parallel ping between the VXLAN tunnel endpoints increases the number of subtables and hence the average number of subtable lookups from 2.5 to 3.5 with a corresponding decrease of throughput to 1.14 Mpps. With the patch the parallel ping has no impact on average number of subtable lookups and performance. The performance gain is then ~50%.

Signed-off-by: Jan Scheurich <jan.scheurich at ericsson.com>

-----

 lib/dpif-netdev.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 92 insertions(+), 18 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 8d39d9e..c491110 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -149,9 +149,15 @@ struct emc_cache {
 
 /* Simple non-wildcarding single-priority classifier. */

+#define NUM_SUBTABLE_VECTORS 32
+#define NUM_SUBTABLE_SLOTS 32
+#define DPCLS_OPTIMIATION_INTERVAL 1000
+
 struct dpcls {
     struct cmap subtables_map;
-    struct pvector subtables;
+    struct pvector subtables[NUM_SUBTABLE_VECTORS];
+    uint64_t hit_cnt[NUM_SUBTABLE_VECTORS][NUM_SUBTABLE_SLOTS];
+    long long int next_optimization;
 };

 /* A rule to be inserted to the classifier. */
@@ -164,12 +170,14 @@ struct dpcls_rule {

 static void dpcls_init(struct dpcls *);
 static void dpcls_destroy(struct dpcls *);
+static void dpcls_try_optimize(struct dpcls *);
 static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
                          const struct netdev_flow_key *mask);
 static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
-static bool dpcls_lookup(const struct dpcls *cls,
+static bool dpcls_lookup(struct dpcls *cls,
                          const struct netdev_flow_key keys[],
-                         struct dpcls_rule **rules, size_t cnt);
+                         struct dpcls_rule **rules, size_t cnt,
+                         const odp_port_t port_no, int *num_lookups_p);
 
 /* Datapath based on the network device interface from netdev.h.
  *
@@ -239,6 +247,7 @@ enum dp_stat_type {
     DP_STAT_MASKED_HIT,         /* Packets that matched in the flow table. */
     DP_STAT_MISS,               /* Packets that did not match. */
     DP_STAT_LOST,               /* Packets not passed up to the client. */
+    DP_STATS_LOOKUP_HIT,        /* Number of subtable lookups for flow table hits */
     DP_N_STATS
 };

@@ -657,8 +666,10 @@ pmd_info_show_stats(struct ds *reply,

     ds_put_format(reply,
                   "\temc hits:%llu\n\tmegaflow hits:%llu\n"
+                  "\tavg. subtable lookups per hit:%5.2f\n"
                   "\tmiss:%llu\n\tlost:%llu\n",
                   stats[DP_STAT_EXACT_HIT], stats[DP_STAT_MASKED_HIT],
+                  stats[DP_STAT_MASKED_HIT] > 0 ? (1.0*stats[DP_STATS_LOOKUP_HIT])/stats[DP_STAT_MASKED_HIT] : 0,
                   stats[DP_STAT_MISS], stats[DP_STAT_LOST]);

     if (total_cycles == 0) {
@@ -1855,13 +1866,13 @@ emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
 }

 static struct dp_netdev_flow *
-dp_netdev_pmd_lookup_flow(const struct dp_netdev_pmd_thread *pmd,
+dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
                           const struct netdev_flow_key *key)
 {
     struct dp_netdev_flow *netdev_flow;
     struct dpcls_rule *rule;

-    dpcls_lookup(&pmd->cls, key, &rule, 1);
+    dpcls_lookup(&pmd->cls, key, &rule, 1, 0, 0);  /* Use 0 as dummy in_port */
     netdev_flow = dp_netdev_flow_cast(rule);

     return netdev_flow;
@@ -2874,6 +2885,7 @@ reload:

             emc_cache_slow_sweep(&pmd->flow_cache);
             coverage_try_clear();
+            dpcls_try_optimize(&pmd->cls);
             ovsrcu_quiesce();

             atomic_read_relaxed(&pmd->change_seq, &seq);
@@ -3814,7 +3826,8 @@ static inline void
 fast_path_processing(struct dp_netdev_pmd_thread *pmd,
                      struct dp_packet_batch *packets_,
                      struct netdev_flow_key *keys,
-                     struct packet_batch_per_flow batches[], size_t *n_batches)
+                     struct packet_batch_per_flow batches[], size_t *n_batches,
+                     odp_port_t port_no)
 {
     int cnt = packets_->count;
 #if !defined(__CHECKER__) && !defined(_WIN32)
@@ -3827,7 +3840,7 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
     struct dpcls_rule *rules[PKT_ARRAY_SIZE];
     struct dp_netdev *dp = pmd->dp;
     struct emc_cache *flow_cache = &pmd->flow_cache;
-    int miss_cnt = 0, lost_cnt = 0;
+    int miss_cnt = 0, lost_cnt = 0, lookup_cnt;
     bool any_miss;
     size_t i;

@@ -3835,7 +3848,7 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
         /* Key length is needed in all the cases, hash computed on demand. */
         keys[i].len = netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
     }
-    any_miss = !dpcls_lookup(&pmd->cls, keys, rules, cnt);
+    any_miss = !dpcls_lookup(&pmd->cls, keys, rules, cnt, port_no, &lookup_cnt);
     if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
         uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
         struct ofpbuf actions, put_actions;
@@ -3893,6 +3906,7 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
     }

     dp_netdev_count_packet(pmd, DP_STAT_MASKED_HIT, cnt - miss_cnt);
+    dp_netdev_count_packet(pmd, DP_STATS_LOOKUP_HIT, lookup_cnt);
     dp_netdev_count_packet(pmd, DP_STAT_MISS, miss_cnt);
     dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
 }
@@ -3925,7 +3939,7 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
                             md_is_valid, port_no);
     if (OVS_UNLIKELY(newcnt)) {
         packets->count = newcnt;
-        fast_path_processing(pmd, packets, keys, batches, &n_batches);
+        fast_path_processing(pmd, packets, keys, batches, &n_batches, port_no);
     }

     for (i = 0; i < n_batches; i++) {
@@ -4354,14 +4368,24 @@ struct dpcls_subtable {
 static void
 dpcls_init(struct dpcls *cls)
 {
+    int i;
+
     cmap_init(&cls->subtables_map);
-    pvector_init(&cls->subtables);
+    for (i=0; i<NUM_SUBTABLE_VECTORS; i++) {
+        pvector_init(&cls->subtables[i]);
+    }
+    memset(cls->hit_cnt, 0, sizeof cls->hit_cnt);
+    cls->next_optimization = time_msec() + DPCLS_OPTIMIATION_INTERVAL;
 }

 static void
 dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
 {
-    pvector_remove(&cls->subtables, subtable);
+    int i;
+
+    for (i=0; i<NUM_SUBTABLE_VECTORS; i++) {
+        pvector_remove(&cls->subtables[i], subtable);
+    }
     cmap_remove(&cls->subtables_map, &subtable->cmap_node,
                 subtable->mask.hash);
     cmap_destroy(&subtable->rules);
@@ -4376,13 +4400,16 @@ dpcls_destroy(struct dpcls *cls)
 {
     if (cls) {
         struct dpcls_subtable *subtable;
+        int i;

         CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
             ovs_assert(cmap_count(&subtable->rules) == 0);
             dpcls_destroy_subtable(cls, subtable);
         }
         cmap_destroy(&cls->subtables_map);
-        pvector_destroy(&cls->subtables);
+        for (i=0; i<NUM_SUBTABLE_VECTORS; i++) {
+            pvector_destroy(&cls->subtables[i]);
+        }
     }
 }

@@ -4390,6 +4417,7 @@ static struct dpcls_subtable *
 dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
 {
     struct dpcls_subtable *subtable;
+    int i;

     /* Need to add one. */
     subtable = xmalloc(sizeof *subtable
@@ -4397,8 +4425,11 @@ dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
     cmap_init(&subtable->rules);
     netdev_flow_key_clone(&subtable->mask, mask);
     cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
-    pvector_insert(&cls->subtables, subtable, 0);
-    pvector_publish(&cls->subtables);
+    for (i=0; i<NUM_SUBTABLE_VECTORS; i++) {
+        /* Insert new subtable with priority zero (no hits) */
+        pvector_insert(&cls->subtables[i], subtable, 0);
+        pvector_publish(&cls->subtables[i]);
+    }

     return subtable;
 }
@@ -4417,6 +4448,33 @@ dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
     return dpcls_create_subtable(cls, mask);
 }

+
+/* Periodically sort the subtable vectors according to hit counts */
+static inline void
+dpcls_try_optimize(struct dpcls *cls)
+{
+    long long int now = time_msec();
+
+    if (now > cls->next_optimization) {
+        struct dpcls_subtable *subtable;
+        int port_idx, subtable_idx;
+
+        for (port_idx=0; port_idx<NUM_SUBTABLE_VECTORS; port_idx++) {
+            struct pvector *pvec = &cls->subtables[port_idx];
+            PVECTOR_FOR_EACH (subtable, pvec) {
+                subtable_idx = hash_pointer(subtable, 210365) % NUM_SUBTABLE_SLOTS;
+                pvector_change_priority(pvec, subtable,
+                        cls->hit_cnt[port_idx][subtable_idx]);
+            }
+            pvector_publish(pvec);
+        }
+
+        /* Start new measuring interval */
+        memset(cls->hit_cnt, 0, sizeof cls->hit_cnt);
+        cls->next_optimization = now + DPCLS_OPTIMIATION_INTERVAL;
+    }
+}
+
 /* Insert 'rule' into 'cls'. */
 static void
 dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
@@ -4433,6 +4491,7 @@ static void
 dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
 {
     struct dpcls_subtable *subtable;
+    int i;

     ovs_assert(rule->mask);

@@ -4441,7 +4500,9 @@ dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
     if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
         == 0) {
         dpcls_destroy_subtable(cls, subtable);
-        pvector_publish(&cls->subtables);
+        for (i=0; i<NUM_SUBTABLE_VECTORS; i++) {
+            pvector_publish(&cls->subtables[i]);
+        }
     }
 }

@@ -4474,8 +4535,9 @@ dpcls_rule_matches_key(const struct dpcls_rule *rule,
  *
  * Returns true if all flows found a corresponding rule. */
 static bool
-dpcls_lookup(const struct dpcls *cls, const struct netdev_flow_key keys[],
-             struct dpcls_rule **rules, const size_t cnt)
+dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key keys[],
+             struct dpcls_rule **rules, const size_t cnt,
+             const odp_port_t port_no, int *num_lookups_p)
 {
     /* The batch size 16 was experimentally found faster than 8 or 32. */
     typedef uint16_t map_type;
@@ -4495,7 +4557,12 @@ dpcls_lookup(const struct dpcls *cls, const struct netdev_flow_key keys[],
     }
     memset(rules, 0, cnt * sizeof *rules);

-    PVECTOR_FOR_EACH (subtable, &cls->subtables) {
+    /* Select the subtable vector for the in_port */
+    uint32_t port_idx = hash_port_no(port_no) % NUM_SUBTABLE_VECTORS;
+    uint64_t *hit_cnt = cls->hit_cnt[port_idx];
+    int lookups_match = 0, subtable_pos = 1;
+
+    PVECTOR_FOR_EACH (subtable, &cls->subtables[port_idx]) {
         const struct netdev_flow_key *mkeys = keys;
         struct dpcls_rule **mrules = rules;
         map_type remains = 0;
@@ -4527,6 +4594,10 @@ dpcls_lookup(const struct dpcls *cls, const struct netdev_flow_key keys[],
                 CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
                     if (OVS_LIKELY(dpcls_rule_matches_key(rule, &mkeys[i]))) {
                         mrules[i] = rule;
+                        /* Update the subtable hit statistics for the in_port vector */
+                        int subtable_idx = hash_pointer(subtable, 210365) % NUM_SUBTABLE_SLOTS;
+                        hit_cnt[subtable_idx]++;
+                        lookups_match += subtable_pos;
                         goto next;
                     }
                 }
@@ -4538,8 +4609,11 @@ dpcls_lookup(const struct dpcls *cls, const struct netdev_flow_key keys[],
             remains |= maps[m];
         }
         if (!remains) {
+            if (num_lookups_p) *num_lookups_p = lookups_match;
             return true;              /* All found. */
         }
+        subtable_pos++;
     }
+    if (num_lookups_p) *num_lookups_p = lookups_match;
     return false;                     /* Some misses. */
 }