[ovs-dev] [PATCH 4/4] lib/pvector: Non-intrusive RCU priority vector.

Thu May 29 23:37:17 UTC 2014

Factor out the priority vector code from the classifier.

Making the classifier use RCU instead of locking requires parallel
access to the priority vector, pointing to subtables in descending
priority order.  When a new subtable is added, a new copy of the
priority vector is allocated, while the current readers can keep on
using the old copy they started with.  Adding and removing subtables
is usually less frequent than adding and removing rules, so this
should not have a visible performance implication.  As on optimization
for the userspace datapath use, where all the subtables have the same
priority, new subtables can be added to the end of the vector without
reallocation and without disturbing readers.

cls_subtables_reset() is now removed, as it served it's purpose in bug
hunting.  Checks on the new pvector are now incorporated into
tests/test-classifier.c.

Signed-off-by: Jarno Rajahalme <jrajahalme at nicira.com>Summary:
Signed-off-by: Jarno Rajahalme <jrajahalme at nicira.com>
---
 lib/automake.mk         |    2 +
 lib/classifier.c        |  445 +++++++----------------------------------------
 lib/pvector.c           |  213 +++++++++++++++++++++++
 lib/pvector.h           |  196 +++++++++++++++++++++
 tests/test-classifier.c |   34 ++++
 5 files changed, 508 insertions(+), 382 deletions(-)
 create mode 100644 lib/pvector.c
 create mode 100644 lib/pvector.h

diff --git a/lib/automake.mk b/lib/automake.mk
index dc2ca0e..ac7185a 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -173,6 +173,8 @@ lib_libopenvswitch_la_SOURCES = \
 	lib/poll-loop.h \
 	lib/process.c \
 	lib/process.h \
+	lib/pvector.c \
+	lib/pvector.h \
 	lib/random.c \
 	lib/random.h \
 	lib/rconn.c \
diff --git a/lib/classifier.c b/lib/classifier.c
index 7bad3ac..f448ded 100644
--- a/lib/classifier.c
+++ b/lib/classifier.c
@@ -29,6 +29,7 @@
 #include "ofp-util.h"
 #include "ovs-thread.h"
 #include "packets.h"
+#include "pvector.h"
 #include "tag.h"
 #include "util.h"
 #include "vlog.h"
@@ -48,18 +49,6 @@ struct cls_trie {
     struct trie_node *root;       /* NULL if none. */
 };
 
-struct cls_subtable_entry {
-    struct cls_subtable *subtable;
-    tag_type tag;
-    unsigned int max_priority;
-};
-
-struct cls_subtables {
-    size_t count;          /* One past last valid array element. */
-    size_t alloc_size;     /* Number of allocated elements. */
-    struct cls_subtable_entry *array;
-};
-
 enum {
     CLS_MAX_INDICES = 3   /* Maximum number of lookup indices per subtable. */
 };
@@ -70,7 +59,7 @@ struct cls_classifier {
     uint8_t flow_segments[CLS_MAX_INDICES]; /* Flow segment boundaries to use
                                              * for staged lookup. */
     struct hmap subtables_map;      /* Contains "struct cls_subtable"s.  */
-    struct cls_subtables subtables;
+    struct pvector subtables;
     struct hmap partitions;         /* Contains "struct cls_partition"s. */
     struct cls_trie tries[CLS_MAX_TRIES]; /* Prefix tries. */
     unsigned int n_tries;
@@ -143,13 +132,6 @@ static struct cls_subtable *insert_subtable(struct cls_classifier *,
 
 static void destroy_subtable(struct cls_classifier *, struct cls_subtable *);
 
-static void update_subtables_after_insertion(struct cls_classifier *,
-                                             struct cls_subtable *,
-                                             unsigned int new_priority);
-static void update_subtables_after_removal(struct cls_classifier *,
-                                           struct cls_subtable *,
-                                           unsigned int del_priority);
-
 static struct cls_match *find_match_wc(const struct cls_subtable *,
                                        const struct flow *, struct trie_ctx *,
                                        unsigned int n_tries,
@@ -190,200 +172,6 @@ static void mask_set_prefix_bits(struct flow_wildcards *, uint8_t be32ofs,
                                  unsigned int nbits);
 static bool mask_prefix_bits_set(const struct flow_wildcards *,
                                  uint8_t be32ofs, unsigned int nbits);
-
-static void
-cls_subtables_init(struct cls_subtables *subtables)
-{
-    memset(subtables, 0, sizeof *subtables);
-}
-
-static void
-cls_subtables_destroy(struct cls_subtables *subtables)
-{
-    free(subtables->array);
-    memset(subtables, 0, sizeof *subtables);
-}
-
-/* Subtables insertion. */
-static void
-cls_subtables_push_back(struct cls_subtables *subtables,
-                        struct cls_subtable_entry a)
-{
-    if (subtables->count == subtables->alloc_size) {
-        subtables->array = x2nrealloc(subtables->array, &subtables->alloc_size,
-                                      sizeof a);
-    }
-
-    subtables->array[subtables->count++] = a;
-}
-
-/* Move subtable entry at 'from' to 'to', shifting the elements in between
- * (including the one at 'to') accordingly. */
-static inline void
-cls_subtables_move(struct cls_subtable_entry *to,
-                   struct cls_subtable_entry *from)
-{
-    if (to != from) {
-        struct cls_subtable_entry temp = *from;
-
-        if (to > from) {
-            /* Shift entries (from,to] backwards to make space at 'to'. */
-            memmove(from, from + 1, (to - from) * sizeof *to);
-        } else {
-            /* Shift entries [to,from) forward to make space at 'to'. */
-            memmove(to + 1, to, (from - to) * sizeof *to);
-        }
-
-        *to = temp;
-    }
-}
-
-/* Subtables removal. */
-static inline void
-cls_subtables_remove(struct cls_subtables *subtables,
-                     struct cls_subtable_entry *elem)
-{
-    ssize_t size = (&subtables->array[subtables->count]
-                    - (elem + 1)) * sizeof *elem;
-    if (size > 0) {
-        memmove(elem, elem + 1, size);
-    }
-    subtables->count--;
-}
-
-#define CLS_SUBTABLES_FOR_EACH(SUBTABLE, ITER, SUBTABLES)  \
-    for ((ITER) = (SUBTABLES)->array;                      \
-         (ITER) < &(SUBTABLES)->array[(SUBTABLES)->count]  \
-             && OVS_LIKELY((SUBTABLE) = (ITER)->subtable); \
-         ++(ITER))
-#define CLS_SUBTABLES_FOR_EACH_CONTINUE(SUBTABLE, ITER, SUBTABLES) \
-    for (++(ITER);                                                 \
-         (ITER) < &(SUBTABLES)->array[(SUBTABLES)->count]          \
-             && OVS_LIKELY((SUBTABLE) = (ITER)->subtable);         \
-         ++(ITER))
-#define CLS_SUBTABLES_FOR_EACH_REVERSE(SUBTABLE, ITER, SUBTABLES)  \
-    for ((ITER) = &(SUBTABLES)->array[(SUBTABLES)->count];         \
-         (ITER) > (SUBTABLES)->array                               \
-             && OVS_LIKELY((SUBTABLE) = (--(ITER))->subtable);)
-
-static void
-cls_subtables_verify(struct cls_subtables *subtables)
-{
-    struct cls_subtable *table;
-    struct cls_subtable_entry *iter;
-    unsigned int priority = 0;
-
-    CLS_SUBTABLES_FOR_EACH_REVERSE (table, iter, subtables) {
-        if (iter->max_priority != table->max_priority) {
-            VLOG_WARN("Subtable %p has mismatching priority in cache (%u != %u)",
-                      table, iter->max_priority, table->max_priority);
-        }
-        if (iter->max_priority < priority) {
-            VLOG_WARN("Subtable cache is out of order (%u < %u)",
-                      iter->max_priority, priority);
-        }
-        priority = iter->max_priority;
-    }
-}
-
-static void
-cls_subtables_reset(struct cls_classifier *cls)
-{
-    struct cls_subtables old = cls->subtables;
-    struct cls_subtable *subtable;
-
-    VLOG_WARN("Resetting subtable cache.");
-
-    cls_subtables_verify(&cls->subtables);
-
-    cls_subtables_init(&cls->subtables);
-
-    HMAP_FOR_EACH (subtable, hmap_node, &cls->subtables_map) {
-        struct cls_match *head;
-        struct cls_subtable_entry elem;
-        struct cls_subtable *table;
-        struct cls_subtable_entry *iter, *from = NULL;
-        unsigned int new_max = 0;
-        unsigned int max_count = 0;
-        bool found;
-
-        /* Verify max_priority. */
-        HMAP_FOR_EACH (head, hmap_node, &subtable->rules) {
-            if (head->priority > new_max) {
-                new_max = head->priority;
-                max_count = 1;
-            } else if (head->priority == new_max) {
-                max_count++;
-            }
-        }
-        if (new_max != subtable->max_priority ||
-            max_count != subtable->max_count) {
-            VLOG_WARN("subtable %p (%u rules) has mismatching max_priority "
-                      "(%u) or max_count (%u). Highest priority found was %u, "
-                      "count: %u",
-                      subtable, subtable->n_rules, subtable->max_priority,
-                      subtable->max_count, new_max, max_count);
-            subtable->max_priority = new_max;
-            subtable->max_count = max_count;
-        }
-
-        /* Locate the subtable from the old cache. */
-        found = false;
-        CLS_SUBTABLES_FOR_EACH (table, iter, &old) {
-            if (table == subtable) {
-                if (iter->max_priority != new_max) {
-                    VLOG_WARN("Subtable %p has wrong max priority (%u != %u) "
-                              "in the old cache.",
-                              subtable, iter->max_priority, new_max);
-                }
-                if (found) {
-                    VLOG_WARN("Subtable %p duplicated in the old cache.",
-                              subtable);
-                }
-                found = true;
-            }
-        }
-        if (!found) {
-            VLOG_WARN("Subtable %p not found from the old cache.", subtable);
-        }
-
-        elem.subtable = subtable;
-        elem.tag = subtable->tag;
-        elem.max_priority = subtable->max_priority;
-        cls_subtables_push_back(&cls->subtables, elem);
-
-        /* Possibly move 'subtable' earlier in the priority array.  If
-         * we break out of the loop, then the subtable (at 'from')
-         * should be moved to the position right after the current
-         * element.  If the loop terminates normally, then 'iter' will
-         * be at the first array element and we'll move the subtable
-         * to the front of the array. */
-        CLS_SUBTABLES_FOR_EACH_REVERSE (table, iter, &cls->subtables) {
-            if (table == subtable) {
-                from = iter; /* Locate the subtable as we go. */
-            } else if (table->max_priority >= new_max) {
-                ovs_assert(from != NULL);
-                iter++; /* After this. */
-                break;
-            }
-        }
-
-        /* Move subtable at 'from' to 'iter'. */
-        cls_subtables_move(iter, from);
-    }
-
-    /* Verify that the old and the new have the same size. */
-    if (old.count != cls->subtables.count) {
-        VLOG_WARN("subtables cache sizes differ: old (%"PRIuSIZE
-                  ") != new (%"PRIuSIZE").",
-                  old.count, cls->subtables.count);
-    }
-
-    cls_subtables_destroy(&old);
-
-    cls_subtables_verify(&cls->subtables);
-}
-
 
 /* flow/miniflow/minimask/minimatch utilities.
  * These are only used by the classifier, so place them here to allow
@@ -673,7 +461,7 @@ classifier_init(struct classifier *cls_, const uint8_t *flow_segments)
 
     cls->n_rules = 0;
     hmap_init(&cls->subtables_map);
-    cls_subtables_init(&cls->subtables);
+    pvector_init(&cls->subtables);
     hmap_init(&cls->partitions);
     cls->n_flow_segments = 0;
     if (flow_segments) {
@@ -718,7 +506,7 @@ classifier_destroy(struct classifier *cls_)
         }
         hmap_destroy(&cls->partitions);
 
-        cls_subtables_destroy(&cls->subtables);
+        pvector_destroy(&cls->subtables);
         free(cls);
     }
 }
@@ -772,7 +560,6 @@ trie_init(struct cls_classifier *cls, int trie_idx,
 {
     struct cls_trie *trie = &cls->tries[trie_idx];
     struct cls_subtable *subtable;
-    struct cls_subtable_entry *iter;
 
     if (trie_idx < cls->n_tries) {
         trie_destroy(trie->root);
@@ -781,7 +568,7 @@ trie_init(struct cls_classifier *cls, int trie_idx,
     trie->field = field;
 
     /* Add existing rules to the trie. */
-    CLS_SUBTABLES_FOR_EACH (subtable, iter, &cls->subtables) {
+    HMAP_FOR_EACH (subtable, hmap_node, &cls->subtables_map) {
         unsigned int plen;
 
         plen = field ? minimask_get_prefix_len(&subtable->mask, field) : 0;
@@ -883,11 +670,13 @@ classifier_replace(struct classifier *cls_, struct cls_rule *rule)
     struct cls_subtable *subtable;
 
     subtable = find_subtable(cls, &rule->match.mask);
+
     if (!subtable) {
         subtable = insert_subtable(cls, &rule->match.mask);
     }
 
     old_rule = insert_rule(cls, subtable, rule);
+
     if (!old_rule) {
         int i;
 
@@ -898,7 +687,6 @@ classifier_replace(struct classifier *cls_, struct cls_rule *rule)
                                                           metadata);
         }
 
-        subtable->n_rules++;
         cls->n_rules++;
 
         for (i = 0; i < cls->n_tries; i++) {
@@ -918,7 +706,6 @@ classifier_replace(struct classifier *cls_, struct cls_rule *rule)
             trie_insert_prefix(&subtable->ports_trie, &masked_ports,
                                subtable->ports_mask_len);
         }
-
         return NULL;
     } else {
         struct cls_rule *old_cls_rule = old_rule->cls_rule;
@@ -1004,8 +791,22 @@ classifier_remove(struct classifier *cls_, struct cls_rule *rule)
 
     if (--subtable->n_rules == 0) {
         destroy_subtable(cls, subtable);
-    } else {
-        update_subtables_after_removal(cls, subtable, cls_match->priority);
+    } else if (subtable->max_priority == cls_match->priority
+               && --subtable->max_count == 0) {
+        /* Find the new 'max_priority' and 'max_count'. */
+        struct cls_match *head;
+        unsigned int max_priority = 0;
+
+        HMAP_FOR_EACH (head, hmap_node, &subtable->rules) {
+            if (head->priority > max_priority) {
+                max_priority = head->priority;
+                subtable->max_count = 1;
+            } else if (head->priority == max_priority) {
+                ++subtable->max_count;
+            }
+        }
+        subtable->max_priority = max_priority;
+        pvector_change_priority(&cls->subtables, subtable, max_priority);
     }
 
     cls->n_rules--;
@@ -1034,12 +835,6 @@ trie_ctx_init(struct trie_ctx *ctx, const struct cls_trie *trie)
     ctx->lookup_done = false;
 }
 
-static inline void
-lookahead_subtable(const struct cls_subtable_entry *subtables)
-{
-    ovs_prefetch_range(subtables->subtable, sizeof *subtables->subtable);
-}
-
 /* Finds and returns the highest-priority rule in 'cls' that matches 'flow'.
  * Returns a null pointer if no rules in 'cls' match 'flow'.  If multiple rules
  * of equal priority match 'flow', returns one arbitrarily.
@@ -1055,15 +850,10 @@ classifier_lookup(const struct classifier *cls_, const struct flow *flow,
     struct cls_classifier *cls = cls_->cls;
     const struct cls_partition *partition;
     tag_type tags;
-    struct cls_match *best;
-    struct trie_ctx trie_ctx[CLS_MAX_TRIES];
-    int i;
-    struct cls_subtable_entry *subtables = cls->subtables.array;
-    int n_subtables = cls->subtables.count;
     int64_t best_priority = -1;
-
-    /* Prefetch the subtables array. */
-    ovs_prefetch_range(subtables, n_subtables * sizeof *subtables);
+    const struct cls_match *best;
+    struct trie_ctx trie_ctx[CLS_MAX_TRIES];
+    struct pvector_cursor cursor = pvector_cursor_init(&cls->subtables);
 
     /* Determine 'tags' such that, if 'subtable->tag' doesn't intersect them,
      * then 'flow' cannot possibly match in 'subtable':
@@ -1090,37 +880,34 @@ classifier_lookup(const struct classifier *cls_, const struct flow *flow,
     tags = partition ? partition->tags : TAG_ARBITRARY;
 
     /* Initialize trie contexts for match_find_wc(). */
-    for (i = 0; i < cls->n_tries; i++) {
+    for (int i = 0; i < cls->n_tries; i++) {
         trie_ctx_init(&trie_ctx[i], &cls->tries[i]);
     }
 
     /* Prefetch the first subtables. */
-    if (n_subtables > 1) {
-        lookahead_subtable(subtables);
-        lookahead_subtable(subtables + 1);
-    }
+    pvector_cursor_lookahead(&cursor, 0, sizeof(struct cls_subtable));
+    pvector_cursor_lookahead(&cursor, 1, sizeof(struct cls_subtable));
 
     best = NULL;
-    for (i = 0; OVS_LIKELY(i < n_subtables); i++) {
+    while (pvector_cursor_next(&cursor)) {
+        struct cls_subtable *subtable;
         struct cls_match *rule;
 
-        if ((int64_t)subtables[i].max_priority <= best_priority) {
+        if ((int64_t)pvector_cursor_priority(&cursor) <= best_priority) {
             /* Subtables are in descending priority order,
              * can not find anything better. */
             break;
         }
+        subtable = pvector_cursor_ptr(&cursor);
 
         /* Prefetch a forthcoming subtable. */
-        if (i + 2 < n_subtables) {
-            lookahead_subtable(&subtables[i + 2]);
-        }
+        pvector_cursor_lookahead(&cursor, 1, sizeof(struct cls_subtable));
 
-        if (!tag_intersects(tags, subtables[i].tag)) {
+        if (!tag_intersects(tags, subtable->tag)) {
             continue;
         }
 
-        rule = find_match_wc(subtables[i].subtable, flow, trie_ctx,
-                             cls->n_tries, wc);
+        rule = find_match_wc(subtable, flow, trie_ctx, cls->n_tries, wc);
         if (rule && (int64_t)rule->priority > best_priority) {
             best_priority = (int64_t)rule->priority;
             best = rule;
@@ -1181,9 +968,8 @@ struct cls_rule *classifier_lookup_miniflow_first(const struct classifier *cls_,
 {
     struct cls_classifier *cls = cls_->cls;
     struct cls_subtable *subtable;
-    struct cls_subtable_entry *iter;
 
-    CLS_SUBTABLES_FOR_EACH (subtable, iter, &cls->subtables) {
+    PVECTOR_FOR_EACH (subtable, &cls->subtables) {
         struct cls_match *rule;
 
         rule = find_match_miniflow(subtable, flow,
@@ -1210,6 +996,7 @@ classifier_find_rule_exactly(const struct classifier *cls_,
     struct cls_subtable *subtable;
 
     subtable = find_subtable(cls, &target->match.mask);
+
     if (!subtable) {
         return NULL;
     }
@@ -1257,15 +1044,15 @@ classifier_rule_overlaps(const struct classifier *cls_,
 {
     struct cls_classifier *cls = cls_->cls;
     struct cls_subtable *subtable;
-    struct cls_subtable_entry *iter;
+    unsigned int priority;
 
     /* Iterate subtables in the descending max priority order. */
-    CLS_SUBTABLES_FOR_EACH (subtable, iter, &cls->subtables) {
+    PVECTOR_FOR_EACH_PRIORITY (subtable, priority, &cls->subtables) {
         uint32_t storage[FLOW_U32S];
         struct minimask mask;
         struct cls_match *head;
 
-        if (target->priority > iter->max_priority) {
+        if (target->priority > priority) {
             break; /* Can skip this and the rest of the subtables. */
         }
 
@@ -1450,7 +1237,6 @@ insert_subtable(struct cls_classifier *cls, const struct minimask *mask)
     int i, index = 0;
     struct flow_wildcards old, new;
     uint8_t prev;
-    struct cls_subtable_entry elem;
     int count = count_1bits(mask->masks.map);
 
     subtable = xzalloc(sizeof *subtable - sizeof mask->masks.inline_values
@@ -1501,10 +1287,6 @@ insert_subtable(struct cls_classifier *cls, const struct minimask *mask)
         = 32 - ctz32(ntohl(MINIFLOW_GET_BE32(&mask->masks, tp_src)));
 
     hmap_insert(&cls->subtables_map, &subtable->hmap_node, hash);
-    elem.subtable = subtable;
-    elem.tag = subtable->tag;
-    elem.max_priority = subtable->max_priority;
-    cls_subtables_push_back(&cls->subtables, elem);
 
     return subtable;
 }
@@ -1513,133 +1295,17 @@ static void
 destroy_subtable(struct cls_classifier *cls, struct cls_subtable *subtable)
 {
     int i;
-    struct cls_subtable *table = NULL;
-    struct cls_subtable_entry *iter;
-
-    CLS_SUBTABLES_FOR_EACH (table, iter, &cls->subtables) {
-        if (table == subtable) {
-            cls_subtables_remove(&cls->subtables, iter);
-            break;
-        }
-    }
 
+    pvector_remove(&cls->subtables, subtable);
     trie_destroy(subtable->ports_trie);
 
     for (i = 0; i < subtable->n_indices; i++) {
         hindex_destroy(&subtable->indices[i]);
     }
-    minimask_destroy(&subtable->mask);
     hmap_remove(&cls->subtables_map, &subtable->hmap_node);
+    minimask_destroy(&subtable->mask);
     hmap_destroy(&subtable->rules);
-    free(subtable);
-}
-
-/* This function performs the following updates for 'subtable' in 'cls'
- * following the addition of a new rule with priority 'new_priority' to
- * 'subtable':
- *
- *    - Update 'subtable->max_priority' and 'subtable->max_count' if necessary.
- *
- *    - Update 'subtable''s position in 'cls->subtables' if necessary.
- *
- * This function should only be called after adding a new rule, not after
- * replacing a rule by an identical one or modifying a rule in-place. */
-static void
-update_subtables_after_insertion(struct cls_classifier *cls,
-                                 struct cls_subtable *subtable,
-                                 unsigned int new_priority)
-{
-    if (new_priority == subtable->max_priority) {
-        ++subtable->max_count;
-    } else if (new_priority > subtable->max_priority) {
-        struct cls_subtable *table;
-        struct cls_subtable_entry *iter, *from = NULL;
-
-        subtable->max_priority = new_priority;
-        subtable->max_count = 1;
-
-        /* Possibly move 'subtable' earlier in the priority array.  If
-         * we break out of the loop, then the subtable (at 'from')
-         * should be moved to the position right after the current
-         * element.  If the loop terminates normally, then 'iter' will
-         * be at the first array element and we'll move the subtable
-         * to the front of the array. */
-        CLS_SUBTABLES_FOR_EACH_REVERSE (table, iter, &cls->subtables) {
-            if (table == subtable) {
-                from = iter; /* Locate the subtable as we go. */
-                iter->max_priority = new_priority;
-            } else if (table->max_priority >= new_priority) {
-                if (from == NULL) {
-                    /* Corrupted cache? */
-                    cls_subtables_reset(cls);
-                    VLOG_ABORT("update_subtables_after_insertion(): Subtable priority list corrupted.");
-                    OVS_NOT_REACHED();
-                }
-                iter++; /* After this. */
-                break;
-            }
-        }
-
-        /* Move subtable at 'from' to 'iter'. */
-        cls_subtables_move(iter, from);
-    }
-}
-
-/* This function performs the following updates for 'subtable' in 'cls'
- * following the deletion of a rule with priority 'del_priority' from
- * 'subtable':
- *
- *    - Update 'subtable->max_priority' and 'subtable->max_count' if necessary.
- *
- *    - Update 'subtable''s position in 'cls->subtables' if necessary.
- *
- * This function should only be called after removing a rule, not after
- * replacing a rule by an identical one or modifying a rule in-place. */
-static void
-update_subtables_after_removal(struct cls_classifier *cls,
-                               struct cls_subtable *subtable,
-                               unsigned int del_priority)
-{
-    if (del_priority == subtable->max_priority && --subtable->max_count == 0) {
-        struct cls_match *head;
-        struct cls_subtable *table;
-        struct cls_subtable_entry *iter, *from = NULL;
-
-        subtable->max_priority = 0;
-        HMAP_FOR_EACH (head, hmap_node, &subtable->rules) {
-            if (head->priority > subtable->max_priority) {
-                subtable->max_priority = head->priority;
-                subtable->max_count = 1;
-            } else if (head->priority == subtable->max_priority) {
-                ++subtable->max_count;
-            }
-        }
-
-        /* Possibly move 'subtable' later in the priority array.
-         * After the loop the 'iter' will point right after the position
-         * at which the subtable should be moved (either at a subtable
-         * with an equal or lower priority, or just past the array),
-         * so it is decremented once. */
-        CLS_SUBTABLES_FOR_EACH (table, iter, &cls->subtables) {
-            if (table == subtable) {
-                from = iter; /* Locate the subtable as we go. */
-                iter->max_priority = subtable->max_priority;
-            } else if (table->max_priority <= subtable->max_priority) {
-                if (from == NULL) {
-                    /* Corrupted cache? */
-                    cls_subtables_reset(cls);
-                    VLOG_ABORT("update_subtables_after_removal(): Subtable priority list corrupted.");
-                    OVS_NOT_REACHED();
-                }
-                break;
-            }
-        }
-        /* Now at one past the destination. */
-        iter--;
-
-        /* Move subtable at 'from' to 'iter'. */
-        cls_subtables_move(iter, from);
-    }
+    ovsrcu_postpone(free, subtable);
 }
 
 struct range {
@@ -1897,7 +1563,8 @@ insert_rule(struct cls_classifier *cls, struct cls_subtable *subtable,
         FOR_EACH_RULE_IN_LIST (rule, head) {
             if (cls_match->priority >= rule->priority) {
                 if (rule == head) {
-                    /* 'new' is the new highest-priority flow in the list. */
+                    /* 'cls_match' is the new highest-priority flow in the
+                     * list. */
                     hmap_replace(&subtable->rules,
                                  &rule->hmap_node, &cls_match->hmap_node);
                 }
@@ -1905,11 +1572,10 @@ insert_rule(struct cls_classifier *cls, struct cls_subtable *subtable,
                 if (cls_match->priority == rule->priority) {
                     list_replace(&cls_match->list, &rule->list);
                     old = rule;
-                    goto out;
                 } else {
                     list_insert(&rule->list, &cls_match->list);
-                    goto out;
                 }
+                goto out;
             }
         }
 
@@ -1919,13 +1585,28 @@ insert_rule(struct cls_classifier *cls, struct cls_subtable *subtable,
 
  out:
     if (!old) {
-        update_subtables_after_insertion(cls, subtable, cls_match->priority);
+        subtable->n_rules++;
+
+        /* Rule was added, not replaced.  Update 'subtable's 'max_priority'
+         * and 'max_count', if necessary. */
+        if (subtable->n_rules == 1) {
+            subtable->max_priority = cls_match->priority;
+            subtable->max_count = 1;
+            pvector_insert(&cls->subtables, subtable, cls_match->priority);
+        } else if (subtable->max_priority == cls_match->priority) {
+            ++subtable->max_count;
+        } else if (cls_match->priority > subtable->max_priority) {
+            subtable->max_priority = cls_match->priority;
+            subtable->max_count = 1;
+            pvector_change_priority(&cls->subtables, subtable, cls_match->priority);
+        }
     } else {
         /* Remove old node from indices. */
         for (i = 0; i < subtable->n_indices; i++) {
             hindex_remove(&subtable->indices[i], &old->index_nodes[i]);
         }
     }
+
     return old;
 }
 
diff --git a/lib/pvector.c b/lib/pvector.c
new file mode 100644
index 0000000..551b71d
--- /dev/null
+++ b/lib/pvector.c
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2014 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include "pvector.h"
+
+static struct pvector_impl *
+pvector_impl_get(const struct pvector *pvec)
+{
+    return ovsrcu_get(struct pvector_impl *, &pvec->impl);
+}
+
+static struct pvector_impl *
+pvector_impl_alloc(size_t size)
+{
+    struct pvector_impl *impl;
+
+    impl = xmalloc(sizeof *impl + size * sizeof impl->vector[0]);
+    impl->size = 0;
+    impl->allocated = size;
+
+    return impl;
+}
+
+static struct pvector_impl *
+pvector_impl_dup(struct pvector_impl *old)
+{
+    return xmemdup(old, old->allocated * sizeof old->vector[0]);
+}
+
+/* Initializes 'pvec' as an empty concurrent priority vector. */
+void
+pvector_init(struct pvector *pvec)
+{
+    ovsrcu_set(&pvec->impl, pvector_impl_alloc(0));
+}
+
+/* Destroys 'pvec'.
+ *
+ * The client is responsible for destroying any data previously held in
+ * 'pvec'. */
+void
+pvector_destroy(struct pvector *pvec)
+{
+    ovsrcu_postpone(free, pvector_impl_get(pvec));
+    ovsrcu_set(&pvec->impl, NULL); /* Poison. */
+}
+
+/* Iterators for callers that need the 'index' afterward. */
+#define PVECTOR_IMPL_FOR_EACH(ENTRY, INDEX, IMPL)          \
+    for ((INDEX) = 0;                                      \
+         (INDEX) < (IMPL)->size                            \
+             && ((ENTRY) = &(IMPL)->vector[INDEX], true);  \
+         (INDEX)++)
+#define PVECTOR_IMPL_FOR_EACH_REVERSE(ENTRY, INDEX, IMPL)  \
+    for ((INDEX) = (int)(IMPL)->size - 1;                  \
+         (int)(INDEX) >= 0                                 \
+             && ((ENTRY) = &(IMPL)->vector[INDEX], true);  \
+         (INDEX)--)
+
+static int
+pvector_entry_cmp(const void *a_, const void *b_)
+{
+    unsigned int a = ((const struct pvector_entry *)a_)->priority;
+    unsigned int b = ((const struct pvector_entry *)b_)->priority;
+
+    return a > b ? -1 : a < b;
+}
+
+static void
+pvector_impl_sort(struct pvector_impl *impl)
+{
+    qsort(impl->vector, impl->size, sizeof *impl->vector, pvector_entry_cmp);
+}
+
+/* Returns the index with priority equal or lower than 'target_priority',
+ * which will be one past the vector if none exists. */
+static int
+pvector_impl_find_priority(struct pvector_impl *impl,
+                           unsigned int target_priority)
+{
+    const struct pvector_entry *entry;
+    int index;
+
+    PVECTOR_IMPL_FOR_EACH (entry, index, impl) {
+        if (entry->priority <= target_priority) {
+            break;
+        }
+    }
+    return index;
+}
+
+/* Returns the index of the 'ptr' in the vector, or -1 if none is found. */
+static int
+pvector_impl_find(struct pvector_impl *impl, void *target)
+{
+    const struct pvector_entry *entry;
+    int index;
+
+    PVECTOR_IMPL_FOR_EACH (entry, index, impl) {
+        if (entry->ptr == target) {
+            return index;
+        }
+    }
+    return -1;
+}
+
+void
+pvector_insert(struct pvector *pvec, void *ptr, unsigned int priority)
+{
+    struct pvector_impl *old, *new;
+    int index;
+
+    old = pvector_impl_get(pvec);
+
+    /* Check if can add to the end without reallocation. */
+    if (old->allocated > old->size &&
+        (!old->size || priority <= old->vector[old->size - 1].priority)) {
+        old->vector[old->size].ptr = ptr;
+        old->vector[old->size].priority = priority;
+        /* Size increment must not be visible to the readers before the new
+         * entry is stored. */
+        atomic_thread_fence(memory_order_release);
+        ++old->size;
+    } else {
+        new = pvector_impl_alloc(old->allocated + 1);
+
+        index = pvector_impl_find_priority(old, priority);
+        /* Now at the insertion index. */
+        memcpy(new->vector, old->vector, index * sizeof old->vector[0]);
+        new->vector[index].ptr = ptr;
+        new->vector[index].priority = priority;
+        memcpy(&new->vector[index + 1], &old->vector[index],
+               (old->size - index) * sizeof old->vector[0]);
+        new->size = old->size + 1;
+
+        ovsrcu_set(&pvec->impl, new);
+        ovsrcu_postpone(free, old);
+    }
+}
+
+void
+pvector_remove(struct pvector *pvec, void *ptr)
+{
+    struct pvector_impl *old, *new;
+    int index;
+
+    old = pvector_impl_get(pvec);
+
+    ovs_assert(old->size > 0);
+
+    index = pvector_impl_find(old, ptr);
+    ovs_assert(index >= 0);
+    /* Now at the index of the entry to be deleted. */
+
+    /* We do not try to delete the last entry without reallocation so that
+     * the readers can read the 'size' once in the beginning of each iteration.
+     */
+
+    /* Keep extra space for insertions to the end. */
+    new = pvector_impl_alloc(old->size - 1 + PVECTOR_EXTRA_ALLOC);
+
+    memcpy(new->vector, old->vector, index * sizeof old->vector[0]);
+    memcpy(&new->vector[index], &old->vector[index + 1],
+           (old->size - (index + 1)) * sizeof old->vector[0]);
+
+    new->size = old->size - 1;
+
+    ovsrcu_set(&pvec->impl, new);
+    ovsrcu_postpone(free, old);
+}
+
+/* Change entry's 'priority' and keep the vector ordered. */
+void
+pvector_change_priority(struct pvector *pvec, void *ptr, unsigned int priority)
+{
+    struct pvector_impl *old = pvector_impl_get(pvec);
+    int index = pvector_impl_find(old, ptr);
+
+    ovs_assert(index >= 0);
+    /* Now at the index of the entry to be updated. */
+
+    if ((priority > old->vector[index].priority && index > 0
+         && priority > old->vector[index - 1].priority)
+        || (priority < old->vector[index].priority && index < old->size - 1
+            && priority < old->vector[index + 1].priority)) {
+        /* Have to reallocate to reorder. */
+        struct pvector_impl *new = pvector_impl_dup(old);
+
+        new->vector[index].priority = priority;
+        pvector_impl_sort(new);
+
+        ovsrcu_set(&pvec->impl, new);
+        ovsrcu_postpone(free, old);
+    } else {
+        /* Can update in place. Readers are free to use either value,
+         * so we do not try to synchronize here. */
+        old->vector[index].priority = priority;
+    }
+}
diff --git a/lib/pvector.h b/lib/pvector.h
new file mode 100644
index 0000000..22ff1d0
--- /dev/null
+++ b/lib/pvector.h
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2014 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PVECTOR_H
+#define PVECTOR_H 1
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "ovs-rcu.h"
+#include "util.h"
+
+/* Concurrent Priority Vector
+ * ==========================
+ *
+ * Readers can assume that once they have taken a pointer to the vector
+ * with pvector_cursor_init(), the 'size' member will not decrease, so that
+ * they can safely read 'size' entries from 'vector', and find that each
+ * entry has a valid, non-NULL 'ptr', and the vector is in order from highest
+ * to lowest 'priority'.  The 'priority' values can change any time, but only
+ * so that the order of the entries does not change, so readers can use
+ * 'priority' values read at any time after aquisition of the vector pointer.
+ *
+ * Writers can concurrently add entries to the end of the vector, incrementing
+ * 'size', or update the 'priority' value of an entry, but only if that does
+ * not affect the ordering of the entries.  Writers will never change the 'ptr'
+ * values, or decrement the 'size' on a copy that readers have access to.
+ */
+
+struct pvector_entry {
+    unsigned int priority;
+    void *ptr;
+};
+
+/* Writers will preallocate space for some entries at the end to avoid future
+ * reallocations. */
+enum { PVECTOR_EXTRA_ALLOC = 4 };
+
+struct pvector_impl {
+    size_t size;       /* Number of entries in the vector. */
+    size_t allocated;  /* Number of allocated entries. */
+    struct pvector_entry vector[];
+};
+
+/* Concurrent priority vector. */
+struct pvector {
+    OVSRCU_TYPE(struct pvector_impl *) impl;
+};
+
+/* Initialization. */
+void pvector_init(struct pvector *);
+void pvector_destroy(struct pvector *);
+
+/* Count. */
+static inline size_t pvector_count(const struct pvector *);
+static inline bool pvector_is_empty(const struct pvector *);
+
+/* Insertion and deletion. */
+void pvector_insert(struct pvector *, void *, unsigned int);
+void pvector_change_priority(struct pvector *, void *, unsigned int);
+void pvector_remove(struct pvector *, void *);
+
+/* Iteration.
+ *
+ *
+ * Thread-safety
+ * =============
+ *
+ * Iteration is safe even in a pvector that is changing concurrently.
+ * Multiple writers must exclude each other via e.g., a mutex.
+ *
+ * Example
+ * =======
+ *
+ *     struct my_node {
+ *         int data;
+ *     };
+ *
+ *     struct my_node elem1, elem2, *iter;
+ *     struct pvector my_pvector;
+ *
+ *     pvector_init(&my_pvector);
+ *     ...add data...
+ *     pvector_insert(&my_pvector, &elem1, 1);
+ *     pvector_insert(&my_pvector, &elem2, 2);
+ *     ...
+ *     PVECTOR_FOR_EACH (iter, &my_pvector) {
+ *         ...operate on '*iter'...
+ *         ...elem2 to be seen before elem1...
+ *     }
+ *     ...
+ *     pvector_destroy(&my_pvector);
+ *
+ * There is no PVECTOR_FOR_EACH_SAFE variant as iteration is performed on RCU
+ * protected instance of the priority vector.  Any concurrent modifications
+ * that would be disruptive for readers (such as deletions), will be peformed
+ * on a new instance.  To see any of the modifications, a new iteration loop
+ * has to be started.
+ *
+ * The PVECTOR_FOR_EACH_PRIORITY also gives the priority of each entry during
+ * the iteration.
+ *
+ * The iteration loop must be completed without entering the OVS RCU quiescent
+ * period.  That is, an old iteration loop must not be continued after any
+ * blocking IO (VLOG is non-blocking, so that is OK).
+ */
+struct pvector_cursor {
+    size_t size;       /* Number of entries in the vector. */
+    size_t entry_idx;  /* Current index. */
+    const struct pvector_entry *vector;
+};
+
+static inline struct pvector_cursor pvector_cursor_init(const struct pvector *);
+static inline bool pvector_cursor_next(struct pvector_cursor *);
+static inline void *pvector_cursor_ptr(const struct pvector_cursor *);
+static inline unsigned int pvector_cursor_priority(const struct pvector_cursor *);
+static inline void pvector_cursor_lookahead(const struct pvector_cursor *,
+                                            int n, size_t size);
+
+#define PVECTOR_FOR_EACH(PTR, PVECTOR)                                  \
+    for (struct pvector_cursor cursor__ = pvector_cursor_init(PVECTOR); \
+         pvector_cursor_next(&cursor__) &&                              \
+             ((PTR) = pvector_cursor_ptr(&cursor__), true); )
+
+#define PVECTOR_FOR_EACH_PRIORITY(PTR, PRIORITY, PVECTOR)               \
+    for (struct pvector_cursor cursor__ = pvector_cursor_init(PVECTOR); \
+         pvector_cursor_next(&cursor__) &&                              \
+             ((PTR) = pvector_cursor_ptr(&cursor__),                    \
+              (PRIORITY) = pvector_cursor_priority(&cursor__), true); )
+
+
+/* Inline implementations. */
+
+static inline struct pvector_cursor
+pvector_cursor_init(const struct pvector *pvec)
+{
+    const struct pvector_impl *impl;
+    struct pvector_cursor cursor;
+
+    impl = ovsrcu_get(struct pvector_impl *, &pvec->impl);
+
+    cursor.size = impl->size;
+    cursor.vector = impl->vector;
+    cursor.entry_idx = -1;
+
+    ovs_prefetch_range(cursor.vector, cursor.size * sizeof cursor.vector[0]);
+
+    return cursor;
+}
+
+static inline bool pvector_cursor_next(struct pvector_cursor *cursor)
+{
+    return ++cursor->entry_idx < cursor->size;
+}
+
+static inline void *pvector_cursor_ptr(const struct pvector_cursor *cursor)
+{
+    return cursor->vector[cursor->entry_idx].ptr;
+}
+
+static inline unsigned int pvector_cursor_priority(const struct pvector_cursor *cursor)
+{
+    return cursor->vector[cursor->entry_idx].priority;
+}
+
+static inline void pvector_cursor_lookahead(const struct pvector_cursor *cursor,
+                                            int n, size_t size)
+{
+    if (cursor->entry_idx + n < cursor->size) {
+        ovs_prefetch_range(cursor->vector[cursor->entry_idx + n].ptr, size);
+    }
+}
+
+static inline size_t pvector_count(const struct pvector *pvec)
+{
+    return ovsrcu_get(struct pvector_impl *, &pvec->impl)->size;
+}
+
+static inline bool pvector_is_empty(const struct pvector *pvec)
+{
+    return pvector_count(pvec) == 0;
+}
+
+#endif /* pvector.h */
diff --git a/tests/test-classifier.c b/tests/test-classifier.c
index 42c18bc..6b7556d 100644
--- a/tests/test-classifier.c
+++ b/tests/test-classifier.c
@@ -464,6 +464,21 @@ destroy_classifier(struct classifier *cls)
 }
 
 static void
+pvector_verify(struct pvector *pvec)
+{
+    void *ptr;
+    unsigned int priority, prev_priority = UINT_MAX;
+
+    PVECTOR_FOR_EACH_PRIORITY (ptr, priority, pvec) {
+        if (priority > prev_priority) {
+            VLOG_ABORT("Priority vector is out of order (%u > %u)",
+                       priority, prev_priority);
+        }
+        prev_priority = priority;
+    }
+}
+
+static void
 check_tables(const struct classifier *cls, int n_tables, int n_rules,
              int n_dups) OVS_REQ_RDLOCK(cls->rwlock)
 {
@@ -475,10 +490,28 @@ check_tables(const struct classifier *cls, int n_tables, int n_rules,
     int found_dups = 0;
     int found_rules2 = 0;
 
+    pvector_verify(&cls->cls->subtables);
+
     HMAP_FOR_EACH (table, hmap_node, &cls->cls->subtables_map) {
         const struct cls_match *head;
         unsigned int max_priority = 0;
         unsigned int max_count = 0;
+        bool found = false;
+        const struct cls_subtable *iter;
+
+        /* Locate the subtable from 'subtables'. */
+        PVECTOR_FOR_EACH (iter, &cls->cls->subtables) {
+            if (iter == table) {
+                if (found) {
+                    VLOG_ABORT("Subtable %p duplicated in 'subtables'.",
+                               table);
+                }
+                found = true;
+            }
+        }
+        if (!found) {
+            VLOG_ABORT("Subtable %p not found from 'subtables'.", table);
+        }
 
         assert(!hmap_is_empty(&table->rules));
 
@@ -511,6 +544,7 @@ check_tables(const struct classifier *cls, int n_tables, int n_rules,
     }
 
     assert(found_tables == hmap_count(&cls->cls->subtables_map));
+    assert(found_tables == pvector_count(&cls->cls->subtables));
     assert(n_tables == -1 || n_tables == hmap_count(&cls->cls->subtables_map));
     assert(n_rules == -1 || found_rules == n_rules);
     assert(n_dups == -1 || found_dups == n_dups);
-- 
1.7.10.4