[ovs-dev] [PATCH ovn v3 5/5] ovn-controller: Fix port group conjunction flow explosion problem.

Han Zhou hzhou at ovn.org
Tue Apr 27 23:41:14 UTC 2021


For an ACL with match: outport == @PG && ip4.src == $PG_AS, given below
scale:

P: PG size
LP: number of local lports
D: number of all datapaths (logical switches)
LD: number of datapaths that contain local lports

With current OVN implementation, the total number of OF flows is:
    LP + (P * D) + D

The reason is, firstly, datapath is not part of the conjunction, so for
each datapath the lflow is reparsed.

Secondly, although ovn-controller tries to filter out the flows that are
for non-local lports, with the conjunction match, the logic that filters
out non-local flows doesn't work for the conjunction part that doesn't
have the lport in the match (the P * D part). When there is only one
port on each LS it is fine, because no conjunction will be used because
SB port groups are split per datapath, so each port group would have
only one port. However, when more than one ports are on each LS the flow
explosion happens.

This patch deal with the second reason above, by refining the SB port
groups to store only locally bound lports: empty const sets will not
generate any flows. This reduces the related flow number from
LP + (P * D) + D to LP + (P * LD) + LD.

Since LD is expected to be small, so even if it is a multiplier, the
total number is reduced significantly. In particular, in ovn-k8s use
cases the LD is always 1, so the formula above becomes LP + P + LD.

With a scale of 1k k8s nodes, each has 4 ports for the same PG: P = 4k,
LP = 4, D = 1k, LD = 1. The current implementation generates ~4m flows.
With this patch it becomes only ~4k.

Reported-by: Girish Moodalbail <gmoodalbail at nvidia.com>
Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2021-March/381082.html
Reported-by: Dumitru Ceara <dceara at redhat.com>
Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1944098
Tested-by: Zhen Wang <zhewang at nvidia.com>
Signed-off-by: Han Zhou <hzhou at ovn.org>
---
 controller/binding.c        |  20 ++++
 controller/binding.h        |   9 ++
 controller/ovn-controller.c | 212 +++++++++++++++++++++++++++++++-----
 include/ovn/expr.h          |   3 +-
 lib/expr.c                  |  12 +-
 tests/ovn.at                |  55 ++++++++++
 tests/test-ovn.c            |   4 +-
 utilities/ovn-trace.c       |   2 +-
 8 files changed, 281 insertions(+), 36 deletions(-)

diff --git a/controller/binding.c b/controller/binding.c
index 514f5f33f..5aca964cc 100644
--- a/controller/binding.c
+++ b/controller/binding.c
@@ -2987,3 +2987,23 @@ cleanup:
 
     return b_lport;
 }
+
+struct sset *
+binding_collect_local_binding_lports(struct local_binding_data *lbinding_data)
+{
+    struct sset *lports = xzalloc(sizeof *lports);
+    sset_init(lports);
+    struct shash_node *shash_node;
+    SHASH_FOR_EACH (shash_node, &lbinding_data->lports) {
+        struct binding_lport *b_lport = shash_node->data;
+        sset_add(lports, b_lport->name);
+    }
+    return lports;
+}
+
+void
+binding_destroy_local_binding_lports(struct sset *lports)
+{
+    sset_destroy(lports);
+    free(lports);
+}
diff --git a/controller/binding.h b/controller/binding.h
index 4fc9ef207..cd573dbbe 100644
--- a/controller/binding.h
+++ b/controller/binding.h
@@ -128,4 +128,13 @@ void binding_seqno_run(struct local_binding_data *lbinding_data);
 void binding_seqno_install(struct local_binding_data *lbinding_data);
 void binding_seqno_flush(void);
 void binding_dump_local_bindings(struct local_binding_data *, struct ds *);
+
+/* Generates a sset of lport names from local_binding_data.
+ * Note: the caller is responsible for destroying and freeing the returned
+ * sset, by calling binding_detroy_local_binding_lports(). */
+struct sset *binding_collect_local_binding_lports(struct local_binding_data *);
+
+/* Destroy and free the lports sset returned by
+ * binding_collect_local_binding_lports(). */
+void binding_destroy_local_binding_lports(struct sset *lports);
 #endif /* controller/binding.h */
diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
index 00ae49eb9..f86d780cc 100644
--- a/controller/ovn-controller.c
+++ b/controller/ovn-controller.c
@@ -1367,6 +1367,7 @@ addr_sets_update(const struct sbrec_address_set_table *address_set_table,
         }
     }
 }
+
 static void
 en_addr_sets_run(struct engine_node *node, void *data)
 {
@@ -1415,20 +1416,72 @@ addr_sets_sb_address_set_handler(struct engine_node *node, void *data)
 }
 
 struct ed_type_port_groups{
-    struct shash port_groups;
+    /* A copy of SB port_groups, each converted as a sset for efficient lport
+     * lookup. */
+    struct shash port_group_ssets;
+
+    /* Const sets containing local lports, used for expr parsing. */
+    struct shash port_groups_cs_local;
+
     bool change_tracked;
     struct sset new;
     struct sset deleted;
     struct sset updated;
 };
 
+static void
+port_group_ssets_add_or_update(struct shash *port_group_ssets,
+                               const struct sbrec_port_group *pg)
+{
+    struct sset *lports = shash_find_data(port_group_ssets, pg->name);
+    if (lports) {
+        sset_clear(lports);
+    } else {
+        lports = xzalloc(sizeof *lports);
+        sset_init(lports);
+        shash_add(port_group_ssets, pg->name, lports);
+    }
+
+    for (size_t i = 0; i < pg->n_ports; i++) {
+        sset_add(lports, pg->ports[i]);
+    }
+}
+
+static void
+port_group_ssets_delete(struct shash *port_group_ssets,
+                        const char *pg_name)
+{
+    struct shash_node *node = shash_find(port_group_ssets, pg_name);
+    if (node) {
+        struct sset *lports = node->data;
+        shash_delete(port_group_ssets, node);
+        sset_destroy(lports);
+        free(lports);
+    }
+}
+
+/* Delete and free all ssets in port_group_ssets, but not
+ * destroying the shash itself. */
+static void
+port_group_ssets_clear(struct shash *port_group_ssets)
+{
+    struct shash_node *node, *next;
+    SHASH_FOR_EACH_SAFE (node, next, port_group_ssets) {
+        struct sset *lports = node->data;
+        shash_delete(port_group_ssets, node);
+        sset_destroy(lports);
+        free(lports);
+    }
+}
+
 static void *
 en_port_groups_init(struct engine_node *node OVS_UNUSED,
                     struct engine_arg *arg OVS_UNUSED)
 {
     struct ed_type_port_groups *pg = xzalloc(sizeof *pg);
 
-    shash_init(&pg->port_groups);
+    shash_init(&pg->port_group_ssets);
+    shash_init(&pg->port_groups_cs_local);
     pg->change_tracked = false;
     sset_init(&pg->new);
     sset_init(&pg->deleted);
@@ -1440,41 +1493,52 @@ static void
 en_port_groups_cleanup(void *data)
 {
     struct ed_type_port_groups *pg = data;
-    expr_const_sets_destroy(&pg->port_groups);
-    shash_destroy(&pg->port_groups);
+
+    expr_const_sets_destroy(&pg->port_groups_cs_local);
+    shash_destroy(&pg->port_groups_cs_local);
+
+    port_group_ssets_clear(&pg->port_group_ssets);
+    shash_destroy(&pg->port_group_ssets);
+
     sset_destroy(&pg->new);
     sset_destroy(&pg->deleted);
     sset_destroy(&pg->updated);
 }
 
-/* Iterate port groups in the southbound database.  Create and update the
- * corresponding symtab entries as necessary. */
- static void
+static void
 port_groups_init(const struct sbrec_port_group_table *port_group_table,
-                 struct shash *port_groups)
+                 const struct sset *local_lports,
+                 struct shash *port_group_ssets,
+                 struct shash *port_groups_cs_local)
 {
     const struct sbrec_port_group *pg;
     SBREC_PORT_GROUP_TABLE_FOR_EACH (pg, port_group_table) {
-        expr_const_sets_add_strings(port_groups, pg->name,
+        port_group_ssets_add_or_update(port_group_ssets, pg);
+        expr_const_sets_add_strings(port_groups_cs_local, pg->name,
                                     (const char *const *) pg->ports,
-                                    pg->n_ports);
+                                    pg->n_ports, local_lports);
     }
 }
 
 static void
 port_groups_update(const struct sbrec_port_group_table *port_group_table,
-                   struct shash *port_groups, struct sset *new,
-                   struct sset *deleted, struct sset *updated)
+                   const struct sset *local_lports,
+                   struct shash *port_group_ssets,
+                   struct shash *port_groups_cs_local,
+                   struct sset *new, struct sset *deleted,
+                   struct sset *updated)
 {
     const struct sbrec_port_group *pg;
     SBREC_PORT_GROUP_TABLE_FOR_EACH_TRACKED (pg, port_group_table) {
         if (sbrec_port_group_is_deleted(pg)) {
-            expr_const_sets_remove(port_groups, pg->name);
+            expr_const_sets_remove(port_groups_cs_local, pg->name);
+            port_group_ssets_delete(port_group_ssets, pg->name);
             sset_add(deleted, pg->name);
         } else {
-            expr_const_sets_add_strings(port_groups, pg->name,
+            port_group_ssets_add_or_update(port_group_ssets, pg);
+            expr_const_sets_add_strings(port_groups_cs_local, pg->name,
                                         (const char *const *) pg->ports,
-                                        pg->n_ports);
+                                        pg->n_ports, local_lports);
             if (sbrec_port_group_is_new(pg)) {
                 sset_add(new, pg->name);
             } else {
@@ -1485,22 +1549,36 @@ port_groups_update(const struct sbrec_port_group_table *port_group_table,
 }
 
 static void
-en_port_groups_run(struct engine_node *node, void *data)
+en_port_groups_clear_tracked_data(void *data_)
 {
-    struct ed_type_port_groups *pg = data;
-
+    struct ed_type_port_groups *pg = data_;
     sset_clear(&pg->new);
     sset_clear(&pg->deleted);
     sset_clear(&pg->updated);
-    expr_const_sets_destroy(&pg->port_groups);
+    pg->change_tracked = false;
+}
+
+static void
+en_port_groups_run(struct engine_node *node, void *data)
+{
+    struct ed_type_port_groups *pg = data;
+
+    expr_const_sets_destroy(&pg->port_groups_cs_local);
+    port_group_ssets_clear(&pg->port_group_ssets);
 
     struct sbrec_port_group_table *pg_table =
         (struct sbrec_port_group_table *)EN_OVSDB_GET(
             engine_get_input("SB_port_group", node));
 
-    port_groups_init(pg_table, &pg->port_groups);
+    struct ed_type_runtime_data *rt_data =
+        engine_get_input_data("runtime_data", node);
+
+    struct sset *local_b_lports = binding_collect_local_binding_lports(
+        &rt_data->lbinding_data);
+    port_groups_init(pg_table, local_b_lports, &pg->port_group_ssets,
+                     &pg->port_groups_cs_local);
+    binding_destroy_local_binding_lports(local_b_lports);
 
-    pg->change_tracked = false;
     engine_set_node_state(node, EN_UPDATED);
 }
 
@@ -1509,16 +1587,19 @@ port_groups_sb_port_group_handler(struct engine_node *node, void *data)
 {
     struct ed_type_port_groups *pg = data;
 
-    sset_clear(&pg->new);
-    sset_clear(&pg->deleted);
-    sset_clear(&pg->updated);
-
     struct sbrec_port_group_table *pg_table =
         (struct sbrec_port_group_table *)EN_OVSDB_GET(
             engine_get_input("SB_port_group", node));
 
-    port_groups_update(pg_table, &pg->port_groups, &pg->new,
-                     &pg->deleted, &pg->updated);
+    struct ed_type_runtime_data *rt_data =
+        engine_get_input_data("runtime_data", node);
+
+    struct sset *local_b_lports = binding_collect_local_binding_lports(
+        &rt_data->lbinding_data);
+    port_groups_update(pg_table, local_b_lports, &pg->port_group_ssets,
+                       &pg->port_groups_cs_local, &pg->new, &pg->deleted,
+                       &pg->updated);
+    binding_destroy_local_binding_lports(local_b_lports);
 
     if (!sset_is_empty(&pg->new) || !sset_is_empty(&pg->deleted) ||
             !sset_is_empty(&pg->updated)) {
@@ -1531,6 +1612,73 @@ port_groups_sb_port_group_handler(struct engine_node *node, void *data)
     return true;
 }
 
+static bool
+port_groups_runtime_data_handler(struct engine_node *node, void *data)
+{
+    struct ed_type_port_groups *pg = data;
+
+    struct sbrec_port_group_table *pg_table =
+        (struct sbrec_port_group_table *)EN_OVSDB_GET(
+            engine_get_input("SB_port_group", node));
+
+    struct ed_type_runtime_data *rt_data =
+        engine_get_input_data("runtime_data", node);
+
+    if (!rt_data->tracked) {
+        return false;
+    }
+
+    if (hmap_is_empty(&rt_data->tracked_dp_bindings)) {
+        goto out;
+    }
+
+    struct sset *local_b_lports = binding_collect_local_binding_lports(
+        &rt_data->lbinding_data);
+
+    const struct sbrec_port_group *pg_sb;
+    SBREC_PORT_GROUP_TABLE_FOR_EACH (pg_sb, pg_table) {
+        struct sset *pg_lports = shash_find_data(&pg->port_group_ssets,
+                                                 pg_sb->name);
+        ovs_assert(pg_lports);
+
+        struct tracked_binding_datapath *tdp;
+        bool need_update = false;
+        HMAP_FOR_EACH (tdp, node, &rt_data->tracked_dp_bindings) {
+            struct shash_node *shash_node;
+            SHASH_FOR_EACH (shash_node, &tdp->lports) {
+                struct tracked_binding_lport *lport = shash_node->data;
+                if (sset_contains(pg_lports, lport->pb->logical_port)) {
+                    /* At least one local port-binding change is related to the
+                     * port_group, so the port_group_cs_local needs update. */
+                    need_update = true;
+                    break;
+                }
+            }
+            if (need_update) {
+                break;
+            }
+        }
+        if (need_update) {
+            expr_const_sets_add_strings(&pg->port_groups_cs_local, pg_sb->name,
+                                        (const char *const *) pg_sb->ports,
+                                        pg_sb->n_ports, local_b_lports);
+            sset_add(&pg->updated, pg_sb->name);
+        }
+    }
+
+    binding_destroy_local_binding_lports(local_b_lports);
+
+out:
+    if (!sset_is_empty(&pg->new) || !sset_is_empty(&pg->deleted) ||
+            !sset_is_empty(&pg->updated)) {
+        engine_set_node_state(node, EN_UPDATED);
+    } else {
+        engine_set_node_state(node, EN_UNCHANGED);
+    }
+    pg->change_tracked = true;
+    return true;
+}
+
 /* Connection tracking zones. */
 struct ed_type_ct_zones {
     unsigned long bitmap[BITMAP_N_LONGS(MAX_CT_ZONES)];
@@ -1880,7 +2028,7 @@ static void init_lflow_ctx(struct engine_node *node,
 
     struct ed_type_port_groups *pg_data =
         engine_get_input_data("port_groups", node);
-    struct shash *port_groups = &pg_data->port_groups;
+    struct shash *port_groups = &pg_data->port_groups_cs_local;
 
     l_ctx_in->sbrec_multicast_group_by_name_datapath =
         sbrec_mc_group_by_name_dp;
@@ -2540,7 +2688,7 @@ main(int argc, char *argv[])
                                       "physical_flow_changes");
     ENGINE_NODE(flow_output, "flow_output");
     ENGINE_NODE(addr_sets, "addr_sets");
-    ENGINE_NODE(port_groups, "port_groups");
+    ENGINE_NODE_WITH_CLEAR_TRACK_DATA(port_groups, "port_groups");
 
 #define SB_NODE(NAME, NAME_STR) ENGINE_NODE_SB(NAME, NAME_STR);
     SB_NODES
@@ -2556,6 +2704,10 @@ main(int argc, char *argv[])
                      addr_sets_sb_address_set_handler);
     engine_add_input(&en_port_groups, &en_sb_port_group,
                      port_groups_sb_port_group_handler);
+    /* port_groups computation requires runtime_data's lbinding_data for the
+     * locally bound ports. */
+    engine_add_input(&en_port_groups, &en_runtime_data,
+                     port_groups_runtime_data_handler);
 
     /* Engine node physical_flow_changes indicates whether
      * we can recompute only physical flows or we can
@@ -2986,7 +3138,7 @@ main(int argc, char *argv[])
                     engine_get_data(&en_port_groups);
                 if (br_int && chassis && as_data && pg_data) {
                     char *error = ofctrl_inject_pkt(br_int, pending_pkt.flow_s,
-                        &as_data->addr_sets, &pg_data->port_groups);
+                        &as_data->addr_sets, &pg_data->port_groups_cs_local);
                     if (error) {
                         unixctl_command_reply_error(pending_pkt.conn, error);
                         free(error);
diff --git a/include/ovn/expr.h b/include/ovn/expr.h
index 96435038a..de90e87e1 100644
--- a/include/ovn/expr.h
+++ b/include/ovn/expr.h
@@ -548,7 +548,8 @@ void expr_constant_set_destroy(struct expr_constant_set *cs);
 void expr_const_sets_add_integers(struct shash *const_sets, const char *name,
                                   const char * const *values, size_t n_values);
 void expr_const_sets_add_strings(struct shash *const_sets, const char *name,
-                                 const char * const *values, size_t n_values);
+                                 const char * const *values, size_t n_values,
+                                 const struct sset *filter);
 void expr_const_sets_remove(struct shash *const_sets, const char *name);
 void expr_const_sets_destroy(struct shash *const_sets);
 
diff --git a/lib/expr.c b/lib/expr.c
index cfc1082e1..7d4f47c9d 100644
--- a/lib/expr.c
+++ b/lib/expr.c
@@ -1103,10 +1103,13 @@ expr_const_sets_add_integers(struct shash *const_sets, const char *name,
 
 /* Adds an constant set named 'name' to 'const_sets', replacing any existing
  * constant set entry with the given name. Unlike expr_const_sets_add_integers,
- * the 'values' will not be converted but stored as is. */
+ * the 'values' will not be converted but stored as is.
+ * 'filter', if not NULL, specifies a set of eligible values that are allowed
+ * to be added from 'values'. */
 void
 expr_const_sets_add_strings(struct shash *const_sets, const char *name,
-                            const char *const *values, size_t n_values)
+                            const char *const *values, size_t n_values,
+                            const struct sset *filter)
 {
     /* Replace any existing entry for this name. */
     expr_const_sets_remove(const_sets, name);
@@ -1117,6 +1120,11 @@ expr_const_sets_add_strings(struct shash *const_sets, const char *name,
     cs->values = xmalloc(n_values * sizeof *cs->values);
     cs->type = EXPR_C_STRING;
     for (size_t i = 0; i < n_values; i++) {
+        if (filter && !sset_find(filter, values[i])) {
+            VLOG_DBG("Skip constant set entry '%s' for '%s'",
+                     values[i], name);
+            continue;
+        }
         union expr_constant *c = &cs->values[cs->n_values++];
         c->string = xstrdup(values[i]);
     }
diff --git a/tests/ovn.at b/tests/ovn.at
index e52bb55cd..1ee44f1c2 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -26275,3 +26275,58 @@ primary lport : [[sw0p2]]
 OVN_CLEANUP([hv1], [hv2])
 AT_CLEANUP
 ])
+
+# Tests the efficiency of conjunction flow generation when port groups are used
+# by ACLs. Make sure there is no open flow explosion in table 45 for an ACL
+# with self-referencing match condition of a port group:
+#
+# "outport == @pg1 && ip4.src == $pg1_ip4"
+#
+# 10 LSes (ls[0-9]), each has 2 LSPs (lsp[0-9][01]). All LSPs belong to port
+# group pg1, but only LSPs of LS0 are bound on HV1.
+#
+# The expected number of conjunction flows is 2 + 20 = 22.
+# - 20 for expanding the address set $pg1_ip4 to 20 ip addresses.
+# - 2 for expanding the port group @pg1 to the 2 locally bound lports.
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([ovn -- ACL with Port Group conjunction flow efficiency])
+ovn_start
+
+net_add n1
+
+sim_add hv1
+as hv1
+ovs-vsctl add-br br-phys
+ovn_attach n1 br-phys 192.168.0.1
+
+ovn-nbctl lr-add lr0
+
+for i in $(seq 0 9); do
+    ovn-nbctl ls-add ls$i
+    ovn-nbctl lrp-add lr0 lrp_lr0_ls$i aa:bb:bb:00:00:0$i 192.168.${i}.1/24
+
+    ovn-nbctl lsp-add ls$i lsp_ls${i}_lr0 -- \
+        lsp-set-addresses lsp_ls${i}_lr0 router -- \
+        lsp-set-type lsp_ls${i}_lr0 router -- \
+        lsp-set-options lsp_ls${i}_lr0 router-port=lrp_lr0_ls${i}
+
+    for j in 0 1; do
+        ovn-nbctl lsp-add ls$i lsp${i}-${j} -- \
+            lsp-set-addresses lsp${i}-${j} "aa:aa:aa:00:0$i:0$j 192.168.$i.1$j"
+    done
+done
+
+ovn-nbctl pg-add pg1
+ovn-nbctl pg-set-ports pg1 $(for i in 0 1 2 3 4 5 6 7 8 9; do for j in 0 1; do echo lsp${i}-${j}; done; done)
+
+ovn-nbctl --type=port-group acl-add pg1 to-lport 100 "outport == @pg1 && ip4.src == \$pg1_ip4" allow
+
+ovs-vsctl add-port br-int lsp0-0 -- set interface lsp0-0 external_ids:iface-id=lsp0-0
+ovs-vsctl add-port br-int lsp0-1 -- set interface lsp0-1 external_ids:iface-id=lsp0-1
+
+check ovn-nbctl --wait=hv sync
+AT_CHECK([test $(ovs-ofctl dump-flows br-int table=45 | grep conjunction | wc -l) == 22])
+
+OVN_CLEANUP([hv1])
+AT_CLEANUP
+])
diff --git a/tests/test-ovn.c b/tests/test-ovn.c
index 8b13cd472..98cc2c503 100644
--- a/tests/test-ovn.c
+++ b/tests/test-ovn.c
@@ -243,8 +243,8 @@ create_port_groups(struct shash *port_groups)
     };
     static const char *const pg2[] = { NULL };
 
-    expr_const_sets_add_strings(port_groups, "0_pg1", pg1, 3);
-    expr_const_sets_add_strings(port_groups, "0_pg_empty", pg2, 0);
+    expr_const_sets_add_strings(port_groups, "0_pg1", pg1, 3, NULL);
+    expr_const_sets_add_strings(port_groups, "0_pg_empty", pg2, 0, NULL);
 }
 
 static bool
diff --git a/utilities/ovn-trace.c b/utilities/ovn-trace.c
index 5df5f72d6..3b26b5af1 100644
--- a/utilities/ovn-trace.c
+++ b/utilities/ovn-trace.c
@@ -833,7 +833,7 @@ read_port_groups(void)
     SBREC_PORT_GROUP_FOR_EACH (sbpg, ovnsb_idl) {
         expr_const_sets_add_strings(&port_groups, sbpg->name,
                                     (const char *const *) sbpg->ports,
-                                    sbpg->n_ports);
+                                    sbpg->n_ports, NULL);
     }
 }
 
-- 
2.30.2



More information about the dev mailing list