[ovs-dev] [PATCH ovn 1/2] northd: Use address sets for ARP responder flows for VIPs.

Dumitru Ceara dceara at redhat.com
Fri Oct 1 13:46:54 UTC 2021


Otherwise the S_ROUTER_IN_IP_INPUT aggregated flows that reply to ARP
requests targeting load balancer VIPs get completely regenerated every
time a new VIP/LB is added.  This affects SB memory usage as RAFT log
entries are huge.  Use an address set instead.  Updating an address set
is incremental, because it's performed with a "mutate" operation.

On a large scale ovn-kubernetes deployment with a high number of
load balancers (services) this change reduces memory usage of
ovsdb-servers running the OVN_Southbound cluster by 50%, from ~2GB
RSS to ~1GB RSS.

Signed-off-by: Dumitru Ceara <dceara at redhat.com>
---
 northd/northd.c     |   61 +++++++++++++++++++++++++++++++++------------------
 tests/ovn-northd.at |   44 +++++++++++++++++--------------------
 2 files changed, 59 insertions(+), 46 deletions(-)

diff --git a/northd/northd.c b/northd/northd.c
index cf2467fe1..5699745d6 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -11937,39 +11937,28 @@ build_lrouter_ipv4_ip_input(struct ovn_port *op,
                               op->cr_port->json_key);
             }
 
-            struct ds load_balancer_ips_v4 = DS_EMPTY_INITIALIZER;
-
-            /* For IPv4 we can just create one rule with all required IPs. */
-            ds_put_cstr(&load_balancer_ips_v4, "{ ");
-            ds_put_and_free_cstr(&load_balancer_ips_v4,
-                                 sset_join(&op->od->lb_ips_v4, ", ", " }"));
-
-            build_lrouter_arp_flow(op->od, op, ds_cstr(&load_balancer_ips_v4),
+            /* Create a single ARP rule for all IPs that are used as VIPs. */
+            char *lb_ips_v4_as = xasprintf("$%s_lb_ip4", op->od->nbr->name);
+            build_lrouter_arp_flow(op->od, op, lb_ips_v4_as,
                                    REG_INPORT_ETH_ADDR,
                                    match, false, 90, NULL, lflows);
-            ds_destroy(&load_balancer_ips_v4);
+            free(lb_ips_v4_as);
         }
 
         if (sset_count(&op->od->lb_ips_v6)) {
             ds_clear(match);
-            ds_clear(actions);
-
-            struct ds load_balancer_ips_v6 = DS_EMPTY_INITIALIZER;
-
-            ds_put_cstr(&load_balancer_ips_v6, "{ ");
-            ds_put_and_free_cstr(&load_balancer_ips_v6,
-                                 sset_join(&op->od->lb_ips_v6, ", ", " }"));
 
             if (is_l3dgw_port(op)) {
                 ds_put_format(match, "is_chassis_resident(%s)",
                               op->cr_port->json_key);
             }
-            build_lrouter_nd_flow(op->od, op, "nd_na",
-                                  ds_cstr(&load_balancer_ips_v6), NULL,
+
+            /* Create a single ND rule for all IPs that are used as VIPs. */
+            char *lb_ips_v6_as = xasprintf("$%s_lb_ip6", op->od->nbr->name);
+            build_lrouter_nd_flow(op->od, op, "nd_na", lb_ips_v6_as, NULL,
                                   REG_INPORT_ETH_ADDR, match, false, 90,
                                   NULL, lflows, meter_groups);
-
-            ds_destroy(&load_balancer_ips_v6);
+            free(lb_ips_v6_as);
         }
 
         if (!op->od->is_gw_router && !op->od->n_l3dgw_ports) {
@@ -13541,7 +13530,7 @@ sync_address_set(struct northd_context *ctx, const char *name,
  * in OVN_Northbound, so that the address sets used in Logical_Flows in
  * OVN_Southbound is checked against the proper set.*/
 static void
-sync_address_sets(struct northd_context *ctx)
+sync_address_sets(struct northd_context *ctx, struct hmap *datapaths)
 {
     struct shash sb_address_sets = SHASH_INITIALIZER(&sb_address_sets);
 
@@ -13588,6 +13577,34 @@ sync_address_sets(struct northd_context *ctx)
         svec_destroy(&ipv6_addrs);
     }
 
+    /* Sync router load balancer VIP generated address sets. */
+    struct ovn_datapath *od;
+    HMAP_FOR_EACH (od, key_node, datapaths) {
+        if (!od->nbr) {
+            continue;
+        }
+
+        if (sset_count(&od->lb_ips_v4)) {
+            char *ipv4_addrs_name = xasprintf("%s_lb_ip4", od->nbr->name);
+            const char **ipv4_addrs = sset_array(&od->lb_ips_v4);
+
+            sync_address_set(ctx, ipv4_addrs_name, ipv4_addrs,
+                             sset_count(&od->lb_ips_v4), &sb_address_sets);
+            free(ipv4_addrs_name);
+            free(ipv4_addrs);
+        }
+
+        if (sset_count(&od->lb_ips_v6)) {
+            char *ipv6_addrs_name = xasprintf("%s_lb_ip6", od->nbr->name);
+            const char **ipv6_addrs = sset_array(&od->lb_ips_v6);
+
+            sync_address_set(ctx, ipv6_addrs_name, ipv6_addrs,
+                             sset_count(&od->lb_ips_v6), &sb_address_sets);
+            free(ipv6_addrs_name);
+            free(ipv6_addrs);
+        }
+    }
+
     /* sync user defined address sets, which may overwrite port group
      * generated address sets if same name is used */
     const struct nbrec_address_set *nb_address_set;
@@ -14328,7 +14345,7 @@ ovnnb_db_run(struct northd_context *ctx,
     stopwatch_start(CLEAR_LFLOWS_CTX_STOPWATCH_NAME, time_msec());
     ovn_update_ipv6_prefix(ports);
 
-    sync_address_sets(ctx);
+    sync_address_sets(ctx, datapaths);
     sync_port_groups(ctx, &port_groups);
     sync_meters(ctx, &meter_groups);
     sync_dns_entries(ctx, datapaths);
diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
index 5de554455..aece7aba4 100644
--- a/tests/ovn-northd.at
+++ b/tests/ovn-northd.at
@@ -1658,8 +1658,8 @@ ovn-nbctl lb-add lb1 "192.168.2.1:8080" "10.0.0.4:8080"
 ovn-nbctl lb-add lb2 "192.168.2.4:8080" "10.0.0.5:8080" udp
 ovn-nbctl lb-add lb3 "192.168.2.5:8080" "10.0.0.6:8080"
 ovn-nbctl lb-add lb4 "192.168.2.6:8080" "10.0.0.7:8080"
-ovn-nbctl lb-add lb5 "fe80::200:ff:fe00:101:8080" "fe02::200:ff:fe00:101:8080"
-ovn-nbctl lb-add lb5 "fe80::200:ff:fe00:102:8080" "fe02::200:ff:fe00:102:8080"
+ovn-nbctl lb-add lb5 "[[fe80::200:ff:fe00:101]]:8080" "[[fe02::200:ff:fe00:101]]:8080"
+ovn-nbctl lb-add lb5 "[[fe80::200:ff:fe00:102]]:8080" "[[fe02::200:ff:fe00:102]]:8080"
 
 ovn-nbctl lr-lb-add lr lb1
 ovn-nbctl lr-lb-add lr lb2
@@ -1669,6 +1669,10 @@ ovn-nbctl lr-lb-add lr lb5
 
 ovn-nbctl --wait=sb sync
 
+# Check generated VIP address sets.
+check_column '192.168.2.1 192.168.2.4 192.168.2.5 192.168.2.6' Address_Set addresses name=lr_lb_ip4
+check_column 'fe80::200:ff:fe00:101 fe80::200:ff:fe00:102' Address_Set addresses name=lr_lb_ip6
+
 # Ingress router port ETH address is stored in lr_in_admission.
 AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_admission.*xreg0\[[0..47\]]" | sort], [0], [dnl
   table=0 (lr_in_admission    ), priority=50   , dnl
@@ -1685,16 +1689,8 @@ match=(eth.mcast && inport == "lrp-public"), dnl
 action=(xreg0[[0..47]] = 00:00:00:00:01:00; next;)
 ])
 
-# The order of the VIP addresses in the flow table entries doesn't
-# matter, so just replace each of them with a generic $vip for
-# testing.  It would be better if we could ensure each one appeared
-# exactly once, but that's hard with sed.
-sed_vips() {
-    sed 's/192\.168\.2\.[[1456]]/$vip/g'
-}
-
 # Ingress router port ETH address is used for ARP reply/NA in lr_in_ip_input.
-AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_ip_input.*priority=90" | grep "arp\|nd" | sed_vips | sort], [0], [dnl
+AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_ip_input.*priority=90" | grep "arp\|nd" | sort], [0], [dnl
   table=3 (lr_in_ip_input     ), priority=90   , dnl
 match=(arp.op == 1 && arp.tpa == 43.43.43.150), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
@@ -1708,28 +1704,28 @@ action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */
 match=(arp.op == 1 && arp.tpa == 43.43.43.4), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 && arp.spa == 42.42.42.0/24), dnl
+match=(inport == "lrp" && arp.op == 1 && arp.tpa == $lr_lb_ip4), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp" && arp.op == 1 && arp.tpa == { $vip, $vip, $vip, $vip }), dnl
+match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 && arp.spa == 42.42.42.0/24), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
 match=(inport == "lrp" && ip6.dst == {fe80::200:ff:fe00:1, ff02::1:ff00:1} && nd_ns && nd.target == fe80::200:ff:fe00:1), dnl
 action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src = nd.target; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp" && nd_ns && nd.target == { fe80::200:ff:fe00:101:8080, fe80::200:ff:fe00:102:8080 }), dnl
+match=(inport == "lrp" && nd_ns && nd.target == $lr_lb_ip6), dnl
 action=(nd_na { eth.src = xreg0[[0..47]]; ip6.src = nd.target; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.1 && arp.spa == 43.43.43.0/24), dnl
+match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == $lr_lb_ip4), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == { $vip, $vip, $vip, $vip }), dnl
+match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.1 && arp.spa == 43.43.43.0/24), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
 match=(inport == "lrp-public" && ip6.dst == {fe80::200:ff:fe00:100, ff02::1:ff00:100} && nd_ns && nd.target == fe80::200:ff:fe00:100), dnl
 action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src = nd.target; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp-public" && nd_ns && nd.target == { fe80::200:ff:fe00:101:8080, fe80::200:ff:fe00:102:8080 }), dnl
+match=(inport == "lrp-public" && nd_ns && nd.target == $lr_lb_ip6), dnl
 action=(nd_na { eth.src = xreg0[[0..47]]; ip6.src = nd.target; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
 ])
 
@@ -1763,7 +1759,7 @@ action=(xreg0[[0..47]] = 00:00:00:00:01:00; next;)
 # Ingress router port is used for ARP reply/NA in lr_in_ip_input.
 # xxreg0[0..47] is used unless external_mac is set.
 # Priority 90 flows (per router).
-AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_ip_input.*priority=90" | grep "arp\|nd" | sed_vips | sort], [0], [dnl
+AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_ip_input.*priority=90" | grep "arp\|nd" | sort], [0], [dnl
   table=3 (lr_in_ip_input     ), priority=90   , dnl
 match=(arp.op == 1 && arp.tpa == 43.43.43.150), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
@@ -1777,28 +1773,28 @@ action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */
 match=(arp.op == 1 && arp.tpa == 43.43.43.4), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 && arp.spa == 42.42.42.0/24), dnl
+match=(inport == "lrp" && arp.op == 1 && arp.tpa == $lr_lb_ip4), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp" && arp.op == 1 && arp.tpa == { $vip, $vip, $vip, $vip }), dnl
+match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 && arp.spa == 42.42.42.0/24), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
 match=(inport == "lrp" && ip6.dst == {fe80::200:ff:fe00:1, ff02::1:ff00:1} && nd_ns && nd.target == fe80::200:ff:fe00:1), dnl
 action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src = nd.target; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp" && nd_ns && nd.target == { fe80::200:ff:fe00:101:8080, fe80::200:ff:fe00:102:8080 }), dnl
+match=(inport == "lrp" && nd_ns && nd.target == $lr_lb_ip6), dnl
 action=(nd_na { eth.src = xreg0[[0..47]]; ip6.src = nd.target; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.1 && arp.spa == 43.43.43.0/24), dnl
+match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == $lr_lb_ip4 && is_chassis_resident("cr-lrp-public")), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == { $vip, $vip, $vip, $vip } && is_chassis_resident("cr-lrp-public")), dnl
+match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.1 && arp.spa == 43.43.43.0/24), dnl
 action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
 match=(inport == "lrp-public" && ip6.dst == {fe80::200:ff:fe00:100, ff02::1:ff00:100} && nd_ns && nd.target == fe80::200:ff:fe00:100 && is_chassis_resident("cr-lrp-public")), dnl
 action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src = nd.target; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
   table=3 (lr_in_ip_input     ), priority=90   , dnl
-match=(inport == "lrp-public" && nd_ns && nd.target == { fe80::200:ff:fe00:101:8080, fe80::200:ff:fe00:102:8080 } && is_chassis_resident("cr-lrp-public")), dnl
+match=(inport == "lrp-public" && nd_ns && nd.target == $lr_lb_ip6 && is_chassis_resident("cr-lrp-public")), dnl
 action=(nd_na { eth.src = xreg0[[0..47]]; ip6.src = nd.target; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
 ])
 



More information about the dev mailing list