[ovs-dev] [PATCH ovn 1/2] northd: Use address sets for ARP responder flows for VIPs.

Numan Siddique numans at ovn.org
Wed Oct 6 01:01:00 UTC 2021


On Fri, Oct 1, 2021 at 9:47 AM Dumitru Ceara <dceara at redhat.com> wrote:
>
> Otherwise the S_ROUTER_IN_IP_INPUT aggregated flows that reply to ARP
> requests targeting load balancer VIPs get completely regenerated every
> time a new VIP/LB is added.  This affects SB memory usage as RAFT log
> entries are huge.  Use an address set instead.  Updating an address set
> is incremental, because it's performed with a "mutate" operation.
>
> On a large scale ovn-kubernetes deployment with a high number of
> load balancers (services) this change reduces memory usage of
> ovsdb-servers running the OVN_Southbound cluster by 50%, from ~2GB
> RSS to ~1GB RSS.
>
> Signed-off-by: Dumitru Ceara <dceara at redhat.com>

Thanks Dumitru for the patches.

Both the patches LGTM.   I applied both to the main branch.

Numan

> ---
>  northd/northd.c     |   61 +++++++++++++++++++++++++++++++++------------------
>  tests/ovn-northd.at |   44 +++++++++++++++++--------------------
>  2 files changed, 59 insertions(+), 46 deletions(-)
>
> diff --git a/northd/northd.c b/northd/northd.c
> index cf2467fe1..5699745d6 100644
> --- a/northd/northd.c
> +++ b/northd/northd.c
> @@ -11937,39 +11937,28 @@ build_lrouter_ipv4_ip_input(struct ovn_port *op,
>                                op->cr_port->json_key);
>              }
>
> -            struct ds load_balancer_ips_v4 = DS_EMPTY_INITIALIZER;
> -
> -            /* For IPv4 we can just create one rule with all required IPs. */
> -            ds_put_cstr(&load_balancer_ips_v4, "{ ");
> -            ds_put_and_free_cstr(&load_balancer_ips_v4,
> -                                 sset_join(&op->od->lb_ips_v4, ", ", " }"));
> -
> -            build_lrouter_arp_flow(op->od, op, ds_cstr(&load_balancer_ips_v4),
> +            /* Create a single ARP rule for all IPs that are used as VIPs. */
> +            char *lb_ips_v4_as = xasprintf("$%s_lb_ip4", op->od->nbr->name);
> +            build_lrouter_arp_flow(op->od, op, lb_ips_v4_as,
>                                     REG_INPORT_ETH_ADDR,
>                                     match, false, 90, NULL, lflows);
> -            ds_destroy(&load_balancer_ips_v4);
> +            free(lb_ips_v4_as);
>          }
>
>          if (sset_count(&op->od->lb_ips_v6)) {
>              ds_clear(match);
> -            ds_clear(actions);
> -
> -            struct ds load_balancer_ips_v6 = DS_EMPTY_INITIALIZER;
> -
> -            ds_put_cstr(&load_balancer_ips_v6, "{ ");
> -            ds_put_and_free_cstr(&load_balancer_ips_v6,
> -                                 sset_join(&op->od->lb_ips_v6, ", ", " }"));
>
>              if (is_l3dgw_port(op)) {
>                  ds_put_format(match, "is_chassis_resident(%s)",
>                                op->cr_port->json_key);
>              }
> -            build_lrouter_nd_flow(op->od, op, "nd_na",
> -                                  ds_cstr(&load_balancer_ips_v6), NULL,
> +
> +            /* Create a single ND rule for all IPs that are used as VIPs. */
> +            char *lb_ips_v6_as = xasprintf("$%s_lb_ip6", op->od->nbr->name);
> +            build_lrouter_nd_flow(op->od, op, "nd_na", lb_ips_v6_as, NULL,
>                                    REG_INPORT_ETH_ADDR, match, false, 90,
>                                    NULL, lflows, meter_groups);
> -
> -            ds_destroy(&load_balancer_ips_v6);
> +            free(lb_ips_v6_as);
>          }
>
>          if (!op->od->is_gw_router && !op->od->n_l3dgw_ports) {
> @@ -13541,7 +13530,7 @@ sync_address_set(struct northd_context *ctx, const char *name,
>   * in OVN_Northbound, so that the address sets used in Logical_Flows in
>   * OVN_Southbound is checked against the proper set.*/
>  static void
> -sync_address_sets(struct northd_context *ctx)
> +sync_address_sets(struct northd_context *ctx, struct hmap *datapaths)
>  {
>      struct shash sb_address_sets = SHASH_INITIALIZER(&sb_address_sets);
>
> @@ -13588,6 +13577,34 @@ sync_address_sets(struct northd_context *ctx)
>          svec_destroy(&ipv6_addrs);
>      }
>
> +    /* Sync router load balancer VIP generated address sets. */
> +    struct ovn_datapath *od;
> +    HMAP_FOR_EACH (od, key_node, datapaths) {
> +        if (!od->nbr) {
> +            continue;
> +        }
> +
> +        if (sset_count(&od->lb_ips_v4)) {
> +            char *ipv4_addrs_name = xasprintf("%s_lb_ip4", od->nbr->name);
> +            const char **ipv4_addrs = sset_array(&od->lb_ips_v4);
> +
> +            sync_address_set(ctx, ipv4_addrs_name, ipv4_addrs,
> +                             sset_count(&od->lb_ips_v4), &sb_address_sets);
> +            free(ipv4_addrs_name);
> +            free(ipv4_addrs);
> +        }
> +
> +        if (sset_count(&od->lb_ips_v6)) {
> +            char *ipv6_addrs_name = xasprintf("%s_lb_ip6", od->nbr->name);
> +            const char **ipv6_addrs = sset_array(&od->lb_ips_v6);
> +
> +            sync_address_set(ctx, ipv6_addrs_name, ipv6_addrs,
> +                             sset_count(&od->lb_ips_v6), &sb_address_sets);
> +            free(ipv6_addrs_name);
> +            free(ipv6_addrs);
> +        }
> +    }
> +
>      /* sync user defined address sets, which may overwrite port group
>       * generated address sets if same name is used */
>      const struct nbrec_address_set *nb_address_set;
> @@ -14328,7 +14345,7 @@ ovnnb_db_run(struct northd_context *ctx,
>      stopwatch_start(CLEAR_LFLOWS_CTX_STOPWATCH_NAME, time_msec());
>      ovn_update_ipv6_prefix(ports);
>
> -    sync_address_sets(ctx);
> +    sync_address_sets(ctx, datapaths);
>      sync_port_groups(ctx, &port_groups);
>      sync_meters(ctx, &meter_groups);
>      sync_dns_entries(ctx, datapaths);
> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> index 5de554455..aece7aba4 100644
> --- a/tests/ovn-northd.at
> +++ b/tests/ovn-northd.at
> @@ -1658,8 +1658,8 @@ ovn-nbctl lb-add lb1 "192.168.2.1:8080" "10.0.0.4:8080"
>  ovn-nbctl lb-add lb2 "192.168.2.4:8080" "10.0.0.5:8080" udp
>  ovn-nbctl lb-add lb3 "192.168.2.5:8080" "10.0.0.6:8080"
>  ovn-nbctl lb-add lb4 "192.168.2.6:8080" "10.0.0.7:8080"
> -ovn-nbctl lb-add lb5 "fe80::200:ff:fe00:101:8080" "fe02::200:ff:fe00:101:8080"
> -ovn-nbctl lb-add lb5 "fe80::200:ff:fe00:102:8080" "fe02::200:ff:fe00:102:8080"
> +ovn-nbctl lb-add lb5 "[[fe80::200:ff:fe00:101]]:8080" "[[fe02::200:ff:fe00:101]]:8080"
> +ovn-nbctl lb-add lb5 "[[fe80::200:ff:fe00:102]]:8080" "[[fe02::200:ff:fe00:102]]:8080"
>
>  ovn-nbctl lr-lb-add lr lb1
>  ovn-nbctl lr-lb-add lr lb2
> @@ -1669,6 +1669,10 @@ ovn-nbctl lr-lb-add lr lb5
>
>  ovn-nbctl --wait=sb sync
>
> +# Check generated VIP address sets.
> +check_column '192.168.2.1 192.168.2.4 192.168.2.5 192.168.2.6' Address_Set addresses name=lr_lb_ip4
> +check_column 'fe80::200:ff:fe00:101 fe80::200:ff:fe00:102' Address_Set addresses name=lr_lb_ip6
> +
>  # Ingress router port ETH address is stored in lr_in_admission.
>  AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_admission.*xreg0\[[0..47\]]" | sort], [0], [dnl
>    table=0 (lr_in_admission    ), priority=50   , dnl
> @@ -1685,16 +1689,8 @@ match=(eth.mcast && inport == "lrp-public"), dnl
>  action=(xreg0[[0..47]] = 00:00:00:00:01:00; next;)
>  ])
>
> -# The order of the VIP addresses in the flow table entries doesn't
> -# matter, so just replace each of them with a generic $vip for
> -# testing.  It would be better if we could ensure each one appeared
> -# exactly once, but that's hard with sed.
> -sed_vips() {
> -    sed 's/192\.168\.2\.[[1456]]/$vip/g'
> -}
> -
>  # Ingress router port ETH address is used for ARP reply/NA in lr_in_ip_input.
> -AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_ip_input.*priority=90" | grep "arp\|nd" | sed_vips | sort], [0], [dnl
> +AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_ip_input.*priority=90" | grep "arp\|nd" | sort], [0], [dnl
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>  match=(arp.op == 1 && arp.tpa == 43.43.43.150), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
> @@ -1708,28 +1704,28 @@ action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */
>  match=(arp.op == 1 && arp.tpa == 43.43.43.4), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 && arp.spa == 42.42.42.0/24), dnl
> +match=(inport == "lrp" && arp.op == 1 && arp.tpa == $lr_lb_ip4), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == { $vip, $vip, $vip, $vip }), dnl
> +match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 && arp.spa == 42.42.42.0/24), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>  match=(inport == "lrp" && ip6.dst == {fe80::200:ff:fe00:1, ff02::1:ff00:1} && nd_ns && nd.target == fe80::200:ff:fe00:1), dnl
>  action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src = nd.target; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp" && nd_ns && nd.target == { fe80::200:ff:fe00:101:8080, fe80::200:ff:fe00:102:8080 }), dnl
> +match=(inport == "lrp" && nd_ns && nd.target == $lr_lb_ip6), dnl
>  action=(nd_na { eth.src = xreg0[[0..47]]; ip6.src = nd.target; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.1 && arp.spa == 43.43.43.0/24), dnl
> +match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == $lr_lb_ip4), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == { $vip, $vip, $vip, $vip }), dnl
> +match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.1 && arp.spa == 43.43.43.0/24), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>  match=(inport == "lrp-public" && ip6.dst == {fe80::200:ff:fe00:100, ff02::1:ff00:100} && nd_ns && nd.target == fe80::200:ff:fe00:100), dnl
>  action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src = nd.target; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp-public" && nd_ns && nd.target == { fe80::200:ff:fe00:101:8080, fe80::200:ff:fe00:102:8080 }), dnl
> +match=(inport == "lrp-public" && nd_ns && nd.target == $lr_lb_ip6), dnl
>  action=(nd_na { eth.src = xreg0[[0..47]]; ip6.src = nd.target; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
>  ])
>
> @@ -1763,7 +1759,7 @@ action=(xreg0[[0..47]] = 00:00:00:00:01:00; next;)
>  # Ingress router port is used for ARP reply/NA in lr_in_ip_input.
>  # xxreg0[0..47] is used unless external_mac is set.
>  # Priority 90 flows (per router).
> -AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_ip_input.*priority=90" | grep "arp\|nd" | sed_vips | sort], [0], [dnl
> +AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_ip_input.*priority=90" | grep "arp\|nd" | sort], [0], [dnl
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>  match=(arp.op == 1 && arp.tpa == 43.43.43.150), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
> @@ -1777,28 +1773,28 @@ action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */
>  match=(arp.op == 1 && arp.tpa == 43.43.43.4), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 && arp.spa == 42.42.42.0/24), dnl
> +match=(inport == "lrp" && arp.op == 1 && arp.tpa == $lr_lb_ip4), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp" && arp.op == 1 && arp.tpa == { $vip, $vip, $vip, $vip }), dnl
> +match=(inport == "lrp" && arp.op == 1 && arp.tpa == 42.42.42.1 && arp.spa == 42.42.42.0/24), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>  match=(inport == "lrp" && ip6.dst == {fe80::200:ff:fe00:1, ff02::1:ff00:1} && nd_ns && nd.target == fe80::200:ff:fe00:1), dnl
>  action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src = nd.target; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp" && nd_ns && nd.target == { fe80::200:ff:fe00:101:8080, fe80::200:ff:fe00:102:8080 }), dnl
> +match=(inport == "lrp" && nd_ns && nd.target == $lr_lb_ip6), dnl
>  action=(nd_na { eth.src = xreg0[[0..47]]; ip6.src = nd.target; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.1 && arp.spa == 43.43.43.0/24), dnl
> +match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == $lr_lb_ip4 && is_chassis_resident("cr-lrp-public")), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == { $vip, $vip, $vip, $vip } && is_chassis_resident("cr-lrp-public")), dnl
> +match=(inport == "lrp-public" && arp.op == 1 && arp.tpa == 43.43.43.1 && arp.spa == 43.43.43.0/24), dnl
>  action=(eth.dst = eth.src; eth.src = xreg0[[0..47]]; arp.op = 2; /* ARP reply */ arp.tha = arp.sha; arp.sha = xreg0[[0..47]]; arp.tpa <-> arp.spa; outport = inport; flags.loopback = 1; output;)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
>  match=(inport == "lrp-public" && ip6.dst == {fe80::200:ff:fe00:100, ff02::1:ff00:100} && nd_ns && nd.target == fe80::200:ff:fe00:100 && is_chassis_resident("cr-lrp-public")), dnl
>  action=(nd_na_router { eth.src = xreg0[[0..47]]; ip6.src = nd.target; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
>    table=3 (lr_in_ip_input     ), priority=90   , dnl
> -match=(inport == "lrp-public" && nd_ns && nd.target == { fe80::200:ff:fe00:101:8080, fe80::200:ff:fe00:102:8080 } && is_chassis_resident("cr-lrp-public")), dnl
> +match=(inport == "lrp-public" && nd_ns && nd.target == $lr_lb_ip6 && is_chassis_resident("cr-lrp-public")), dnl
>  action=(nd_na { eth.src = xreg0[[0..47]]; ip6.src = nd.target; nd.tll = xreg0[[0..47]]; outport = inport; flags.loopback = 1; output; };)
>  ])
>
>
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>


More information about the dev mailing list