[ovs-dev] [PATCH ovn] northd: Don't merge ARP flood flows for all (un)reachable IPs.
Mark Michelson
mmichels at redhat.com
Fri Jul 30 14:15:31 UTC 2021
I finally got a chance to look through the code.
Acked-by: Mark Michelson <mmichels at redhat.com>
On 7/29/21 5:59 AM, Dumitru Ceara wrote:
> Logical_Flows are immutable, that is, when the match or action change
> ovn-northd will actually remove the old logical flow and add a new one.
>
> Commit ecc941c70f16 added flows to flood ARPs to routers for unreachable
> addresses.
>
> In the following topology (2 switches connected to a router with a load
> balancer attached to it):
>
> S1 --- (IP1/MASK1) R (IP2/MASK2) --- S2
>
> LB1 (applied on R) with N VIPs.
>
> The following logical flows were generated:
>
> S1 (in_l2_lkup): if "arp.op == 1 && arp.tpa == { VIP1, .., VIPn, IP1 }" then ...
> S2 (in_l2_lkup): if "arp.op == 1 && arp.tpa == { VIP1, .., VIPn, IP2 }" then ...
>
> While this seemed a good idea initially because it would reduce the
> total number of logical flows, this approach has a few major downsides
> that severely affect scalability:
>
> 1. When logical datapath grouping is enabled (which is how high scale
> deployments should use OVN) there is no chance that the flows for
> reachable/unreachable router owned IPs are grouped together for all
> datapaths that correspond to the logical switches they are generated
> for.
> 2. Whenever a VIP (VIPn+1) is added to LB1 all these flows will be
> removed and new ones will be added with the only difference being the
> VIPn+1 aded to the flow match. As this is a new logical flow record,
> ovn-controller will have to remove all previously installed openflows
> (for the deleted logical flow) and add new ones. However, for most
> of these openflows, the only change is the cookie value (the first 32
> bits of the logical flow UUID). At scale this unnecessary openflow
> churn will induce significant latency. Moreover, this also creates
> unnecessary jsonrpc traffic from ovn-northd towards OVN_Southbound
> because the old flow needs to be removed and a new one (with a large
> match field) needs to be installed.
>
> To avoid this, we now install individual flows, one per IP, for
> managing ARP/ND traffic from logical switches towards router owned
> IPs that are reachable/unreachable on the logical port connecting
> the switch.
>
> Fixes: ecc941c70f16 ("northd: Flood ARPs to routers for "unreachable" addresses.")
> Signed-off-by: Dumitru Ceara <dceara at redhat.com>
> ---
> On a deployment based on an ovn-kubernetes cluster with 20 nodes, 2500
> load balancer VIPs, 5000 pods, we:
> - measure number of logical flows in the SB DB, size of the SB DB
> - then create an additional 250 pods (logical ports), bind them, and add
> a load balancer VIP (with a single backend) for each of the additional
> pods, and measure how long it takes for all openflows corresponding to
> each of the pods to be installed in OVS.
>
> Results:
> - without fix:
> ------------
> SB lflows: 67976
> SB size (uncompacted): 46 MiB
> SB size (compacted): 40 MiB
>
> after 250 ports + VIPs added:
> SB lflows: 71479
> SB size (uncompacted): 248 MiB
> SB size (compacted): 42 MiB
> Max time to install all relevant openflows for a single pod: ~6sec
>
> - with fix:
> ---------
> SB lflows: 70399
> SB size (uncompacted): 46 MiB
> SB size (compacted): 40 MiB
>
> after 250 ports + VIPs added:
> SB lflows: 74149
> SB size (uncompacted): 165 MiB
> SB size (compacted): 42 MiB
> Max time to install all relevant openflows for a single pod: ~100msec
> ---
> northd/ovn-northd.c | 94 ++++++++++++++++----------------------------
> northd/ovn_northd.dl | 28 ++++++-------
> tests/ovn-northd.at | 8 ++--
> 3 files changed, 50 insertions(+), 80 deletions(-)
>
> diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
> index ebe12cace..4b8554e8b 100644
> --- a/northd/ovn-northd.c
> +++ b/northd/ovn-northd.c
> @@ -6651,7 +6651,7 @@ build_lswitch_rport_arp_req_self_orig_flow(struct ovn_port *op,
> }
>
> static void
> -arp_nd_ns_match(struct ds *ips, int addr_family, struct ds *match)
> +arp_nd_ns_match(const char *ips, int addr_family, struct ds *match)
> {
> /* Packets received from VXLAN tunnels have already been through the
> * router pipeline so we should skip them. Normally this is done by the
> @@ -6661,13 +6661,10 @@ arp_nd_ns_match(struct ds *ips, int addr_family, struct ds *match)
> ds_put_cstr(match, FLAGBIT_NOT_VXLAN " && ");
>
> if (addr_family == AF_INET) {
> - ds_put_cstr(match, "arp.op == 1 && arp.tpa == {");
> + ds_put_format(match, "arp.op == 1 && arp.tpa == %s", ips);
> } else {
> - ds_put_cstr(match, "nd_ns && nd.target == {");
> + ds_put_format(match, "nd_ns && nd.target == %s", ips);
> }
> -
> - ds_put_cstr(match, ds_cstr_ro(ips));
> - ds_put_cstr(match, "}");
> }
>
> /*
> @@ -6676,7 +6673,7 @@ arp_nd_ns_match(struct ds *ips, int addr_family, struct ds *match)
> * switching domain as regular broadcast.
> */
> static void
> -build_lswitch_rport_arp_req_flow_for_reachable_ip(struct ds *ips,
> +build_lswitch_rport_arp_req_flow_for_reachable_ip(const char *ips,
> int addr_family, struct ovn_port *patch_op, struct ovn_datapath *od,
> uint32_t priority, struct hmap *lflows,
> const struct ovsdb_idl_row *stage_hint)
> @@ -6715,7 +6712,7 @@ build_lswitch_rport_arp_req_flow_for_reachable_ip(struct ds *ips,
> * broadcast.
> */
> static void
> -build_lswitch_rport_arp_req_flow_for_unreachable_ip(struct ds *ips,
> +build_lswitch_rport_arp_req_flow_for_unreachable_ip(const char *ips,
> int addr_family, struct ovn_datapath *od, uint32_t priority,
> struct hmap *lflows, const struct ovsdb_idl_row *stage_hint)
> {
> @@ -6757,10 +6754,6 @@ build_lswitch_rport_arp_req_flows(struct ovn_port *op,
> * router port.
> * Priority: 80.
> */
> - struct ds ips_v4_match_reachable = DS_EMPTY_INITIALIZER;
> - struct ds ips_v6_match_reachable = DS_EMPTY_INITIALIZER;
> - struct ds ips_v4_match_unreachable = DS_EMPTY_INITIALIZER;
> - struct ds ips_v6_match_unreachable = DS_EMPTY_INITIALIZER;
>
> const char *ip_addr;
> SSET_FOR_EACH (ip_addr, &op->od->lb_ips_v4) {
> @@ -6771,9 +6764,13 @@ build_lswitch_rport_arp_req_flows(struct ovn_port *op,
> */
> if (ip_parse(ip_addr, &ipv4_addr)) {
> if (lrouter_port_ipv4_reachable(op, ipv4_addr)) {
> - ds_put_format(&ips_v4_match_reachable, "%s, ", ip_addr);
> + build_lswitch_rport_arp_req_flow_for_reachable_ip(
> + ip_addr, AF_INET, sw_op, sw_od, 80, lflows,
> + stage_hint);
> } else {
> - ds_put_format(&ips_v4_match_unreachable, "%s, ", ip_addr);
> + build_lswitch_rport_arp_req_flow_for_unreachable_ip(
> + ip_addr, AF_INET, sw_od, 90, lflows,
> + stage_hint);
> }
> }
> }
> @@ -6785,9 +6782,13 @@ build_lswitch_rport_arp_req_flows(struct ovn_port *op,
> */
> if (ipv6_parse(ip_addr, &ipv6_addr)) {
> if (lrouter_port_ipv6_reachable(op, &ipv6_addr)) {
> - ds_put_format(&ips_v6_match_reachable, "%s, ", ip_addr);
> + build_lswitch_rport_arp_req_flow_for_reachable_ip(
> + ip_addr, AF_INET6, sw_op, sw_od, 80, lflows,
> + stage_hint);
> } else {
> - ds_put_format(&ips_v6_match_unreachable, "%s, ", ip_addr);
> + build_lswitch_rport_arp_req_flow_for_unreachable_ip(
> + ip_addr, AF_INET6, sw_od, 90, lflows,
> + stage_hint);
> }
> }
> }
> @@ -6812,65 +6813,42 @@ build_lswitch_rport_arp_req_flows(struct ovn_port *op,
>
> if (!sset_contains(&op->od->lb_ips_v6, nat->external_ip)) {
> if (lrouter_port_ipv6_reachable(op, addr)) {
> - ds_put_format(&ips_v6_match_reachable, "%s, ",
> - nat->external_ip);
> + build_lswitch_rport_arp_req_flow_for_reachable_ip(
> + nat->external_ip, AF_INET6, sw_op, sw_od, 80, lflows,
> + stage_hint);
> } else {
> - ds_put_format(&ips_v6_match_unreachable, "%s, ",
> - nat->external_ip);
> + build_lswitch_rport_arp_req_flow_for_unreachable_ip(
> + nat->external_ip, AF_INET6, sw_od, 90, lflows,
> + stage_hint);
> }
> }
> } else {
> ovs_be32 addr = nat_entry->ext_addrs.ipv4_addrs[0].addr;
> if (!sset_contains(&op->od->lb_ips_v4, nat->external_ip)) {
> if (lrouter_port_ipv4_reachable(op, addr)) {
> - ds_put_format(&ips_v4_match_reachable, "%s, ",
> - nat->external_ip);
> + build_lswitch_rport_arp_req_flow_for_reachable_ip(
> + nat->external_ip, AF_INET, sw_op, sw_od, 80, lflows,
> + stage_hint);
> } else {
> - ds_put_format(&ips_v4_match_unreachable, "%s, ",
> - nat->external_ip);
> + build_lswitch_rport_arp_req_flow_for_unreachable_ip(
> + nat->external_ip, AF_INET, sw_od, 90, lflows,
> + stage_hint);
> }
> }
> }
> }
>
> for (size_t i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
> - ds_put_format(&ips_v4_match_reachable, "%s, ",
> - op->lrp_networks.ipv4_addrs[i].addr_s);
> - }
> - for (size_t i = 0; i < op->lrp_networks.n_ipv6_addrs; i++) {
> - ds_put_format(&ips_v6_match_reachable, "%s, ",
> - op->lrp_networks.ipv6_addrs[i].addr_s);
> - }
> -
> - if (ds_last(&ips_v4_match_reachable) != EOF) {
> - ds_chomp(&ips_v4_match_reachable, ' ');
> - ds_chomp(&ips_v4_match_reachable, ',');
> build_lswitch_rport_arp_req_flow_for_reachable_ip(
> - &ips_v4_match_reachable, AF_INET, sw_op, sw_od, 80, lflows,
> - stage_hint);
> + op->lrp_networks.ipv4_addrs[i].addr_s, AF_INET, sw_op, sw_od, 80,
> + lflows, stage_hint);
> }
> - if (ds_last(&ips_v6_match_reachable) != EOF) {
> - ds_chomp(&ips_v6_match_reachable, ' ');
> - ds_chomp(&ips_v6_match_reachable, ',');
> + for (size_t i = 0; i < op->lrp_networks.n_ipv6_addrs; i++) {
> build_lswitch_rport_arp_req_flow_for_reachable_ip(
> - &ips_v6_match_reachable, AF_INET6, sw_op, sw_od, 80, lflows,
> - stage_hint);
> + op->lrp_networks.ipv6_addrs[i].addr_s, AF_INET6, sw_op, sw_od, 80,
> + lflows, stage_hint);
> }
>
> - if (ds_last(&ips_v4_match_unreachable) != EOF) {
> - ds_chomp(&ips_v4_match_unreachable, ' ');
> - ds_chomp(&ips_v4_match_unreachable, ',');
> - build_lswitch_rport_arp_req_flow_for_unreachable_ip(
> - &ips_v4_match_unreachable, AF_INET, sw_od, 90, lflows,
> - stage_hint);
> - }
> - if (ds_last(&ips_v6_match_unreachable) != EOF) {
> - ds_chomp(&ips_v6_match_unreachable, ' ');
> - ds_chomp(&ips_v6_match_unreachable, ',');
> - build_lswitch_rport_arp_req_flow_for_unreachable_ip(
> - &ips_v6_match_unreachable, AF_INET6, sw_od, 90, lflows,
> - stage_hint);
> - }
> /* Self originated ARP requests/ND need to be flooded as usual.
> *
> * However, if the switch doesn't have any non-router ports we shouldn't
> @@ -6881,10 +6859,6 @@ build_lswitch_rport_arp_req_flows(struct ovn_port *op,
> if (sw_od->n_router_ports != sw_od->nbs->n_ports) {
> build_lswitch_rport_arp_req_self_orig_flow(op, 75, sw_od, lflows);
> }
> - ds_destroy(&ips_v4_match_reachable);
> - ds_destroy(&ips_v6_match_reachable);
> - ds_destroy(&ips_v4_match_unreachable);
> - ds_destroy(&ips_v6_match_unreachable);
> }
>
> static void
> diff --git a/northd/ovn_northd.dl b/northd/ovn_northd.dl
> index 7bfaae992..d82f43cc8 100644
> --- a/northd/ovn_northd.dl
> +++ b/northd/ovn_northd.dl
> @@ -4375,8 +4375,7 @@ Flow(.logical_datapath = sw._uuid,
> .stage = s_SWITCH_IN_L2_LKUP(),
> .priority = 80,
> .__match = fLAGBIT_NOT_VXLAN() ++
> - " && arp.op == 1 && arp.tpa == { " ++
> - ipv4.to_vec().join(", ") ++ "}",
> + " && arp.op == 1 && arp.tpa == " ++ ipv4,
> .actions = if (sw.has_non_router_port) {
> "clone {outport = ${sp.json_name}; output; }; "
> "outport = ${mc_flood_l2}; output;"
> @@ -4386,15 +4385,14 @@ Flow(.logical_datapath = sw._uuid,
> .external_ids = stage_hint(sp.lsp._uuid)) :-
> sp in &SwitchPort(.sw = sw, .peer = Some{rp}),
> rp.is_enabled(),
> - &SwitchPortARPForwards(.port = sp, .reachable_ips_v4 = ipv4),
> - not ipv4.is_empty(),
> + &SwitchPortARPForwards(.port = sp, .reachable_ips_v4 = ips_v4),
> + var ipv4 = FlatMap(ips_v4),
> var mc_flood_l2 = json_string_escape(mC_FLOOD_L2().0).
> Flow(.logical_datapath = sw._uuid,
> .stage = s_SWITCH_IN_L2_LKUP(),
> .priority = 80,
> .__match = fLAGBIT_NOT_VXLAN() ++
> - " && nd_ns && nd.target == { " ++
> - ipv6.to_vec().join(", ") ++ "}",
> + " && nd_ns && nd.target == " ++ ipv6,
> .actions = if (sw.has_non_router_port) {
> "clone {outport = ${sp.json_name}; output; }; "
> "outport = ${mc_flood_l2}; output;"
> @@ -4404,36 +4402,34 @@ Flow(.logical_datapath = sw._uuid,
> .external_ids = stage_hint(sp.lsp._uuid)) :-
> sp in &SwitchPort(.sw = sw, .peer = Some{rp}),
> rp.is_enabled(),
> - &SwitchPortARPForwards(.port = sp, .reachable_ips_v6 = ipv6),
> - not ipv6.is_empty(),
> + &SwitchPortARPForwards(.port = sp, .reachable_ips_v6 = ips_v6),
> + var ipv6 = FlatMap(ips_v6),
> var mc_flood_l2 = json_string_escape(mC_FLOOD_L2().0).
>
> Flow(.logical_datapath = sw._uuid,
> .stage = s_SWITCH_IN_L2_LKUP(),
> .priority = 90,
> .__match = fLAGBIT_NOT_VXLAN() ++
> - " && arp.op == 1 && arp.tpa == {" ++
> - ipv4.to_vec().join(", ") ++ "}",
> + " && arp.op == 1 && arp.tpa == " ++ ipv4,
> .actions = "outport = ${flood}; output;",
> .external_ids = stage_hint(sp.lsp._uuid)) :-
> sp in &SwitchPort(.sw = sw, .peer = Some{rp}),
> rp.is_enabled(),
> - &SwitchPortARPForwards(.port = sp, .unreachable_ips_v4 = ipv4),
> - not ipv4.is_empty(),
> + &SwitchPortARPForwards(.port = sp, .unreachable_ips_v4 = ips_v4),
> + var ipv4 = FlatMap(ips_v4),
> var flood = json_string_escape(mC_FLOOD().0).
>
> Flow(.logical_datapath = sw._uuid,
> .stage = s_SWITCH_IN_L2_LKUP(),
> .priority = 90,
> .__match = fLAGBIT_NOT_VXLAN() ++
> - " && nd_ns && nd.target == {" ++
> - ipv6.to_vec().join(", ") ++ "}",
> + " && nd_ns && nd.target == " ++ ipv6,
> .actions = "outport = ${flood}; output;",
> .external_ids = stage_hint(sp.lsp._uuid)) :-
> sp in &SwitchPort(.sw = sw, .peer = Some{rp}),
> rp.is_enabled(),
> - &SwitchPortARPForwards(.port = sp, .unreachable_ips_v6 = ipv6),
> - not ipv6.is_empty(),
> + &SwitchPortARPForwards(.port = sp, .unreachable_ips_v6 = ips_v6),
> + var ipv6 = FlatMap(ips_v6),
> var flood = json_string_escape(mC_FLOOD().0).
>
> for (SwitchPortNewDynamicAddress(.port = &SwitchPort{.lsp = lsp, .json_name = json_name, .sw = sw},
> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> index 94405349e..3df4bf615 100644
> --- a/tests/ovn-northd.at
> +++ b/tests/ovn-northd.at
> @@ -4276,20 +4276,20 @@ AT_CHECK([ovn-sbctl lflow-list sw | grep ls_in_l2_lkup | grep priority=90 -c], [
> ])
>
> # We expect that the installed flows will match the unreachable DNAT addresses only.
> -AT_CHECK([ovn-sbctl lflow-list sw | grep ls_in_l2_lkup | grep priority=90 | grep "arp.tpa == {30.0.0.100}" -c], [0], [dnl
> +AT_CHECK([ovn-sbctl lflow-list sw | grep ls_in_l2_lkup | grep priority=90 | grep "arp.tpa == 30.0.0.100" -c], [0], [dnl
> 1
> ])
>
> -AT_CHECK([ovn-sbctl lflow-list sw | grep ls_in_l2_lkup | grep priority=90 | grep "arp.tpa == {40.0.0.100}" -c], [0], [dnl
> +AT_CHECK([ovn-sbctl lflow-list sw | grep ls_in_l2_lkup | grep priority=90 | grep "arp.tpa == 40.0.0.100" -c], [0], [dnl
> 1
> ])
>
> # Ensure that we do not create flows for SNAT addresses
> -AT_CHECK([ovn-sbctl lflow-list sw | grep ls_in_l2_lkup | grep priority=90 | grep "arp.tpa == {30.0.0.200}" -c], [1], [dnl
> +AT_CHECK([ovn-sbctl lflow-list sw | grep ls_in_l2_lkup | grep priority=90 | grep "arp.tpa == 30.0.0.200" -c], [1], [dnl
> 0
> ])
>
> -AT_CHECK([ovn-sbctl lflow-list sw | grep ls_in_l2_lkup | grep priority=90 | grep "arp.tpa == {40.0.0.200}" -c], [1], [dnl
> +AT_CHECK([ovn-sbctl lflow-list sw | grep ls_in_l2_lkup | grep priority=90 | grep "arp.tpa == 40.0.0.200" -c], [1], [dnl
> 0
> ])
>
>
More information about the dev
mailing list