[ovs-dev] [PATCH ovn] ovn-northd: Don't generate identical flows for same LBs with different protocol.

Mark Gray mark.d.gray at redhat.com
Fri Aug 27 17:05:52 UTC 2021


On 27/08/2021 16:24, Ilya Maximets wrote:
> It's common for CMS (e.g., ovn-kubernetes) to create 3 identical load
> balancers, one per protocol (tcp, udp, sctp).  However, if VIPs of
> these load balancers has no ports specified (!vip_port), northd will
> generate identical logical flows for them.  2 of 3 such flows will be
> just discarded, so it's better to not build them form the beginning.

s/form/from

> 
> For example, in an ovn-heater's 120 node density-heavy scenario we
> have 3 load balancers with 15K VIPs in each.  One for tcp, one for
> udp and one for sctp.  In this case, ovn-northd generates 26M of
> logical flows in total.  ~7.5M of them are flows for a single load
> balancer.  2 * 7.5M = 15M are identical to the first 7.5M and just
> discarded.
> 
> Let's find all these identical load balancers and skip while building
> logical flows.  With this change, 15M of redundant logical flows are
> not generated saving ~1.5 seconds of the CPU time per run.
> 
> Comparison function and the loop looks heavy, but in testing it takes
> only a few milliseconds on these large load balancers.
> 
> Signed-off-by: Ilya Maximets <i.maximets at ovn.org>

I see you haven't changed any tests? I am guessing because this doesn't
change the logical flows .. which is surprising?

Also, as an FYI, I was unable to create load balancers like this using
nbctl directly. It fails on creation of the load balancer. However, you
can modify the NBDB after creation. So I presume this is an allowed
configuration. However, it is not very well specified.

`$ ovn-nbctl lb-add lb0 11.0.0.200 192.168.0.2 tcp
ovn-nbctl: Protocol is unnecessary when no port of vip is given.`

> ---
> 
> The better option might be to allow multiple protocols being configured
> per load balancer, but this will require big API/schema/etc changes
> and may reuire re-desing of certain northd/ovn-controller logic.
> 
>  lib/lb.c            | 56 +++++++++++++++++++++++++++++++++++++++++++++
>  lib/lb.h            |  4 ++++
>  northd/ovn-northd.c | 35 ++++++++++++++++++++++++++++
>  3 files changed, 95 insertions(+)
> 
> diff --git a/lib/lb.c b/lib/lb.c
> index 7b0ed1abe..fb12c3457 100644
> --- a/lib/lb.c
> +++ b/lib/lb.c
> @@ -168,6 +168,7 @@ ovn_northd_lb_create(const struct nbrec_load_balancer *nbrec_lb)
>      bool is_sctp = nullable_string_is_equal(nbrec_lb->protocol, "sctp");
>      struct ovn_northd_lb *lb = xzalloc(sizeof *lb);
>  
> +    lb->skip_lflow_build = false;
>      lb->nlb = nbrec_lb;
>      lb->proto = is_udp ? "udp" : is_sctp ? "sctp" : "tcp";
>      lb->n_vips = smap_count(&nbrec_lb->vips);
> @@ -238,6 +239,61 @@ ovn_northd_lb_find(struct hmap *lbs, const struct uuid *uuid)
>      return NULL;
>  }
>  
> +/* Compares two load balancers for equality ignoring the 'protocol'. */
> +bool
> +ovn_northd_lb_equal_except_for_proto(const struct ovn_northd_lb *lb1,
> +                                     const struct ovn_northd_lb *lb2)
> +{

I'd have a bit of a concern about maintaining this. For example, if we
add more fields, we would have to remember to update this but I can't
think of a better way of doing it.

> +    /* It's much more convenient to compare Northbound DB configuration. */
> +    const struct nbrec_load_balancer *lb_a = lb1->nlb;
> +    const struct nbrec_load_balancer *lb_b = lb2->nlb;
> +
> +    if (!smap_equal(&lb_a->external_ids, &lb_b->external_ids)) {
> +        return false;
> +    }
> +    if (lb_a->n_health_check != lb_b->n_health_check) {
> +        return false;
> +    }
> +    if (lb_a->n_health_check
> +        && !memcmp(lb_a->health_check, lb_b->health_check,
> +                   lb_a->n_health_check * sizeof *lb_a->health_check)) {
> +        return false;
> +    }
> +    if (!smap_equal(&lb_a->ip_port_mappings, &lb_b->ip_port_mappings)) {
> +        return false;
> +    }
> +    if (!smap_equal(&lb_a->options, &lb_b->options)) {
> +        return false;
> +    }
> +    if (lb_a->n_selection_fields != lb_b->n_selection_fields) {
> +        return false;
> +    }
> +    if (lb_a->n_selection_fields &&
> +        memcmp(lb_a->selection_fields, lb_b->selection_fields,
> +               lb_a->n_selection_fields * sizeof *lb_a->selection_fields)) {
> +        return false;
> +    }
> +    if (!smap_equal(&lb_a->vips, &lb_b->vips)) {
> +        return false;
> +    }
> +
> +    /* Below fields are not easily accessible from the Nb DB entry, so
> +     * comparing parsed versions. */
> +    if (lb1->n_nb_ls != lb2->n_nb_ls || lb1->n_nb_lr != lb2->n_nb_lr) {
> +        return false;
> +    }
> +    if (lb1->n_nb_ls
> +        && memcmp(lb1->nb_ls, lb1->nb_ls, lb1->n_nb_ls * sizeof *lb1->nb_ls)) {
> +        return false;
> +    }
> +    if (lb1->n_nb_lr
> +        && memcmp(lb1->nb_lr, lb1->nb_lr, lb1->n_nb_lr * sizeof *lb1->nb_lr)) {
> +        return false;
> +    }
> +
> +    return true;
> +}
> +
>  void
>  ovn_northd_lb_add_lr(struct ovn_northd_lb *lb, struct ovn_datapath *od)
>  {
> diff --git a/lib/lb.h b/lib/lb.h
> index 832ed31fb..8e92338a3 100644
> --- a/lib/lb.h
> +++ b/lib/lb.h
> @@ -50,6 +50,8 @@ struct ovn_northd_lb {
>      size_t n_nb_lr;
>      size_t n_allocated_nb_lr;
>      struct ovn_datapath **nb_lr;
> +
> +    bool skip_lflow_build;
>  };
>  
>  struct ovn_lb_vip {
> @@ -87,6 +89,8 @@ struct ovn_northd_lb_backend {
>  
>  struct ovn_northd_lb *ovn_northd_lb_create(const struct nbrec_load_balancer *);
>  struct ovn_northd_lb * ovn_northd_lb_find(struct hmap *, const struct uuid *);
> +bool ovn_northd_lb_equal_except_for_proto(const struct ovn_northd_lb *,
> +                                          const struct ovn_northd_lb *);
>  void ovn_northd_lb_destroy(struct ovn_northd_lb *);
>  void
>  ovn_northd_lb_add_lr(struct ovn_northd_lb *lb, struct ovn_datapath *od);
> diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
> index af413aba4..d1efc28f4 100644
> --- a/northd/ovn-northd.c
> +++ b/northd/ovn-northd.c
> @@ -3732,6 +3732,35 @@ build_ovn_lbs(struct northd_context *ctx, struct hmap *datapaths,
>              sbrec_datapath_binding_set_load_balancers(od->sb, NULL, 0);
>          }
>      }
> +
> +    /* Northd doesn't use protocol in case vips has no ports specified.
> +     * And it's also common that user configures several identical load
> +     * balancers, one per protocol. We need to find them and use only one
> +     * for logical flow construction.  Logical flows will be identical,
> +     * so it's faster to just not build them. */
> +    struct ovn_northd_lb *lb2;
> +    HMAP_FOR_EACH (lb, hmap_node, lbs) {
> +        if (lb->skip_lflow_build) {
> +            continue;
> +        }
> +        for (size_t i = 0; i < lb->n_vips; i++) {
> +            if (lb->vips[i].vip_port) {
> +                goto next;
> +            }
> +        }
> +        HMAP_FOR_EACH (lb2, hmap_node, lbs) {
> +            if (lb2 == lb || lb2->skip_lflow_build) {
> +                continue;
> +            }
> +            if (ovn_northd_lb_equal_except_for_proto(lb, lb2)) {
> +                /* Load balancer still referenced from logical switch or
> +                 * router, so we can't destroy it here. */
> +                lb2->skip_lflow_build = true;
> +            }
> +        }

I am not sure how this would scale in the case in which there were lots
of load-balancers. OVN-K is changing to a model in which there are many
(15k?) loadbalancers. This would increase this inner loop to 15k x 15k
(225M) iterations.

> +next:;
> +    }
> +
>  }
>  
>  static void
> @@ -12772,6 +12801,9 @@ build_lflows_thread(void *arg)
>                      if (stop_parallel_processing()) {
>                          return NULL;
>                      }
> +                    if (lb->skip_lflow_build) {
> +                        continue;
> +                    }
>                      build_lswitch_arp_nd_service_monitor(lb, lsi->lflows,
>                                                           &lsi->match,
>                                                           &lsi->actions);
> @@ -12943,6 +12975,9 @@ build_lswitch_and_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>              build_lswitch_and_lrouter_iterate_by_op(op, &lsi);
>          }
>          HMAP_FOR_EACH (lb, hmap_node, lbs) {
> +            if (lb->skip_lflow_build) {
> +                continue;
> +            }
>              build_lswitch_arp_nd_service_monitor(lb, lsi.lflows,
>                                                   &lsi.actions,
>                                                   &lsi.match);
> 

I have found a configuration that causes undefined behaviour. However,
it is the same as the without your patch but it is relevant. If you
define two load balancers with the protocol specified (but without the
port) and attach different attributes to each. It can cause conflicting
logical flows. Consider the following reproducer (can be run in sandbox
environment):

`
# Create the first logical switch with one port
ovn-nbctl ls-add sw0
ovn-nbctl lsp-add sw0 sw0-port1
ovn-nbctl lsp-set-addresses sw0-port1 "50:54:00:00:00:01 192.168.0.2"

# Create the second logical switch with one port
ovn-nbctl ls-add sw1
ovn-nbctl lsp-add sw1 sw1-port1
ovn-nbctl lsp-set-addresses sw1-port1 "50:54:00:00:00:03 11.0.0.2"

# Create a logical router and attach both logical switches
ovn-nbctl lr-add lr0
ovn-nbctl lrp-add lr0 lrp0 00:00:00:00:ff:01 192.168.0.1/24
ovn-nbctl lsp-add sw0 lrp0-attachment
ovn-nbctl lsp-set-type lrp0-attachment router
ovn-nbctl lsp-set-addresses lrp0-attachment 00:00:00:00:ff:01
ovn-nbctl lsp-set-options lrp0-attachment router-port=lrp0
ovn-nbctl lrp-add lr0 lrp1 00:00:00:00:ff:02 11.0.0.1/24
ovn-nbctl lsp-add sw1 lrp1-attachment
ovn-nbctl lsp-set-type lrp1-attachment router
ovn-nbctl lsp-set-addresses lrp1-attachment 00:00:00:00:ff:02
ovn-nbctl lsp-set-options lrp1-attachment router-port=lrp1

ovs-vsctl add-port br-int p1 -- \
    set Interface p1 external_ids:iface-id=sw0-port1
ovs-vsctl add-port br-int p2 -- \
    set Interface p2 external_ids:iface-id=sw1-port1

ovn-nbctl set Logical_Router lr0 options:chassis=hv1
ovn-nbctl lb-add lb0 11.0.0.200 192.168.0.2
ovn-nbctl lb-add lb1 11.0.0.200 192.168.0.2
ovn-nbctl set Load_Balancer lb0 protocol=tcp
ovn-nbctl set Load_Balancer lb0 options=skip_snat=true
ovn-nbctl set Load_Balancer lb1 protocol=udp
ovn-nbctl lr-lb-add lr0 lb0
ovn-nbctl lr-lb-add lr0 lb1
`

Your code gives the following flows:
$ ovn-sbctl dump-flows | grep lr_in_dnat
  table=6 (lr_in_dnat         ), priority=110  , match=(ct.est && ip4 &&
reg0 == 11.0.0.200 && ct_label.natted == 1),
action=(flags.skip_snat_for_lb = 1; next;)
  table=6 (lr_in_dnat         ), priority=110  , match=(ct.est && ip4 &&
reg0 == 11.0.0.200 && ct_label.natted == 1), action=(next;)
  table=6 (lr_in_dnat         ), priority=110  , match=(ct.new && ip4 &&
reg0 == 11.0.0.200), action=(ct_lb(backends=192.168.0.2);)
  table=6 (lr_in_dnat         ), priority=110  , match=(ct.new && ip4 &&
reg0 == 11.0.0.200), action=(flags.skip_snat_for_lb = 1;
ct_lb(backends=192.168.0.2);)
  table=6 (lr_in_dnat         ), priority=0    , match=(1), action=(next;)

These should produce different flows for each load balancer which means
we need to specify protocol as a match field. I think this prevents your
optimization?

For reference, I raised a bugzilla bug for this and it is similar to one
I fixed recently.

https://bugzilla.redhat.com/show_bug.cgi?id=1998592



More information about the dev mailing list