[ovs-dev] [PATCH] ofproto: Fix resource usage explosion while processing bundled FLOW_MOD.

Mon Nov 22 08:07:44 UTC 2021

Thanks for the patch Ilya!

I’ve tested this with my setup in OVN similar to described in message by link at "Reported-at".
ovs-vswitchd with this patch applied seems working fine. It processed flows and though with high CPU load during ~3-5 seconds after start, it went to normal CPU load about 2-4%.
I’ve tested this with both: OVS v2.13.5 and master branch.

Tested-by: Vladislav Odintsov <odivlad at gmail.com>

Regards,
Vladislav Odintsov

> On 20 Nov 2021, at 02:07, Ilya Maximets <i.maximets at ovn.org> wrote:
> 
> While processing a bundle, OVS will add all new and modified rules
> to classifiers.  Classifiers are using RCU-protected pvector to
> store subtables.  Addition of a new subtable or removal of the old
> one leads to re-allocation and memory copy of the pvector array.
> Old version of that array is given to RCU thread to free it later.
> 
> The problem is that bundle is processed under the mutex without
> entering the quiescent state.  Therefore, memory can not be freed
> until the whole bundle is processed.  So, if a few thousands of
> flows added to the same table in a bundle, pvector will be re-allocated
> while adding each of them.  So, we'll have a few thousands of copies
> of the same array waiting to be freed.
> 
> In case of OVN deployments, there could be hundreds of thousands of
> flows in the same table leading to a fast consumption of a huge
> amount of memory and wasting a lot of CPU cycles on allocations and
> copies.  Below snippet of the 'valgrid --tool=massif' output shows
> ovs-vswitchd consuming 3.5GB of RAM while processing a bundle with
> 65K FLOW_MODs in the OVN deployment.  3.4GB of that memory are
> copies of the same pvector.
> 
> -------------------------------------------------------------------
>   n        time(i)         total(B)   useful-heap(B) extra-heap(B)
> -------------------------------------------------------------------
>  64 109,907,465,404    3,559,987,568    3,546,879,748    13,107,820
> 99.63% (3,546,879,748B) (heap allocation functions) malloc/new/new[]
> ->97.61% (3,474,750,333B) xmalloc__ (util.c:137)
> |->97.61% (3,474,750,333B) xmalloc (util.c:172)
> | ->96.38% (3,431,068,352B) pvector_impl_dup (pvector.c:48)
> || ->96.38% (3,431,067,840B) pvector_insert (pvector.c:138)
> || |->96.38% (3,431,067,840B) classifier_replace (classifier.c:664)
> || | ->96.38% (3,431,067,840B) classifier_insert (classifier.c:695)
> || |  ->96.38% (3,431,067,840B) replace_rule_start (ofproto.c:5563)
> || |   ->96.38% (3,431,067,840B) add_flow_start (ofproto.c:5179)
> || |    ->96.38% (3,431,067,840B) ofproto_flow_mod_start (ofproto.c:8017)
> || |     ->96.38% (3,431,067,744B) do_bundle_commit (ofproto.c:8168)
> || |     |->96.38% (3,431,067,744B) handle_bundle_control (ofproto.c:8309)
> || |     | ->96.38% (3,431,067,744B) handle_single_part_openflow (ofproto.c:8593)
> || |     |  ->96.38% (3,431,067,744B) handle_openflow (ofproto.c:8674)
> || |     |   ->96.38% (3,431,067,744B) ofconn_run (connmgr.c:1329)
> || |     |    ->96.38% (3,431,067,744B) connmgr_run (connmgr.c:356)
> || |     |     ->96.38% (3,431,067,744B) ofproto_run (ofproto.c:1879)
> || |     |      ->96.38% (3,431,067,744B) bridge_run__ (bridge.c:3251)
> || |     |       ->96.38% (3,431,067,744B) bridge_run (bridge.c:3310)
> || |     |        ->96.38% (3,431,067,744B) main (ovs-vswitchd.c:127)
> 
> Fixing that by postponing the publishing of classifier updates,
> so each flow modification can work with the same version of pvector.
> 
> Unfortunately, bundled PACKET_OUT messages requires all previous
> changes to be published before processing, otherwise the packet
> will use wrong version of OF tables.  Publishing all changes before
> processing PACKET_OUT messages to avoid this issue.  Hopefully,
> mixup of a big number of FLOW_MOD and PACKET_OUT messages is not
> a common usecase.
> 
> Reported-by: Vladislav Odintsov <odivlad at gmail.com>
> Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2021-November/389503.html
> Signed-off-by: Ilya Maximets <i.maximets at ovn.org>
> ---
> ofproto/ofproto-provider.h |  1 +
> ofproto/ofproto.c          | 47 ++++++++++++++++++++++++++++++++++++++
> 2 files changed, 48 insertions(+)
> 
> diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h
> index 57c7d17cb..a934a98f2 100644
> --- a/ofproto/ofproto-provider.h
> +++ b/ofproto/ofproto-provider.h
> @@ -1962,6 +1962,7 @@ struct ofproto_flow_mod {
>     bool modify_may_add_flow;
>     bool modify_keep_counts;
>     enum nx_flow_update_event event;
> +    uint8_t table_id;
> 
>     /* These are only used during commit execution.
>      * ofproto_flow_mod_uninit() does NOT clean these up. */
> diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c
> index bd6103b1c..139d6d394 100644
> --- a/ofproto/ofproto.c
> +++ b/ofproto/ofproto.c
> @@ -7967,6 +7967,7 @@ ofproto_flow_mod_init(struct ofproto *ofproto, struct ofproto_flow_mod *ofm,
>     ofm->criteria.version = OVS_VERSION_NOT_REMOVED;
>     ofm->conjs = NULL;
>     ofm->n_conjs = 0;
> +    ofm->table_id = fm->table_id;
> 
>     bool check_buffer_id = false;
> 
> @@ -8104,6 +8105,33 @@ ofproto_flow_mod_finish(struct ofproto *ofproto, struct ofproto_flow_mod *ofm,
>     return error;
> }
> 
> +static void
> +ofproto_table_classifier_defer(struct ofproto *ofproto,
> +                               const struct ofproto_flow_mod *ofm)
> +{
> +    if (check_table_id(ofproto, ofm->table_id)) {
> +        if (ofm->table_id == OFPTT_ALL) {
> +            struct oftable *table;
> +
> +            OFPROTO_FOR_EACH_TABLE (table, ofproto) {
> +                classifier_defer(&table->cls);
> +            }
> +        } else {
> +            classifier_defer(&ofproto->tables[ofm->table_id].cls);
> +        }
> +    }
> +}
> +
> +static void
> +ofproto_publish_classifiers(struct ofproto *ofproto)
> +{
> +    struct oftable *table;
> +
> +    OFPROTO_FOR_EACH_TABLE (table, ofproto) {
> +        classifier_publish(&table->cls);
> +    }
> +}
> +
> /* Commit phases (all while locking ofproto_mutex):
>  *
>  * 1. Begin: Gather resources and make changes visible in the next version.
> @@ -8165,6 +8193,10 @@ do_bundle_commit(struct ofconn *ofconn, uint32_t id, uint16_t flags)
>                     /* Store the version in which the changes should take
>                      * effect. */
>                     be->ofm.version = version;
> +                    /* Publishing of the classifier update for every flow
> +                     * modification in a bundle separately is expensive in
> +                     * CPU time and memory.  Deferring. */
> +                    ofproto_table_classifier_defer(ofproto, &be->ofm);
>                     error = ofproto_flow_mod_start(ofproto, &be->ofm);
>                 } else if (be->type == OFPTYPE_GROUP_MOD) {
>                     /* Store the version in which the changes should take
> @@ -8173,6 +8205,9 @@ do_bundle_commit(struct ofconn *ofconn, uint32_t id, uint16_t flags)
>                     error = ofproto_group_mod_start(ofproto, &be->ogm);
>                 } else if (be->type == OFPTYPE_PACKET_OUT) {
>                     be->opo.version = version;
> +                    /* Need to use current version of flows for packet-out,
> +                     * so publishing all classifiers now. */
> +                    ofproto_publish_classifiers(ofproto);
>                     error = ofproto_packet_out_start(ofproto, &be->opo);
>                 } else {
>                     OVS_NOT_REACHED();
> @@ -8183,6 +8218,9 @@ do_bundle_commit(struct ofconn *ofconn, uint32_t id, uint16_t flags)
>             }
>         }
> 
> +        /* Publishing all changes made to classifiers. */
> +        ofproto_publish_classifiers(ofproto);
> +
>         if (error) {
>             /* Send error referring to the original message. */
>             ofconn_send_error(ofconn, be->msg, error);
> @@ -8191,14 +8229,23 @@ do_bundle_commit(struct ofconn *ofconn, uint32_t id, uint16_t flags)
>             /* 2. Revert.  Undo all the changes made above. */
>             LIST_FOR_EACH_REVERSE_CONTINUE(be, node, &bundle->msg_list) {
>                 if (be->type == OFPTYPE_FLOW_MOD) {
> +                    /* Publishing of the classifier update for every flow
> +                     * modification in a bundle separately is expensive in
> +                     * CPU time and memory.  Deferring. */
> +                    ofproto_table_classifier_defer(ofproto, &be->ofm);
>                     ofproto_flow_mod_revert(ofproto, &be->ofm);
>                 } else if (be->type == OFPTYPE_GROUP_MOD) {
>                     ofproto_group_mod_revert(ofproto, &be->ogm);
>                 } else if (be->type == OFPTYPE_PACKET_OUT) {
> +                    /* Need to use current version of flows for packet-out,
> +                     * so publishing all classifiers now. */
> +                    ofproto_publish_classifiers(ofproto);
>                     ofproto_packet_out_revert(ofproto, &be->opo);
>                 }
>                 /* Nothing needs to be reverted for a port mod. */
>             }
> +            /* Publishing all changes made to classifiers. */
> +            ofproto_publish_classifiers(ofproto);
>         } else {
>             /* 4. Finish. */
>             LIST_FOR_EACH (be, node, &bundle->msg_list) {
> -- 
> 2.31.1
> 
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev