[ovs-dev] [PATCH] ofproto: Fix resource usage explosion while processing bundled FLOW_MOD.
Vladislav Odintsov
odivlad at gmail.com
Mon Nov 22 08:07:44 UTC 2021
Thanks for the patch Ilya!
I’ve tested this with my setup in OVN similar to described in message by link at "Reported-at".
ovs-vswitchd with this patch applied seems working fine. It processed flows and though with high CPU load during ~3-5 seconds after start, it went to normal CPU load about 2-4%.
I’ve tested this with both: OVS v2.13.5 and master branch.
Tested-by: Vladislav Odintsov <odivlad at gmail.com>
Regards,
Vladislav Odintsov
> On 20 Nov 2021, at 02:07, Ilya Maximets <i.maximets at ovn.org> wrote:
>
> While processing a bundle, OVS will add all new and modified rules
> to classifiers. Classifiers are using RCU-protected pvector to
> store subtables. Addition of a new subtable or removal of the old
> one leads to re-allocation and memory copy of the pvector array.
> Old version of that array is given to RCU thread to free it later.
>
> The problem is that bundle is processed under the mutex without
> entering the quiescent state. Therefore, memory can not be freed
> until the whole bundle is processed. So, if a few thousands of
> flows added to the same table in a bundle, pvector will be re-allocated
> while adding each of them. So, we'll have a few thousands of copies
> of the same array waiting to be freed.
>
> In case of OVN deployments, there could be hundreds of thousands of
> flows in the same table leading to a fast consumption of a huge
> amount of memory and wasting a lot of CPU cycles on allocations and
> copies. Below snippet of the 'valgrid --tool=massif' output shows
> ovs-vswitchd consuming 3.5GB of RAM while processing a bundle with
> 65K FLOW_MODs in the OVN deployment. 3.4GB of that memory are
> copies of the same pvector.
>
> -------------------------------------------------------------------
> n time(i) total(B) useful-heap(B) extra-heap(B)
> -------------------------------------------------------------------
> 64 109,907,465,404 3,559,987,568 3,546,879,748 13,107,820
> 99.63% (3,546,879,748B) (heap allocation functions) malloc/new/new[]
> ->97.61% (3,474,750,333B) xmalloc__ (util.c:137)
> |->97.61% (3,474,750,333B) xmalloc (util.c:172)
> | ->96.38% (3,431,068,352B) pvector_impl_dup (pvector.c:48)
> || ->96.38% (3,431,067,840B) pvector_insert (pvector.c:138)
> || |->96.38% (3,431,067,840B) classifier_replace (classifier.c:664)
> || | ->96.38% (3,431,067,840B) classifier_insert (classifier.c:695)
> || | ->96.38% (3,431,067,840B) replace_rule_start (ofproto.c:5563)
> || | ->96.38% (3,431,067,840B) add_flow_start (ofproto.c:5179)
> || | ->96.38% (3,431,067,840B) ofproto_flow_mod_start (ofproto.c:8017)
> || | ->96.38% (3,431,067,744B) do_bundle_commit (ofproto.c:8168)
> || | |->96.38% (3,431,067,744B) handle_bundle_control (ofproto.c:8309)
> || | | ->96.38% (3,431,067,744B) handle_single_part_openflow (ofproto.c:8593)
> || | | ->96.38% (3,431,067,744B) handle_openflow (ofproto.c:8674)
> || | | ->96.38% (3,431,067,744B) ofconn_run (connmgr.c:1329)
> || | | ->96.38% (3,431,067,744B) connmgr_run (connmgr.c:356)
> || | | ->96.38% (3,431,067,744B) ofproto_run (ofproto.c:1879)
> || | | ->96.38% (3,431,067,744B) bridge_run__ (bridge.c:3251)
> || | | ->96.38% (3,431,067,744B) bridge_run (bridge.c:3310)
> || | | ->96.38% (3,431,067,744B) main (ovs-vswitchd.c:127)
>
> Fixing that by postponing the publishing of classifier updates,
> so each flow modification can work with the same version of pvector.
>
> Unfortunately, bundled PACKET_OUT messages requires all previous
> changes to be published before processing, otherwise the packet
> will use wrong version of OF tables. Publishing all changes before
> processing PACKET_OUT messages to avoid this issue. Hopefully,
> mixup of a big number of FLOW_MOD and PACKET_OUT messages is not
> a common usecase.
>
> Reported-by: Vladislav Odintsov <odivlad at gmail.com>
> Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2021-November/389503.html
> Signed-off-by: Ilya Maximets <i.maximets at ovn.org>
> ---
> ofproto/ofproto-provider.h | 1 +
> ofproto/ofproto.c | 47 ++++++++++++++++++++++++++++++++++++++
> 2 files changed, 48 insertions(+)
>
> diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h
> index 57c7d17cb..a934a98f2 100644
> --- a/ofproto/ofproto-provider.h
> +++ b/ofproto/ofproto-provider.h
> @@ -1962,6 +1962,7 @@ struct ofproto_flow_mod {
> bool modify_may_add_flow;
> bool modify_keep_counts;
> enum nx_flow_update_event event;
> + uint8_t table_id;
>
> /* These are only used during commit execution.
> * ofproto_flow_mod_uninit() does NOT clean these up. */
> diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c
> index bd6103b1c..139d6d394 100644
> --- a/ofproto/ofproto.c
> +++ b/ofproto/ofproto.c
> @@ -7967,6 +7967,7 @@ ofproto_flow_mod_init(struct ofproto *ofproto, struct ofproto_flow_mod *ofm,
> ofm->criteria.version = OVS_VERSION_NOT_REMOVED;
> ofm->conjs = NULL;
> ofm->n_conjs = 0;
> + ofm->table_id = fm->table_id;
>
> bool check_buffer_id = false;
>
> @@ -8104,6 +8105,33 @@ ofproto_flow_mod_finish(struct ofproto *ofproto, struct ofproto_flow_mod *ofm,
> return error;
> }
>
> +static void
> +ofproto_table_classifier_defer(struct ofproto *ofproto,
> + const struct ofproto_flow_mod *ofm)
> +{
> + if (check_table_id(ofproto, ofm->table_id)) {
> + if (ofm->table_id == OFPTT_ALL) {
> + struct oftable *table;
> +
> + OFPROTO_FOR_EACH_TABLE (table, ofproto) {
> + classifier_defer(&table->cls);
> + }
> + } else {
> + classifier_defer(&ofproto->tables[ofm->table_id].cls);
> + }
> + }
> +}
> +
> +static void
> +ofproto_publish_classifiers(struct ofproto *ofproto)
> +{
> + struct oftable *table;
> +
> + OFPROTO_FOR_EACH_TABLE (table, ofproto) {
> + classifier_publish(&table->cls);
> + }
> +}
> +
> /* Commit phases (all while locking ofproto_mutex):
> *
> * 1. Begin: Gather resources and make changes visible in the next version.
> @@ -8165,6 +8193,10 @@ do_bundle_commit(struct ofconn *ofconn, uint32_t id, uint16_t flags)
> /* Store the version in which the changes should take
> * effect. */
> be->ofm.version = version;
> + /* Publishing of the classifier update for every flow
> + * modification in a bundle separately is expensive in
> + * CPU time and memory. Deferring. */
> + ofproto_table_classifier_defer(ofproto, &be->ofm);
> error = ofproto_flow_mod_start(ofproto, &be->ofm);
> } else if (be->type == OFPTYPE_GROUP_MOD) {
> /* Store the version in which the changes should take
> @@ -8173,6 +8205,9 @@ do_bundle_commit(struct ofconn *ofconn, uint32_t id, uint16_t flags)
> error = ofproto_group_mod_start(ofproto, &be->ogm);
> } else if (be->type == OFPTYPE_PACKET_OUT) {
> be->opo.version = version;
> + /* Need to use current version of flows for packet-out,
> + * so publishing all classifiers now. */
> + ofproto_publish_classifiers(ofproto);
> error = ofproto_packet_out_start(ofproto, &be->opo);
> } else {
> OVS_NOT_REACHED();
> @@ -8183,6 +8218,9 @@ do_bundle_commit(struct ofconn *ofconn, uint32_t id, uint16_t flags)
> }
> }
>
> + /* Publishing all changes made to classifiers. */
> + ofproto_publish_classifiers(ofproto);
> +
> if (error) {
> /* Send error referring to the original message. */
> ofconn_send_error(ofconn, be->msg, error);
> @@ -8191,14 +8229,23 @@ do_bundle_commit(struct ofconn *ofconn, uint32_t id, uint16_t flags)
> /* 2. Revert. Undo all the changes made above. */
> LIST_FOR_EACH_REVERSE_CONTINUE(be, node, &bundle->msg_list) {
> if (be->type == OFPTYPE_FLOW_MOD) {
> + /* Publishing of the classifier update for every flow
> + * modification in a bundle separately is expensive in
> + * CPU time and memory. Deferring. */
> + ofproto_table_classifier_defer(ofproto, &be->ofm);
> ofproto_flow_mod_revert(ofproto, &be->ofm);
> } else if (be->type == OFPTYPE_GROUP_MOD) {
> ofproto_group_mod_revert(ofproto, &be->ogm);
> } else if (be->type == OFPTYPE_PACKET_OUT) {
> + /* Need to use current version of flows for packet-out,
> + * so publishing all classifiers now. */
> + ofproto_publish_classifiers(ofproto);
> ofproto_packet_out_revert(ofproto, &be->opo);
> }
> /* Nothing needs to be reverted for a port mod. */
> }
> + /* Publishing all changes made to classifiers. */
> + ofproto_publish_classifiers(ofproto);
> } else {
> /* 4. Finish. */
> LIST_FOR_EACH (be, node, &bundle->msg_list) {
> --
> 2.31.1
>
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
More information about the dev
mailing list