[ovs-dev] [PATCH] ofproto: Fix resource usage explosion while processing bundled FLOW_MOD.

Ilya Maximets i.maximets at ovn.org
Fri Nov 19 23:07:38 UTC 2021


While processing a bundle, OVS will add all new and modified rules
to classifiers.  Classifiers are using RCU-protected pvector to
store subtables.  Addition of a new subtable or removal of the old
one leads to re-allocation and memory copy of the pvector array.
Old version of that array is given to RCU thread to free it later.

The problem is that bundle is processed under the mutex without
entering the quiescent state.  Therefore, memory can not be freed
until the whole bundle is processed.  So, if a few thousands of
flows added to the same table in a bundle, pvector will be re-allocated
while adding each of them.  So, we'll have a few thousands of copies
of the same array waiting to be freed.

In case of OVN deployments, there could be hundreds of thousands of
flows in the same table leading to a fast consumption of a huge
amount of memory and wasting a lot of CPU cycles on allocations and
copies.  Below snippet of the 'valgrid --tool=massif' output shows
ovs-vswitchd consuming 3.5GB of RAM while processing a bundle with
65K FLOW_MODs in the OVN deployment.  3.4GB of that memory are
copies of the same pvector.

 -------------------------------------------------------------------
   n        time(i)         total(B)   useful-heap(B) extra-heap(B)
 -------------------------------------------------------------------
  64 109,907,465,404    3,559,987,568    3,546,879,748    13,107,820
 99.63% (3,546,879,748B) (heap allocation functions) malloc/new/new[]
 ->97.61% (3,474,750,333B) xmalloc__ (util.c:137)
 |->97.61% (3,474,750,333B) xmalloc (util.c:172)
 | ->96.38% (3,431,068,352B) pvector_impl_dup (pvector.c:48)
 || ->96.38% (3,431,067,840B) pvector_insert (pvector.c:138)
 || |->96.38% (3,431,067,840B) classifier_replace (classifier.c:664)
 || | ->96.38% (3,431,067,840B) classifier_insert (classifier.c:695)
 || |  ->96.38% (3,431,067,840B) replace_rule_start (ofproto.c:5563)
 || |   ->96.38% (3,431,067,840B) add_flow_start (ofproto.c:5179)
 || |    ->96.38% (3,431,067,840B) ofproto_flow_mod_start (ofproto.c:8017)
 || |     ->96.38% (3,431,067,744B) do_bundle_commit (ofproto.c:8168)
 || |     |->96.38% (3,431,067,744B) handle_bundle_control (ofproto.c:8309)
 || |     | ->96.38% (3,431,067,744B) handle_single_part_openflow (ofproto.c:8593)
 || |     |  ->96.38% (3,431,067,744B) handle_openflow (ofproto.c:8674)
 || |     |   ->96.38% (3,431,067,744B) ofconn_run (connmgr.c:1329)
 || |     |    ->96.38% (3,431,067,744B) connmgr_run (connmgr.c:356)
 || |     |     ->96.38% (3,431,067,744B) ofproto_run (ofproto.c:1879)
 || |     |      ->96.38% (3,431,067,744B) bridge_run__ (bridge.c:3251)
 || |     |       ->96.38% (3,431,067,744B) bridge_run (bridge.c:3310)
 || |     |        ->96.38% (3,431,067,744B) main (ovs-vswitchd.c:127)

Fixing that by postponing the publishing of classifier updates,
so each flow modification can work with the same version of pvector.

Unfortunately, bundled PACKET_OUT messages requires all previous
changes to be published before processing, otherwise the packet
will use wrong version of OF tables.  Publishing all changes before
processing PACKET_OUT messages to avoid this issue.  Hopefully,
mixup of a big number of FLOW_MOD and PACKET_OUT messages is not
a common usecase.

Reported-by: Vladislav Odintsov <odivlad at gmail.com>
Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2021-November/389503.html
Signed-off-by: Ilya Maximets <i.maximets at ovn.org>
---
 ofproto/ofproto-provider.h |  1 +
 ofproto/ofproto.c          | 47 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h
index 57c7d17cb..a934a98f2 100644
--- a/ofproto/ofproto-provider.h
+++ b/ofproto/ofproto-provider.h
@@ -1962,6 +1962,7 @@ struct ofproto_flow_mod {
     bool modify_may_add_flow;
     bool modify_keep_counts;
     enum nx_flow_update_event event;
+    uint8_t table_id;
 
     /* These are only used during commit execution.
      * ofproto_flow_mod_uninit() does NOT clean these up. */
diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c
index bd6103b1c..139d6d394 100644
--- a/ofproto/ofproto.c
+++ b/ofproto/ofproto.c
@@ -7967,6 +7967,7 @@ ofproto_flow_mod_init(struct ofproto *ofproto, struct ofproto_flow_mod *ofm,
     ofm->criteria.version = OVS_VERSION_NOT_REMOVED;
     ofm->conjs = NULL;
     ofm->n_conjs = 0;
+    ofm->table_id = fm->table_id;
 
     bool check_buffer_id = false;
 
@@ -8104,6 +8105,33 @@ ofproto_flow_mod_finish(struct ofproto *ofproto, struct ofproto_flow_mod *ofm,
     return error;
 }
 
+static void
+ofproto_table_classifier_defer(struct ofproto *ofproto,
+                               const struct ofproto_flow_mod *ofm)
+{
+    if (check_table_id(ofproto, ofm->table_id)) {
+        if (ofm->table_id == OFPTT_ALL) {
+            struct oftable *table;
+
+            OFPROTO_FOR_EACH_TABLE (table, ofproto) {
+                classifier_defer(&table->cls);
+            }
+        } else {
+            classifier_defer(&ofproto->tables[ofm->table_id].cls);
+        }
+    }
+}
+
+static void
+ofproto_publish_classifiers(struct ofproto *ofproto)
+{
+    struct oftable *table;
+
+    OFPROTO_FOR_EACH_TABLE (table, ofproto) {
+        classifier_publish(&table->cls);
+    }
+}
+
 /* Commit phases (all while locking ofproto_mutex):
  *
  * 1. Begin: Gather resources and make changes visible in the next version.
@@ -8165,6 +8193,10 @@ do_bundle_commit(struct ofconn *ofconn, uint32_t id, uint16_t flags)
                     /* Store the version in which the changes should take
                      * effect. */
                     be->ofm.version = version;
+                    /* Publishing of the classifier update for every flow
+                     * modification in a bundle separately is expensive in
+                     * CPU time and memory.  Deferring. */
+                    ofproto_table_classifier_defer(ofproto, &be->ofm);
                     error = ofproto_flow_mod_start(ofproto, &be->ofm);
                 } else if (be->type == OFPTYPE_GROUP_MOD) {
                     /* Store the version in which the changes should take
@@ -8173,6 +8205,9 @@ do_bundle_commit(struct ofconn *ofconn, uint32_t id, uint16_t flags)
                     error = ofproto_group_mod_start(ofproto, &be->ogm);
                 } else if (be->type == OFPTYPE_PACKET_OUT) {
                     be->opo.version = version;
+                    /* Need to use current version of flows for packet-out,
+                     * so publishing all classifiers now. */
+                    ofproto_publish_classifiers(ofproto);
                     error = ofproto_packet_out_start(ofproto, &be->opo);
                 } else {
                     OVS_NOT_REACHED();
@@ -8183,6 +8218,9 @@ do_bundle_commit(struct ofconn *ofconn, uint32_t id, uint16_t flags)
             }
         }
 
+        /* Publishing all changes made to classifiers. */
+        ofproto_publish_classifiers(ofproto);
+
         if (error) {
             /* Send error referring to the original message. */
             ofconn_send_error(ofconn, be->msg, error);
@@ -8191,14 +8229,23 @@ do_bundle_commit(struct ofconn *ofconn, uint32_t id, uint16_t flags)
             /* 2. Revert.  Undo all the changes made above. */
             LIST_FOR_EACH_REVERSE_CONTINUE(be, node, &bundle->msg_list) {
                 if (be->type == OFPTYPE_FLOW_MOD) {
+                    /* Publishing of the classifier update for every flow
+                     * modification in a bundle separately is expensive in
+                     * CPU time and memory.  Deferring. */
+                    ofproto_table_classifier_defer(ofproto, &be->ofm);
                     ofproto_flow_mod_revert(ofproto, &be->ofm);
                 } else if (be->type == OFPTYPE_GROUP_MOD) {
                     ofproto_group_mod_revert(ofproto, &be->ogm);
                 } else if (be->type == OFPTYPE_PACKET_OUT) {
+                    /* Need to use current version of flows for packet-out,
+                     * so publishing all classifiers now. */
+                    ofproto_publish_classifiers(ofproto);
                     ofproto_packet_out_revert(ofproto, &be->opo);
                 }
                 /* Nothing needs to be reverted for a port mod. */
             }
+            /* Publishing all changes made to classifiers. */
+            ofproto_publish_classifiers(ofproto);
         } else {
             /* 4. Finish. */
             LIST_FOR_EACH (be, node, &bundle->msg_list) {
-- 
2.31.1



More information about the dev mailing list