[ovs-dev] [PATCH v3 7/8] conntrack: Inverse conn and ct lock precedence

Gaetan Rivet grive at u256.net
Tue Jun 15 23:22:52 UTC 2021


The lock priority order is for the global 'ct_lock' to be taken first
and then 'conn->lock'. This is an issue, as multiple operations on
connections are thus blocked between threads contending on the
global 'ct_lock'.

This was previously necessary due to how the expiration lists, timeout
policies and zone limits were managed. They are now using RCU-friendly
structures that allow concurrent readers. The mutual exclusion now only
needs to happen during writes.

This allows reducing the 'ct_lock' precedence, and to only take it
when writing the relevant structures. This will reduce contention on
'ct_lock', which impairs scalability when the connection tracker is
used by many threads.

Signed-off-by: Gaetan Rivet <grive at u256.net>
Reviewed-by: Eli Britstein <elibr at nvidia.com>
---
 lib/conntrack-private.h |  7 ++++--
 lib/conntrack-tp.c      | 30 +---------------------
 lib/conntrack.c         | 56 +++++++++++++++++++++++++----------------
 3 files changed, 41 insertions(+), 52 deletions(-)

diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
index ea2e7ed4d..bb82252e8 100644
--- a/lib/conntrack-private.h
+++ b/lib/conntrack-private.h
@@ -134,6 +134,9 @@ struct conn {
     struct nat_action_info_t *nat_info;
     char *alg;
     struct conn *nat_conn; /* The NAT 'conn' context, if there is one. */
+    atomic_flag reclaimed; /* False during the lifetime of the connection,
+                            * True as soon as a thread has started freeing
+                            * its memory. */
 
     /* Inserted once by a PMD, then managed by the 'ct_clean' thread. */
     struct conn_expire exp;
@@ -196,8 +199,8 @@ struct conntrack {
 };
 
 /* Lock acquisition order:
- *    1. 'ct_lock'
- *    2. 'conn->lock'
+ *    1. 'conn->lock'
+ *    2. 'ct_lock'
  *    3. 'resources_lock'
  */
 
diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c
index 592e10c6f..22363e7fe 100644
--- a/lib/conntrack-tp.c
+++ b/lib/conntrack-tp.c
@@ -245,58 +245,30 @@ conn_schedule_expiration(struct conn *conn, enum ct_timeout tm, long long now,
     ignore(atomic_flag_test_and_set(&conn->exp.reschedule));
 }
 
-static void
-conn_update_expiration__(struct conntrack *ct, struct conn *conn,
-                         enum ct_timeout tm, long long now,
-                         uint32_t tp_value)
-    OVS_REQUIRES(conn->lock)
-{
-    ovs_mutex_unlock(&conn->lock);
-
-    ovs_mutex_lock(&ct->ct_lock);
-    ovs_mutex_lock(&conn->lock);
-    conn_schedule_expiration(conn, tm, now, tp_value);
-    ovs_mutex_unlock(&conn->lock);
-    ovs_mutex_unlock(&ct->ct_lock);
-
-    ovs_mutex_lock(&conn->lock);
-}
-
 /* The conn entry lock must be held on entry and exit. */
 void
 conn_update_expiration(struct conntrack *ct, struct conn *conn,
                        enum ct_timeout tm, long long now)
-    OVS_REQUIRES(conn->lock)
 {
     struct timeout_policy *tp;
     uint32_t val;
 
-    ovs_mutex_unlock(&conn->lock);
-
-    ovs_mutex_lock(&ct->ct_lock);
-    ovs_mutex_lock(&conn->lock);
     tp = timeout_policy_lookup(ct, conn->tp_id);
     if (tp) {
         val = tp->policy.attrs[tm_to_ct_dpif_tp(tm)];
     } else {
         val = ct_dpif_netdev_tp_def[tm_to_ct_dpif_tp(tm)];
     }
-    ovs_mutex_unlock(&conn->lock);
-    ovs_mutex_unlock(&ct->ct_lock);
-
-    ovs_mutex_lock(&conn->lock);
     VLOG_DBG_RL(&rl, "Update timeout %s zone=%u with policy id=%d "
                 "val=%u sec.",
                 ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);
 
-    conn_update_expiration__(ct, conn, tm, now, val);
+    conn_schedule_expiration(conn, tm, now, val);
 }
 
-/* ct_lock must be held. */
 void
 conn_init_expiration(struct conntrack *ct, struct conn *conn,
                      enum ct_timeout tm, long long now)
-    OVS_REQUIRES(ct->ct_lock)
 {
     struct timeout_policy *tp;
     uint32_t val;
diff --git a/lib/conntrack.c b/lib/conntrack.c
index 71f51f3d9..045710e8d 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -465,7 +465,7 @@ zone_limit_delete(struct conntrack *ct, uint16_t zone)
 
 static void
 conn_clean_cmn(struct conntrack *ct, struct conn *conn)
-    OVS_REQUIRES(ct->ct_lock)
+    OVS_REQUIRES(conn->lock, ct->ct_lock)
 {
     if (conn->alg) {
         expectation_clean(ct, &conn->key);
@@ -495,18 +495,29 @@ conn_unref(struct conn *conn)
  * removes the associated nat 'conn' from the lookup datastructures. */
 static void
 conn_clean(struct conntrack *ct, struct conn *conn)
-    OVS_REQUIRES(ct->ct_lock)
+    OVS_EXCLUDED(conn->lock, ct->ct_lock)
 {
     ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
 
+    if (atomic_flag_test_and_set(&conn->reclaimed)) {
+        return;
+    }
+
+    ovs_mutex_lock(&conn->lock);
+
+    ovs_mutex_lock(&ct->ct_lock);
     conn_clean_cmn(ct, conn);
     if (conn->nat_conn) {
         uint32_t hash = conn_key_hash(&conn->nat_conn->key, ct->hash_basis);
         cmap_remove(&ct->conns, &conn->nat_conn->cm_node, hash);
     }
+    ovs_mutex_unlock(&ct->ct_lock);
+
     conn->cleaned = true;
     conn_unref(conn);
     atomic_count_dec(&ct->n_conn);
+
+    ovs_mutex_unlock(&conn->lock);
 }
 
 static inline bool
@@ -521,14 +532,25 @@ conn_unref_one(struct conn *conn)
 
 static void
 conn_clean_one(struct conntrack *ct, struct conn *conn)
-    OVS_REQUIRES(ct->ct_lock)
+    OVS_EXCLUDED(conn->lock, ct->ct_lock)
 {
+    if (atomic_flag_test_and_set(&conn->reclaimed)) {
+        return;
+    }
+
+    ovs_mutex_lock(&conn->lock);
+
+    ovs_mutex_lock(&ct->ct_lock);
     conn_clean_cmn(ct, conn);
+    ovs_mutex_unlock(&ct->ct_lock);
+
     if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
         conn->cleaned = true;
         atomic_count_dec(&ct->n_conn);
     }
     conn_unref_one(conn);
+
+    ovs_mutex_unlock(&conn->lock);
 }
 
 /* Destroys the connection tracker 'ct' and frees all the allocated memory.
@@ -542,8 +564,6 @@ conntrack_destroy(struct conntrack *ct)
     pthread_join(ct->clean_thread, NULL);
     latch_destroy(&ct->clean_thread_exit);
 
-    ovs_mutex_lock(&ct->ct_lock);
-
     for (unsigned i = 0; i < N_CT_TM; i++) {
         struct mpsc_queue_node *node;
 
@@ -559,7 +579,6 @@ conntrack_destroy(struct conntrack *ct)
     CMAP_FOR_EACH (conn, cm_node, &ct->conns) {
         conn_clean_one(ct, conn);
     }
-    cmap_destroy(&ct->conns);
 
     struct zone_limit *zl;
     CMAP_FOR_EACH (zl, node, &ct->zone_limits) {
@@ -568,7 +587,6 @@ conntrack_destroy(struct conntrack *ct)
         cmap_remove(&ct->zone_limits, &zl->node, hash);
         ovsrcu_postpone(free, zl);
     }
-    cmap_destroy(&ct->zone_limits);
 
     struct timeout_policy *tp;
     CMAP_FOR_EACH (tp, node, &ct->timeout_policies) {
@@ -577,6 +595,11 @@ conntrack_destroy(struct conntrack *ct)
         cmap_remove(&ct->timeout_policies, &tp->node, hash);
         ovsrcu_postpone(free, tp);
     }
+
+    ovs_mutex_lock(&ct->ct_lock);
+
+    cmap_destroy(&ct->conns);
+    cmap_destroy(&ct->zone_limits);
     cmap_destroy(&ct->timeout_policies);
 
     ovs_mutex_unlock(&ct->ct_lock);
@@ -1034,7 +1057,6 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
                const struct nat_action_info_t *nat_action_info,
                const char *helper, const struct alg_exp_node *alg_exp,
                enum ct_alg_ctl_type ct_alg_ctl, uint32_t tp_id)
-    OVS_REQUIRES(ct->ct_lock)
 {
     struct conn *nc = NULL;
     struct conn *nat_conn = NULL;
@@ -1113,13 +1135,18 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
             nat_conn->alg = NULL;
             nat_conn->nat_conn = NULL;
             uint32_t nat_hash = conn_key_hash(&nat_conn->key, ct->hash_basis);
+            ovs_mutex_lock(&ct->ct_lock);
             cmap_insert(&ct->conns, &nat_conn->cm_node, nat_hash);
+            ovs_mutex_unlock(&ct->ct_lock);
         }
 
         nc->nat_conn = nat_conn;
         ovs_mutex_init_adaptive(&nc->lock);
         nc->conn_type = CT_CONN_TYPE_DEFAULT;
+        atomic_flag_clear(&nc->reclaimed);
+        ovs_mutex_lock(&ct->ct_lock);
         cmap_insert(&ct->conns, &nc->cm_node, ctx->hash);
+        ovs_mutex_unlock(&ct->ct_lock);
         conn_expire_push_back(ct, nc);
         atomic_count_inc(&ct->n_conn);
         ctx->conn = nc; /* For completeness. */
@@ -1180,11 +1207,9 @@ conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
             pkt->md.ct_state = CS_INVALID;
             break;
         case CT_UPDATE_NEW:
-            ovs_mutex_lock(&ct->ct_lock);
             if (conn_lookup(ct, &conn->key, now, NULL, NULL)) {
                 conn_clean(ct, conn);
             }
-            ovs_mutex_unlock(&ct->ct_lock);
             create_new_conn = true;
             break;
         case CT_UPDATE_VALID_NEW:
@@ -1366,11 +1391,9 @@ process_one(struct conntrack *ct, struct dp_packet *pkt,
 
     /* Delete found entry if in wrong direction. 'force' implies commit. */
     if (OVS_UNLIKELY(force && ctx->reply && conn)) {
-        ovs_mutex_lock(&ct->ct_lock);
         if (conn_lookup(ct, &conn->key, now, NULL, NULL)) {
             conn_clean(ct, conn);
         }
-        ovs_mutex_unlock(&ct->ct_lock);
         conn = NULL;
     }
 
@@ -1434,12 +1457,10 @@ process_one(struct conntrack *ct, struct dp_packet *pkt,
         }
         ovs_rwlock_unlock(&ct->resources_lock);
 
-        ovs_mutex_lock(&ct->ct_lock);
         if (!conn_lookup(ct, &ctx->key, now, NULL, NULL)) {
             conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
                                   helper, alg_exp, ct_alg_ctl, tp_id);
         }
-        ovs_mutex_unlock(&ct->ct_lock);
     }
 
     write_ct_md(pkt, zone, conn, &ctx->key, alg_exp);
@@ -1564,8 +1585,6 @@ ct_sweep(struct conntrack *ct, long long now, size_t limit)
     struct mpsc_queue_node *node;
     size_t count = 0;
 
-    ovs_mutex_lock(&ct->ct_lock);
-
     for (unsigned i = 0; i < N_CT_TM; i++) {
         struct conn *end_of_queue = NULL;
 
@@ -1630,7 +1649,6 @@ ct_sweep(struct conntrack *ct, long long now, size_t limit)
 out:
     VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec", count,
              time_msec() - now);
-    ovs_mutex_unlock(&ct->ct_lock);
     return min_expiration;
 }
 
@@ -2688,13 +2706,11 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone)
 {
     struct conn *conn;
 
-    ovs_mutex_lock(&ct->ct_lock);
     CMAP_FOR_EACH (conn, cm_node, &ct->conns) {
         if (!zone || *zone == conn->key.zone) {
             conn_clean_one(ct, conn);
         }
     }
-    ovs_mutex_unlock(&ct->ct_lock);
 
     return 0;
 }
@@ -2709,7 +2725,6 @@ conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
 
     memset(&key, 0, sizeof(key));
     tuple_to_conn_key(tuple, zone, &key);
-    ovs_mutex_lock(&ct->ct_lock);
     conn_lookup(ct, &key, time_msec(), &conn, NULL);
 
     if (conn && conn->conn_type == CT_CONN_TYPE_DEFAULT) {
@@ -2719,7 +2734,6 @@ conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
         error = ENOENT;
     }
 
-    ovs_mutex_unlock(&ct->ct_lock);
     return error;
 }
 
-- 
2.31.1



More information about the dev mailing list