[ovs-dev] [PATCH v1 1/9] conntrack: Use rcu-lists to store conn expirations

Gaetan Rivet grive at u256.net
Wed Feb 17 16:34:35 UTC 2021


Change the connection expiration lists from ovs_list to rculist.
This is a pre-step towards reducing the granularity of 'ct_lock'.

Signed-off-by: Gaetan Rivet <grive at u256.net>
Reviewed-by: Eli Britstein <elibr at nvidia.com>
---
 lib/conntrack-private.h | 76 +++++++++++++++++++++++++++--------------
 lib/conntrack-tp.c      | 60 +++++++++++++++++++++++++-------
 lib/conntrack.c         | 14 ++++----
 3 files changed, 105 insertions(+), 45 deletions(-)

diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
index e8332bdba..4b6f9eae3 100644
--- a/lib/conntrack-private.h
+++ b/lib/conntrack-private.h
@@ -29,6 +29,7 @@
 #include "openvswitch/list.h"
 #include "openvswitch/types.h"
 #include "packets.h"
+#include "rculist.h"
 #include "unaligned.h"
 #include "dp-packet.h"
 
@@ -86,17 +87,55 @@ struct alg_exp_node {
     bool nat_rpl_dst;
 };
 
+/* Timeouts: all the possible timeout states passed to update_expiration()
+ * are listed here. The name will be prefix by CT_TM_ and the value is in
+ * milliseconds */
+#define CT_TIMEOUTS \
+    CT_TIMEOUT(TCP_FIRST_PACKET) \
+    CT_TIMEOUT(TCP_OPENING) \
+    CT_TIMEOUT(TCP_ESTABLISHED) \
+    CT_TIMEOUT(TCP_CLOSING) \
+    CT_TIMEOUT(TCP_FIN_WAIT) \
+    CT_TIMEOUT(TCP_CLOSED) \
+    CT_TIMEOUT(OTHER_FIRST) \
+    CT_TIMEOUT(OTHER_MULTIPLE) \
+    CT_TIMEOUT(OTHER_BIDIR) \
+    CT_TIMEOUT(ICMP_FIRST) \
+    CT_TIMEOUT(ICMP_REPLY)
+
+enum ct_timeout {
+#define CT_TIMEOUT(NAME) CT_TM_##NAME,
+    CT_TIMEOUTS
+#undef CT_TIMEOUT
+    N_CT_TM
+};
+
 enum OVS_PACKED_ENUM ct_conn_type {
     CT_CONN_TYPE_DEFAULT,
     CT_CONN_TYPE_UN_NAT,
 };
 
+struct conn_expire {
+    /* Set once when initializing the expiration node. */
+    struct conntrack *ct;
+    /* Timeout state of the connection.
+     * It follows the connection state updates.
+     */
+    enum ct_timeout tm;
+    /* Insert and remove the expiration node only once per RCU syncs.
+     * If multiple threads update the connection, its expiration should
+     * be removed only once and added only once to timeout lists.
+     */
+    atomic_flag insert_once;
+    atomic_flag remove_once;
+    struct rculist node;
+};
+
 struct conn {
     /* Immutable data. */
     struct conn_key key;
     struct conn_key rev_key;
     struct conn_key parent_key; /* Only used for orig_tuple support. */
-    struct ovs_list exp_node;
     struct cmap_node cm_node;
     struct nat_action_info_t *nat_info;
     char *alg;
@@ -104,6 +143,7 @@ struct conn {
 
     /* Mutable data. */
     struct ovs_mutex lock; /* Guards all mutable fields. */
+    struct conn_expire exp;
     ovs_u128 label;
     long long expiration;
     uint32_t mark;
@@ -132,33 +172,10 @@ enum ct_update_res {
     CT_UPDATE_VALID_NEW,
 };
 
-/* Timeouts: all the possible timeout states passed to update_expiration()
- * are listed here. The name will be prefix by CT_TM_ and the value is in
- * milliseconds */
-#define CT_TIMEOUTS \
-    CT_TIMEOUT(TCP_FIRST_PACKET) \
-    CT_TIMEOUT(TCP_OPENING) \
-    CT_TIMEOUT(TCP_ESTABLISHED) \
-    CT_TIMEOUT(TCP_CLOSING) \
-    CT_TIMEOUT(TCP_FIN_WAIT) \
-    CT_TIMEOUT(TCP_CLOSED) \
-    CT_TIMEOUT(OTHER_FIRST) \
-    CT_TIMEOUT(OTHER_MULTIPLE) \
-    CT_TIMEOUT(OTHER_BIDIR) \
-    CT_TIMEOUT(ICMP_FIRST) \
-    CT_TIMEOUT(ICMP_REPLY)
-
-enum ct_timeout {
-#define CT_TIMEOUT(NAME) CT_TM_##NAME,
-    CT_TIMEOUTS
-#undef CT_TIMEOUT
-    N_CT_TM
-};
-
 struct conntrack {
     struct ovs_mutex ct_lock; /* Protects 2 following fields. */
     struct cmap conns OVS_GUARDED;
-    struct ovs_list exp_lists[N_CT_TM] OVS_GUARDED;
+    struct rculist exp_lists[N_CT_TM] OVS_GUARDED;
     struct hmap zone_limits OVS_GUARDED;
     struct hmap timeout_policies OVS_GUARDED;
     uint32_t hash_basis; /* Salt for hashing a connection key. */
@@ -204,4 +221,13 @@ struct ct_l4_proto {
                                struct ct_dpif_protoinfo *);
 };
 
+static inline void
+conn_expire_remove(struct conn_expire *exp)
+{
+    if (!atomic_flag_test_and_set(&exp->remove_once)
+        && rculist_next(&exp->node)) {
+        rculist_remove(&exp->node);
+    }
+}
+
 #endif /* conntrack-private.h */
diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c
index a586d3a8d..30ba4bda8 100644
--- a/lib/conntrack-tp.c
+++ b/lib/conntrack-tp.c
@@ -230,6 +230,50 @@ tm_to_ct_dpif_tp(enum ct_timeout tm)
     return CT_DPIF_TP_ATTR_MAX;
 }
 
+static void
+conn_expire_init(struct conn *conn, struct conntrack *ct)
+{
+    struct conn_expire *exp = &conn->exp;
+
+    if (exp->ct != NULL) {
+        return;
+    }
+
+    exp->ct = ct;
+    atomic_flag_clear(&exp->insert_once);
+    atomic_flag_clear(&exp->remove_once);
+    /* The expiration is initially unscheduled, flag it as 'removed'. */
+    atomic_flag_test_and_set(&exp->remove_once);
+}
+
+static void
+conn_expire_insert(struct conn *conn)
+{
+    struct conn_expire *exp = &conn->exp;
+
+    ovs_mutex_lock(&exp->ct->ct_lock);
+    ovs_mutex_lock(&conn->lock);
+
+    rculist_push_back(&exp->ct->exp_lists[exp->tm], &exp->node);
+    atomic_flag_clear(&exp->insert_once);
+    atomic_flag_clear(&exp->remove_once);
+
+    ovs_mutex_unlock(&conn->lock);
+    ovs_mutex_unlock(&exp->ct->ct_lock);
+}
+
+static void
+conn_schedule_expiration(struct conntrack *ct, struct conn *conn,
+                         enum ct_timeout tm, long long now, uint32_t tp_value)
+{
+    conn_expire_init(conn, ct);
+    conn->expiration = now + tp_value * 1000;
+    conn->exp.tm = tm;
+    if (!atomic_flag_test_and_set(&conn->exp.insert_once)) {
+        ovsrcu_postpone(conn_expire_insert, conn);
+    }
+}
+
 static void
 conn_update_expiration__(struct conntrack *ct, struct conn *conn,
                          enum ct_timeout tm, long long now,
@@ -241,9 +285,8 @@ conn_update_expiration__(struct conntrack *ct, struct conn *conn,
     ovs_mutex_lock(&ct->ct_lock);
     ovs_mutex_lock(&conn->lock);
     if (!conn->cleaned) {
-        conn->expiration = now + tp_value * 1000;
-        ovs_list_remove(&conn->exp_node);
-        ovs_list_push_back(&ct->exp_lists[tm], &conn->exp_node);
+        conn_expire_remove(&conn->exp);
+        conn_schedule_expiration(ct, conn, tm, now, tp_value);
     }
     ovs_mutex_unlock(&conn->lock);
     ovs_mutex_unlock(&ct->ct_lock);
@@ -281,15 +324,6 @@ conn_update_expiration(struct conntrack *ct, struct conn *conn,
     conn_update_expiration__(ct, conn, tm, now, val);
 }
 
-static void
-conn_init_expiration__(struct conntrack *ct, struct conn *conn,
-                       enum ct_timeout tm, long long now,
-                       uint32_t tp_value)
-{
-    conn->expiration = now + tp_value * 1000;
-    ovs_list_push_back(&ct->exp_lists[tm], &conn->exp_node);
-}
-
 /* ct_lock must be held. */
 void
 conn_init_expiration(struct conntrack *ct, struct conn *conn,
@@ -309,5 +343,5 @@ conn_init_expiration(struct conntrack *ct, struct conn *conn,
     VLOG_DBG_RL(&rl, "Init timeout %s zone=%u with policy id=%d val=%u sec.",
                 ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);
 
-    conn_init_expiration__(ct, conn, tm, now, val);
+    conn_schedule_expiration(ct, conn, tm, now, val);
 }
diff --git a/lib/conntrack.c b/lib/conntrack.c
index 99198a601..ac12f9196 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -301,7 +301,7 @@ conntrack_init(void)
     ovs_mutex_lock(&ct->ct_lock);
     cmap_init(&ct->conns);
     for (unsigned i = 0; i < ARRAY_SIZE(ct->exp_lists); i++) {
-        ovs_list_init(&ct->exp_lists[i]);
+        rculist_init(&ct->exp_lists[i]);
     }
     hmap_init(&ct->zone_limits);
     ct->zone_limit_seq = 0;
@@ -466,7 +466,7 @@ conn_clean(struct conntrack *ct, struct conn *conn)
         uint32_t hash = conn_key_hash(&conn->nat_conn->key, ct->hash_basis);
         cmap_remove(&ct->conns, &conn->nat_conn->cm_node, hash);
     }
-    ovs_list_remove(&conn->exp_node);
+    conn_expire_remove(&conn->exp);
     conn->cleaned = true;
     ovsrcu_postpone(delete_conn, conn);
     atomic_count_dec(&ct->n_conn);
@@ -478,7 +478,7 @@ conn_clean_one(struct conntrack *ct, struct conn *conn)
 {
     conn_clean_cmn(ct, conn);
     if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
-        ovs_list_remove(&conn->exp_node);
+        conn_expire_remove(&conn->exp);
         conn->cleaned = true;
         atomic_count_dec(&ct->n_conn);
     }
@@ -1075,8 +1075,8 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
      * can limit DoS impact. */
 nat_res_exhaustion:
     free(nat_conn);
-    ovs_list_remove(&nc->exp_node);
-    delete_conn_cmn(nc);
+    conn_expire_remove(&nc->exp);
+    ovsrcu_postpone(delete_conn_cmn, nc);
     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
     VLOG_WARN_RL(&rl, "Unable to NAT due to tuple space exhaustion - "
                  "if DoS attack, use firewalling and/or zone partitioning.");
@@ -1493,14 +1493,14 @@ set_label(struct dp_packet *pkt, struct conn *conn,
 static long long
 ct_sweep(struct conntrack *ct, long long now, size_t limit)
 {
-    struct conn *conn, *next;
+    struct conn *conn;
     long long min_expiration = LLONG_MAX;
     size_t count = 0;
 
     ovs_mutex_lock(&ct->ct_lock);
 
     for (unsigned i = 0; i < N_CT_TM; i++) {
-        LIST_FOR_EACH_SAFE (conn, next, exp_node, &ct->exp_lists[i]) {
+        RCULIST_FOR_EACH (conn, exp.node, &ct->exp_lists[i]) {
             ovs_mutex_lock(&conn->lock);
             if (now < conn->expiration || count >= limit) {
                 min_expiration = MIN(min_expiration, conn->expiration);
-- 
2.30.0



More information about the dev mailing list