[ovs-dev] [PATCH] raft: Reintroduce jsonrpc inactivity probes.

Ilya Maximets i.maximets at ovn.org
Tue Feb 23 13:15:00 UTC 2021


It's not enough to just have heartbeats.

RAFT heartbeats are unidirectional, i.e. leader sends them to followers
but not the other way around.  Missing heartbeats provokes followers to
start election, but if leader will not receive any replies it will not
do anything while there is a quorum, i.e. there are enough other
servers to make decisions.

This leads to situation that while TCP connection is established,
leader will continue to blindly send messages to it.  In our case this
leads to growing send backlog.  Connection will be terminated
eventually due to excessive send backlog, but this this might take a
lot of time and wasted process memory.  At the same time 'candidate'
will continue to send vote requests to the dead connection on its
side.

To fix that we need to reintroduce inactivity probes that will drop
connection if there was no incoming traffic for a long time and remote
server doesn't reply to the "echo" request.  Probe interval might be
chosen based on an election timeout to avoid issues described in commit
db5a066c17bd.

Fixes: db5a066c17bd ("raft: Disable RAFT jsonrpc inactivity probe.")
Signed-off-by: Ilya Maximets <i.maximets at ovn.org>
---
 ovsdb/raft.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/ovsdb/raft.c b/ovsdb/raft.c
index ea91d1fdb..0fb1420fb 100644
--- a/ovsdb/raft.c
+++ b/ovsdb/raft.c
@@ -940,6 +940,34 @@ raft_reset_ping_timer(struct raft *raft)
     raft->ping_timeout = time_msec() + raft->election_timer / 3;
 }
 
+static void
+raft_conn_update_probe_interval(struct raft *raft, struct raft_conn *r_conn)
+{
+    /* Inactivity probe will be sent if connection will remain idle for the
+     * time of an election timeout.  Connection will be dropped if inactivity
+     * will last twice that time.
+     *
+     * It's not enough to just have heartbeats if connection is still
+     * established, but no packets received from the other side.  Without
+     * inactivity probe follower will just try to initiate election
+     * indefinitely staying in 'candidate' role.  And the leader will continue
+     * to send heartbeats to the dead connection thinking that remote server
+     * is still part of the cluster. */
+    int probe_interval = raft->election_timer + ELECTION_RANGE_MSEC;
+
+    jsonrpc_session_set_probe_interval(r_conn->js, probe_interval);
+}
+
+static void
+raft_update_probe_intervals(struct raft *raft)
+{
+    struct raft_conn *r_conn;
+
+    LIST_FOR_EACH (r_conn, list_node, &raft->conns) {
+        raft_conn_update_probe_interval(raft, r_conn);
+    }
+}
+
 static void
 raft_add_conn(struct raft *raft, struct jsonrpc_session *js,
               const struct uuid *sid, bool incoming)
@@ -954,7 +982,7 @@ raft_add_conn(struct raft *raft, struct jsonrpc_session *js,
                                               &conn->sid);
     conn->incoming = incoming;
     conn->js_seqno = jsonrpc_session_get_seqno(conn->js);
-    jsonrpc_session_set_probe_interval(js, 0);
+    raft_conn_update_probe_interval(raft, conn);
     jsonrpc_session_set_backlog_threshold(js, raft->conn_backlog_max_n_msgs,
                                               raft->conn_backlog_max_n_bytes);
 }
@@ -2804,6 +2832,7 @@ raft_update_commit_index(struct raft *raft, uint64_t new_commit_index)
                           raft->election_timer, e->election_timer);
                 raft->election_timer = e->election_timer;
                 raft->election_timer_new = 0;
+                raft_update_probe_intervals(raft);
             }
             if (e->servers) {
                 /* raft_run_reconfigure() can write a new Raft entry, which can
@@ -2820,6 +2849,7 @@ raft_update_commit_index(struct raft *raft, uint64_t new_commit_index)
                 VLOG_INFO("Election timer changed from %"PRIu64" to %"PRIu64,
                           raft->election_timer, e->election_timer);
                 raft->election_timer = e->election_timer;
+                raft_update_probe_intervals(raft);
             }
         }
         /* Check if any pending command can be completed, and complete it.
-- 
2.26.2



More information about the dev mailing list