[ovs-dev] [PATCH 3/4] raft.c: Set candidate_retrying if no leader elected since last election.

Han Zhou zhouhan at gmail.com
Tue Aug 13 16:23:21 UTC 2019


From: Han Zhou <hzhou8 at ebay.com>

candiate_retrying is used to determine if the current node is disconnected
from the cluster when the node is in candiate role. However, a node
can flap between candidate and follower role before a leader is elected
when majority of the cluster is down, so is_connected() will flap, too, which
confuses clients.

This patch avoids the flapping with the help of a new member had_leader,
so that if no leader was elected since last election, we know we are
still retrying, and keep as disconnected from the cluster.

Signed-off-by: Han Zhou <hzhou8 at ebay.com>
---
 ovsdb/raft.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/ovsdb/raft.c b/ovsdb/raft.c
index 1c38b3b..63c3081 100644
--- a/ovsdb/raft.c
+++ b/ovsdb/raft.c
@@ -286,8 +286,11 @@ struct raft {
 
     /* Candidates only.  Reinitialized at start of election. */
     int n_votes;                /* Number of votes for me. */
-    bool candidate_retrying;    /* The first round of election timed-out and it
-                                   is now retrying. */
+    bool candidate_retrying;    /* The earlier election timed-out and we are
+                                   now retrying. */
+    bool had_leader;            /* There has been leader elected since last
+                                   election initiated. This is to help setting
+                                   candidate_retrying. */
 };
 
 /* All Raft structures. */
@@ -345,6 +348,7 @@ static bool raft_handle_write_error(struct raft *, struct ovsdb_error *);
 
 static void raft_run_reconfigure(struct raft *);
 
+static void raft_set_leader(struct raft *, const struct uuid *sid);
 static struct raft_server *
 raft_find_server(const struct raft *raft, const struct uuid *sid)
 {
@@ -1616,8 +1620,11 @@ raft_start_election(struct raft *raft, bool leadership_transfer)
     }
 
     ovs_assert(raft->role != RAFT_LEADER);
-    raft->candidate_retrying = (raft->role == RAFT_CANDIDATE);
     raft->role = RAFT_CANDIDATE;
+    /* If there was no leader elected since last election, we know we are
+     * retrying now. */
+    raft->candidate_retrying = !raft->had_leader;
+    raft->had_leader = false;
 
     raft->n_votes = 0;
 
@@ -2450,6 +2457,14 @@ raft_server_init_leader(struct raft *raft, struct raft_server *s)
 }
 
 static void
+raft_set_leader(struct raft *raft, const struct uuid *sid)
+{
+    raft->leader_sid = *sid;
+    raft->had_leader = true;
+    raft->candidate_retrying = false;
+}
+
+static void
 raft_become_leader(struct raft *raft)
 {
     log_all_commands(raft);
@@ -2461,7 +2476,7 @@ raft_become_leader(struct raft *raft)
 
     ovs_assert(raft->role != RAFT_LEADER);
     raft->role = RAFT_LEADER;
-    raft->leader_sid = raft->sid;
+    raft_set_leader(raft, &raft->sid);
     raft->election_timeout = LLONG_MAX;
     raft_reset_ping_timer(raft);
 
@@ -2855,7 +2870,7 @@ raft_update_leader(struct raft *raft, const struct uuid *sid)
                       raft_get_nickname(raft, sid, buf, sizeof buf),
                       raft->term);
         }
-        raft->leader_sid = *sid;
+        raft_set_leader(raft, sid);
 
         /* Record the leader to the log.  This is not used by the algorithm
          * (although it could be, for quick restart), but it is used for
-- 
2.1.0



More information about the dev mailing list