[ovs-dev] [PATCH v3 3/5] raft.c: Stale leader should disconnect from cluster.
Han Zhou
zhouhan at gmail.com
Mon Aug 19 16:29:58 UTC 2019
From: Han Zhou <hzhou8 at ebay.com>
As mentioned in RAFT paper, section 6.2:
Leaders: A server might be in the leader state, but if it isn’t the current
leader, it could be needlessly delaying client requests. For example, suppose a
leader is partitioned from the rest of the cluster, but it can still
communicate with a particular client. Without additional mechanism, it could
delay a request from that client forever, being unable to replicate a log entry
to any other servers. Meanwhile, there might be another leader of a newer term
that is able to communicate with a majority of the cluster and would be able to
commit the client’s request. Thus, a leader in Raft steps down if an election
timeout elapses without a successful round of heartbeats to a majority of its
cluster; this allows clients to retry their requests with another server.
Reported-by: Aliasgar Ginwala <aginwala at ebay.com>
Tested-by: Aliasgar Ginwala <aginwala at ebay.com>
Signed-off-by: Han Zhou <hzhou8 at ebay.com>
---
ovsdb/raft-private.h | 3 ++
ovsdb/raft.c | 43 +++++++++++++++--
tests/ovsdb-cluster.at | 123 +++++++++++++++++++++++++++++--------------------
3 files changed, 116 insertions(+), 53 deletions(-)
diff --git a/ovsdb/raft-private.h b/ovsdb/raft-private.h
index 35a3dd7..fb7e6e3 100644
--- a/ovsdb/raft-private.h
+++ b/ovsdb/raft-private.h
@@ -80,6 +80,9 @@ struct raft_server {
uint64_t next_index; /* Index of next log entry to send this server. */
uint64_t match_index; /* Index of max log entry server known to have. */
enum raft_server_phase phase;
+ bool replied; /* Reply to append_request was received from this
+ node during current election_timeout interval.
+ */
/* For use in adding and removing servers: */
struct uuid requester_sid; /* Nonzero if requested via RPC. */
struct unixctl_conn *requester_conn; /* Only if requested via unixctl. */
diff --git a/ovsdb/raft.c b/ovsdb/raft.c
index 1c38b3b..71c5a7e 100644
--- a/ovsdb/raft.c
+++ b/ovsdb/raft.c
@@ -250,7 +250,6 @@ struct raft {
uint64_t last_applied; /* Max log index applied to state machine. */
struct uuid leader_sid; /* Server ID of leader (zero, if unknown). */
- /* Followers and candidates only. */
#define ELECTION_BASE_MSEC 1024
#define ELECTION_RANGE_MSEC 1024
long long int election_base; /* Time of last heartbeat from leader. */
@@ -1785,7 +1784,43 @@ raft_run(struct raft *raft)
}
if (!raft->joining && time_msec() >= raft->election_timeout) {
- raft_start_election(raft, false);
+ if (raft->role == RAFT_LEADER) {
+ /* Check if majority of followers replied, then reset
+ * election_timeout and reset s->replied. Otherwise, become
+ * follower.
+ *
+ * Raft paper section 6.2: Leaders: A server might be in the leader
+ * state, but if it isn’t the current leader, it could be
+ * needlessly delaying client requests. For example, suppose a
+ * leader is partitioned from the rest of the cluster, but it can
+ * still communicate with a particular client. Without additional
+ * mechanism, it could delay a request from that client forever,
+ * being unable to replicate a log entry to any other servers.
+ * Meanwhile, there might be another leader of a newer term that is
+ * able to communicate with a majority of the cluster and would be
+ * able to commit the client’s request. Thus, a leader in Raft
+ * steps down if an election timeout elapses without a successful
+ * round of heartbeats to a majority of its cluster; this allows
+ * clients to retry their requests with another server. */
+ int count = 0;
+ HMAP_FOR_EACH (server, hmap_node, &raft->servers) {
+ if (server->replied) {
+ count ++;
+ }
+ }
+ if (count >= hmap_count(&raft->servers) / 2) {
+ HMAP_FOR_EACH (server, hmap_node, &raft->servers) {
+ server->replied = false;
+ }
+ raft_reset_election_timer(raft);
+ } else {
+ raft_become_follower(raft);
+ raft_start_election(raft, false);
+ }
+ } else {
+ raft_start_election(raft, false);
+ }
+
}
if (raft->leaving && time_msec() >= raft->leave_timeout) {
@@ -2447,6 +2482,7 @@ raft_server_init_leader(struct raft *raft, struct raft_server *s)
s->next_index = raft->log_end;
s->match_index = 0;
s->phase = RAFT_PHASE_STABLE;
+ s->replied = false;
}
static void
@@ -2462,7 +2498,7 @@ raft_become_leader(struct raft *raft)
ovs_assert(raft->role != RAFT_LEADER);
raft->role = RAFT_LEADER;
raft->leader_sid = raft->sid;
- raft->election_timeout = LLONG_MAX;
+ raft_reset_election_timer(raft);
raft_reset_ping_timer(raft);
struct raft_server *s;
@@ -3192,6 +3228,7 @@ raft_handle_append_reply(struct raft *raft,
}
}
+ s->replied = true;
if (rpy->result == RAFT_APPEND_OK) {
/* Figure 3.1: "If successful, update nextIndex and matchIndex for
* follower (section 3.5)." */
diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
index 02e9926..0fbb268 100644
--- a/tests/ovsdb-cluster.at
+++ b/tests/ovsdb-cluster.at
@@ -65,59 +65,82 @@ EXECUTION_EXAMPLES
AT_BANNER([OVSDB - disconnect from cluster])
-# When a node is disconnected from the cluster, the IDL should disconnect
-# and retry even if it uses a single remote, because the remote IP can be
-# a VIP on a load-balance.
-AT_SETUP([OVSDB cluster - disconnect from cluster, single remote])
-AT_KEYWORDS([ovsdb server negative unix cluster disconnect])
+OVS_START_SHELL_HELPERS
+# ovsdb_test_cluster_disconnect LEADER_OR_FOLLOWER
+ovsdb_test_cluster_disconnect () {
+ leader_or_follower=$1
+ schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
+ ordinal_schema > schema
+ AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr])
+ cid=`ovsdb-tool db-cid s1.db`
+ schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
+ for i in `seq 2 3`; do
+ AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft])
+ done
+
+ on_exit 'kill `cat *.pid`'
+ for i in `seq 3`; do
+ AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db])
+ done
+ for i in `seq 3`; do
+ AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
+ done
+
+ AT_CHECK([ovsdb-client transact unix:s1.ovsdb '[["idltest",
+ {"op": "insert",
+ "table": "simple",
+ "row": {"i": 1}}]]'], [0], [ignore], [ignore])
+
+ # When a node is disconnected from the cluster, the IDL should disconnect
+ # and retry even if it uses a single remote, because the remote IP can be
+ # a VIP on a load-balance. So we use single remote to test here.
+ if test $leader_or_follower == "leader"; then
+ target=1
+ shutdown="2 3"
+ else
+ target=3
+
+ # shutdown follower before the leader so that there is no chance for s3
+ # become leader during the process.
+ shutdown="2 1"
+ fi
+
+ # Connect to $target. Use "wait" to trigger a non-op transaction so
+ # that test-ovsdb will not quit.
+
+ test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -v -t10 idl unix:s$target.ovsdb '[["idltest",
+ {"op": "wait",
+ "table": "simple",
+ "where": [["i", "==", 1]],
+ "columns": ["i"],
+ "until": "==",
+ "rows": [{"i": 1}]}]]' > test-ovsdb.log 2>&1 &
+ echo $! > test-ovsdb.pid
-schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
-ordinal_schema > schema
-AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr])
-cid=`ovsdb-tool db-cid s1.db`
-schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
-for i in `seq 2 3`; do
- AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft])
-done
-
-on_exit 'kill `cat *.pid`'
-for i in `seq 3`; do
- AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db])
-done
-for i in `seq 3`; do
- AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
-done
-
-AT_CHECK([ovsdb-client transact unix:s1.ovsdb '[["idltest",
- {"op": "insert",
- "table": "simple",
- "row": {"i": 1}}]]'], [0], [ignore], [ignore])
-
-# Connect to a single remote s3. Use "wait" to trigger a non-op transaction so
-# that test-ovsdb will not quit.
-
-test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -v -t10 idl unix:s3.ovsdb '[["idltest",
- {"op": "wait",
- "table": "simple",
- "where": [["i", "==", 1]],
- "columns": ["i"],
- "until": "==",
- "rows": [{"i": 1}]}]]' > test-ovsdb.log 2>&1 &
-echo $! > test-ovsdb.pid
-
-OVS_WAIT_UNTIL([grep "000: i=1" test-ovsdb.log])
-
-# Shutdown the other 2 servers so that s3 is disconnected from the cluster.
-for i in 2 1; do
- OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid])
-done
-
-# The test-ovsdb should detect the disconnect and retry.
-OVS_WAIT_UNTIL([grep disconnect test-ovsdb.log])
-
-OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s3], [s3.pid])
+ OVS_WAIT_UNTIL([grep "000: i=1" test-ovsdb.log])
+ # Shutdown the other servers so that $target is disconnected from the cluster.
+ for i in $shutdown; do
+ OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid])
+ done
+
+ # The test-ovsdb should detect the disconnect and retry.
+ OVS_WAIT_UNTIL([grep disconnect test-ovsdb.log])
+
+ OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$target], [s$target.pid])
+}
+OVS_END_SHELL_HELPERS
+
+AT_SETUP([OVSDB cluster - follower disconnect from cluster, single remote])
+AT_KEYWORDS([ovsdb server negative unix cluster disconnect])
+ovsdb_test_cluster_disconnect follower
+AT_CLEANUP
+
+AT_SETUP([OVSDB cluster - leader disconnect from cluster, single remote])
+AT_KEYWORDS([ovsdb server negative unix cluster disconnect])
+ovsdb_test_cluster_disconnect leader
AT_CLEANUP
+
OVS_START_SHELL_HELPERS
--
2.1.0
More information about the dev
mailing list