[ovs-dev] [PATCH 4/4] raft.c: Stale leader should disconnect from cluster.

Tue Aug 13 20:29:30 UTC 2019

On Tue, Aug 13, 2019 at 9:23 AM Han Zhou <zhouhan at gmail.com> wrote:
>
> From: Han Zhou <hzhou8 at ebay.com>
>
> As mentioned in RAFT paper, section 6.2:
>
> Leaders: A server might be in the leader state, but if it isn’t the
current
> leader, it could be needlessly delaying client requests. For example,
suppose a
> leader is partitioned from the rest of the cluster, but it can still
> communicate with a particular client. Without additional mechanism, it
could
> delay a request from that client forever, being unable to replicate a log
entry
> to any other servers. Meanwhile, there might be another leader of a newer
term
> that is able to communicate with a majority of the cluster and would be
able to
> commit the client’s request. Thus, a leader in Raft steps down if an
election
> timeout elapses without a successful round of heartbeats to a majority of
its
> cluster; this allows clients to retry their requests with another server.
>
> Signed-off-by: Han Zhou <hzhou8 at ebay.com>
> ---
>  ovsdb/raft-private.h   |   3 ++
>  ovsdb/raft.c           |  42 ++++++++++++++++-
>  tests/ovsdb-cluster.at | 123
+++++++++++++++++++++++++++++--------------------
>  3 files changed, 116 insertions(+), 52 deletions(-)
>
> diff --git a/ovsdb/raft-private.h b/ovsdb/raft-private.h
> index 35a3dd7..fb7e6e3 100644
> --- a/ovsdb/raft-private.h
> +++ b/ovsdb/raft-private.h
> @@ -80,6 +80,9 @@ struct raft_server {
>      uint64_t next_index;     /* Index of next log entry to send this
server. */
>      uint64_t match_index;    /* Index of max log entry server known to
have. */
>      enum raft_server_phase phase;
> +    bool replied;            /* Reply to append_request was received
from this
> +                                node during current election_timeout
interval.
> +                                */
>      /* For use in adding and removing servers: */
>      struct uuid requester_sid;  /* Nonzero if requested via RPC. */
>      struct unixctl_conn *requester_conn; /* Only if requested via
unixctl. */
> diff --git a/ovsdb/raft.c b/ovsdb/raft.c
> index 63c3081..9516a6f 100644
> --- a/ovsdb/raft.c
> +++ b/ovsdb/raft.c
> @@ -1792,7 +1792,43 @@ raft_run(struct raft *raft)
>      }
>
>      if (!raft->joining && time_msec() >= raft->election_timeout) {
> -        raft_start_election(raft, false);
> +        if (raft->role == RAFT_LEADER) {
> +            /* Check if majority of followers replied, then reset
> +             * election_timeout and reset s->replied. Otherwise, become
> +             * follower.
> +             *
> +             * Raft paper section 6.2: Leaders: A server might be in the
leader
> +             * state, but if it isn’t the current leader, it could be
> +             * needlessly delaying client requests. For example, suppose
a
> +             * leader is partitioned from the rest of the cluster, but
it can
> +             * still communicate with a particular client. Without
additional
> +             * mechanism, it could delay a request from that client
forever,
> +             * being unable to replicate a log entry to any other
servers.
> +             * Meanwhile, there might be another leader of a newer term
that is
> +             * able to communicate with a majority of the cluster and
would be
> +             * able to commit the client’s request. Thus, a leader in
Raft
> +             * steps down if an election timeout elapses without a
successful
> +             * round of heartbeats to a majority of its cluster; this
allows
> +             * clients to retry their requests with another server.  */
> +            int count = 0;
> +            HMAP_FOR_EACH (server, hmap_node, &raft->servers) {
> +                if (server->replied) {
> +                    count ++;
> +                }
> +            }
> +            if (count >= hmap_count(&raft->servers) / 2) {
> +                HMAP_FOR_EACH (server, hmap_node, &raft->servers) {
> +                    server->replied = false;
> +                }
> +                raft_reset_election_timer(raft);
> +            } else {
> +                raft_become_follower(raft);
> +                raft_start_election(raft, false);
> +            }
> +        } else {
> +            raft_start_election(raft, false);
> +        }
> +
>      }
>
>      if (raft->leaving && time_msec() >= raft->leave_timeout) {
> @@ -2454,6 +2490,7 @@ raft_server_init_leader(struct raft *raft, struct
raft_server *s)
>      s->next_index = raft->log_end;
>      s->match_index = 0;
>      s->phase = RAFT_PHASE_STABLE;
> +    s->replied = false;
>  }
>
>  static void
> @@ -2477,7 +2514,7 @@ raft_become_leader(struct raft *raft)
>      ovs_assert(raft->role != RAFT_LEADER);
>      raft->role = RAFT_LEADER;
>      raft_set_leader(raft, &raft->sid);
> -    raft->election_timeout = LLONG_MAX;
> +    raft_reset_election_timer(raft);
>      raft_reset_ping_timer(raft);
>
>      struct raft_server *s;
> @@ -3207,6 +3244,7 @@ raft_handle_append_reply(struct raft *raft,
>          }
>      }
>
> +    s->replied = true;
>      if (rpy->result == RAFT_APPEND_OK) {
>          /* Figure 3.1: "If successful, update nextIndex and matchIndex
for
>           * follower (section 3.5)." */
> diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
> index 6a13843..7146fe6 100644
> --- a/tests/ovsdb-cluster.at
> +++ b/tests/ovsdb-cluster.at
> @@ -65,59 +65,82 @@ EXECUTION_EXAMPLES
>
>  AT_BANNER([OVSDB - disconnect from cluster])
>
> -# When a node is disconnected from the cluster, the IDL should disconnect
> -# and retry even if it uses a single remote, because the remote IP can be
> -# a VIP on a load-balance.
> -AT_SETUP([OVSDB cluster - disconnect from cluster, single remote])
> -AT_KEYWORDS([ovsdb server negative unix cluster disconnect])
> +OVS_START_SHELL_HELPERS
> +# ovsdb_test_cluster_disconnect LEADER_OR_FOLLOWER
> +ovsdb_test_cluster_disconnect () {
> +    leader_or_follower=$1
> +    schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
> +    ordinal_schema > schema
> +    AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster
s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr])
> +    cid=`ovsdb-tool db-cid s1.db`
> +    schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
> +    for i in `seq 2 3`; do
> +        AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name
unix:s$i.raft unix:s1.raft])
> +    done
> +
> +    on_exit 'kill `cat *.pid`'
> +    for i in `seq 3`; do
> +        AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach
--no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i
--remote=punix:s$i.ovsdb s$i.db])
> +    done
> +    for i in `seq 3`; do
> +        AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name
connected])
> +    done
> +
> +    AT_CHECK([ovsdb-client transact unix:s1.ovsdb '[["idltest",
> +          {"op": "insert",
> +           "table": "simple",
> +           "row": {"i": 1}}]]'], [0], [ignore], [ignore])
> +
> +    # When a node is disconnected from the cluster, the IDL should
disconnect
> +    # and retry even if it uses a single remote, because the remote IP
can be
> +    # a VIP on a load-balance. So we use single remote to test here.
> +    if test $leader_or_follower == "leader"; then
> +        target=1
> +        shutdown="2 3"
> +    else
> +        target=3
> +
> +        # shutdown follower before the leader so that there is no chance
for s3
> +        # become leader during the process.
> +        shutdown="2 1"
> +    fi
> +
> +    # Connect to $target.  Use "wait" to trigger a non-op transaction so
> +    # that test-ovsdb will not quit.
> +
> +    on_exit 'pkill test-ovsdb'
> +    test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -v -t10 idl
unix:s$target.ovsdb '[["idltest",
> +          {"op": "wait",
> +           "table": "simple",
> +           "where": [["i", "==", 1]],
> +           "columns": ["i"],
> +           "until": "==",
> +           "rows": [{"i": 1}]}]]' > test-ovsdb.log 2>&1 &
>
> -schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
> -ordinal_schema > schema
> -AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db
$abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr])
> -cid=`ovsdb-tool db-cid s1.db`
> -schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
> -for i in `seq 2 3`; do
> -    AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft
unix:s1.raft])
> -done
> -
> -on_exit 'kill `cat *.pid`'
> -for i in `seq 3`; do
> -    AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach
--no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i
--remote=punix:s$i.ovsdb s$i.db])
> -done
> -for i in `seq 3`; do
> -    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
> -done
> -
> -AT_CHECK([ovsdb-client transact unix:s1.ovsdb '[["idltest",
> -      {"op": "insert",
> -       "table": "simple",
> -       "row": {"i": 1}}]]'], [0], [ignore], [ignore])
> -
> -# Connect to a single remote s3.  Use "wait" to trigger a non-op
transaction so
> -# that test-ovsdb will not quit.
> -
> -on_exit 'pkill test-ovsdb'
> -test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -v -t10 idl
unix:s3.ovsdb '[["idltest",
> -      {"op": "wait",
> -       "table": "simple",
> -       "where": [["i", "==", 1]],
> -       "columns": ["i"],
> -       "until": "==",
> -       "rows": [{"i": 1}]}]]' > test-ovsdb.log 2>&1 &
> -
> -OVS_WAIT_UNTIL([grep "000: i=1" test-ovsdb.log])
> -
> -# Shutdown the other 2 servers so that s3 is disconnected from the
cluster.
> -for i in 2 1; do
> -    OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid])
> -done
> -
> -# The test-ovsdb should detect the disconnect and retry.
> -OVS_WAIT_UNTIL([grep disconnect test-ovsdb.log])
> -
> -OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s3], [s3.pid])
> +    OVS_WAIT_UNTIL([grep "000: i=1" test-ovsdb.log])
>
> +    # Shutdown the other servers so that $target is disconnected from
the cluster.
> +    for i in $shutdown; do
> +        OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid])
> +    done
> +
> +    # The test-ovsdb should detect the disconnect and retry.
> +    OVS_WAIT_UNTIL([grep disconnect test-ovsdb.log])
> +
> +    OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$target], [s$target.pid])
> +}
> +OVS_END_SHELL_HELPERS
> +
> +AT_SETUP([OVSDB cluster - follower disconnect from cluster, single
remote])
> +AT_KEYWORDS([ovsdb server negative unix cluster disconnect])
> +ovsdb_test_cluster_disconnect follower
> +AT_CLEANUP
> +
> +AT_SETUP([OVSDB cluster - leader disconnect from cluster, single remote])
> +AT_KEYWORDS([ovsdb server negative unix cluster disconnect])
> +ovsdb_test_cluster_disconnect leader
>  AT_CLEANUP
> +
>
>
>  OVS_START_SHELL_HELPERS
> --
> 2.1.0
>

Sorry that I forgot to add:
Reported-by: Aliasgar Ginwala <aginwala at ebay.com>
Tested-by: Aliasgar Ginwala <aginwala at ebay.com>

I will wait for comments and add it to v2 (if v2 is needed).