[ovs-dev] [PATCH v2 9/9] ovsdb raft: Fix duplicated transaction execution when leader failover.

Han Zhou zhouhan at gmail.com
Fri Apr 12 23:26:28 UTC 2019


From: Han Zhou <hzhou8 at ebay.com>

When a transaction is submitted from a client connected to a follower,
if leader crashes after receiving the execute_command_request from the
follower and sending out append request to the majority of followers,
but before sending execute_command_reply to the follower. The
transaction would finally got commited by the new leader. However,
with current implementation the transaction would be commited twice.

For the root cause, there are two cases:

Case 1, the connected follower becomes the new leader. In this case,
the pending command of the follower will be cancelled during its role
changing to leader, so the trigger for the transaction will be retried.

Case 2, another follower becomes the new leader. In this case, since
there is no execute_command_reply from the original leader (which has
crashed), the command will finally timed out, causing the trigger for
the transaction retried.

In both cases, the transaction will be retried by the server node's
trigger retrying logic. This patch fixes the problem by below changes:

1) A pending command can be completed not only by
execute_command_reply, but also when the eid is committed, if the
execute_command_reply never came.

2) Instead of cancelling all pending commands during role change, let
the commands continue waiting to be completed when the eid is
committed. The timer is increased to be twice the election base time,
so that it has the chance to be completed when leader crashes.

This patch fixes the two raft failure test cases previously disabled.
See the test case for details of how to reproduce the problem.

Signed-off-by: Han Zhou <hzhou8 at ebay.com>
---
 ovsdb/raft.c           | 76 ++++++++++++++++++++++++++++++++------------------
 tests/ovsdb-cluster.at |  4 ---
 2 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/ovsdb/raft.c b/ovsdb/raft.c
index 02bdf11..77ad365 100644
--- a/ovsdb/raft.c
+++ b/ovsdb/raft.c
@@ -1607,10 +1607,7 @@ raft_start_election(struct raft *raft, bool leadership_transfer)
         return;
     }
 
-    raft_complete_all_commands(raft, RAFT_CMD_LOST_LEADERSHIP);
-
     ovs_assert(raft->role != RAFT_LEADER);
-    ovs_assert(hmap_is_empty(&raft->commands));
     raft->role = RAFT_CANDIDATE;
 
     raft->n_votes = 0;
@@ -1793,17 +1790,22 @@ raft_run(struct raft *raft)
         }
     }
 
-    if (time_msec() >= raft->ping_timeout) {
+    long long int now = time_msec();
+    if (now >= raft->ping_timeout) {
         if (raft->role == RAFT_LEADER) {
             raft_send_heartbeats(raft);
-        } else {
-            long long int now = time_msec();
-            struct raft_command *cmd, *next_cmd;
-            HMAP_FOR_EACH_SAFE (cmd, next_cmd, hmap_node, &raft->commands) {
-                if (cmd->timestamp
-                    && now - cmd->timestamp > ELECTION_BASE_MSEC) {
-                    raft_command_complete(raft, cmd, RAFT_CMD_TIMEOUT);
-                }
+        }
+        /* Check if any commands timeout. Timeout is set to twice the time of
+         * election base time so that commands can complete properly during
+         * leader election. E.g. a leader crashed and current node with pending
+         * commands becomes new leader: the pending commands can still complete
+         * if the crashed leader has replicated the transactions to majority of
+         * followers before it crashed. */
+        struct raft_command *cmd, *next_cmd;
+        HMAP_FOR_EACH_SAFE (cmd, next_cmd, hmap_node, &raft->commands) {
+            if (cmd->timestamp
+                && now - cmd->timestamp > ELECTION_BASE_MSEC * 2) {
+                raft_command_complete(raft, cmd, RAFT_CMD_TIMEOUT);
             }
             raft_reset_ping_timer(raft);
         }
@@ -1974,6 +1976,7 @@ raft_command_initiate(struct raft *raft,
     struct raft_command *cmd = raft_command_create_incomplete(raft, index);
     ovs_assert(eid);
     cmd->eid = *eid;
+    cmd->timestamp = time_msec();
 
     raft_waiter_create(raft, RAFT_W_ENTRY, true)->entry.index = cmd->index;
 
@@ -1996,6 +1999,15 @@ raft_command_initiate(struct raft *raft,
     return cmd;
 }
 
+static void
+log_all_commands(struct raft *raft)
+{
+    struct raft_command *cmd, *next;
+    HMAP_FOR_EACH_SAFE (cmd, next, hmap_node, &raft->commands) {
+        VLOG_DBG("raft command eid: "UUID_FMT, UUID_ARGS(&cmd->eid));
+    }
+}
+
 static struct raft_command * OVS_WARN_UNUSED_RESULT
 raft_command_execute__(struct raft *raft,
                        const struct json *data, const struct json *servers,
@@ -2051,6 +2063,7 @@ raft_command_execute__(struct raft *raft,
         struct raft_command *cmd = raft_command_create_incomplete(raft, 0);
         cmd->timestamp = time_msec();
         cmd->eid = eid;
+        log_all_commands(raft);
         return cmd;
     }
 
@@ -2126,6 +2139,8 @@ raft_command_complete(struct raft *raft,
                       struct raft_command *cmd,
                       enum raft_command_status status)
 {
+    VLOG_DBG("raft_command_complete eid "UUID_FMT" status: %s",
+             UUID_ARGS(&cmd->eid), raft_command_status_to_string(status));
     if (!uuid_is_zero(&cmd->sid)) {
         uint64_t commit_index = status == RAFT_CMD_SUCCESS ? cmd->index : 0;
         raft_send_execute_command_reply(raft, &cmd->sid, &cmd->eid, status,
@@ -2149,19 +2164,6 @@ raft_complete_all_commands(struct raft *raft, enum raft_command_status status)
 }
 
 static struct raft_command *
-raft_find_command_by_index(struct raft *raft, uint64_t index)
-{
-    struct raft_command *cmd;
-
-    HMAP_FOR_EACH_IN_BUCKET (cmd, hmap_node, index, &raft->commands) {
-        if (cmd->index == index) {
-            return cmd;
-        }
-    }
-    return NULL;
-}
-
-static struct raft_command *
 raft_find_command_by_eid(struct raft *raft, const struct uuid *eid)
 {
     struct raft_command *cmd;
@@ -2441,7 +2443,7 @@ raft_server_init_leader(struct raft *raft, struct raft_server *s)
 static void
 raft_become_leader(struct raft *raft)
 {
-    raft_complete_all_commands(raft, RAFT_CMD_LOST_LEADERSHIP);
+    log_all_commands(raft);
 
     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
     VLOG_INFO_RL(&rl, "term %"PRIu64": elected leader by %d+ of "
@@ -2627,8 +2629,14 @@ raft_update_commit_index(struct raft *raft, uint64_t new_commit_index)
             const struct raft_entry *e = raft_get_entry(raft, index);
             if (e->data) {
                 struct raft_command *cmd
-                    = raft_find_command_by_index(raft, index);
+                    = raft_find_command_by_eid(raft, &e->eid);
                 if (cmd) {
+                    if (!cmd->index) {
+                        VLOG_DBG("Command completed after role change from"
+                                 " follower to leader "UUID_FMT,
+                                 UUID_ARGS(&e->eid));
+                        cmd->index = index;
+                    }
                     raft_command_complete(raft, cmd, RAFT_CMD_SUCCESS);
                 }
             }
@@ -2641,6 +2649,20 @@ raft_update_commit_index(struct raft *raft, uint64_t new_commit_index)
         }
     } else {
         raft->commit_index = new_commit_index;
+        /* Check if any pending command can be completed, and complete it.
+         * This can happen when leader fail-over before sending
+         * execute_command_reply. */
+        const struct uuid *eid = raft_get_eid(raft, new_commit_index);
+        struct raft_command *cmd = raft_find_command_by_eid(raft, eid);
+        if (cmd) {
+            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+            VLOG_INFO_RL(&rl,
+                         "Command completed without reply (eid: "UUID_FMT", "
+                         "commit index: %"PRIu64")",
+                         UUID_ARGS(eid), new_commit_index);
+            cmd->index = new_commit_index;
+            raft_command_complete(raft, cmd, RAFT_CMD_SUCCESS);
+        }
     }
 
     /* Write the commit index to the log.  The next time we restart, this
diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
index 4e88766..931da6e 100644
--- a/tests/ovsdb-cluster.at
+++ b/tests/ovsdb-cluster.at
@@ -138,8 +138,6 @@ AT_BANNER([OVSDB - cluster failure with pending transaction])
 
 AT_SETUP([OVSDB cluster - txn on follower-2, leader crash before sending appendReq, follower-2 becomes leader])
 AT_KEYWORDS([ovsdb server negative unix cluster pending-txn])
-# XXX: fix bug before enabling this test
-AT_CHECK([exit 77])
 ovsdb_cluster_failure_test 2 3 1 crash-before-sending-append-request 2
 AT_CLEANUP
 
@@ -157,8 +155,6 @@ AT_CLEANUP
 
 AT_SETUP([OVSDB cluster - txn on follower-2, leader crash before sending execRep, follower-3 becomes leader])
 AT_KEYWORDS([ovsdb server negative unix cluster pending-txn])
-# XXX: fix bug before enabling this test
-AT_CHECK([exit 77])
 ovsdb_cluster_failure_test 2 3 1 crash-before-sending-execute-command-reply 3
 AT_CLEANUP
 
-- 
2.1.0



More information about the dev mailing list