[ovs-dev] [PATCH v2] ovsdb: provide raft and command interfaces with priority

Anton Ivanov anton.ivanov at cambridgegreys.com
Wed Jun 23 10:33:54 UTC 2021


I just sent an updated version:

1. Logic fixed in the outer (remotes) loop. Inner (sessions) was mostly OK.

2. Appropriate measures have been taken to ensure that the "skip to" pointer is always valid.

3. Tested by forcing <10ms timeouts on processing and/or mandating "skip" on every iteration. All tests pass (see 4)

4. I had to disable skipping for replay. They seem to be incompatible.

Brgds,

A.

On 21/06/2021 20:29, Ilya Maximets wrote:
> On 6/11/21 5:42 PM, anton.ivanov at cambridgegreys.com wrote:
>> From: Anton Ivanov <anton.ivanov at cambridgegreys.com>
>>
>> Set a soft time limit of "raft election timer"/2 on ovsdb
>> processing.
>>
>> This improves behaviour in large heavily loaded clusters.
>> While it cannot fully eliminate spurious raft elections
>> under heavy load, it significantly decreases their number.
>>
>> Processing is (to the extent possible) restarted where it
>> stopped on the previous iteration to ensure that sessions
>> towards the tail of the session list are not starved.
>>
>> Signed-off-by: Anton Ivanov <anton.ivanov at cambridgegreys.com>
>> ---
> Hey, Anton.  Thanks for the patch!
> This is not a complete review, but a few things that I noticed.
> See inline.
>
> Best regards, Ilya Maximets.
>
>>   ovsdb/jsonrpc-server.c | 80 +++++++++++++++++++++++++++++++++++++++---
>>   ovsdb/jsonrpc-server.h |  2 +-
>>   ovsdb/ovsdb-server.c   | 15 +++++++-
>>   ovsdb/raft.c           |  5 +++
>>   ovsdb/raft.h           |  3 ++
>>   ovsdb/storage.c        |  8 +++++
>>   ovsdb/storage.h        |  2 ++
>>   7 files changed, 109 insertions(+), 6 deletions(-)
>>
>> diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c
>> index 4e2dfc3d7..84e0f69b5 100644
>> --- a/ovsdb/jsonrpc-server.c
>> +++ b/ovsdb/jsonrpc-server.c
>> @@ -60,7 +60,8 @@ static struct ovsdb_jsonrpc_session *ovsdb_jsonrpc_session_create(
>>       struct ovsdb_jsonrpc_remote *, struct jsonrpc_session *, bool);
>>   static void ovsdb_jsonrpc_session_preremove_db(struct ovsdb_jsonrpc_remote *,
>>                                                  struct ovsdb *);
>> -static void ovsdb_jsonrpc_session_run_all(struct ovsdb_jsonrpc_remote *);
>> +static void ovsdb_jsonrpc_session_run_all(struct ovsdb_jsonrpc_remote *,
>> +                                          uint64_t limit);
>>   static void ovsdb_jsonrpc_session_wait_all(struct ovsdb_jsonrpc_remote *);
>>   static void ovsdb_jsonrpc_session_get_memory_usage_all(
>>       const struct ovsdb_jsonrpc_remote *, struct simap *usage);
>> @@ -128,6 +129,8 @@ struct ovsdb_jsonrpc_server {
>>       bool read_only;            /* This server is does not accept any
>>                                     transactions that can modify the database. */
>>       struct shash remotes;      /* Contains "struct ovsdb_jsonrpc_remote *"s. */
>> +    struct ovsdb_jsonrpc_remote *skip_to;
>> +    bool yield_immediately;
> 'yield' doesn't seem to be a right word here.  Maybe 'wake_up' or something
> similar?
>
> Also, both fields above needs a comment.
>
> OTOH, do we really need this filed?  I mean, if we didn't process some session,
> shouldn't next session_wait() wake us up?
>
>>   };
>>   
>>   /* A configured remote.  This is either a passive stream listener plus a list
>> @@ -137,6 +140,7 @@ struct ovsdb_jsonrpc_remote {
>>       struct ovsdb_jsonrpc_server *server;
>>       struct pstream *listener;   /* Listener, if passive. */
>>       struct ovs_list sessions;   /* List of "struct ovsdb_jsonrpc_session"s. */
>> +    struct ovsdb_jsonrpc_session *skip_to;
>>       uint8_t dscp;
>>       bool read_only;
>>       char *role;
>> @@ -159,6 +163,8 @@ ovsdb_jsonrpc_server_create(bool read_only)
>>       ovsdb_server_init(&server->up);
>>       shash_init(&server->remotes);
>>       server->read_only = read_only;
>> +    server->yield_immediately = false;
>> +    server->skip_to = NULL;
>>       return server;
>>   }
>>   
>> @@ -279,6 +285,7 @@ ovsdb_jsonrpc_server_add_remote(struct ovsdb_jsonrpc_server *svr,
>>       remote->dscp = options->dscp;
>>       remote->read_only = options->read_only;
>>       remote->role = nullable_xstrdup(options->role);
>> +    remote->skip_to = NULL;
>>       shash_add(&svr->remotes, name, remote);
>>   
>>       if (!listener) {
>> @@ -378,12 +385,26 @@ ovsdb_jsonrpc_server_set_read_only(struct ovsdb_jsonrpc_server *svr,
>>   }
>>   
>>   void
>> -ovsdb_jsonrpc_server_run(struct ovsdb_jsonrpc_server *svr)
>> +ovsdb_jsonrpc_server_run(struct ovsdb_jsonrpc_server *svr, uint64_t limit)
>>   {
>>       struct shash_node *node;
>> +    uint64_t elapsed = 0, start_time = 0;
>> +
>> +    if (limit) {
>> +        start_time = time_now();
> Why this function uses time_now() while others are using time_msec() ?
> time_now() returns seconds while 'limit' is in milliseconds.
>
>> +    }
>> +
>> +    svr->yield_immediately = false;
>>   
>>       SHASH_FOR_EACH (node, &svr->remotes) {
>>           struct ovsdb_jsonrpc_remote *remote = node->data;
>> +        if (svr->skip_to) {
>> +            if (remote != svr->skip_to) {
>> +                continue;
> What if 'skip_to' is already removed from the list?  We will, probably,
> never process any remotes again.
>
> Also, we didn't process first N remotes here and we're not setting
> 'yield_immediately'.  This is inconsistent, at least.  But, yes,
> it's unclear if 'yield_immediately' needed at all.
>
>> +            } else {
>> +                svr->skip_to = NULL;
>> +            }
>> +        }
>>   
>>           if (remote->listener) {
>>               struct stream *stream;
>> @@ -403,7 +424,16 @@ ovsdb_jsonrpc_server_run(struct ovsdb_jsonrpc_server *svr)
>>               }
>>           }
>>   
>> -        ovsdb_jsonrpc_session_run_all(remote);
>> +        ovsdb_jsonrpc_session_run_all(remote, limit - elapsed);
>> +
>> +        if (limit) {
>> +            elapsed = start_time - time_now();
> 'start_time' is always less than time_now(), so 'elapsed' is either
> a zero or a huge unsigned integer.  So, limit here will never work.
>
>> +            if (elapsed >= limit) {
>> +                svr->yield_immediately = true;
>> +                svr->skip_to = remote;
>> +                break;
>> +            }
>> +        }
>>       }
>>   }
>>   
>> @@ -412,6 +442,12 @@ ovsdb_jsonrpc_server_wait(struct ovsdb_jsonrpc_server *svr)
>>   {
>>       struct shash_node *node;
>>   
>> +    if (svr->yield_immediately) {
>> +        poll_immediate_wake();
>> +        svr->yield_immediately = false;
>> +        return;
>> +    }
>> +
>>       SHASH_FOR_EACH (node, &svr->remotes) {
>>           struct ovsdb_jsonrpc_remote *remote = node->data;
>>   
>> @@ -583,15 +619,51 @@ ovsdb_jsonrpc_session_set_options(struct ovsdb_jsonrpc_session *session,
>>   }
>>   
>>   static void
>> -ovsdb_jsonrpc_session_run_all(struct ovsdb_jsonrpc_remote *remote)
>> +ovsdb_jsonrpc_session_run_all(struct ovsdb_jsonrpc_remote *remote,
>> +                              uint64_t limit)
>>   {
>>       struct ovsdb_jsonrpc_session *s, *next;
>> +    uint64_t start_time;
>> +
>> +    if (limit) {
>> +        start_time = time_msec();
>> +    }
>>   
>>       LIST_FOR_EACH_SAFE (s, next, node, &remote->sessions) {
>> +        if ((remote->skip_to) && (s != remote->skip_to)) {
>> +            /* processing was interrupted, we skip to the point
>> +             * where we had to interrupt it
>> +             * we cannot use the _CONTINUE macro as it is not safe
>> +             * if the list has been changed in the meantime.
>> +             */
>> +            continue;
> Same comment here.  If 'skip_to' is already removed from the list, we
> will never process any session again.
>
>> +        }
>> +
>> +        /* set next as skip point if we need to restart processing */
>> +        remote->skip_to = next;
>> +
>>           int error = ovsdb_jsonrpc_session_run(s);
>> +
>>           if (error) {
>>               ovsdb_jsonrpc_session_close(s);
>>           }
>> +
>> +        if (limit) {
>> +            if (time_msec() - start_time > limit) {
>> +                /* we bail leaving skip_to set. Next processing iteration
>> +                 * will skip everything up to skip_to
>> +                 */
>> +                break;
>> +            } else {
>> +                /* We are within time constraints, zero
>> +                 * the session we need to skip to in order to
>> +                 * restart processing.
>> +                 */
>> +                remote->skip_to = NULL;
>> +            }
>> +        } else {
>> +            remote->skip_to = NULL;
>> +        }
>>       }
>>   }
>>   
>> diff --git a/ovsdb/jsonrpc-server.h b/ovsdb/jsonrpc-server.h
>> index e0653aa39..218152e9d 100644
>> --- a/ovsdb/jsonrpc-server.h
>> +++ b/ovsdb/jsonrpc-server.h
>> @@ -67,7 +67,7 @@ void ovsdb_jsonrpc_server_free_remote_status(
>>   void ovsdb_jsonrpc_server_reconnect(struct ovsdb_jsonrpc_server *, bool force,
>>                                       char *comment);
>>   
>> -void ovsdb_jsonrpc_server_run(struct ovsdb_jsonrpc_server *);
>> +void ovsdb_jsonrpc_server_run(struct ovsdb_jsonrpc_server *, uint64_t limit);
>>   void ovsdb_jsonrpc_server_wait(struct ovsdb_jsonrpc_server *);
>>   
>>   void ovsdb_jsonrpc_server_set_read_only(struct ovsdb_jsonrpc_server *,
>> diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c
>> index b09232c65..97bfa05b2 100644
>> --- a/ovsdb/ovsdb-server.c
>> +++ b/ovsdb/ovsdb-server.c
>> @@ -184,6 +184,7 @@ main_loop(struct server_config *config,
>>       char *remotes_error, *ssl_error;
>>       struct shash_node *node;
>>       long long int status_timer = LLONG_MIN;
>> +    uint64_t limit = 0;
> I think, it might save a few lines of code if initialized with UINT64_MAX.
>
>>   
>>       *exiting = false;
>>       ssl_error = NULL;
>> @@ -215,7 +216,7 @@ main_loop(struct server_config *config,
>>               reconfigure_remotes(jsonrpc, all_dbs, remotes),
>>               &remotes_error);
>>           report_error_if_changed(reconfigure_ssl(all_dbs), &ssl_error);
>> -        ovsdb_jsonrpc_server_run(jsonrpc);
>> +        ovsdb_jsonrpc_server_run(jsonrpc, limit);
>>   
>>           if (*is_backup) {
>>               replication_run();
>> @@ -228,8 +229,20 @@ main_loop(struct server_config *config,
>>           struct shash_node *next;
>>           SHASH_FOR_EACH_SAFE (node, next, all_dbs) {
>>               struct db *db = node->data;
>> +            uint64_t db_limit = 0;
>> +
>>               ovsdb_txn_history_run(db->db);
>>               ovsdb_storage_run(db->db->storage);
>> +
>> +            db_limit = max_processing_time(db->db->storage);
>> +            if (db_limit) {
>> +                if (!limit) {
>> +                    limit = db_limit;
>> +                } else {
>> +                    limit = MIN(db_limit, limit);
>> +                }
>> +            }
> It doesn't sound right that we're calculating a limit here inside the
> loop.  If we have more than one database, we need to calculate a minimum
> for all databases and use it later, otherwise the database with higher
> election timer might take more time than a limit for another database.
>
> Also, limit is global and never reset.  If election timer will be changed
> to higher value, limit will stay low.
>
>> +
>>               read_db(config, db);
>>               /* Run triggers after storage_run and read_db to make sure new raft
>>                * updates are utilized in current iteration. */
>> diff --git a/ovsdb/raft.c b/ovsdb/raft.c
>> index e06c1f1ab..c473e5d32 100644
>> --- a/ovsdb/raft.c
>> +++ b/ovsdb/raft.c
>> @@ -407,6 +407,11 @@ raft_make_address_passive(const char *address_)
>>       }
>>   }
>>   
>> +uint64_t raft_election_timer_value(const struct raft *raft)
>> +{
>> +    return raft->election_timer;
>> +}
>> +
>>   static struct raft *
>>   raft_alloc(void)
>>   {
>> diff --git a/ovsdb/raft.h b/ovsdb/raft.h
>> index 3545c41c2..1d270ec0c 100644
>> --- a/ovsdb/raft.h
>> +++ b/ovsdb/raft.h
>> @@ -188,4 +188,7 @@ void raft_take_leadership(struct raft *);
>>   void raft_transfer_leadership(struct raft *, const char *reason);
>>   
>>   const struct uuid *raft_current_eid(const struct raft *);
>> +
>> +uint64_t raft_election_timer_value(const struct raft *);
>> +
>>   #endif /* lib/raft.h */
>> diff --git a/ovsdb/storage.c b/ovsdb/storage.c
>> index 40415fcf6..a2142851a 100644
>> --- a/ovsdb/storage.c
>> +++ b/ovsdb/storage.c
>> @@ -640,3 +640,11 @@ ovsdb_storage_peek_last_eid(struct ovsdb_storage *storage)
>>       }
>>       return raft_current_eid(storage->raft);
>>   }
>> +
>> +uint64_t max_processing_time(struct ovsdb_storage *storage)
> Name of the function should start from a new line since it's not
> just a prototype.
>
>> +{
>> +    if (!storage->raft) {
>> +        return UINT64_MAX;
>> +    }
>> +    return raft_election_timer_value(storage->raft) / 2;
>> +}
>> diff --git a/ovsdb/storage.h b/ovsdb/storage.h
>> index 02b6e7e6c..9e195bbe8 100644
>> --- a/ovsdb/storage.h
>> +++ b/ovsdb/storage.h
>> @@ -97,4 +97,6 @@ struct ovsdb_schema *ovsdb_storage_read_schema(struct ovsdb_storage *);
>>   
>>   const struct uuid *ovsdb_storage_peek_last_eid(struct ovsdb_storage *);
>>   
>> +uint64_t max_processing_time(struct ovsdb_storage *);
>> +
>>   #endif /* ovsdb/storage.h */
>>
>
-- 
Anton R. Ivanov
Cambridgegreys Limited. Registered in England. Company Number 10273661
https://www.cambridgegreys.com/



More information about the dev mailing list