[ovs-dev] [PATCH v2] ovsdb: provide raft and command interfaces with priority

Anton Ivanov anton.ivanov at cambridgegreys.com
Tue Jun 22 07:33:14 UTC 2021


On 21/06/2021 20:29, Ilya Maximets wrote:
> On 6/11/21 5:42 PM, anton.ivanov at cambridgegreys.com wrote:
>> From: Anton Ivanov <anton.ivanov at cambridgegreys.com>
>>
>> Set a soft time limit of "raft election timer"/2 on ovsdb
>> processing.
>>
>> This improves behaviour in large heavily loaded clusters.
>> While it cannot fully eliminate spurious raft elections
>> under heavy load, it significantly decreases their number.
>>
>> Processing is (to the extent possible) restarted where it
>> stopped on the previous iteration to ensure that sessions
>> towards the tail of the session list are not starved.
>>
>> Signed-off-by: Anton Ivanov <anton.ivanov at cambridgegreys.com>
>> ---
> Hey, Anton.  Thanks for the patch!
> This is not a complete review, but a few things that I noticed.
> See inline.
>
> Best regards, Ilya Maximets.
>
>>   ovsdb/jsonrpc-server.c | 80 +++++++++++++++++++++++++++++++++++++++---
>>   ovsdb/jsonrpc-server.h |  2 +-
>>   ovsdb/ovsdb-server.c   | 15 +++++++-
>>   ovsdb/raft.c           |  5 +++
>>   ovsdb/raft.h           |  3 ++
>>   ovsdb/storage.c        |  8 +++++
>>   ovsdb/storage.h        |  2 ++
>>   7 files changed, 109 insertions(+), 6 deletions(-)
>>
>> diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c
>> index 4e2dfc3d7..84e0f69b5 100644
>> --- a/ovsdb/jsonrpc-server.c
>> +++ b/ovsdb/jsonrpc-server.c
>> @@ -60,7 +60,8 @@ static struct ovsdb_jsonrpc_session *ovsdb_jsonrpc_session_create(
>>       struct ovsdb_jsonrpc_remote *, struct jsonrpc_session *, bool);
>>   static void ovsdb_jsonrpc_session_preremove_db(struct ovsdb_jsonrpc_remote *,
>>                                                  struct ovsdb *);
>> -static void ovsdb_jsonrpc_session_run_all(struct ovsdb_jsonrpc_remote *);
>> +static void ovsdb_jsonrpc_session_run_all(struct ovsdb_jsonrpc_remote *,
>> +                                          uint64_t limit);
>>   static void ovsdb_jsonrpc_session_wait_all(struct ovsdb_jsonrpc_remote *);
>>   static void ovsdb_jsonrpc_session_get_memory_usage_all(
>>       const struct ovsdb_jsonrpc_remote *, struct simap *usage);
>> @@ -128,6 +129,8 @@ struct ovsdb_jsonrpc_server {
>>       bool read_only;            /* This server is does not accept any
>>                                     transactions that can modify the database. */
>>       struct shash remotes;      /* Contains "struct ovsdb_jsonrpc_remote *"s. */
>> +    struct ovsdb_jsonrpc_remote *skip_to;
>> +    bool yield_immediately;
> 'yield' doesn't seem to be a right word here.  Maybe 'wake_up' or something
> similar?
>
> Also, both fields above needs a comment.
>
> OTOH, do we really need this filed?  I mean, if we didn't process some session,
> shouldn't next session_wait() wake us up?
>
>>   };
>>   
>>   /* A configured remote.  This is either a passive stream listener plus a list
>> @@ -137,6 +140,7 @@ struct ovsdb_jsonrpc_remote {
>>       struct ovsdb_jsonrpc_server *server;
>>       struct pstream *listener;   /* Listener, if passive. */
>>       struct ovs_list sessions;   /* List of "struct ovsdb_jsonrpc_session"s. */
>> +    struct ovsdb_jsonrpc_session *skip_to;
>>       uint8_t dscp;
>>       bool read_only;
>>       char *role;
>> @@ -159,6 +163,8 @@ ovsdb_jsonrpc_server_create(bool read_only)
>>       ovsdb_server_init(&server->up);
>>       shash_init(&server->remotes);
>>       server->read_only = read_only;
>> +    server->yield_immediately = false;
>> +    server->skip_to = NULL;
>>       return server;
>>   }
>>   
>> @@ -279,6 +285,7 @@ ovsdb_jsonrpc_server_add_remote(struct ovsdb_jsonrpc_server *svr,
>>       remote->dscp = options->dscp;
>>       remote->read_only = options->read_only;
>>       remote->role = nullable_xstrdup(options->role);
>> +    remote->skip_to = NULL;
>>       shash_add(&svr->remotes, name, remote);
>>   
>>       if (!listener) {
>> @@ -378,12 +385,26 @@ ovsdb_jsonrpc_server_set_read_only(struct ovsdb_jsonrpc_server *svr,
>>   }
>>   
>>   void
>> -ovsdb_jsonrpc_server_run(struct ovsdb_jsonrpc_server *svr)
>> +ovsdb_jsonrpc_server_run(struct ovsdb_jsonrpc_server *svr, uint64_t limit)
>>   {
>>       struct shash_node *node;
>> +    uint64_t elapsed = 0, start_time = 0;
>> +
>> +    if (limit) {
>> +        start_time = time_now();
> Why this function uses time_now() while others are using time_msec() ?
> time_now() returns seconds while 'limit' is in milliseconds.

That was daft.

Revised patch coming up shortly.

>
>> +    }
>> +
>> +    svr->yield_immediately = false;
>>   
>>       SHASH_FOR_EACH (node, &svr->remotes) {
>>           struct ovsdb_jsonrpc_remote *remote = node->data;
>> +        if (svr->skip_to) {
>> +            if (remote != svr->skip_to) {
>> +                continue;
> What if 'skip_to' is already removed from the list?  We will, probably,
> never process any remotes again.

I tried to replicate the handling for list here which is correct (see below) and failed. Same as above actually - the loop in the session_list uses time_msec()

A revised patch with fixed handling coming up shortly.

>
> Also, we didn't process first N remotes here and we're not setting
> 'yield_immediately'.  This is inconsistent, at least.  But, yes,
> it's unclear if 'yield_immediately' needed at all.
>
>> +            } else {
>> +                svr->skip_to = NULL;
>> +            }
>> +        }
>>   
>>           if (remote->listener) {
>>               struct stream *stream;
>> @@ -403,7 +424,16 @@ ovsdb_jsonrpc_server_run(struct ovsdb_jsonrpc_server *svr)
>>               }
>>           }
>>   
>> -        ovsdb_jsonrpc_session_run_all(remote);
>> +        ovsdb_jsonrpc_session_run_all(remote, limit - elapsed);
>> +
>> +        if (limit) {
>> +            elapsed = start_time - time_now();
> 'start_time' is always less than time_now(), so 'elapsed' is either
> a zero or a huge unsigned integer.  So, limit here will never work.
>
>> +            if (elapsed >= limit) {
>> +                svr->yield_immediately = true;
>> +                svr->skip_to = remote;
>> +                break;
>> +            }
>> +        }
>>       }
>>   }
>>   
>> @@ -412,6 +442,12 @@ ovsdb_jsonrpc_server_wait(struct ovsdb_jsonrpc_server *svr)
>>   {
>>       struct shash_node *node;
>>   
>> +    if (svr->yield_immediately) {
>> +        poll_immediate_wake();
>> +        svr->yield_immediately = false;
>> +        return;
>> +    }
>> +
>>       SHASH_FOR_EACH (node, &svr->remotes) {
>>           struct ovsdb_jsonrpc_remote *remote = node->data;
>>   
>> @@ -583,15 +619,51 @@ ovsdb_jsonrpc_session_set_options(struct ovsdb_jsonrpc_session *session,
>>   }
>>   
>>   static void
>> -ovsdb_jsonrpc_session_run_all(struct ovsdb_jsonrpc_remote *remote)
>> +ovsdb_jsonrpc_session_run_all(struct ovsdb_jsonrpc_remote *remote,
>> +                              uint64_t limit)
>>   {
>>       struct ovsdb_jsonrpc_session *s, *next;
>> +    uint64_t start_time;
>> +
>> +    if (limit) {
>> +        start_time = time_msec();
>> +    }
>>   
>>       LIST_FOR_EACH_SAFE (s, next, node, &remote->sessions) {
>> +        if ((remote->skip_to) && (s != remote->skip_to)) {
>> +            /* processing was interrupted, we skip to the point
>> +             * where we had to interrupt it
>> +             * we cannot use the _CONTINUE macro as it is not safe
>> +             * if the list has been changed in the meantime.
>> +             */
>> +            continue;
> Same comment here.  If 'skip_to' is already removed from the list, we
> will never process any session again.

Not the case.

Skip-to is set to ->next before the error check which may lead to removal.

If the FOR_EACH list loop completes successfully it is reset to NULL. If the loop encounters the time limit, it bails with the value equals to ->next.

So even if the current session encounters an error, the pointer stored in skip_to will always be valid.

The only corner case which is not handled at present is a bail at the last session - this will lead to all sessions for this remote being re-run instead of skipping to the next remote in the remotes hash. I decided not to try to handle it.

I am happy to add some comments on the handling.

>> +        }
>> +
>> +        /* set next as skip point if we need to restart processing */
>> +        remote->skip_to = next;
>> +
>>           int error = ovsdb_jsonrpc_session_run(s);
>> +
>>           if (error) {
>>               ovsdb_jsonrpc_session_close(s);
>>           }
>> +
>> +        if (limit) {
>> +            if (time_msec() - start_time > limit) {
>> +                /* we bail leaving skip_to set. Next processing iteration
>> +                 * will skip everything up to skip_to
>> +                 */
>> +                break;
>> +            } else {
>> +                /* We are within time constraints, zero
>> +                 * the session we need to skip to in order to
>> +                 * restart processing.
>> +                 */
>> +                remote->skip_to = NULL;
>> +            }
>> +        } else {
>> +            remote->skip_to = NULL;
>> +        }
>>       }
>>   }
>>   
>> diff --git a/ovsdb/jsonrpc-server.h b/ovsdb/jsonrpc-server.h
>> index e0653aa39..218152e9d 100644
>> --- a/ovsdb/jsonrpc-server.h
>> +++ b/ovsdb/jsonrpc-server.h
>> @@ -67,7 +67,7 @@ void ovsdb_jsonrpc_server_free_remote_status(
>>   void ovsdb_jsonrpc_server_reconnect(struct ovsdb_jsonrpc_server *, bool force,
>>                                       char *comment);
>>   
>> -void ovsdb_jsonrpc_server_run(struct ovsdb_jsonrpc_server *);
>> +void ovsdb_jsonrpc_server_run(struct ovsdb_jsonrpc_server *, uint64_t limit);
>>   void ovsdb_jsonrpc_server_wait(struct ovsdb_jsonrpc_server *);
>>   
>>   void ovsdb_jsonrpc_server_set_read_only(struct ovsdb_jsonrpc_server *,
>> diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c
>> index b09232c65..97bfa05b2 100644
>> --- a/ovsdb/ovsdb-server.c
>> +++ b/ovsdb/ovsdb-server.c
>> @@ -184,6 +184,7 @@ main_loop(struct server_config *config,
>>       char *remotes_error, *ssl_error;
>>       struct shash_node *node;
>>       long long int status_timer = LLONG_MIN;
>> +    uint64_t limit = 0;
> I think, it might save a few lines of code if initialized with UINT64_MAX.
>
>>   
>>       *exiting = false;
>>       ssl_error = NULL;
>> @@ -215,7 +216,7 @@ main_loop(struct server_config *config,
>>               reconfigure_remotes(jsonrpc, all_dbs, remotes),
>>               &remotes_error);
>>           report_error_if_changed(reconfigure_ssl(all_dbs), &ssl_error);
>> -        ovsdb_jsonrpc_server_run(jsonrpc);
>> +        ovsdb_jsonrpc_server_run(jsonrpc, limit);
>>   
>>           if (*is_backup) {
>>               replication_run();
>> @@ -228,8 +229,20 @@ main_loop(struct server_config *config,
>>           struct shash_node *next;
>>           SHASH_FOR_EACH_SAFE (node, next, all_dbs) {
>>               struct db *db = node->data;
>> +            uint64_t db_limit = 0;
>> +
>>               ovsdb_txn_history_run(db->db);
>>               ovsdb_storage_run(db->db->storage);
>> +
>> +            db_limit = max_processing_time(db->db->storage);
>> +            if (db_limit) {
>> +                if (!limit) {
>> +                    limit = db_limit;
>> +                } else {
>> +                    limit = MIN(db_limit, limit);
>> +                }
>> +            }
> It doesn't sound right that we're calculating a limit here inside the
> loop.  If we have more than one database, we need to calculate a minimum
> for all databases and use it later, otherwise the database with higher
> election timer might take more time than a limit for another database.
>
> Also, limit is global and never reset.  If election timer will be changed
> to higher value, limit will stay low.
>
>> +
>>               read_db(config, db);
>>               /* Run triggers after storage_run and read_db to make sure new raft
>>                * updates are utilized in current iteration. */
>> diff --git a/ovsdb/raft.c b/ovsdb/raft.c
>> index e06c1f1ab..c473e5d32 100644
>> --- a/ovsdb/raft.c
>> +++ b/ovsdb/raft.c
>> @@ -407,6 +407,11 @@ raft_make_address_passive(const char *address_)
>>       }
>>   }
>>   
>> +uint64_t raft_election_timer_value(const struct raft *raft)
>> +{
>> +    return raft->election_timer;
>> +}
>> +
>>   static struct raft *
>>   raft_alloc(void)
>>   {
>> diff --git a/ovsdb/raft.h b/ovsdb/raft.h
>> index 3545c41c2..1d270ec0c 100644
>> --- a/ovsdb/raft.h
>> +++ b/ovsdb/raft.h
>> @@ -188,4 +188,7 @@ void raft_take_leadership(struct raft *);
>>   void raft_transfer_leadership(struct raft *, const char *reason);
>>   
>>   const struct uuid *raft_current_eid(const struct raft *);
>> +
>> +uint64_t raft_election_timer_value(const struct raft *);
>> +
>>   #endif /* lib/raft.h */
>> diff --git a/ovsdb/storage.c b/ovsdb/storage.c
>> index 40415fcf6..a2142851a 100644
>> --- a/ovsdb/storage.c
>> +++ b/ovsdb/storage.c
>> @@ -640,3 +640,11 @@ ovsdb_storage_peek_last_eid(struct ovsdb_storage *storage)
>>       }
>>       return raft_current_eid(storage->raft);
>>   }
>> +
>> +uint64_t max_processing_time(struct ovsdb_storage *storage)
> Name of the function should start from a new line since it's not
> just a prototype.
>
>> +{
>> +    if (!storage->raft) {
>> +        return UINT64_MAX;
>> +    }
>> +    return raft_election_timer_value(storage->raft) / 2;
>> +}
>> diff --git a/ovsdb/storage.h b/ovsdb/storage.h
>> index 02b6e7e6c..9e195bbe8 100644
>> --- a/ovsdb/storage.h
>> +++ b/ovsdb/storage.h
>> @@ -97,4 +97,6 @@ struct ovsdb_schema *ovsdb_storage_read_schema(struct ovsdb_storage *);
>>   
>>   const struct uuid *ovsdb_storage_peek_last_eid(struct ovsdb_storage *);
>>   
>> +uint64_t max_processing_time(struct ovsdb_storage *);
>> +
>>   #endif /* ovsdb/storage.h */
>>
>
-- 
Anton R. Ivanov
Cambridgegreys Limited. Registered in England. Company Number 10273661
https://www.cambridgegreys.com/



More information about the dev mailing list