[ovs-dev] [PATCH 3/5] raft: Set threshold on backlog for raft connections.

Dumitru Ceara dceara at redhat.com
Wed Oct 28 10:49:04 UTC 2020


On 10/26/20 2:42 AM, Ilya Maximets wrote:
> RAFT messages could be fairly big.  If something abnormal happens to
> one of the servers in a cluster it may not be able to process all the
> incoming messages in a timely manner.  This results in jsonrpc backlog
> growth on the sender's side.  For example if follower gets many new
> clients at once that it needs to serve, or it decides to take a
> snapshot in a period of high number of database changes.
> If backlog grows large enough it becomes harder and harder for follower
> to process incoming raft messages, it sends outdated replies and
> starts receiving snapshots and the whole raft log from the leader.
> Sometimes backlog grows too high (60GB in this example):
> 
>       jsonrpc|INFO|excessive sending backlog, jsonrpc: ssl:<ip>,
>                    num of msgs: 15370, backlog: 61731060773.
> 
> In this case OS might actually decide to kill the sender to free some
> memory.  Anyway, It could take a lot of time for such a server to catch
> up with the rest of the cluster if it has so much data to receive and
> process.
> 
> Introducing backlog thresholds for jsonrpc connections.
> If sending backlog will exceed particular values (500 messages or
> 4GB in size), connection will be dropped and re-created.  This will
> allow to drop all the current backlog and start over increasing
> chances of cluster recovery.
> 
> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1888829
> Signed-off-by: Ilya Maximets <i.maximets at ovn.org>
> ---
>  NEWS          |  2 ++
>  lib/jsonrpc.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>  lib/jsonrpc.h |  6 ++++++
>  ovsdb/raft.c  |  5 +++++
>  4 files changed, 69 insertions(+), 1 deletion(-)
> 
> diff --git a/NEWS b/NEWS
> index 2860a8e9c..ebdf8758b 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -6,6 +6,8 @@ Post-v2.14.0
>       * New unixctl command 'ovsdb-server/memory-trim-on-compaction on|off'.
>         If turned on, ovsdb-server will try to reclaim all the unused memory
>         after every DB compaction back to OS.  Disabled by default.
> +     * Maximum backlog on RAFT connections limited to 500 messages or 4GB.
> +       Once threshold reached, connection is dropped (and re-established).
>     - DPDK:
>       * Removed support for vhost-user dequeue zero-copy.
>     - The environment variable OVS_UNBOUND_CONF, if set, is now used
> diff --git a/lib/jsonrpc.c b/lib/jsonrpc.c
> index ecbc939fe..435824844 100644
> --- a/lib/jsonrpc.c
> +++ b/lib/jsonrpc.c
> @@ -50,6 +50,10 @@ struct jsonrpc {
>      struct ovs_list output;     /* Contains "struct ofpbuf"s. */
>      size_t output_count;        /* Number of elements in "output". */
>      size_t backlog;
> +
> +    /* Limits. */
> +    size_t max_output;          /* 'output_count' disconnection threshold. */
> +    size_t max_backlog;         /* 'backlog' disconnection threshold. */
>  };
>  
>  /* Rate limit for error messages. */
> @@ -178,6 +182,17 @@ jsonrpc_get_backlog(const struct jsonrpc *rpc)
>      return rpc->status ? 0 : rpc->backlog;
>  }
>  
> +/* Sets thresholds for send backlog.  If send backlog contains more than
> + * 'max_n_msgs' messages or larger than 'max_backlog_bytes' bytes, connection
> + * will be dropped. */
> +void
> +jsonrpc_set_backlog_threshold(struct jsonrpc *rpc,
> +                              size_t max_n_msgs, size_t max_backlog_bytes)
> +{
> +    rpc->max_output = max_n_msgs;
> +    rpc->max_backlog = max_backlog_bytes;
> +}
> +
>  /* Returns the number of bytes that have been received on 'rpc''s underlying
>   * stream.  (The value wraps around if it exceeds UINT_MAX.) */
>  unsigned int
> @@ -261,9 +276,26 @@ jsonrpc_send(struct jsonrpc *rpc, struct jsonrpc_msg *msg)
>      rpc->backlog += length;
>  
>      if (rpc->output_count >= 50) {
> -        VLOG_INFO_RL(&rl, "excessive sending backlog, jsonrpc: %s, num of"
> +        static struct vlog_rate_limit bl_rl = VLOG_RATE_LIMIT_INIT(5, 5);
> +        bool disconnect = false;
> +
> +        VLOG_INFO_RL(&bl_rl, "excessive sending backlog, jsonrpc: %s, num of"
>                       " msgs: %"PRIuSIZE", backlog: %"PRIuSIZE".", rpc->name,
>                       rpc->output_count, rpc->backlog);
> +        if (rpc->max_output && rpc->output_count > rpc->max_output) {
> +            disconnect = true;
> +            VLOG_WARN("sending backlog exceeded maximum number of messages (%"
> +                      PRIuSIZE" > %"PRIuSIZE"), disconnecting, jsonrpc: %s.",
> +                      rpc->output_count, rpc->max_output, rpc->name);
> +        } else if (rpc->max_backlog && rpc->backlog > rpc->max_backlog) {
> +            disconnect = true;
> +            VLOG_WARN("sending backlog exceeded maximum size (%"PRIuSIZE" > %"
> +                      PRIuSIZE" bytes), disconnecting, jsonrpc: %s.",
> +                      rpc->backlog, rpc->max_backlog, rpc->name);
> +        }
> +        if (disconnect) {
> +            jsonrpc_error(rpc, E2BIG);
> +        }
>      }
>  
>      if (rpc->backlog == length) {
> @@ -787,6 +819,10 @@ struct jsonrpc_session {
>      int last_error;
>      unsigned int seqno;
>      uint8_t dscp;
> +
> +    /* Limits for jsonrpc. */
> +    size_t max_n_msgs;
> +    size_t max_backlog_bytes;

Hi Ilya,

If I'm not wrong, we end up using these uninitialized in non-clustered mode.

>  };
>  
>  static void
> @@ -970,6 +1006,8 @@ jsonrpc_session_run(struct jsonrpc_session *s)
>              }
>              reconnect_connected(s->reconnect, time_msec());
>              s->rpc = jsonrpc_open(stream);
> +            jsonrpc_set_backlog_threshold(s->rpc, s->max_n_msgs,
> +                                                  s->max_backlog_bytes);

Indent is a bit odd here.

However, I don't see anything in the coding guidelines about this and it does
seem more readable to do it your way.  So I don't know :)

Thanks,
Dumitru

>              s->seqno++;
>          } else if (error != EAGAIN) {
>              reconnect_listen_error(s->reconnect, time_msec(), error);
> @@ -1010,6 +1048,8 @@ jsonrpc_session_run(struct jsonrpc_session *s)
>          if (!error) {
>              reconnect_connected(s->reconnect, time_msec());
>              s->rpc = jsonrpc_open(s->stream);
> +            jsonrpc_set_backlog_threshold(s->rpc, s->max_n_msgs,
> +                                                  s->max_backlog_bytes);
>              s->stream = NULL;
>              s->seqno++;
>          } else if (error != EAGAIN) {
> @@ -1250,3 +1290,18 @@ jsonrpc_session_set_dscp(struct jsonrpc_session *s, uint8_t dscp)
>          jsonrpc_session_force_reconnect(s);
>      }
>  }
> +
> +/* Sets thresholds for send backlog.  If send backlog contains more than
> + * 'max_n_msgs' messages or larger than 'max_backlog_bytes' bytes, connection
> + * will be closed (then reconnected, if that feature is enabled). */
> +void
> +jsonrpc_session_set_backlog_threshold(struct jsonrpc_session *s,
> +                                      size_t max_n_msgs,
> +                                      size_t max_backlog_bytes)
> +{
> +    s->max_n_msgs = max_n_msgs;
> +    s->max_backlog_bytes = max_backlog_bytes;
> +    if (s->rpc) {
> +        jsonrpc_set_backlog_threshold(s->rpc, max_n_msgs, max_backlog_bytes);
> +    }
> +}
> diff --git a/lib/jsonrpc.h b/lib/jsonrpc.h
> index a44114e8d..d75d66b86 100644
> --- a/lib/jsonrpc.h
> +++ b/lib/jsonrpc.h
> @@ -51,6 +51,9 @@ void jsonrpc_wait(struct jsonrpc *);
>  
>  int jsonrpc_get_status(const struct jsonrpc *);
>  size_t jsonrpc_get_backlog(const struct jsonrpc *);
> +void jsonrpc_set_backlog_threshold(struct jsonrpc *, size_t max_n_msgs,
> +                                                     size_t max_backlog_bytes);
> +
>  unsigned int jsonrpc_get_received_bytes(const struct jsonrpc *);
>  const char *jsonrpc_get_name(const struct jsonrpc *);
>  
> @@ -140,6 +143,9 @@ void jsonrpc_session_set_probe_interval(struct jsonrpc_session *,
>                                          int probe_interval);
>  void jsonrpc_session_set_dscp(struct jsonrpc_session *,
>                                uint8_t dscp);
> +void jsonrpc_session_set_backlog_threshold(struct jsonrpc_session *,
> +                                           size_t max_n_msgs,
> +                                           size_t max_backlog_bytes);
>  const char *jsonrpc_session_get_id(const struct jsonrpc_session *);
>  
>  #endif /* jsonrpc.h */
> diff --git a/ovsdb/raft.c b/ovsdb/raft.c
> index ac85c6b67..92a099896 100644
> --- a/ovsdb/raft.c
> +++ b/ovsdb/raft.c
> @@ -925,6 +925,9 @@ raft_reset_ping_timer(struct raft *raft)
>      raft->ping_timeout = time_msec() + raft->election_timer / 3;
>  }
>  
> +#define RAFT_MAX_BACKLOG_N_MSGS    500
> +#define RAFT_MAX_BACKLOG_BYTES     UINT32_MAX
> +
>  static void
>  raft_add_conn(struct raft *raft, struct jsonrpc_session *js,
>                const struct uuid *sid, bool incoming)
> @@ -940,6 +943,8 @@ raft_add_conn(struct raft *raft, struct jsonrpc_session *js,
>      conn->incoming = incoming;
>      conn->js_seqno = jsonrpc_session_get_seqno(conn->js);
>      jsonrpc_session_set_probe_interval(js, 0);
> +    jsonrpc_session_set_backlog_threshold(js, RAFT_MAX_BACKLOG_N_MSGS,
> +                                              RAFT_MAX_BACKLOG_BYTES);
>  }
>  
>  /* Starts the local server in an existing Raft cluster, using the local copy of
> 



More information about the dev mailing list