[ovs-dev] [PATCH ovn] Add northd and ovn-controller cluster status reset commands.
Mark Michelson
mmichels at redhat.com
Wed Jun 10 19:22:33 UTC 2020
I have merged this to master now.
On 5/4/20 12:18 PM, Han Zhou wrote:
> Acked-by: Han Zhou <hzhou at ovn.org <mailto:hzhou at ovn.org>>
>
> On Fri, May 1, 2020 at 12:15 PM Mark Michelson <mmichels at redhat.com
> <mailto:mmichels at redhat.com>> wrote:
> >
> > This patch is dependent on OVS commit
> >
> https://patchwork.ozlabs.org/project/openvswitch/patch/20200501191308.94486-1-mmichels@redhat.com/
> >
> > Please do not merge this until the OVS commit has been merged as well.
> >
> > On 5/1/20 3:13 PM, Mark Michelson wrote:
> > > During the course of debugging a clustered DB environment, all members
> > > of the southbound database cluster were destroyed (i.e. the .db files
> > > were removed from disk) and then restarted. Once this happened,
> > > ovn-northd and ovn-controller could not interact with the southbound
> > > database because they both detected all members of the cluster as
> having
> > > "stale" data. The only course of action was to reset ovn-northd and all
> > > ovn-controllers. It is possible to have this happen with the northbound
> > > database as well if it is clustered.
> > >
> > > This patch offers new ovn-appctl commands for ovn-northd and
> > > ovn-controller that allows for it to reset its clustered status. This
> > > allows for it to interact with the database successfully after a
> cluster
> > > teardown and restart.
> > >
> > > Signed-off-by: Mark Michelson <mmichels at redhat.com
> <mailto:mmichels at redhat.com>>
> > > Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1829109
> > > ---
> > > controller/ovn-controller.8.xml | 16 ++++++++++++++++
> > > controller/ovn-controller.c | 30 ++++++++++++++++++++++++++---
> > > northd/ovn-northd.8.xml | 28 +++++++++++++++++++++++++++
> > > northd/ovn-northd.c | 34
> +++++++++++++++++++++++++++++++++
> > > 4 files changed, 105 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/controller/ovn-controller.8.xml
> b/controller/ovn-controller.8.xml
> > > index 76bbbdc5f..fe62163fa 100644
> > > --- a/controller/ovn-controller.8.xml
> > > +++ b/controller/ovn-controller.8.xml
> > > @@ -491,6 +491,22 @@
> > > recomputes are cpu intensive.
> > > </p>
> > > </dd>
> > > +
> > > + <dt><code>sb-cluster-state-reset</code></dt>
> > > + <dd>
> > > + <p>
> > > + Reset southbound database cluster status when databases
> are destroyed
> > > + and rebuilt.
> > > + </p>
> > > + <p>
> > > + If all databases in a clustered southbound database are
> removed from
> > > + disk, then the stored index of all databases will be reset
> to zero.
> > > + This will cause ovn-controller to be unable to read or
> write to the
> > > + southbound database, because it will always detect the
> data as stale.
> > > + In such a case, run this command so that ovn-controller
> will reset its
> > > + local index so that it can interact with the southbound
> database again.
> > > + </p>
> > > + </dd>
> > > </dl>
> > > </p>
> > >
> > > diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
> > > index 6ff897325..1442accd7 100644
> > > --- a/controller/ovn-controller.c
> > > +++ b/controller/ovn-controller.c
> > > @@ -73,6 +73,7 @@ static unixctl_cb_func extend_table_list;
> > > static unixctl_cb_func inject_pkt;
> > > static unixctl_cb_func ovn_controller_conn_show;
> > > static unixctl_cb_func engine_recompute_cmd;
> > > +static unixctl_cb_func cluster_state_reset_cmd;
> > >
> > > #define DEFAULT_BRIDGE_NAME "br-int"
> > > #define DEFAULT_PROBE_INTERVAL_MSEC 5000
> > > @@ -446,7 +447,7 @@ get_ofctrl_probe_interval(struct ovsdb_idl
> *ovs_idl)
> > > * updates 'sbdb_idl' with that pointer. */
> > > static void
> > > update_sb_db(struct ovsdb_idl *ovs_idl, struct ovsdb_idl *ovnsb_idl,
> > > - bool *monitor_all_p)
> > > + bool *monitor_all_p, bool *reset_ovnsb_idl_min_index)
> > > {
> > > const struct ovsrec_open_vswitch *cfg =
> ovsrec_open_vswitch_first(ovs_idl);
> > > if (!cfg) {
> > > @@ -476,6 +477,12 @@ update_sb_db(struct ovsdb_idl *ovs_idl, struct
> ovsdb_idl *ovnsb_idl,
> > > if (monitor_all_p) {
> > > *monitor_all_p = monitor_all;
> > > }
> > > + if (*reset_ovnsb_idl_min_index) {
> > > + VLOG_INFO("Resetting southbound database cluster state");
> > > + engine_set_force_recompute(true);
> > > + ovsdb_idl_reset_min_index(ovnsb_idl);
> > > + *reset_ovnsb_idl_min_index = false;
> > > + }
> > > }
> > >
> > > static void
> > > @@ -1936,6 +1943,11 @@ main(int argc, char *argv[])
> > > unixctl_command_register("recompute", "", 0, 0,
> engine_recompute_cmd,
> > > NULL);
> > >
> > > + bool reset_ovnsb_idl_min_index = false;
> > > + unixctl_command_register("sb-cluster-state-reset", "", 0, 0,
> > > + cluster_state_reset_cmd,
> > > + &reset_ovnsb_idl_min_index);
> > > +
> > > unsigned int ovs_cond_seqno = UINT_MAX;
> > > unsigned int ovnsb_cond_seqno = UINT_MAX;
> > >
> > > @@ -1957,7 +1969,8 @@ main(int argc, char *argv[])
> > > ovs_cond_seqno = new_ovs_cond_seqno;
> > > }
> > >
> > > - update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl,
> &sb_monitor_all);
> > > + update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl,
> &sb_monitor_all,
> > > + &reset_ovnsb_idl_min_index);
> > > update_ssl_config(ovsrec_ssl_table_get(ovs_idl_loop.idl));
> > >
> ofctrl_set_probe_interval(get_ofctrl_probe_interval(ovs_idl_loop.idl));
> > >
> > > @@ -2207,7 +2220,7 @@ main(int argc, char *argv[])
> > > if (!restart) {
> > > bool done =
> !ovsdb_idl_has_ever_connected(ovnsb_idl_loop.idl);
> > > while (!done) {
> > > - update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, NULL);
> > > + update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl,
> NULL, false);
> > >
> update_ssl_config(ovsrec_ssl_table_get(ovs_idl_loop.idl));
> > >
> > > struct ovsdb_idl_txn *ovs_idl_txn
> > > @@ -2441,3 +2454,14 @@ engine_recompute_cmd(struct unixctl_conn
> *conn OVS_UNUSED, int argc OVS_UNUSED,
> > > poll_immediate_wake();
> > > unixctl_command_reply(conn, NULL);
> > > }
> > > +
> > > +static void
> > > +cluster_state_reset_cmd(struct unixctl_conn *conn, int argc
> OVS_UNUSED,
> > > + const char *argv[] OVS_UNUSED, void *idl_reset_)
> > > +{
> > > + bool *idl_reset = idl_reset_;
> > > +
> > > + *idl_reset = true;
> > > + poll_immediate_wake();
> > > + unixctl_command_reply(conn, NULL);
> > > +}
> > > diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> > > index 1f817423a..ddb296aaa 100644
> > > --- a/northd/ovn-northd.8.xml
> > > +++ b/northd/ovn-northd.8.xml
> > > @@ -96,6 +96,34 @@
> > > acquired OVSDB lock on SB DB, "standby" if it has not or
> "paused" if
> > > this instance is paused.
> > > </dd>
> > > +
> > > + <dt><code>sb-cluster-state-reset</code></dt>
> > > + <dd>
> > > + <p>
> > > + Reset southbound database cluster status when databases
> are destroyed
> > > + and rebuilt.
> > > + </p>
> > > + <p>
> > > + If all databases in a clustered southbound database are
> removed from
> > > + disk, then the stored index of all databases will be reset
> to zero.
> > > + This will cause ovn-northd to be unable to read or write
> to the
> > > + southbound database, because it will always detect the
> data as stale.
> > > + In such a case, run this command so that ovn-northd will
> reset its
> > > + local index so that it can interact with the southbound
> database again.
> > > + </p>
> > > + </dd>
> > > +
> > > + <dt><code>nb-cluster-state-reset</code></dt>
> > > + <dd>
> > > + <p>
> > > + Reset northbound database cluster status when databases
> are destroyed
> > > + and rebuilt.
> > > + </p>
> > > + <p>
> > > + This performs the same task as
> <code>sb-cluster-state-reset</code>
> > > + except for the northbound database client.
> > > + </p>
> > > + </dd>
> > > </dl>
> > > </p>
> > >
> > > diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
> > > index 742aad85e..3714488bb 100644
> > > --- a/northd/ovn-northd.c
> > > +++ b/northd/ovn-northd.c
> > > @@ -56,6 +56,7 @@ static unixctl_cb_func ovn_northd_pause;
> > > static unixctl_cb_func ovn_northd_resume;
> > > static unixctl_cb_func ovn_northd_is_paused;
> > > static unixctl_cb_func ovn_northd_status;
> > > +static unixctl_cb_func cluster_state_reset_cmd;
> > >
> > > struct northd_context {
> > > struct ovsdb_idl *ovnnb_idl;
> > > @@ -11783,6 +11784,16 @@ main(int argc, char *argv[])
> > > &state);
> > > unixctl_command_register("status", "", 0, 0,
> ovn_northd_status, &state);
> > >
> > > + bool reset_ovnsb_idl_min_index = false;
> > > + unixctl_command_register("sb-cluster-state-reset", "", 0, 0,
> > > + cluster_state_reset_cmd,
> > > + &reset_ovnsb_idl_min_index);
> > > +
> > > + bool reset_ovnnb_idl_min_index = false;
> > > + unixctl_command_register("nb-cluster-state-reset", "", 0, 0,
> > > + cluster_state_reset_cmd,
> > > + &reset_ovnnb_idl_min_index);
> > > +
> > > daemonize_complete();
> > >
> > > /* We want to detect (almost) all changes to the ovn-nb db. */
> > > @@ -12068,6 +12079,18 @@ main(int argc, char *argv[])
> > > ovsdb_idl_set_probe_interval(ovnnb_idl_loop.idl,
> northd_probe_interval);
> > > ovsdb_idl_set_probe_interval(ovnsb_idl_loop.idl,
> northd_probe_interval);
> > >
> > > + if (reset_ovnsb_idl_min_index) {
> > > + VLOG_INFO("Resetting southbound database cluster state");
> > > + ovsdb_idl_reset_min_index(ovnsb_idl_loop.idl);
> > > + reset_ovnsb_idl_min_index = false;
> > > + }
> > > +
> > > + if (reset_ovnnb_idl_min_index) {
> > > + VLOG_INFO("Resetting northbound database cluster state");
> > > + ovsdb_idl_reset_min_index(ovnnb_idl_loop.idl);
> > > + reset_ovnnb_idl_min_index = false;
> > > + }
> > > +
> > > poll_block();
> > > if (should_service_stop()) {
> > > exiting = true;
> > > @@ -12146,3 +12169,14 @@ ovn_northd_status(struct unixctl_conn
> *conn, int argc OVS_UNUSED,
> > > unixctl_command_reply(conn, ds_cstr(&s));
> > > ds_destroy(&s);
> > > }
> > > +
> > > +static void
> > > +cluster_state_reset_cmd(struct unixctl_conn *conn, int argc
> OVS_UNUSED,
> > > + const char *argv[] OVS_UNUSED, void *idl_reset_)
> > > +{
> > > + bool *idl_reset = idl_reset_;
> > > +
> > > + *idl_reset = true;
> > > + poll_immediate_wake();
> > > + unixctl_command_reply(conn, NULL);
> > > +}
> > >
> >
> > _______________________________________________
> > dev mailing list
> > dev at openvswitch.org <mailto:dev at openvswitch.org>
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
More information about the dev
mailing list