[ovs-dev] [PATCH v2] ovn-controller: eliminate stall in ofctrl state machine

Lance Richardson lrichard at redhat.com
Fri Jul 8 00:31:08 UTC 2016


The "ovn -- 2 HVs, 3 LRs connected via LS, static routes"
test case currently exhibits frequent failures. These failures
occur because, at the time that the test packets are sent to
verify forwarding, no flows have been installed in the vswitch
for one of the hypervisors.

Investigation shows that, in the failing case, the ofctrl state
machine has not yet transitioned to the S_UPDATE_FLOWS state.
This occurrs when ofctrl_run() is called and:
   1) The state is S_TLV_TABLE_MOD_SENT.
   2) An OFPTYPE_NXT_TLV_TABLE_REPLY message is queued for reception.
   3) No event (other than SB probe timer expiration) is expected
      that would unblock poll_block() in the main ovn-controller
      loop.

In this scenario, ofctrl_run() will move state to S_CLEAR_FLOWS
and return, without having executed run_S_CLEAR_FLOWS() which
would have immediately transitioned the state to S_UPDATE_FLOWS
which is needed in order for ovn-controller to configure flows
in ovs-vswitchd. After a delay of about 5 seconds (the default
SB probe timer interval), ofctrl_run() would be called again
to make the transition to S_UPDATE_FLOWS, but by this time
the test case has already failed.

Fix by expanding the state machine's "while state != old_state"
loop to include processing of receive events, with a maximum
iteration limit to prevent excessive looping in pathological
cases. Without this fix, around 40 failures are seen out of
100 attempts, with this fix no failures have been observed in
several hundred attempts.

Signed-off-by: Lance Richardson <lrichard at redhat.com>
---
 v2: Added maximum iteration limit to state machine loop.

 ovn/controller/ofctrl.c | 50 ++++++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/ovn/controller/ofctrl.c b/ovn/controller/ofctrl.c
index 4c410da..fa23af0 100644
--- a/ovn/controller/ofctrl.c
+++ b/ovn/controller/ofctrl.c
@@ -88,6 +88,9 @@ enum ofctrl_state {
 /* Current state. */
 static enum ofctrl_state state;
 
+/* Maximum number of state machine iterations per invocation*/
+#define OFCTRL_SM_ITER_MAX 10
+
 /* Transaction IDs for messages in flight to the switch. */
 static ovs_be32 xid, xid2;
 
@@ -401,6 +404,7 @@ ofctrl_run(const struct ovsrec_bridge *br_int)
     }
 
     enum ofctrl_state old_state;
+    int count = 0;
     do {
         old_state = state;
         switch (state) {
@@ -410,36 +414,36 @@ ofctrl_run(const struct ovsrec_bridge *br_int)
         default:
             OVS_NOT_REACHED();
         }
-    } while (state != old_state);
 
-    for (int i = 0; state == old_state && i < 50; i++) {
-        struct ofpbuf *msg = rconn_recv(swconn);
-        if (!msg) {
-            break;
-        }
+        for (int i = 0; state == old_state && i < 50; i++) {
+            struct ofpbuf *msg = rconn_recv(swconn);
+            if (!msg) {
+                break;
+            }
 
-        const struct ofp_header *oh = msg->data;
-        enum ofptype type;
-        enum ofperr error;
+            const struct ofp_header *oh = msg->data;
+            enum ofptype type;
+            enum ofperr error;
 
-        error = ofptype_decode(&type, oh);
-        if (!error) {
-            switch (state) {
+            error = ofptype_decode(&type, oh);
+            if (!error) {
+                switch (state) {
 #define STATE(NAME) case NAME: recv_##NAME(oh, type); break;
-                STATES
+                    STATES
 #undef STATE
-            default:
-                OVS_NOT_REACHED();
+                default:
+                    OVS_NOT_REACHED();
+                }
+            } else {
+                char *s = ofp_to_string(oh, ntohs(oh->length), 1);
+                VLOG_WARN("could not decode OpenFlow message (%s): %s",
+                          ofperr_to_string(error), s);
+                free(s);
             }
-        } else {
-            char *s = ofp_to_string(oh, ntohs(oh->length), 1);
-            VLOG_WARN("could not decode OpenFlow message (%s): %s",
-                      ofperr_to_string(error), s);
-            free(s);
-        }
 
-        ofpbuf_delete(msg);
-    }
+            ofpbuf_delete(msg);
+        }
+    } while (state != old_state && count++ < OFCTRL_SM_ITER_MAX);
 
     return (state == S_CLEAR_FLOWS || state == S_UPDATE_FLOWS
             ? mff_ovn_geneve : 0);
-- 
2.5.5




More information about the dev mailing list