[ovs-dev] [PATCH] Use TCP keepalives, where available, for OpenFlow and OVSDB connections.

Ben Pfaff blp at nicira.com
Tue Apr 19 21:00:51 UTC 2011


Until now Open vSwitch has always used application-level keepalive
mechanisms, that is, OFPT_ECHO_REQUEST messages for OpenFlow and echo
requests for JSON-RPC.  This is an end-to-end mechanism, which gives the
highest assurance that in fact the connection is really up.  However, it
has the disadvantage that if the remote process is busy for a few seconds
then the connection can drop and reconnect.  The reconnection then often
causes more work for the remote process, which can cause other connections
to drop, and then the problem snowballs.

This commit switches to instead using TCP keepalives where they are
available.  TCP keepalives have the advantage that they do not require the
remote process to respond, only the transport layer underlying the remote
process, that is, the kernel in our case.  This is much less likely to
cause spurious timeouts.

It remains to be seen whether the lack of end-to-end keepalive probing is
a real problem in practice.

The Python changes in this commit ensure that the "reconnect" module tests
still pass for the Python implementation.  No actual Python
implementation of TCP keepalives is included because the Python version of
the OVSDB modules currently only supports Unix domain socket connections.

CC: Alex Yip <alex at nicira.com>
CC: Martin Casado <casado at nicira.com>
---
 lib/jsonrpc.c           |   38 +++++++-
 lib/rconn.c             |   68 ++++++++++---
 lib/reconnect.c         |  121 +++++++++++++++++++----
 lib/reconnect.h         |    7 +-
 lib/socket-util.c       |   89 +++++++++++++++++
 lib/socket-util.h       |    3 +
 lib/stream-fd.c         |   26 +++++-
 lib/stream-fd.h         |    4 +-
 lib/stream-provider.h   |   31 ++++++-
 lib/stream-ssl.c        |   18 ++++-
 lib/stream-tcp.c        |    6 +-
 lib/stream-unix.c       |    8 +-
 lib/stream.c            |   36 +++++++
 lib/stream.h            |    3 +
 lib/vconn-provider.h    |   33 ++++++-
 lib/vconn-stream.c      |   18 ++++-
 lib/vconn.c             |   38 +++++++-
 lib/vconn.h             |    5 +-
 ovsdb/ovsdb-client.c    |    1 -
 python/ovs/reconnect.py |  125 ++++++++++++++++++------
 tests/reconnect.at      |  243 +++++++++++++++++++++++++++++++++++++++++++++++
 tests/test-reconnect.c  |   22 ++++-
 tests/test-reconnect.py |   16 +++-
 vswitchd/vswitch.xml    |   45 ++++++---
 24 files changed, 899 insertions(+), 105 deletions(-)

diff --git a/lib/jsonrpc.c b/lib/jsonrpc.c
index afcc520..148f650 100644
--- a/lib/jsonrpc.c
+++ b/lib/jsonrpc.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009, 2010 Nicira Networks.
+ * Copyright (c) 2009, 2010, 2011 Nicira Networks.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -674,6 +674,8 @@ struct jsonrpc_session {
     unsigned int seqno;
 };
 
+static void jsonrpc_session_set_keepalive__(struct jsonrpc_session *);
+
 /* Creates and returns a jsonrpc_session to 'name', which should be a string
  * acceptable to stream_open() or pstream_open().
  *
@@ -726,6 +728,8 @@ jsonrpc_session_open_unreliably(struct jsonrpc *jsonrpc)
     s->pstream = NULL;
     s->seqno = 0;
 
+    jsonrpc_session_set_keepalive__(s);
+
     return s;
 }
 
@@ -775,7 +779,9 @@ jsonrpc_session_connect(struct jsonrpc_session *s)
         }
     }
 
-    if (error) {
+    if (!error) {
+        jsonrpc_session_set_keepalive__(s);
+    } else {
         reconnect_connect_failed(s->reconnect, time_msec(), error);
     }
     s->seqno++;
@@ -852,6 +858,13 @@ jsonrpc_session_run(struct jsonrpc_session *s)
             jsonrpc_send(s->rpc, request);
         }
         break;
+
+    case RECONNECT_CHECK_IDLE:
+        if (s->rpc && s->rpc->stream) {
+            reconnect_set_transport_idle(s->reconnect, time_msec(),
+                                         stream_is_idle(s->rpc->stream));
+        }
+        break;
     }
 }
 
@@ -978,5 +991,24 @@ void
 jsonrpc_session_set_probe_interval(struct jsonrpc_session *s,
                                    int probe_interval)
 {
-    reconnect_set_probe_interval(s->reconnect, probe_interval);
+    if (probe_interval != reconnect_get_probe_interval(s->reconnect)) {
+        reconnect_set_probe_interval(s->reconnect, probe_interval);
+        jsonrpc_session_set_keepalive__(s);
+    }
+}
+
+/* If 's' has an underlying stream, sets the keepalive probe interval on it to
+ * correspond with the settings in the reconnect object, and then notify the
+ * reconnect object of whether that worked. */
+static void
+jsonrpc_session_set_keepalive__(struct jsonrpc_session *s)
+{
+    struct stream *stream;
+
+    stream = s->stream ? s->stream : s->rpc ? s->rpc->stream : NULL;
+    if (stream) {
+        int probe_interval = reconnect_get_probe_interval(s->reconnect);
+        int error = stream_set_keepalive(stream, probe_interval / 1000);
+        reconnect_set_app_probe(s->reconnect, error != 0);
+    }
 }
diff --git a/lib/rconn.c b/lib/rconn.c
index 0e18ab4..2ced6ca 100644
--- a/lib/rconn.c
+++ b/lib/rconn.c
@@ -104,11 +104,11 @@ struct rconn {
     time_t creation_time;
     unsigned long int total_time_connected;
 
-    /* Throughout this file, "probe" is shorthand for "inactivity probe".
-     * When nothing has been received from the peer for a while, we send out
-     * an echo request as an inactivity probe packet.  We should receive back
-     * a response. */
+    /* Keepalives for inactivity probing.  An rconn can implement keepalives at
+     * application level by sending OFPT_ECHO_REQUEST messages and waiting for
+     * a reply, or at the transport level if the vconn supports that. */
     int probe_interval;         /* Secs of inactivity before sending probe. */
+    bool app_probe;             /* Are we using OFPT_ECHO_REQUEST to probe? */
 
     /* When we create a vconn we obtain these values, to save them past the end
      * of the vconn's lifetime.  Otherwise, in-band control will only allow
@@ -143,14 +143,14 @@ static void copy_to_monitor(struct rconn *, const struct ofpbuf *);
 static bool is_connected_state(enum state);
 static bool is_admitted_msg(const struct ofpbuf *);
 static bool rconn_logging_connection_attempts__(const struct rconn *);
+static void rconn_set_keepalive__(struct rconn *);
 
 /* Creates and returns a new rconn.
  *
- * 'probe_interval' is a number of seconds.  If the interval passes once
- * without an OpenFlow message being received from the peer, the rconn sends
- * out an "echo request" message.  If the interval passes again without a
- * message being received, the rconn disconnects and re-connects to the peer.
- * Setting 'probe_interval' to 0 disables this behavior.
+ * The rconn will ensure that if the connection is idle for 'probe_interval'
+ * seconds, then it will check that the connection is still up, in an
+ * implementation-specific way.  Setting 'probe_interval' to 0 disables this
+ * behavior.
  *
  * 'max_backoff' is the maximum number of seconds between attempts to connect
  * to the peer.  The actual interval starts at 1 second and doubles on each
@@ -221,7 +221,10 @@ rconn_get_max_backoff(const struct rconn *rc)
 void
 rconn_set_probe_interval(struct rconn *rc, int probe_interval)
 {
-    rc->probe_interval = probe_interval ? MAX(5, probe_interval) : 0;
+    if (probe_interval != rc->probe_interval) {
+        rc->probe_interval = probe_interval ? MAX(5, probe_interval) : 0;
+        rconn_set_keepalive__(rc);
+    }
 }
 
 int
@@ -337,6 +340,9 @@ reconnect(struct rconn *rc)
     rc->n_attempted_connections++;
     retval = vconn_open(rc->target, OFP_VERSION, &rc->vconn);
     if (!retval) {
+        if (rc->probe_interval) {
+            rconn_set_keepalive__(rc);
+        }
         rc->remote_ip = vconn_get_remote_ip(rc->vconn);
         rc->local_ip = vconn_get_local_ip(rc->vconn);
         rc->remote_port = vconn_get_remote_port(rc->vconn);
@@ -416,6 +422,13 @@ timeout_ACTIVE(const struct rconn *rc)
     if (rc->probe_interval) {
         unsigned int base = MAX(rc->last_received, rc->state_entered);
         unsigned int arg = base + rc->probe_interval - rc->state_entered;
+        if (!rc->app_probe) {
+            /* With keepalives, run_ACTIVE() might not transition to IDLE when
+             * the timeout expires.  Prevent using 100% CPU by ensuring that
+             * we wake up no more than once per second. */
+            time_t now = time_now();
+            arg = MAX(arg, now + 1);
+        }
         return arg;
     }
     return UINT_MAX;
@@ -424,7 +437,13 @@ timeout_ACTIVE(const struct rconn *rc)
 static void
 run_ACTIVE(struct rconn *rc)
 {
-    if (timed_out(rc)) {
+    if (!rc->app_probe) {
+        if (vconn_is_idle(rc->vconn)) {
+            VLOG_DBG("%s: sent transport-level keepalive", rc->name);
+            state_transition(rc, S_IDLE);
+            return;
+        }
+    } else if (timed_out(rc)) {
         unsigned int base = MAX(rc->last_received, rc->state_entered);
         VLOG_DBG("%s: idle %u seconds, sending inactivity probe",
                  rc->name, (unsigned int) (time_now() - base));
@@ -443,20 +462,30 @@ run_ACTIVE(struct rconn *rc)
 static unsigned int
 timeout_IDLE(const struct rconn *rc)
 {
-    return rc->probe_interval;
+    if (!rc->app_probe) {
+        /* Wake up once per second to check for recovery. */
+        return time_now() - rc->state_entered + 1;
+    } else {
+        return rc->probe_interval;
+    }
 }
 
 static void
 run_IDLE(struct rconn *rc)
 {
-    if (timed_out(rc)) {
+    if (!rc->app_probe) {
+        if (!vconn_is_idle(rc->vconn)) {
+            VLOG_DBG("%s: received transport-level keepalive reply", rc->name);
+            state_transition(rc, S_ACTIVE);
+        }
+    } else if (timed_out(rc)) {
         VLOG_ERR("%s: no response to inactivity probe after %u "
                  "seconds, disconnecting",
                  rc->name, elapsed_in_this_state(rc));
         disconnect(rc, ETIMEDOUT);
-    } else {
-        do_tx_work(rc);
+        return;
     }
+    do_tx_work(rc);
 }
 
 /* Performs whatever activities are necessary to maintain 'rc': if 'rc' is
@@ -1106,3 +1135,12 @@ rconn_logging_connection_attempts__(const struct rconn *rc)
 {
     return rc->backoff < rc->max_backoff;
 }
+
+static void
+rconn_set_keepalive__(struct rconn *rc)
+{
+    if (rc->vconn) {
+        int error = vconn_set_keepalive(rc->vconn, rc->probe_interval);
+        rc->app_probe = error != 0;
+    }
+}
diff --git a/lib/reconnect.c b/lib/reconnect.c
index c169016..2844c1c 100644
--- a/lib/reconnect.c
+++ b/lib/reconnect.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008, 2009, 2010 Nicira Networks.
+ * Copyright (c) 2008, 2009, 2010, 2011 Nicira Networks.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,6 +51,7 @@ struct reconnect {
     int min_backoff;
     int max_backoff;
     int probe_interval;
+    bool app_probe;             /* Probing implemented at application level? */
     bool passive;
     enum vlog_level info;       /* Used for informational messages. */
 
@@ -62,6 +63,7 @@ struct reconnect {
     long long int last_connected;
     long long int last_disconnected;
     unsigned int max_tries;
+    long long int next_transport_idle_update;
 
     /* These values are simply for statistics reporting, not otherwise used
      * directly by anything internal. */
@@ -99,6 +101,7 @@ reconnect_create(long long int now)
     fsm->min_backoff = RECONNECT_DEFAULT_MIN_BACKOFF;
     fsm->max_backoff = RECONNECT_DEFAULT_MAX_BACKOFF;
     fsm->probe_interval = RECONNECT_DEFAULT_PROBE_INTERVAL;
+    fsm->app_probe = true;
     fsm->passive = false;
     fsm->info = VLL_INFO;
 
@@ -109,6 +112,7 @@ reconnect_create(long long int now)
     fsm->last_connected = LLONG_MAX;
     fsm->last_disconnected = LLONG_MAX;
     fsm->max_tries = UINT_MAX;
+    fsm->next_transport_idle_update = now;
     fsm->creation_time = now;
 
     return fsm;
@@ -177,15 +181,40 @@ reconnect_get_max_backoff(const struct reconnect *fsm)
 /* Returns the "probe interval" for 'fsm' in milliseconds.  If this is zero, it
  * disables the connection keepalive feature.  If it is nonzero, then if the
  * interval passes while 'fsm' is connected and without reconnect_received()
- * being called for 'fsm', reconnect_run() returns RECONNECT_PROBE.  If the
- * interval passes again without reconnect_received() being called,
- * reconnect_run() returns RECONNECT_DISCONNECT for 'fsm'. */
+ * being called for 'fsm', then reconnect_run()'s behavior depends on the type
+ * of configured probing (see reconnect_set_app_probe()):
+ *
+ *   - If application-level probing is configured, reconnect_run() returns
+ *     RECONNECT_PROBE.  If the interval passes again without
+ *     reconnect_received() being called, reconnect_run() returns
+ *     RECONNECT_DISCONNECT for 'fsm'.
+ *
+ *   - If transport-level probing is configured, reconnect_run() returns
+ *     RECONNECT_CHECK_IDLE.  The transport itself is responsible for deciding
+ *     whether the connection has timed out in this case.
+ */
 int
 reconnect_get_probe_interval(const struct reconnect *fsm)
 {
     return fsm->probe_interval;
 }
 
+/* Returns how 'fsm' expects keepalive probing to work.  If the return value is
+ * true, then the application performs keepalive probing, e.g. by sending "echo
+ * request" OpenFlow or JSON-RPC messages.  If the return value is false, then
+ * the underlying transport performs keepalive probing, e.g. via the TCP
+ * keepalive mechanism.
+ *
+ * This value has no effect if the keepalive feature is disabled, that is, if
+ * reconnect_get_probe_interval() returns 0.
+ *
+ * See reconnect_get_probe_interval() for more information. */
+bool
+reconnect_is_app_probe(const struct reconnect *fsm)
+{
+    return fsm->app_probe;
+}
+
 /* Limits the maximum number of times that 'fsm' will ask the client to try to
  * reconnect to 'max_tries'.  UINT_MAX (the default) means an unlimited number
  * of tries.
@@ -232,19 +261,31 @@ reconnect_set_backoff(struct reconnect *fsm, int min_backoff, int max_backoff)
 
 /* Sets the "probe interval" for 'fsm' to 'probe_interval', in milliseconds.
  * If this is zero, it disables the connection keepalive feature.  If it is
- * nonzero, then if the interval passes while 'fsm' is connected and without
- * reconnect_received() being called for 'fsm', reconnect_run() returns
- * RECONNECT_PROBE.  If the interval passes again without reconnect_received()
- * being called, reconnect_run() returns RECONNECT_DISCONNECT for 'fsm'.
+ * nonzero, then it will be forced to a value of at least 1000 ms.
  *
- * If 'probe_interval' is nonzero, then it will be forced to a value of at
- * least 1000 ms. */
+ * See reconnect_get_probe_interval() for more information. */
 void
 reconnect_set_probe_interval(struct reconnect *fsm, int probe_interval)
 {
     fsm->probe_interval = probe_interval ? MAX(1000, probe_interval) : 0;
 }
 
+/* Configures how 'fsm' expects keepalive probing to work.  If 'app_probe' is
+ * true, then the application performs keepalive probing, e.g. by sending "echo
+ * request" OpenFlow or JSON-RPC messages.  If 'app_probe' is false, then the
+ * underlying transport performs keepalive probing, e.g. via the TCP keepalive
+ * mechanism.
+ *
+ * This setting has no effect if the keepalive feature is disabled, that is, if
+ * reconnect_get_probe_interval() returns 0.
+ *
+ * See reconnect_get_probe_interval() for more information. */
+void
+reconnect_set_app_probe(struct reconnect *fsm, bool app_probe)
+{
+    fsm->app_probe = app_probe;
+}
+
 /* Returns true if 'fsm' is in passive mode, false if 'fsm' is in active mode
  * (the default). */
 bool
@@ -478,6 +519,23 @@ reconnect_received(struct reconnect *fsm, long long int now)
     fsm->last_received = now;
 }
 
+/* Tell 'fsm' whether the transport layer is currently probing whether an idle
+ * connection is still connected.  'idle' should be true if this is known to be
+ * the case, or false if it is known not to be the case or if the answer cannot
+ * be determined. */
+void
+reconnect_set_transport_idle(struct reconnect *fsm, long long int now,
+                             bool idle)
+{
+    if (fsm->state == S_ACTIVE && idle) {
+        VLOG_DBG("%s: sent transport-level keepalive", fsm->name);
+        reconnect_transition__(fsm, now, S_IDLE);
+    } else if (fsm->state == S_IDLE && !idle) {
+        VLOG_DBG("%s: received transport-level keepalive reply", fsm->name);
+        reconnect_transition__(fsm, now, S_ACTIVE);
+    }
+}
+
 static void
 reconnect_transition__(struct reconnect *fsm, long long int now,
                        enum state state)
@@ -518,12 +576,15 @@ reconnect_deadline__(const struct reconnect *fsm)
     case S_ACTIVE:
         if (fsm->probe_interval) {
             long long int base = MAX(fsm->last_received, fsm->state_entered);
-            return base + fsm->probe_interval;
+            return MAX(base + fsm->probe_interval,
+                       fsm->next_transport_idle_update);
         }
         return LLONG_MAX;
 
     case S_IDLE:
-        return fsm->state_entered + fsm->probe_interval;
+        return (fsm->app_probe
+                ? fsm->state_entered + fsm->probe_interval
+                : fsm->next_transport_idle_update);
 
     case S_RECONNECT:
         return fsm->state_entered;
@@ -566,7 +627,14 @@ reconnect_deadline__(const struct reconnect *fsm)
  *     - RECONNECT_PROBE: The client should send some kind of request to the
  *       peer that will elicit a response, to ensure that the connection is
  *       indeed in working order.  (This will only be returned if the "probe
- *       interval" is nonzero--see reconnect_set_probe_interval()).
+ *       interval" is nonzero and application-level probing is configured.  See
+ *       reconnect_set_probe_interval().)
+ *
+ *     - RECONNECT_CHECK_IDLE: The client should query the transport layer to
+ *       determine whether idle connection probing is taking place and call
+ *       reconnect_set_transport_idle() to report the result.  (This will only
+ *       be returned if the "probe interval" is nonzero and transport-level
+ *       probing is configured.  See reconnect_set_probe_interval().)
  */
 enum reconnect_action
 reconnect_run(struct reconnect *fsm, long long int now)
@@ -583,16 +651,27 @@ reconnect_run(struct reconnect *fsm, long long int now)
             return RECONNECT_DISCONNECT;
 
         case S_ACTIVE:
-            VLOG_DBG("%s: idle %lld ms, sending inactivity probe", fsm->name,
-                     now - MAX(fsm->last_received, fsm->state_entered));
-            reconnect_transition__(fsm, now, S_IDLE);
-            return RECONNECT_PROBE;
+            if (fsm->app_probe) {
+                VLOG_DBG("%s: idle %lld ms, sending inactivity probe",
+                         fsm->name,
+                         now - MAX(fsm->last_received, fsm->state_entered));
+                reconnect_transition__(fsm, now, S_IDLE);
+                return RECONNECT_PROBE;
+            } else {
+                fsm->next_transport_idle_update = now + 1000;
+                return RECONNECT_CHECK_IDLE;
+            }
 
         case S_IDLE:
-            VLOG_ERR("%s: no response to inactivity probe after %.3g "
-                     "seconds, disconnecting",
-                     fsm->name, (now - fsm->state_entered) / 1000.0);
-            return RECONNECT_DISCONNECT;
+            if (fsm->app_probe) {
+                VLOG_ERR("%s: no response to inactivity probe after %.3g "
+                         "seconds, disconnecting",
+                         fsm->name, (now - fsm->state_entered) / 1000.0);
+                return RECONNECT_DISCONNECT;
+            } else {
+                fsm->next_transport_idle_update = now + 1000;
+                return RECONNECT_CHECK_IDLE;
+            }
 
         case S_RECONNECT:
             return RECONNECT_DISCONNECT;
diff --git a/lib/reconnect.h b/lib/reconnect.h
index 997a03f..c34b973 100644
--- a/lib/reconnect.h
+++ b/lib/reconnect.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009, 2010 Nicira Networks.
+ * Copyright (c) 2009, 2010, 2011 Nicira Networks.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,6 +48,7 @@ void reconnect_set_name(struct reconnect *, const char *name);
 int reconnect_get_min_backoff(const struct reconnect *);
 int reconnect_get_max_backoff(const struct reconnect *);
 int reconnect_get_probe_interval(const struct reconnect *);
+bool reconnect_is_app_probe(const struct reconnect *);
 
 void reconnect_set_max_tries(struct reconnect *, unsigned int max_tries);
 unsigned int reconnect_get_max_tries(struct reconnect *);
@@ -55,6 +56,7 @@ unsigned int reconnect_get_max_tries(struct reconnect *);
 void reconnect_set_backoff(struct reconnect *,
                            int min_backoff, int max_backoff);
 void reconnect_set_probe_interval(struct reconnect *, int probe_interval);
+void reconnect_set_app_probe(struct reconnect *, bool app_probe);
 
 bool reconnect_is_passive(const struct reconnect *);
 void reconnect_set_passive(struct reconnect *, bool passive,
@@ -80,11 +82,14 @@ void reconnect_connected(struct reconnect *, long long int now);
 void reconnect_connect_failed(struct reconnect *, long long int now,
                               int error);
 void reconnect_received(struct reconnect *, long long int now);
+void reconnect_set_transport_idle(struct reconnect *, long long int now,
+                                  bool idle);
 
 enum reconnect_action {
     RECONNECT_CONNECT = 1,
     RECONNECT_DISCONNECT,
     RECONNECT_PROBE,
+    RECONNECT_CHECK_IDLE
 };
 enum reconnect_action reconnect_run(struct reconnect *, long long int now);
 void reconnect_wait(struct reconnect *, long long int now);
diff --git a/lib/socket-util.c b/lib/socket-util.c
index 7e4b8be..2515f88 100644
--- a/lib/socket-util.c
+++ b/lib/socket-util.c
@@ -20,6 +20,7 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <netdb.h>
+#include <netinet/tcp.h>
 #include <poll.h>
 #include <stddef.h>
 #include <stdio.h>
@@ -791,3 +792,91 @@ xpipe(int fds[2])
         VLOG_FATAL("failed to create pipe (%s)", strerror(errno));
     }
 }
+
+#ifdef __linux__
+/* Linux setsockopt() enforces these maximum values for the TCP keepalive
+ * parameters.  They are in the kernel's own <net/tcp.h> but not exported into
+ * public header files.  They have not changed since at least 2.6.12. */
+#define MAX_TCP_KEEPIDLE	32767
+#define MAX_TCP_KEEPINTVL	32767
+#define MAX_TCP_KEEPCNT		127
+
+static int
+setsockopt_int(int fd, int level, int optname, int value)
+{
+    return setsockopt(fd, level, optname, &value, sizeof(value));
+}
+
+/* Attempts to configure TCP keepalive probes on TCP socket 'fd'.
+ *
+ * For 'max_idle' > 0, the expected behavior should approximate sending a
+ * keepalive probe after 'max_idle' seconds and disconnecting after an
+ * additional 'max_idle' seconds if no response is received.  A 'max_idle' of 0
+ * requests disabling keepalives.
+ *
+ * SO_KEEPALIVE is a standard POSIX feature, but properly implementing this
+ * function requires per-connection keepalive settings only implemented by
+ * Linux (AFAIK).  This function returns EOPNOTSUPP if the operating system
+ * does not implement per-connection TCP keepalives.  The caller should then
+ * fall back to application-level probes (e.g. OFPT_ECHO_REQUEST). */
+int
+set_tcp_keepalive(int fd, int max_idle)
+{
+    int retval = 0;
+    if (max_idle > 0) {
+        int keepidle = MIN(max_idle, MAX_TCP_KEEPIDLE);
+        int keepintvl = 1;
+        int keepcnt = MIN(max_idle, MAX_TCP_KEEPCNT);
+
+        retval = (setsockopt_int(fd, IPPROTO_TCP, TCP_KEEPIDLE, keepidle)
+                  || setsockopt_int(fd, IPPROTO_TCP, TCP_KEEPINTVL, keepintvl)
+                  || setsockopt_int(fd, IPPROTO_TCP, TCP_KEEPCNT, keepcnt));
+    }
+    if (!retval) {
+        retval = setsockopt_int(fd, SOL_SOCKET, SO_KEEPALIVE, max_idle > 0);
+    }
+    if (retval) {
+        VLOG_WARN("failed to set %ds TCP keepalive (%s)",
+                  max_idle, strerror(errno));
+        return errno;
+    }
+    return 0;
+}
+
+/* Returns true if TCP socket 'fd' is known to be in the process of verifying
+ * that the connection is alive, e.g. a TCP keepalive probe has been sent but
+ * no response has been received.  Returns false otherwise, including in the
+ * case where the TCP implementation does not provide a way to query the
+ * keepalive probing status.
+ *
+ * This function assumes that TCP keepalives have been enabled by a successful
+ * call to set_tcp_keepalive() with a positive 'max_idle' value. */
+bool
+is_tcp_idle(int fd)
+{
+    struct tcp_info tcp;
+    socklen_t len;
+
+    len = sizeof(tcp);
+    if (getsockopt(fd, IPPROTO_TCP, TCP_INFO, &tcp, &len)) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
+
+        VLOG_WARN_RL(&rl, "failed to retrieve tcp_info (%s)", strerror(errno));
+        return false;
+    }
+
+    return tcp.tcpi_probes > 0;
+}
+#else  /* !__linux__ */
+int
+set_tcp_keepalive(int fd, int max_idle)
+{
+    return EOPNOTSUPP;
+}
+
+bool
+is_tcp_idle(int fd)
+{
+    return false;
+}
+#endif  /* !__linux__ */
diff --git a/lib/socket-util.h b/lib/socket-util.h
index 8c5af39..6806edf 100644
--- a/lib/socket-util.h
+++ b/lib/socket-util.h
@@ -48,6 +48,9 @@ int write_fully(int fd, const void *, size_t, size_t *bytes_written);
 int fsync_parent_dir(const char *file_name);
 int get_mtime(const char *file_name, struct timespec *mtime);
 
+int set_tcp_keepalive(int fd, int max_idle);
+bool is_tcp_idle(int fd);
+
 void xpipe(int fds[2]);
 
 #endif /* socket-util.h */
diff --git a/lib/stream-fd.c b/lib/stream-fd.c
index 2026db6..f51b9dd 100644
--- a/lib/stream-fd.c
+++ b/lib/stream-fd.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008, 2009, 2010 Nicira Networks.
+ * Copyright (c) 2008, 2009, 2010, 2011 Nicira Networks.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,6 +43,7 @@ struct stream_fd
     struct stream stream;
     int fd;
     char *unlink_path;
+    bool is_tcp;
 };
 
 static struct stream_class stream_fd_class;
@@ -55,6 +56,8 @@ static void maybe_unlink_and_free(char *path);
  * and stores a pointer to the stream in '*streamp'.  Initial connection status
  * 'connect_status' is interpreted as described for stream_init().
  *
+ * Specify 'is_tcp' as true if 'fd' is a TCP socket, false otherwise.
+ *
  * When '*streamp' is closed, then 'unlink_path' (if nonnull) will be passed to
  * fatal_signal_unlink_file_now() and then freed with free().
  *
@@ -62,7 +65,7 @@ static void maybe_unlink_and_free(char *path);
  * implementation never fails.) */
 int
 new_fd_stream(const char *name, int fd, int connect_status,
-              char *unlink_path, struct stream **streamp)
+              char *unlink_path, bool is_tcp, struct stream **streamp)
 {
     struct stream_fd *s;
 
@@ -70,12 +73,13 @@ new_fd_stream(const char *name, int fd, int connect_status,
     stream_init(&s->stream, &stream_fd_class, connect_status, name);
     s->fd = fd;
     s->unlink_path = unlink_path;
+    s->is_tcp = is_tcp;
     *streamp = &s->stream;
     return 0;
 }
 
 static struct stream_fd *
-stream_fd_cast(struct stream *stream)
+stream_fd_cast(const struct stream *stream)
 {
     stream_assert_class(stream, &stream_fd_class);
     return CONTAINER_OF(stream, struct stream_fd, stream);
@@ -154,6 +158,20 @@ fd_wait(struct stream *stream, enum stream_wait_type wait)
     }
 }
 
+static int
+fd_set_keepalive(struct stream *stream, int max_idle)
+{
+    struct stream_fd *s = stream_fd_cast(stream);
+    return s->is_tcp ? set_tcp_keepalive(s->fd, max_idle) : EOPNOTSUPP;
+}
+
+static bool
+fd_is_idle(const struct stream *stream)
+{
+    const struct stream_fd *s = stream_fd_cast(stream);
+    return s->is_tcp ? is_tcp_idle(s->fd) : false;
+}
+
 static struct stream_class stream_fd_class = {
     "fd",                       /* name */
     NULL,                       /* open */
@@ -164,6 +182,8 @@ static struct stream_class stream_fd_class = {
     NULL,                       /* run */
     NULL,                       /* run_wait */
     fd_wait,                    /* wait */
+    fd_set_keepalive,           /* set_keepalive */
+    fd_is_idle,                 /* is_idle */
 };
 
 /* Passive file descriptor stream. */
diff --git a/lib/stream-fd.h b/lib/stream-fd.h
index d2a34eb..abe461a 100644
--- a/lib/stream-fd.h
+++ b/lib/stream-fd.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008, 2009 Nicira Networks.
+ * Copyright (c) 2008, 2009, 2011 Nicira Networks.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@ struct pstream;
 struct sockaddr;
 
 int new_fd_stream(const char *name, int fd, int connect_status,
-                      char *unlink_path, struct stream **streamp);
+                  char *unlink_path, bool is_tcp, struct stream **streamp);
 int new_fd_pstream(const char *name, int fd,
                    int (*accept_cb)(int fd, const struct sockaddr *,
                                     size_t sa_len, struct stream **),
diff --git a/lib/stream-provider.h b/lib/stream-provider.h
index 2b8ca69..92a1bb4 100644
--- a/lib/stream-provider.h
+++ b/lib/stream-provider.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009, 2010 Nicira Networks.
+ * Copyright (c) 2009, 2010, 2011 Nicira Networks.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -124,6 +124,35 @@ struct stream_class {
     /* Arranges for the poll loop to wake up when 'stream' is ready to take an
      * action of the given 'type'. */
     void (*wait)(struct stream *stream, enum stream_wait_type type);
+
+    /* Attempts to configure keepalive probes on 'stream'.  For TCP sockets,
+     * for example, this would be TCP keepalives.  Returns 0 if successful,
+     * otherwise a positive errno value.
+     *
+     * For 'max_idle' > 0, the expected behavior should approximate sending a
+     * keepalive probe after 'max_idle' seconds and disconnecting after an
+     * additional 'max_idle' seconds if no response is received.  A 'max_idle'
+     * of 0 requests disabling keepalives.
+     *
+     * Support for keepalives varies by the type of channel and the operating
+     * system.  The caller should fall back to application-level probes if
+     * keepalives are not available.  This function should return EOPNOTSUPP if
+     * keepalives are not available.  It may be a null pointer if keepalives
+     * are known at compile time not to be supported. */
+    int (*set_keepalive)(struct stream *stream, int max_idle);
+
+    /* Returns true if 'stream' is known to be in the process of verifying that
+     * the connection is alive, e.g. a keepalive probe has been sent but no
+     * response has been received.  Returns false otherwise, including in the
+     * case where 'stream' has no way to query the keepalive probing status.
+     *
+     * This function may assume that keepalives have been enabled by a
+     * successful call to 'set_keepalive' with a positive 'max_idle' value.
+     *
+     * May be a null pointer if it is known at compile time either that
+     * keepalives are not supported or that 'stream' cannot query whether
+     * keepalive probes are in progress. */
+    bool (*is_idle)(const struct stream *stream);
 };
 
 /* Passive listener for incoming stream connections.
diff --git a/lib/stream-ssl.c b/lib/stream-ssl.c
index 977c5ba..a04c522 100644
--- a/lib/stream-ssl.c
+++ b/lib/stream-ssl.c
@@ -323,7 +323,7 @@ error:
 }
 
 static struct ssl_stream *
-ssl_stream_cast(struct stream *stream)
+ssl_stream_cast(const struct stream *stream)
 {
     stream_assert_class(stream, &ssl_stream_class);
     return CONTAINER_OF(stream, struct ssl_stream, stream);
@@ -844,6 +844,20 @@ ssl_wait(struct stream *stream, enum stream_wait_type wait)
     }
 }
 
+static int
+ssl_set_keepalive(struct stream *stream, int max_idle)
+{
+    struct ssl_stream *sslv = ssl_stream_cast(stream);
+    return set_tcp_keepalive(sslv->fd, max_idle);
+}
+
+static bool
+ssl_is_idle(const struct stream *stream)
+{
+    const struct ssl_stream *sslv = ssl_stream_cast(stream);
+    return is_tcp_idle(sslv->fd);
+}
+
 struct stream_class ssl_stream_class = {
     "ssl",                      /* name */
     ssl_open,                   /* open */
@@ -854,6 +868,8 @@ struct stream_class ssl_stream_class = {
     ssl_run,                    /* run */
     ssl_run_wait,               /* run_wait */
     ssl_wait,                   /* wait */
+    ssl_set_keepalive,          /* set_keepalive */
+    ssl_is_idle,                /* is_idle */
 };
 
 /* Passive SSL. */
diff --git a/lib/stream-tcp.c b/lib/stream-tcp.c
index d92fe3a..0bf5c80 100644
--- a/lib/stream-tcp.c
+++ b/lib/stream-tcp.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008, 2009, 2010 Nicira Networks.
+ * Copyright (c) 2008, 2009, 2010, 2011 Nicira Networks.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,7 +58,7 @@ new_tcp_stream(const char *name, int fd, int connect_status,
         return errno;
     }
 
-    retval = new_fd_stream(name, fd, connect_status, NULL, streamp);
+    retval = new_fd_stream(name, fd, connect_status, NULL, true, streamp);
     if (!retval) {
         struct stream *stream = *streamp;
         stream_set_remote_ip(stream, remote->sin_addr.s_addr);
@@ -94,6 +94,8 @@ struct stream_class tcp_stream_class = {
     NULL,                       /* run */
     NULL,                       /* run_wait */
     NULL,                       /* wait */
+    NULL,                       /* set_keepalive */
+    NULL,                       /* is_idle */
 };
 
 /* Passive TCP. */
diff --git a/lib/stream-unix.c b/lib/stream-unix.c
index 955414d..8d662a1 100644
--- a/lib/stream-unix.c
+++ b/lib/stream-unix.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008, 2009, 2010 Nicira Networks.
+ * Copyright (c) 2008, 2009, 2010, 2011 Nicira Networks.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -60,7 +60,7 @@ unix_open(const char *name, char *suffix, struct stream **streamp)
     }
 
     return new_fd_stream(name, fd, check_connection_completion(fd),
-                         bind_path, streamp);
+                         bind_path, false, streamp);
 }
 
 struct stream_class unix_stream_class = {
@@ -73,6 +73,8 @@ struct stream_class unix_stream_class = {
     NULL,                       /* run */
     NULL,                       /* run_wait */
     NULL,                       /* wait */
+    NULL,                       /* set_keepalive */
+    NULL,                       /* is_idle */
 };
 
 /* Passive UNIX socket. */
@@ -116,7 +118,7 @@ punix_accept(int fd, const struct sockaddr *sa, size_t sa_len,
     } else {
         strcpy(name, "unix");
     }
-    return new_fd_stream(name, fd, 0, NULL, streamp);
+    return new_fd_stream(name, fd, 0, NULL, false, streamp);
 }
 
 struct pstream_class punix_pstream_class = {
diff --git a/lib/stream.c b/lib/stream.c
index 37b6110..60df2df 100644
--- a/lib/stream.c
+++ b/lib/stream.c
@@ -414,6 +414,42 @@ stream_run_wait(struct stream *stream)
     }
 }
 
+/* Attempts to configure keepalive probes on 'stream'.  For TCP sockets, for
+ * example, this would be TCP keepalives.  Returns 0 if successful, otherwise a
+ * positive errno value.
+ *
+ * For 'max_idle' > 0, the keepalive behavior will approximate sending a
+ * keepalive probe after 'max_idle' seconds and disconnecting after an
+ * additional 'max_idle' seconds if no response is received.  A 'max_idle' of 0
+ * requests disabling keepalives.
+ *
+ * Support for keepalives varies by the type of channel and the operating
+ * system.  The caller should fall back to application-level probes if
+ * keepalives are not available.  This function should return EOPNOTSUPP if
+ * keepalives are not available. */
+int
+stream_set_keepalive(struct stream *stream, int max_idle)
+{
+    return (stream->class->set_keepalive
+            ? (stream->class->set_keepalive)(stream, max_idle)
+            : EOPNOTSUPP);
+}
+
+/* Returns true if 'stream' is known to be in the process of verifying that the
+ * connection is alive, e.g. a keepalive probe has been sent but no response
+ * has been received.  Returns false otherwise, including in the case where
+ * 'stream' cannot query the keepalive probing status.
+ *
+ * The caller should have previously enabled keepalives through a successful
+ * call to 'stream_set_keepalive' with a positive 'max_idle' value. */
+bool
+stream_is_idle(const struct stream *stream)
+{
+    return (stream->class->is_idle
+            ? (stream->class->is_idle)(stream)
+            : false);
+}
+
 /* Arranges for the poll loop to wake up when 'stream' is ready to take an
  * action of the given 'type'. */
 void
diff --git a/lib/stream.h b/lib/stream.h
index 51a7656..459a013 100644
--- a/lib/stream.h
+++ b/lib/stream.h
@@ -46,6 +46,9 @@ int stream_send(struct stream *, const void *buffer, size_t n);
 void stream_run(struct stream *);
 void stream_run_wait(struct stream *);
 
+int stream_set_keepalive(struct stream *, int max_idle);
+bool stream_is_idle(const struct stream *);
+
 enum stream_wait_type {
     STREAM_CONNECT,
     STREAM_RECV,
diff --git a/lib/vconn-provider.h b/lib/vconn-provider.h
index 682a593..d9c89e5 100644
--- a/lib/vconn-provider.h
+++ b/lib/vconn-provider.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008, 2009, 2010 Nicira Networks.
+ * Copyright (c) 2008, 2009, 2010, 2011 Nicira Networks.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -123,6 +123,37 @@ struct vconn_class {
     /* Arranges for the poll loop to wake up when 'vconn' is ready to take an
      * action of the given 'type'. */
     void (*wait)(struct vconn *vconn, enum vconn_wait_type type);
+
+    /* Attempts to configure keepalive probes on the channel underlying
+     * 'vconn'.  For TCP sockets, for example, this would be TCP keepalives.
+     * Returns 0 if successful, otherwise a positive errno value.
+     *
+     * For 'max_idle' > 0, the expected behavior should approximate sending a
+     * keepalive probe after 'max_idle' seconds and disconnecting after an
+     * additional 'max_idle' seconds if no response is received.  A 'max_idle'
+     * of 0 requests disabling keepalives.
+     *
+     * Support for keepalives varies by the type of channel and the operating
+     * system.  The caller should fall back to application-level probes
+     * (e.g. OFPT_ECHO_REQUEST) if keepalives are not available.  This function
+     * should return EOPNOTSUPP if keepalives are not available.  It may be a
+     * null pointer if keepalives are known at compile time not to be
+     * supported. */
+    int (*set_keepalive)(struct vconn *vconn, int max_idle);
+
+    /* Returns true if 'vconn' is known to be in the process of verifying that
+     * the connection is alive, e.g. a keepalive probe has been sent but no
+     * response has been received.  Returns false otherwise, including in the
+     * case where the underlying channel does not provide a way to query
+     * the keepalive probing status.
+     *
+     * This function may assume that keepalives have been enabled by a
+     * successful call to 'set_keepalive' with a positive 'max_idle' value.
+     *
+     * May be a null pointer if it is known at compile time either that
+     * keepalives are not supported or that the underlying channel does not
+     * provide a way to query whether keepalive probes are in progress. */
+    bool (*is_idle)(const struct vconn *vconn);
 };
 
 /* Passive virtual connection to an OpenFlow device.
diff --git a/lib/vconn-stream.c b/lib/vconn-stream.c
index 39f1c94..adf6c8a 100644
--- a/lib/vconn-stream.c
+++ b/lib/vconn-stream.c
@@ -98,7 +98,7 @@ vconn_stream_open(const char *name, char *suffix OVS_UNUSED,
 }
 
 static struct vconn_stream *
-vconn_stream_cast(struct vconn *vconn)
+vconn_stream_cast(const struct vconn *vconn)
 {
     return CONTAINER_OF(vconn, struct vconn_stream, vconn);
 }
@@ -285,6 +285,20 @@ vconn_stream_wait(struct vconn *vconn, enum vconn_wait_type wait)
         NOT_REACHED();
     }
 }
+
+static int
+vconn_stream_set_keepalive(struct vconn *vconn, int max_idle)
+{
+    struct vconn_stream *s = vconn_stream_cast(vconn);
+    return stream_set_keepalive(s->stream, max_idle);
+}
+
+static bool
+vconn_stream_is_idle(const struct vconn *vconn)
+{
+    const struct vconn_stream *s = vconn_stream_cast(vconn);
+    return stream_is_idle(s->stream);
+}
 
 /* Passive stream socket vconn. */
 
@@ -376,6 +390,8 @@ pvconn_pstream_wait(struct pvconn *pvconn)
             vconn_stream_run,                       \
             vconn_stream_run_wait,                  \
             vconn_stream_wait,                      \
+            vconn_stream_set_keepalive,             \
+            vconn_stream_is_idle,                   \
     }
 
 #define PSTREAM_INIT(NAME)                          \
diff --git a/lib/vconn.c b/lib/vconn.c
index 6ea9366..205a386 100644
--- a/lib/vconn.c
+++ b/lib/vconn.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008, 2009, 2010 Nicira Networks.
+ * Copyright (c) 2008, 2009, 2010, 2011 Nicira Networks.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -273,6 +273,42 @@ vconn_run_wait(struct vconn *vconn)
     }
 }
 
+/* Attempts to configure keepalive probes on the channel underlying 'stream'.
+ * For TCP sockets, for example, this would be TCP keepalives.  Returns 0 if
+ * successful, otherwise a positive errno value.
+ *
+ * For 'max_idle' > 0, the keepalive behavior will approximate sending a
+ * keepalive probe after 'max_idle' seconds and disconnecting after an
+ * additional 'max_idle' seconds if no response is received.  A 'max_idle' of 0
+ * requests disabling keepalives.
+ *
+ * Support for keepalives varies by the type of channel and the operating
+ * system.  The caller should fall back to application-level probes
+ * (e.g. OFPT_ECHO_REQUEST) if keepalives are not available.  This function
+ * returns EOPNOTSUPP if keepalives are not available. */
+int
+vconn_set_keepalive(struct vconn *vconn, int max_idle)
+{
+    return (vconn->class->set_keepalive
+            ? (vconn->class->set_keepalive)(vconn, max_idle)
+            : EOPNOTSUPP);
+}
+
+/* Returns true if the channel underlying 'vconn' is known to be in the process
+ * of verifying that the connection is alive, e.g. a keepalive probe has been
+ * sent but no response has been received.  Returns false otherwise, including
+ * in the case where 'vconn' cannot query the keepalive probing status.
+ *
+ * The caller should have previously enabled keepalives through a successful
+ * call to 'vconn_set_keepalive' with a positive 'max_idle' value. */
+bool
+vconn_is_idle(const struct vconn *vconn)
+{
+    return (vconn->class->is_idle
+            ? (vconn->class->is_idle)(vconn)
+            : false);
+}
+
 int
 vconn_open_block(const char *name, int min_version, struct vconn **vconnp)
 {
diff --git a/lib/vconn.h b/lib/vconn.h
index 8e321b2..ace8d42 100644
--- a/lib/vconn.h
+++ b/lib/vconn.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008, 2009, 2010 Nicira Networks.
+ * Copyright (c) 2008, 2009, 2010, 2011 Nicira Networks.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -60,6 +60,9 @@ int vconn_open_block(const char *name, int min_version, struct vconn **);
 int vconn_send_block(struct vconn *, struct ofpbuf *);
 int vconn_recv_block(struct vconn *, struct ofpbuf **);
 
+int vconn_set_keepalive(struct vconn *, int max_idle);
+bool vconn_is_idle(const struct vconn *);
+
 enum vconn_wait_type {
     WAIT_CONNECT,
     WAIT_RECV,
diff --git a/ovsdb/ovsdb-client.c b/ovsdb/ovsdb-client.c
index a66b013..60c26a5 100644
--- a/ovsdb/ovsdb-client.c
+++ b/ovsdb/ovsdb-client.c
@@ -271,7 +271,6 @@ fetch_schema(const char *server, const char *database)
     return schema;
 }
 
-
 static void
 do_list_dbs(int argc OVS_UNUSED, char *argv[])
 {
diff --git a/python/ovs/reconnect.py b/python/ovs/reconnect.py
index 5fc96bc..b5ed443 100644
--- a/python/ovs/reconnect.py
+++ b/python/ovs/reconnect.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2010 Nicira Networks
+# Copyright (c) 2010, 2011 Nicira Networks
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@ import os
 CONNECT = 'connect'
 DISCONNECT = 'disconnect'
 PROBE = 'probe'
+CHECK_IDLE = 'check-idle'
 
 EOF = -1
 
@@ -91,16 +92,21 @@ class Reconnect(object):
         def deadline(fsm):
             if fsm.probe_interval:
                 base = max(fsm.last_received, fsm.state_entered)
-                return base + fsm.probe_interval
+                return max(base + fsm.probe_interval,
+                           fsm.next_transport_idle_update)
             return None
 
         @staticmethod
         def run(fsm, now):
-            logging.debug("%s: idle %d ms, sending inactivity probe"
-                          % (fsm.name,
-                             now - max(fsm.last_received, fsm.state_entered)))
-            fsm._transition(now, Reconnect.Idle)
-            return PROBE
+            if fsm.app_probe:
+                logging.debug("%s: idle %d ms, sending inactivity probe"
+                              % (fsm.name, now - max(fsm.last_received,
+                                                     fsm.state_entered)))
+                fsm._transition(now, Reconnect.Idle)
+                return PROBE
+            else:
+                fsm.next_transport_idle_update = now + 1000
+                return CHECK_IDLE
 
     class Idle(object):
         name = "IDLE"
@@ -108,14 +114,21 @@ class Reconnect(object):
 
         @staticmethod
         def deadline(fsm):
-            return fsm.state_entered + fsm.probe_interval
+            if fsm.app_probe:
+                return fsm.state_entered + fsm.probe_interval
+            else:
+                return fsm.next_transport_idle_update
 
         @staticmethod
         def run(fsm, now):
-            logging.error("%s: no response to inactivity probe after %.3g "
-                          "seconds, disconnecting"
-                          % (fsm.name, (now - fsm.state_entered) / 1000.0))
-            return DISCONNECT
+            if fsm.app_probe:
+                logging.error("%s: no response to inactivity probe after %.3g "
+                              "seconds, disconnecting"
+                              % (fsm.name, (now - fsm.state_entered) / 1000.0))
+                return DISCONNECT
+            else:
+                fsm.next_transport_idle_update = now + 1000
+                return CHECK_IDLE
 
     class Reconnect:
         name = "RECONNECT"
@@ -138,6 +151,7 @@ class Reconnect(object):
         self.min_backoff = 1000
         self.max_backoff = 8000
         self.probe_interval = 5000
+        self.app_probe = True
         self.passive = False
         self.info_level = logging.info
 
@@ -148,6 +162,7 @@ class Reconnect(object):
         self.last_connected = None
         self.last_disconnected = None
         self.max_tries = None
+        self.next_transport_idle_update = now
 
         self.creation_time = now
         self.n_attempted_connections = 0
@@ -198,11 +213,32 @@ class Reconnect(object):
         """Returns the "probe interval" in milliseconds.  If this is zero, it
         disables the connection keepalive feature.  If it is nonzero, then if
         the interval passes while the FSM is connected and without
-        self.received() being called, self.run() returns ovs.reconnect.PROBE.
-        If the interval passes again without self.received() being called,
-        self.run() returns ovs.reconnect.DISCONNECT."""
+        self.received() being called, then self.run()'s behavior depends on the
+        type of configured probing (see set_app_probe()):
+
+          - If application-level probing is configured, self.run() returns
+            ovs.reconnect.PROBE.  If the interval passes again without
+            self.received() being called, self.run() returns
+            ovs.reconnect.DISCONNECT.
+
+          - If transport-level probing is configured, self.run() returns
+            ovs.reconnect.CHECK_IDLE.  The transport itself is responsible for
+            deciding whether the connection has timed out in this case."""
         return self.probe_interval
 
+    def is_app_probe(self):
+        """Returns how the FSM expects keepalive probing to work.  If the
+        return value is true, then the application performs keepalive probing,
+        e.g. by sending "echo request" OpenFlow or JSON-RPC messages.  If the
+        return value is false, then the underlying transport performs keepalive
+        probing, e.g. via the TCP keepalive mechanism.
+
+        This value has no effect if the keepalive feature is disabled, that is,
+        if self.get_probe_interval() returns 0.
+
+        See get_probe_interval() for more information."""
+        return self.app_probe
+
     def set_max_tries(self, max_tries):
         """Limits the maximum number of times that this object will ask the
         client to try to reconnect to 'max_tries'.  None (the default) means an
@@ -237,24 +273,33 @@ class Reconnect(object):
                 self.backoff = self.max_backoff
         
     def set_probe_interval(self, probe_interval):
-        """Sets the "probe interval" to 'probe_interval', in milliseconds.  If
-        this is zero, it disables the connection keepalive feature.  If it is
-        nonzero, then if the interval passes while this FSM is connected and
-        without self.received() being called, self.run() returns
-        ovs.reconnect.PROBE.  If the interval passes again without
-        self.received() being called, self.run() returns
-        ovs.reconnect.DISCONNECT.
-
-        If 'probe_interval' is nonzero, then it will be forced to a value of at
-        least 1000 ms."""
+        """Sets the "probe interval" for the FSM to 'probe_interval', in
+        milliseconds.  If this is zero, it disables the connection keepalive
+        feature.  If it is nonzero, then it will be forced to a value of at
+        least 1000 ms.
+
+        See get_probe_interval() for more information."""
         if probe_interval:
             self.probe_interval = max(1000, probe_interval)
         else:
             self.probe_interval = 0
 
+    def set_app_probe(self, app_probe):
+        """Configures how the FSM expects keepalive probing to work.  If
+        'app_probe' is true, then the application performs keepalive probing,
+        e.g. by sending "echo request" OpenFlow or JSON-RPC messages.  If
+        'app_probe' is false, then the underlying transport performs keepalive
+        probing, e.g. via the TCP keepalive mechanism.
+
+        This setting has no effect if the keepalive feature is disabled, that
+        is, if reconnect_get_probe_interval() returns 0.
+
+        See get_probe_interval() for more information."""
+        self.app_probe = app_probe
+
     def is_passive(self):
-        """Returns true if 'fsm' is in passive mode, false if 'fsm' is in
-        active mode (the default)."""
+        """Returns true if the FSM is in passive mode, false if it is in active
+        mode (the default)."""
         return self.passive
 
     def set_passive(self, passive, now):
@@ -444,6 +489,19 @@ class Reconnect(object):
             self._transition(now, Reconnect.Active)
         self.last_received = now
 
+    def set_transport_idle(self, now, idle):
+        """Tell this FSM whether the transport layer is currently probing
+        whether an idle connection is still connected.  'idle' should be True
+        if this is known to be the case, or False if it is known not to be the
+        case or if the answer cannot be determined."""
+        if self.state == Reconnect.Active and idle:
+            logging.debug("%s: sent transport-level keepalive" % self.name)
+            self._transition(now, Reconnect.Idle)
+        elif self.state == Reconnect.Idle and not idle:
+            logging.debug("%s: received transport-level keepalive reply"
+                          % self.name)
+            self._transition(now, Reconnect.Active)
+
     def _transition(self, now, state):
         if self.state == Reconnect.ConnectInProgress:
             self.n_attempted_connections += 1
@@ -493,12 +551,19 @@ class Reconnect(object):
             - ovs.reconnect.DISCONNECT: The client should abort the current
               connection or connection attempt or listen attempt and call
               self.disconnected() or self.connect_failed() to indicate it.
-        
+
             - ovs.reconnect.PROBE: The client should send some kind of request
               to the peer that will elicit a response, to ensure that the
               connection is indeed in working order.  (This will only be
-              returned if the "probe interval" is nonzero--see
-              self.set_probe_interval())."""
+              returned if the "probe interval" is nonzero and application-level
+              probing is configured.  See set_probe_interval().)
+
+            - ovs.reconnect.CHECK_IDLE: The client should query the transport
+              layer to determine whether idle connection probing is taking
+              place and call self.set_transport_idle() to report the result.
+              (This will only be returned if the "probe interval" is nonzero
+              and transport-level probing is configured.  See
+              set_probe_interval().)"""
         if now >= self.state.deadline(self):
             return self.state.run(self, now)
         else:
diff --git a/tests/reconnect.at b/tests/reconnect.at
index bb37170..13274bd 100644
--- a/tests/reconnect.at
+++ b/tests/reconnect.at
@@ -1221,3 +1221,246 @@ run
 listening
   in LISTENING for 0 ms (0 ms backoff)
 ])
+
+######################################################################
+RECONNECT_CHECK([transport idle with recovery], [dnl
+enable
+set-app-probe 0
+
+# Connection succeeds.
+run
+connected
+
+# Nothing received at app level...
+timeout
+run
+
+# ...so reconnect asks to check transport-level idle status.
+# But at transport level we're not idle.
+timeout
+run
+set-transport-idle 0
+
+# Reconnect asks to check transport-level idle status again 1s later.
+# Still not idle
+timeout
+run
+set-transport-idle 0
+
+# Again 1s later.  Now we're idle.
+timeout
+run
+set-transport-idle 1
+
+# Now that we're idle reconnect should check back with us once a second.
+# Check that once...
+timeout
+run
+set-transport-idle 1
+
+# Twice...
+timeout
+run
+set-transport-idle 1
+
+# Now transport idle recovers.
+timeout
+run
+set-transport-idle 0
+], [dnl
+### t=1000 ###
+enable
+  in BACKOFF for 0 ms (0 ms backoff)
+set-app-probe 0
+
+# Connection succeeds.
+run
+  should connect
+connected
+  in ACTIVE for 0 ms (0 ms backoff)
+  created 1000, last received 1000, last connected 1000
+  1 successful connections out of 1 attempts, seqno 1
+  connected
+  last connected 0 ms ago, connected 0 ms total
+
+# Nothing received at app level...
+timeout
+  advance 5000 ms
+
+### t=6000 ###
+  in ACTIVE for 5000 ms (0 ms backoff)
+run
+  should call reconnect_set_transport_idle()
+
+# ...so reconnect asks to check transport-level idle status.
+# But at transport level we're not idle.
+timeout
+  advance 1000 ms
+
+### t=7000 ###
+  in ACTIVE for 6000 ms (0 ms backoff)
+run
+  should call reconnect_set_transport_idle()
+set-transport-idle 0
+
+# Reconnect asks to check transport-level idle status again 1s later.
+# Still not idle
+timeout
+  advance 1000 ms
+
+### t=8000 ###
+  in ACTIVE for 7000 ms (0 ms backoff)
+run
+  should call reconnect_set_transport_idle()
+set-transport-idle 0
+
+# Again 1s later.  Now we're idle.
+timeout
+  advance 1000 ms
+
+### t=9000 ###
+  in ACTIVE for 8000 ms (0 ms backoff)
+run
+  should call reconnect_set_transport_idle()
+set-transport-idle 1
+  in IDLE for 0 ms (0 ms backoff)
+
+# Now that we're idle reconnect should check back with us once a second.
+# Check that once...
+timeout
+  advance 1000 ms
+
+### t=10000 ###
+  in IDLE for 1000 ms (0 ms backoff)
+run
+  should call reconnect_set_transport_idle()
+set-transport-idle 1
+
+# Twice...
+timeout
+  advance 1000 ms
+
+### t=11000 ###
+  in IDLE for 2000 ms (0 ms backoff)
+run
+  should call reconnect_set_transport_idle()
+set-transport-idle 1
+
+# Now transport idle recovers.
+timeout
+  advance 1000 ms
+
+### t=12000 ###
+  in IDLE for 3000 ms (0 ms backoff)
+run
+  should call reconnect_set_transport_idle()
+set-transport-idle 0
+  in ACTIVE for 0 ms (0 ms backoff)
+])
+
+######################################################################
+RECONNECT_CHECK([transport idle with failure], [dnl
+enable
+set-app-probe 0
+
+# Connection succeeds.
+run
+connected
+
+# Nothing received at app level...
+timeout
+run
+
+# ...so reconnect asks to check transport-level idle status.
+# At transport level we're idle too.
+timeout
+run
+set-transport-idle 1
+
+# Now that we're idle reconnect should check back with us once a second.
+# Check that once...
+timeout
+run
+set-transport-idle 1
+
+# Twice...
+timeout
+run
+set-transport-idle 1
+
+# Now transport idle times us out.
+timeout
+run
+disconnected ETIMEDOUT
+], [dnl
+### t=1000 ###
+enable
+  in BACKOFF for 0 ms (0 ms backoff)
+set-app-probe 0
+
+# Connection succeeds.
+run
+  should connect
+connected
+  in ACTIVE for 0 ms (0 ms backoff)
+  created 1000, last received 1000, last connected 1000
+  1 successful connections out of 1 attempts, seqno 1
+  connected
+  last connected 0 ms ago, connected 0 ms total
+
+# Nothing received at app level...
+timeout
+  advance 5000 ms
+
+### t=6000 ###
+  in ACTIVE for 5000 ms (0 ms backoff)
+run
+  should call reconnect_set_transport_idle()
+
+# ...so reconnect asks to check transport-level idle status.
+# At transport level we're idle too.
+timeout
+  advance 1000 ms
+
+### t=7000 ###
+  in ACTIVE for 6000 ms (0 ms backoff)
+run
+  should call reconnect_set_transport_idle()
+set-transport-idle 1
+  in IDLE for 0 ms (0 ms backoff)
+
+# Now that we're idle reconnect should check back with us once a second.
+# Check that once...
+timeout
+  advance 1000 ms
+
+### t=8000 ###
+  in IDLE for 1000 ms (0 ms backoff)
+run
+  should call reconnect_set_transport_idle()
+set-transport-idle 1
+
+# Twice...
+timeout
+  advance 1000 ms
+
+### t=9000 ###
+  in IDLE for 2000 ms (0 ms backoff)
+run
+  should call reconnect_set_transport_idle()
+set-transport-idle 1
+
+# Now transport idle times us out.
+timeout
+  advance 1000 ms
+
+### t=10000 ###
+  in IDLE for 3000 ms (0 ms backoff)
+run
+  should call reconnect_set_transport_idle()
+disconnected ETIMEDOUT
+  in BACKOFF for 0 ms (1000 ms backoff)
+  1 successful connections out of 1 attempts, seqno 2
+  disconnected
+  disconnected at 10000 ms (0 ms ago)
+])
diff --git a/tests/test-reconnect.c b/tests/test-reconnect.c
index fae0f17..29153f8 100644
--- a/tests/test-reconnect.c
+++ b/tests/test-reconnect.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009, 2010 Nicira Networks.
+ * Copyright (c) 2009, 2010, 2011 Nicira Networks.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -118,6 +118,8 @@ error_from_string(const char *s)
         return ECONNREFUSED;
     } else if (!strcmp(s, "EOF")) {
         return EOF;
+    } else if (!strcmp(s, "ETIMEDOUT")) {
+        return ETIMEDOUT;
     } else {
         ovs_fatal(0, "unknown error '%s'", s);
     }
@@ -181,6 +183,10 @@ do_run(int argc, char *argv[])
     case RECONNECT_PROBE:
         printf("  should send probe\n");
         break;
+
+    case RECONNECT_CHECK_IDLE:
+        printf("  should call reconnect_set_transport_idle()\n");
+        break;
     }
 }
 
@@ -272,6 +278,18 @@ do_listen_error(int argc OVS_UNUSED, char *argv[])
     reconnect_listen_error(reconnect, now, atoi(argv[1]));
 }
 
+static void
+do_set_app_probe(int argc OVS_UNUSED, char *argv[])
+{
+    reconnect_set_app_probe(reconnect, atoi(argv[1]));
+}
+
+static void
+do_set_transport_idle(int argc OVS_UNUSED, char *argv[])
+{
+    reconnect_set_transport_idle(reconnect, now, atoi(argv[1]));
+}
+
 static const struct command commands[] = {
     { "enable", 0, 0, do_enable },
     { "disable", 0, 0, do_disable },
@@ -288,5 +306,7 @@ static const struct command commands[] = {
     { "passive", 0, 0, do_set_passive },
     { "listening", 0, 0, do_listening },
     { "listen-error", 1, 1, do_listen_error },
+    { "set-app-probe", 1, 1, do_set_app_probe },
+    { "set-transport-idle", 1, 1, do_set_transport_idle },
     { NULL, 0, 0, NULL },
 };
diff --git a/tests/test-reconnect.py b/tests/test-reconnect.py
index a4cc9b7..4d9ebc7 100644
--- a/tests/test-reconnect.py
+++ b/tests/test-reconnect.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2009, 2010 Nicira Networks.
+# Copyright (c) 2009, 2010, 2011 Nicira Networks.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -34,6 +34,8 @@ def error_from_string(s):
         return errno.ECONNREFUSED
     elif s == "EOF":
         return EOF
+    elif s == "ETIMEDOUT":
+        return errno.ETIMEDOUT
     else:
         sys.stderr.write("unknown error '%s'\n" % s)
         sys.exit(1)
@@ -67,6 +69,8 @@ def do_run(arg):
         print "  should disconnect"
     elif action == ovs.reconnect.PROBE:
         print "  should send probe"
+    elif action == ovs.reconnect.CHECK_IDLE:
+        print "  should call reconnect_set_transport_idle()"
     else:
         assert False
 
@@ -137,6 +141,12 @@ def do_listening(arg):
 def do_listen_error(arg):
     r.listen_error(now, int(arg))
 
+def do_set_app_probe(arg):
+    r.set_app_probe(int(arg) != 0)
+
+def do_set_transport_idle(arg):
+    r.set_transport_idle(now, int(arg) != 0)
+
 def main():
     commands = {
         "enable": do_enable,
@@ -153,7 +163,9 @@ def main():
         "set-max-tries": do_set_max_tries,
         "passive": do_set_passive,
         "listening": do_listening,
-        "listen-error": do_listen_error
+        "listen-error": do_listen_error,
+        "set-app-probe": do_set_app_probe,
+        "set-transport-idle": do_set_transport_idle
     }
 
     logging.basicConfig(level=logging.CRITICAL)
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 6070a1c..b46e0af 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -1800,14 +1800,21 @@
       </column>
 
       <column name="inactivity_probe">
-        Maximum number of milliseconds of idle time on connection to
-        controller before sending an inactivity probe message.  If Open
-        vSwitch does not communicate with the controller for the specified
-        number of seconds, it will send a probe.  If a response is not
-        received for the same additional amount of time, Open vSwitch
-        assumes the connection has been broken and attempts to reconnect.
-        Default is implementation-specific.  A value of 0 disables
-        inactivity probes.
+        <p>
+          Maximum number of milliseconds of idle time on connection to
+          controller before sending an inactivity probe message.  If Open
+          vSwitch does not communicate with the controller for the specified
+          number of seconds, it will send a probe.  If a response is not
+          received for the same additional amount of time, Open vSwitch assumes
+          the connection has been broken and attempts to reconnect.  Default is
+          implementation-specific.  A value of 0 disables inactivity probes.
+        </p>
+        <p>
+          When the operating system supports it, inactivity probes on TCP and
+          SSL connections are implemented using the TCP keepalive feature.  In
+          other cases, they are implemented with OpenFlow ``echo request''
+          messages.
+        </p>
       </column>
     </group>
 
@@ -2054,13 +2061,21 @@
       </column>
 
       <column name="inactivity_probe">
-        Maximum number of milliseconds of idle time on connection to the client
-        before sending an inactivity probe message.  If Open vSwitch does not
-        communicate with the client for the specified number of seconds, it
-        will send a probe.  If a response is not received for the same
-        additional amount of time, Open vSwitch assumes the connection has been
-        broken and attempts to reconnect.  Default is implementation-specific.
-        A value of 0 disables inactivity probes.
+        <p>
+          Maximum number of milliseconds of idle time on connection to the
+          client before sending an inactivity probe message.  If Open vSwitch
+          does not communicate with the client for the specified number of
+          seconds, it will send a probe.  If a response is not received for the
+          same additional amount of time, Open vSwitch assumes the connection
+          has been broken and attempts to reconnect.  Default is
+          implementation-specific.  A value of 0 disables inactivity probes.
+        </p>
+        <p>
+          When the operating system supports it, inactivity probes on TCP and
+          SSL connections are implemented using the TCP keepalive feature.  In
+          other cases, they are implemented with OVSDB ``echo request''
+          messages.
+        </p>
       </column>
     </group>
 
-- 
1.7.1




More information about the dev mailing list