[ovs-dev] [PATCH] Granular link health statistics for cfm.

Mehak Mahajan mmahajan at nicira.com
Wed Apr 4 18:38:42 UTC 2012


The changes display the cfm_health of an interface.  The cfm_health
is an exponential weighted moving average of the health of all
remote_mpids.  The value can vary from 0 to 100, 100 being very healthy
and 0 being unhealthy.

Feature #10363
Requested-by: Ethan Jackson <ethan at nicira.com>
Signed-off-by: Mehak Mahajan <mmahajan at nicira.com>
---
 NEWS                       |    2 +
 lib/cfm.c                  |   83 +++++++++++++++++++++++++++++++++++++++++++-
 lib/cfm.h                  |    1 +
 ofproto/ofproto-dpif.c     |    9 +++++
 ofproto/ofproto-provider.h |   10 +++++
 ofproto/ofproto.c          |   13 +++++++
 ofproto/ofproto.h          |    3 +-
 vswitchd/bridge.c          |   11 ++++++
 vswitchd/vswitch.ovsschema |    9 ++++-
 vswitchd/vswitch.xml       |   24 +++++++++++++
 10 files changed, 161 insertions(+), 4 deletions(-)

diff --git a/NEWS b/NEWS
index a466f92..ed3fc88 100644
--- a/NEWS
+++ b/NEWS
@@ -6,6 +6,8 @@ post-v1.6.0
     - Added ability to configure dscp setting for manager and controller
       connections.  By default, these connections have a DSCP value of
       Internetwork Control (0xc0).
+    - Added the granular link health statistics, 'cfm_health', to an
+      interface.
 
 
 v1.6.0 - xx xxx xxxx
diff --git a/lib/cfm.c b/lib/cfm.c
index 8b9e5bc..a85be4f 100644
--- a/lib/cfm.c
+++ b/lib/cfm.c
@@ -60,6 +60,7 @@ static const uint8_t eth_addr_ccm_x[6] = {
 #define CCM_MAID_LEN 48
 #define CCM_OPCODE 1 /* CFM message opcode meaning CCM. */
 #define CCM_RDI_MASK 0x80
+#define CFM_HEALTH_INTERVAL 2
 struct ccm {
     uint8_t  mdlevel_version; /* MD Level and Version */
     uint8_t  opcode;
@@ -111,6 +112,10 @@ struct cfm {
      * avoid flapping. */
     uint64_t *rmps_array;     /* Cache of remote_mps. */
     size_t rmps_array_len;    /* Number of rmps in 'rmps_array'. */
+
+    int health;               /* Average health over all remote_mps */
+    int health_interval;      /* Num of fault_intervals used to compute the
+                               * health. */
 };
 
 /* Remote MPs represent foreign network entities that are configured to have
@@ -124,6 +129,10 @@ struct remote_mp {
                             receiving CCMs that it's expecting to. */
     bool opup;           /* Operational State. */
     uint32_t seq;        /* Most recently received sequence number. */
+    uint8_t num_health_ccm; /* Number of received ccm frames per
+                               fault_interval. */
+    int health;          /* Exponentially weighted moving average of link
+                            health */
 };
 
 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(20, 30);
@@ -290,6 +299,7 @@ cfm_create(const char *name)
     hmap_insert(&all_cfms, &cfm->hmap_node, hash_string(cfm->name, 0));
     cfm->remote_opup = true;
     cfm->fault_override = -1;
+    cfm->health = 100;
     return cfm;
 }
 
@@ -314,6 +324,13 @@ cfm_destroy(struct cfm *cfm)
     free(cfm);
 }
 
+/* Returns the health as a percentage. */
+static int
+cfm_get_rmp_health_stats(struct remote_mp *rmp)
+{
+    return rmp->health;
+}
+
 /* Should be run periodically to update fault statistics messages. */
 void
 cfm_run(struct cfm *cfm)
@@ -332,8 +349,48 @@ cfm_run(struct cfm *cfm)
                                   sizeof *cfm->rmps_array);
 
         cfm->remote_opup = true;
-        HMAP_FOR_EACH_SAFE (rmp, rmp_next, node, &cfm->remote_mps) {
+        if (cfm->health_interval == CFM_HEALTH_INTERVAL) {
+            int cfm_health = 0;
+
+            HMAP_FOR_EACH(rmp, node, &cfm->remote_mps) {
+                int lost, exp_ccm_recvd;
+
+                exp_ccm_recvd = (CFM_HEALTH_INTERVAL * 3) +
+                    (CFM_HEALTH_INTERVAL / 2);
+                /* Calculate the exponentially weighted moving average.  A
+                 * weight of 1/2 causes the older data to eventually decay to
+                 * <1% if there is no packet loss.  Since the 'fault_interval'
+                 * is (3.5 * cfm_interval), and a cfm packet is expected to be
+                 * received every cfm_interval, computing cfm_health
+                 * 'CFM_HEALTH_INTERVAL'th 'fault_interval' expects
+                 * 'exp_ccm_recvd' CCM frames must be received.  The cfm_health
+                 * is not computed every 'fault_interval' so as to get an
+                 * integer number of ccm frames for computation. */
+
+                lost = ((exp_ccm_recvd - rmp->num_health_ccm) * 100) /
+                    exp_ccm_recvd;
+                /* When associating a weight of 1/2 with the older data, it is
+                 * possible that on recovering from loss, the health of
+                 * the interface never reaches 100% even though there is 0
+                 * packet loss. (Eg: (int) (99 + 100) / 2 can not be 100).
+                 * To account for this, the value of rmp->health is always
+                 * rounded up to the nearest integer. */
+                rmp->health = DIV_ROUND_UP((rmp->health + (100 - lost)), 2);
+                rmp->health = MIN(rmp->health, 100);
+                assert(rmp->health >= 0 && rmp->health <= 100);
+                cfm_health += rmp->health;
+                rmp->num_health_ccm = 0;
+            }
 
+            /* Calculate the average cfm health. */
+            cfm->health = hmap_is_empty(&cfm->remote_mps)
+                          ? 0
+                          : cfm_health / hmap_count(&cfm->remote_mps);
+            assert(cfm->health >= 0 && cfm->health <= 100);
+            cfm->health_interval = 0;
+        }
+        cfm->health_interval++;
+        HMAP_FOR_EACH_SAFE (rmp, rmp_next, node, &cfm->remote_mps) {
             if (!rmp->recv) {
                 VLOG_DBG("%s: no CCM from RMP %"PRIu64" in the last %lldms",
                          cfm->name, rmp->mpid, interval);
@@ -535,6 +592,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
         uint64_t ccm_mpid;
         uint32_t ccm_seq;
         bool ccm_opdown;
+        bool fault = false;
 
         if (cfm->extended) {
             ccm_mpid = ntohll(ccm->mpid64);
@@ -549,6 +607,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
             VLOG_WARN_RL(&rl, "%s: received a CCM with an invalid interval"
                          " (%"PRIu8") from RMP %"PRIu64, cfm->name,
                          ccm_interval, ccm_mpid);
+            fault = true;
         }
 
         if (cfm->extended && ccm_interval == 0
@@ -556,6 +615,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
             VLOG_WARN_RL(&rl, "%s: received a CCM with an invalid extended"
                          " interval (%"PRIu16"ms) from RMP %"PRIu64, cfm->name,
                          ccm_interval_ms_x, ccm_mpid);
+            fault = true;
         }
 
         rmp = lookup_remote_mp(cfm, ccm_mpid);
@@ -563,12 +623,15 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
             if (hmap_count(&cfm->remote_mps) < CFM_MAX_RMPS) {
                 rmp = xzalloc(sizeof *rmp);
                 hmap_insert(&cfm->remote_mps, &rmp->node, hash_mpid(ccm_mpid));
+                rmp->num_health_ccm = 0;
+                rmp->health = 100;
             } else {
                 cfm->recv_fault |= CFM_FAULT_OVERFLOW;
                 VLOG_WARN_RL(&rl,
                              "%s: dropped CCM with MPID %"PRIu64" from MAC "
                              ETH_ADDR_FMT, cfm->name, ccm_mpid,
                              ETH_ADDR_ARGS(eth->eth_src));
+                fault = true;
             }
         }
 
@@ -576,16 +639,23 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
                  " (interval %"PRIu8") (RDI %s)", cfm->name, ccm_seq,
                  ccm_mpid, ccm_interval, ccm_rdi ? "true" : "false");
 
+        if (ccm_rdi) {
+	    fault = true;
+        }
         if (rmp) {
             if (rmp->seq && ccm_seq != (rmp->seq + 1)) {
                 VLOG_WARN_RL(&rl, "%s: (mpid %"PRIu64") detected sequence"
                              " numbers which indicate possible connectivity"
                              " problems (previous %"PRIu32") (current %"PRIu32
                              ")", cfm->name, ccm_mpid, rmp->seq, ccm_seq);
+                fault = true;
             }
 
             rmp->mpid = ccm_mpid;
             rmp->recv = true;
+            if (!fault) {
+                rmp->num_health_ccm++;
+            }
             rmp->seq = ccm_seq;
             rmp->rdi = ccm_rdi;
             rmp->opup = !ccm_opdown;
@@ -605,6 +675,15 @@ cfm_get_fault(const struct cfm *cfm)
     return cfm->fault;
 }
 
+/* Gets the health of 'cfm'.  Returns an integer between 0 and 100 indicating
+ * the health of the link as a percentage which is calculated as an average of
+ * the health of all remote_mps. */
+int
+cfm_get_health(const struct cfm *cfm)
+{
+    return cfm->health;
+}
+
 /* Gets the operational state of 'cfm'.  'cfm' is considered operationally down
  * if it has received a CCM with the operationally down bit set from any of its
  * remote maintenance points. Returns true if 'cfm' is operationally up. False
@@ -656,6 +735,7 @@ cfm_print_details(struct ds *ds, const struct cfm *cfm)
         ds_put_cstr(ds, "\n");
     }
 
+    ds_put_format(ds, "\taverage health: %d\n", cfm->health);
     ds_put_format(ds, "\topstate: %s\n", cfm->opup ? "up" : "down");
     ds_put_format(ds, "\tremote_opstate: %s\n",
                   cfm->remote_opup ? "up" : "down");
@@ -672,6 +752,7 @@ cfm_print_details(struct ds *ds, const struct cfm *cfm)
         ds_put_format(ds, "\trecv since check: %s\n",
                       rmp->recv ? "true" : "false");
         ds_put_format(ds, "\topstate: %s\n", rmp->opup? "up" : "down");
+        ds_put_format(ds, "\tlink health: %d\n", cfm_get_rmp_health_stats(rmp));
     }
 }
 
diff --git a/lib/cfm.h b/lib/cfm.h
index 2556a32..2b4f888 100644
--- a/lib/cfm.h
+++ b/lib/cfm.h
@@ -69,6 +69,7 @@ bool cfm_configure(struct cfm *, const struct cfm_settings *);
 bool cfm_should_process_flow(const struct cfm *cfm, const struct flow *);
 void cfm_process_heartbeat(struct cfm *, const struct ofpbuf *packet);
 int cfm_get_fault(const struct cfm *);
+int cfm_get_health(const struct cfm *);
 bool cfm_get_opup(const struct cfm *);
 void cfm_get_remote_mpids(const struct cfm *, const uint64_t **rmps,
                           size_t *n_rmps);
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index 51b847f..a42d09e 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -1119,6 +1119,14 @@ get_cfm_remote_mpids(const struct ofport *ofport_, const uint64_t **rmps,
         return -1;
     }
 }
+
+static int
+get_cfm_health(const struct ofport *ofport_)
+{
+    struct ofport_dpif *ofport = ofport_dpif_cast(ofport_);
+
+    return ofport->cfm ? cfm_get_health(ofport->cfm) : -1;
+}
 
 /* Spanning Tree. */
 
@@ -6491,6 +6499,7 @@ const struct ofproto_class ofproto_dpif_class = {
     set_cfm,
     get_cfm_fault,
     get_cfm_remote_mpids,
+    get_cfm_health,
     set_stp,
     get_stp_status,
     set_stp_port,
diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h
index 26904ef..26d3693 100644
--- a/ofproto/ofproto-provider.h
+++ b/ofproto/ofproto-provider.h
@@ -980,6 +980,16 @@ struct ofproto_class {
     int (*get_cfm_remote_mpids)(const struct ofport *ofport,
                                 const uint64_t **rmps, size_t *n_rmps);
 
+    /* Checks the health of CFM configured on 'ofport'.  Returns an integer
+     * to indicate the health percentage of the 'ofport' which is an average of
+     * the health of all the remote_mps.  Returns an integer between 0 and 100
+     * where 0 means that the 'ofport' is very unhealthy and 100 means the
+     * 'ofport' is perfectly healthy.
+     *
+     * This function may be a null pointer if the ofproto implementation does
+     * not support CFM. */
+    int (*get_cfm_health)(const struct ofport *ofport);
+
     /* Configures spanning tree protocol (STP) on 'ofproto' using the
      * settings defined in 's'.
      *
diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c
index e7e0401..f934306 100644
--- a/ofproto/ofproto.c
+++ b/ofproto/ofproto.c
@@ -2481,6 +2481,19 @@ ofproto_port_get_cfm_remote_mpids(const struct ofproto *ofproto,
             : -1);
 }
 
+/* Checks the health of the CFM for 'ofp_port' within 'ofproto'.  Returns an
+ * integer value between 0 and 100 to indicate the health of the port as a
+ * percentage which is the average of cfm health of all the remote_mpids or
+ * returns -1 if CFM is not enabled on 'ofport'. */
+int
+ofproto_port_get_cfm_health(const struct ofproto *ofproto, uint16_t ofp_port)
+{
+    struct ofport *ofport = ofproto_get_port(ofproto, ofp_port);
+    return (ofport && ofproto->ofproto_class->get_cfm_health
+            ? ofproto->ofproto_class->get_cfm_health(ofport)
+            : -1);
+}
+
 static enum ofperr
 handle_aggregate_stats_request(struct ofconn *ofconn,
                                const struct ofp_stats_msg *osm)
diff --git a/ofproto/ofproto.h b/ofproto/ofproto.h
index 6172f29..c40f5d3 100644
--- a/ofproto/ofproto.h
+++ b/ofproto/ofproto.h
@@ -348,7 +348,8 @@ int ofproto_port_get_cfm_fault(const struct ofproto *, uint16_t ofp_port);
 int ofproto_port_get_cfm_remote_mpids(const struct ofproto *,
                                       uint16_t ofp_port, const uint64_t **rmps,
                                       size_t *n_rmps);
-
+int ofproto_port_get_cfm_health(const struct ofproto *ofproto,
+                                uint16_t ofp_port);
 void ofproto_get_ofproto_controller_info(const struct ofproto *, struct shash *);
 void ofproto_free_ofproto_controller_info(struct shash *);
 
diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c
index adc3b47..37093d8 100644
--- a/vswitchd/bridge.c
+++ b/vswitchd/bridge.c
@@ -1547,6 +1547,8 @@ iface_refresh_cfm_stats(struct iface *iface)
     int fault, error;
     const uint64_t *rmps;
     size_t n_rmps;
+    int health;
+    int64_t cfm_health = 0;
 
     if (iface_is_synthetic(iface)) {
         return;
@@ -1582,6 +1584,15 @@ iface_refresh_cfm_stats(struct iface *iface)
     } else {
         ovsrec_interface_set_cfm_remote_mpids(cfg, NULL, 0);
     }
+
+    health = ofproto_port_get_cfm_health(iface->port->bridge->ofproto,
+                                        iface->ofp_port);
+    if (health >= 0) {
+        cfm_health = (int64_t) health;
+        ovsrec_interface_set_cfm_health(cfg, &cfm_health, 1);
+    } else {
+        ovsrec_interface_set_cfm_health(cfg, NULL, 0);
+    }
 }
 
 static void
diff --git a/vswitchd/vswitch.ovsschema b/vswitchd/vswitch.ovsschema
index a3847e7..c7e1ac9 100644
--- a/vswitchd/vswitch.ovsschema
+++ b/vswitchd/vswitch.ovsschema
@@ -1,6 +1,6 @@
 {"name": "Open_vSwitch",
- "version": "6.8.0",
- "cksum": "4106006492 16485",
+ "version": "6.9.0",
+ "cksum": "617116616 16682",
  "tables": {
    "Open_vSwitch": {
      "columns": {
@@ -197,6 +197,11 @@
          "ephemeral": true},
        "cfm_fault_status": {
          "type": {"key": "string", "min": 0, "max": "unlimited"}},
+       "cfm_health": {
+         "type": {"key": {"type": "integer",
+                          "minInteger": 0,
+                          "maxInteger": 100},
+                  "min": 0, "max": 1}},
        "lacp_current": {
          "type": {"key": {"type": "boolean"},
                   "min": 0, "max": 1},
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index f3ea338..520d344 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -1726,6 +1726,30 @@
         an <code>ovs-appctl</code> command.
       </column>
 
+      <column name="cfm_health">
+        <p>
+          Indicates the health of the interface as a percentage value.  The
+          percentage value is an average of the health of links to all
+          remote_mpids.  The cfm_health indicated here is an exponentially
+          weighted moving average.  The health of an interface will reduce if
+          healthy heartbeats are not received at a rate of 3.5/'fault_interval,
+          and will gradually improve as healthy heartbeats are received at the
+          desired rate.  The health of the cfm interface is refreshed every 2
+          seconds.
+        </p>
+        <p>
+          As mentioned above, the faults can be triggered for several reasons.
+          The link health will deteriorate even if heartbeats are received but
+          they are reported to be unhealthy.  An unhealthy heartbeat in this
+          context is a heartbeat for which either some fault is set or is out
+          of sequence.  The health of a interface can vary from 0 to 100.
+          The interface health can be 100 only on receiving healthy heartbeats
+          at the desired rate.  If either the heartbeats are not received at
+          the desired rate or some of the heartbeats are unhealthy, the
+          interface health will be less than 100.
+        </p>
+      </column>
+
       <column name="cfm_remote_mpids">
         When CFM is properly configured, Open vSwitch will occasionally
         receive CCM broadcasts.  These broadcasts contain the MPID of the
-- 
1.7.2.5




More information about the dev mailing list