[ovs-dev] [PATCH] Granular link health statistics for cfm.
Mehak Mahajan
mmahajan at nicira.com
Wed Apr 4 18:38:42 UTC 2012
The changes display the cfm_health of an interface. The cfm_health
is an exponential weighted moving average of the health of all
remote_mpids. The value can vary from 0 to 100, 100 being very healthy
and 0 being unhealthy.
Feature #10363
Requested-by: Ethan Jackson <ethan at nicira.com>
Signed-off-by: Mehak Mahajan <mmahajan at nicira.com>
---
NEWS | 2 +
lib/cfm.c | 83 +++++++++++++++++++++++++++++++++++++++++++-
lib/cfm.h | 1 +
ofproto/ofproto-dpif.c | 9 +++++
ofproto/ofproto-provider.h | 10 +++++
ofproto/ofproto.c | 13 +++++++
ofproto/ofproto.h | 3 +-
vswitchd/bridge.c | 11 ++++++
vswitchd/vswitch.ovsschema | 9 ++++-
vswitchd/vswitch.xml | 24 +++++++++++++
10 files changed, 161 insertions(+), 4 deletions(-)
diff --git a/NEWS b/NEWS
index a466f92..ed3fc88 100644
--- a/NEWS
+++ b/NEWS
@@ -6,6 +6,8 @@ post-v1.6.0
- Added ability to configure dscp setting for manager and controller
connections. By default, these connections have a DSCP value of
Internetwork Control (0xc0).
+ - Added the granular link health statistics, 'cfm_health', to an
+ interface.
v1.6.0 - xx xxx xxxx
diff --git a/lib/cfm.c b/lib/cfm.c
index 8b9e5bc..a85be4f 100644
--- a/lib/cfm.c
+++ b/lib/cfm.c
@@ -60,6 +60,7 @@ static const uint8_t eth_addr_ccm_x[6] = {
#define CCM_MAID_LEN 48
#define CCM_OPCODE 1 /* CFM message opcode meaning CCM. */
#define CCM_RDI_MASK 0x80
+#define CFM_HEALTH_INTERVAL 2
struct ccm {
uint8_t mdlevel_version; /* MD Level and Version */
uint8_t opcode;
@@ -111,6 +112,10 @@ struct cfm {
* avoid flapping. */
uint64_t *rmps_array; /* Cache of remote_mps. */
size_t rmps_array_len; /* Number of rmps in 'rmps_array'. */
+
+ int health; /* Average health over all remote_mps */
+ int health_interval; /* Num of fault_intervals used to compute the
+ * health. */
};
/* Remote MPs represent foreign network entities that are configured to have
@@ -124,6 +129,10 @@ struct remote_mp {
receiving CCMs that it's expecting to. */
bool opup; /* Operational State. */
uint32_t seq; /* Most recently received sequence number. */
+ uint8_t num_health_ccm; /* Number of received ccm frames per
+ fault_interval. */
+ int health; /* Exponentially weighted moving average of link
+ health */
};
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(20, 30);
@@ -290,6 +299,7 @@ cfm_create(const char *name)
hmap_insert(&all_cfms, &cfm->hmap_node, hash_string(cfm->name, 0));
cfm->remote_opup = true;
cfm->fault_override = -1;
+ cfm->health = 100;
return cfm;
}
@@ -314,6 +324,13 @@ cfm_destroy(struct cfm *cfm)
free(cfm);
}
+/* Returns the health as a percentage. */
+static int
+cfm_get_rmp_health_stats(struct remote_mp *rmp)
+{
+ return rmp->health;
+}
+
/* Should be run periodically to update fault statistics messages. */
void
cfm_run(struct cfm *cfm)
@@ -332,8 +349,48 @@ cfm_run(struct cfm *cfm)
sizeof *cfm->rmps_array);
cfm->remote_opup = true;
- HMAP_FOR_EACH_SAFE (rmp, rmp_next, node, &cfm->remote_mps) {
+ if (cfm->health_interval == CFM_HEALTH_INTERVAL) {
+ int cfm_health = 0;
+
+ HMAP_FOR_EACH(rmp, node, &cfm->remote_mps) {
+ int lost, exp_ccm_recvd;
+
+ exp_ccm_recvd = (CFM_HEALTH_INTERVAL * 3) +
+ (CFM_HEALTH_INTERVAL / 2);
+ /* Calculate the exponentially weighted moving average. A
+ * weight of 1/2 causes the older data to eventually decay to
+ * <1% if there is no packet loss. Since the 'fault_interval'
+ * is (3.5 * cfm_interval), and a cfm packet is expected to be
+ * received every cfm_interval, computing cfm_health
+ * 'CFM_HEALTH_INTERVAL'th 'fault_interval' expects
+ * 'exp_ccm_recvd' CCM frames must be received. The cfm_health
+ * is not computed every 'fault_interval' so as to get an
+ * integer number of ccm frames for computation. */
+
+ lost = ((exp_ccm_recvd - rmp->num_health_ccm) * 100) /
+ exp_ccm_recvd;
+ /* When associating a weight of 1/2 with the older data, it is
+ * possible that on recovering from loss, the health of
+ * the interface never reaches 100% even though there is 0
+ * packet loss. (Eg: (int) (99 + 100) / 2 can not be 100).
+ * To account for this, the value of rmp->health is always
+ * rounded up to the nearest integer. */
+ rmp->health = DIV_ROUND_UP((rmp->health + (100 - lost)), 2);
+ rmp->health = MIN(rmp->health, 100);
+ assert(rmp->health >= 0 && rmp->health <= 100);
+ cfm_health += rmp->health;
+ rmp->num_health_ccm = 0;
+ }
+ /* Calculate the average cfm health. */
+ cfm->health = hmap_is_empty(&cfm->remote_mps)
+ ? 0
+ : cfm_health / hmap_count(&cfm->remote_mps);
+ assert(cfm->health >= 0 && cfm->health <= 100);
+ cfm->health_interval = 0;
+ }
+ cfm->health_interval++;
+ HMAP_FOR_EACH_SAFE (rmp, rmp_next, node, &cfm->remote_mps) {
if (!rmp->recv) {
VLOG_DBG("%s: no CCM from RMP %"PRIu64" in the last %lldms",
cfm->name, rmp->mpid, interval);
@@ -535,6 +592,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
uint64_t ccm_mpid;
uint32_t ccm_seq;
bool ccm_opdown;
+ bool fault = false;
if (cfm->extended) {
ccm_mpid = ntohll(ccm->mpid64);
@@ -549,6 +607,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
VLOG_WARN_RL(&rl, "%s: received a CCM with an invalid interval"
" (%"PRIu8") from RMP %"PRIu64, cfm->name,
ccm_interval, ccm_mpid);
+ fault = true;
}
if (cfm->extended && ccm_interval == 0
@@ -556,6 +615,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
VLOG_WARN_RL(&rl, "%s: received a CCM with an invalid extended"
" interval (%"PRIu16"ms) from RMP %"PRIu64, cfm->name,
ccm_interval_ms_x, ccm_mpid);
+ fault = true;
}
rmp = lookup_remote_mp(cfm, ccm_mpid);
@@ -563,12 +623,15 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
if (hmap_count(&cfm->remote_mps) < CFM_MAX_RMPS) {
rmp = xzalloc(sizeof *rmp);
hmap_insert(&cfm->remote_mps, &rmp->node, hash_mpid(ccm_mpid));
+ rmp->num_health_ccm = 0;
+ rmp->health = 100;
} else {
cfm->recv_fault |= CFM_FAULT_OVERFLOW;
VLOG_WARN_RL(&rl,
"%s: dropped CCM with MPID %"PRIu64" from MAC "
ETH_ADDR_FMT, cfm->name, ccm_mpid,
ETH_ADDR_ARGS(eth->eth_src));
+ fault = true;
}
}
@@ -576,16 +639,23 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
" (interval %"PRIu8") (RDI %s)", cfm->name, ccm_seq,
ccm_mpid, ccm_interval, ccm_rdi ? "true" : "false");
+ if (ccm_rdi) {
+ fault = true;
+ }
if (rmp) {
if (rmp->seq && ccm_seq != (rmp->seq + 1)) {
VLOG_WARN_RL(&rl, "%s: (mpid %"PRIu64") detected sequence"
" numbers which indicate possible connectivity"
" problems (previous %"PRIu32") (current %"PRIu32
")", cfm->name, ccm_mpid, rmp->seq, ccm_seq);
+ fault = true;
}
rmp->mpid = ccm_mpid;
rmp->recv = true;
+ if (!fault) {
+ rmp->num_health_ccm++;
+ }
rmp->seq = ccm_seq;
rmp->rdi = ccm_rdi;
rmp->opup = !ccm_opdown;
@@ -605,6 +675,15 @@ cfm_get_fault(const struct cfm *cfm)
return cfm->fault;
}
+/* Gets the health of 'cfm'. Returns an integer between 0 and 100 indicating
+ * the health of the link as a percentage which is calculated as an average of
+ * the health of all remote_mps. */
+int
+cfm_get_health(const struct cfm *cfm)
+{
+ return cfm->health;
+}
+
/* Gets the operational state of 'cfm'. 'cfm' is considered operationally down
* if it has received a CCM with the operationally down bit set from any of its
* remote maintenance points. Returns true if 'cfm' is operationally up. False
@@ -656,6 +735,7 @@ cfm_print_details(struct ds *ds, const struct cfm *cfm)
ds_put_cstr(ds, "\n");
}
+ ds_put_format(ds, "\taverage health: %d\n", cfm->health);
ds_put_format(ds, "\topstate: %s\n", cfm->opup ? "up" : "down");
ds_put_format(ds, "\tremote_opstate: %s\n",
cfm->remote_opup ? "up" : "down");
@@ -672,6 +752,7 @@ cfm_print_details(struct ds *ds, const struct cfm *cfm)
ds_put_format(ds, "\trecv since check: %s\n",
rmp->recv ? "true" : "false");
ds_put_format(ds, "\topstate: %s\n", rmp->opup? "up" : "down");
+ ds_put_format(ds, "\tlink health: %d\n", cfm_get_rmp_health_stats(rmp));
}
}
diff --git a/lib/cfm.h b/lib/cfm.h
index 2556a32..2b4f888 100644
--- a/lib/cfm.h
+++ b/lib/cfm.h
@@ -69,6 +69,7 @@ bool cfm_configure(struct cfm *, const struct cfm_settings *);
bool cfm_should_process_flow(const struct cfm *cfm, const struct flow *);
void cfm_process_heartbeat(struct cfm *, const struct ofpbuf *packet);
int cfm_get_fault(const struct cfm *);
+int cfm_get_health(const struct cfm *);
bool cfm_get_opup(const struct cfm *);
void cfm_get_remote_mpids(const struct cfm *, const uint64_t **rmps,
size_t *n_rmps);
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index 51b847f..a42d09e 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -1119,6 +1119,14 @@ get_cfm_remote_mpids(const struct ofport *ofport_, const uint64_t **rmps,
return -1;
}
}
+
+static int
+get_cfm_health(const struct ofport *ofport_)
+{
+ struct ofport_dpif *ofport = ofport_dpif_cast(ofport_);
+
+ return ofport->cfm ? cfm_get_health(ofport->cfm) : -1;
+}
/* Spanning Tree. */
@@ -6491,6 +6499,7 @@ const struct ofproto_class ofproto_dpif_class = {
set_cfm,
get_cfm_fault,
get_cfm_remote_mpids,
+ get_cfm_health,
set_stp,
get_stp_status,
set_stp_port,
diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h
index 26904ef..26d3693 100644
--- a/ofproto/ofproto-provider.h
+++ b/ofproto/ofproto-provider.h
@@ -980,6 +980,16 @@ struct ofproto_class {
int (*get_cfm_remote_mpids)(const struct ofport *ofport,
const uint64_t **rmps, size_t *n_rmps);
+ /* Checks the health of CFM configured on 'ofport'. Returns an integer
+ * to indicate the health percentage of the 'ofport' which is an average of
+ * the health of all the remote_mps. Returns an integer between 0 and 100
+ * where 0 means that the 'ofport' is very unhealthy and 100 means the
+ * 'ofport' is perfectly healthy.
+ *
+ * This function may be a null pointer if the ofproto implementation does
+ * not support CFM. */
+ int (*get_cfm_health)(const struct ofport *ofport);
+
/* Configures spanning tree protocol (STP) on 'ofproto' using the
* settings defined in 's'.
*
diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c
index e7e0401..f934306 100644
--- a/ofproto/ofproto.c
+++ b/ofproto/ofproto.c
@@ -2481,6 +2481,19 @@ ofproto_port_get_cfm_remote_mpids(const struct ofproto *ofproto,
: -1);
}
+/* Checks the health of the CFM for 'ofp_port' within 'ofproto'. Returns an
+ * integer value between 0 and 100 to indicate the health of the port as a
+ * percentage which is the average of cfm health of all the remote_mpids or
+ * returns -1 if CFM is not enabled on 'ofport'. */
+int
+ofproto_port_get_cfm_health(const struct ofproto *ofproto, uint16_t ofp_port)
+{
+ struct ofport *ofport = ofproto_get_port(ofproto, ofp_port);
+ return (ofport && ofproto->ofproto_class->get_cfm_health
+ ? ofproto->ofproto_class->get_cfm_health(ofport)
+ : -1);
+}
+
static enum ofperr
handle_aggregate_stats_request(struct ofconn *ofconn,
const struct ofp_stats_msg *osm)
diff --git a/ofproto/ofproto.h b/ofproto/ofproto.h
index 6172f29..c40f5d3 100644
--- a/ofproto/ofproto.h
+++ b/ofproto/ofproto.h
@@ -348,7 +348,8 @@ int ofproto_port_get_cfm_fault(const struct ofproto *, uint16_t ofp_port);
int ofproto_port_get_cfm_remote_mpids(const struct ofproto *,
uint16_t ofp_port, const uint64_t **rmps,
size_t *n_rmps);
-
+int ofproto_port_get_cfm_health(const struct ofproto *ofproto,
+ uint16_t ofp_port);
void ofproto_get_ofproto_controller_info(const struct ofproto *, struct shash *);
void ofproto_free_ofproto_controller_info(struct shash *);
diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c
index adc3b47..37093d8 100644
--- a/vswitchd/bridge.c
+++ b/vswitchd/bridge.c
@@ -1547,6 +1547,8 @@ iface_refresh_cfm_stats(struct iface *iface)
int fault, error;
const uint64_t *rmps;
size_t n_rmps;
+ int health;
+ int64_t cfm_health = 0;
if (iface_is_synthetic(iface)) {
return;
@@ -1582,6 +1584,15 @@ iface_refresh_cfm_stats(struct iface *iface)
} else {
ovsrec_interface_set_cfm_remote_mpids(cfg, NULL, 0);
}
+
+ health = ofproto_port_get_cfm_health(iface->port->bridge->ofproto,
+ iface->ofp_port);
+ if (health >= 0) {
+ cfm_health = (int64_t) health;
+ ovsrec_interface_set_cfm_health(cfg, &cfm_health, 1);
+ } else {
+ ovsrec_interface_set_cfm_health(cfg, NULL, 0);
+ }
}
static void
diff --git a/vswitchd/vswitch.ovsschema b/vswitchd/vswitch.ovsschema
index a3847e7..c7e1ac9 100644
--- a/vswitchd/vswitch.ovsschema
+++ b/vswitchd/vswitch.ovsschema
@@ -1,6 +1,6 @@
{"name": "Open_vSwitch",
- "version": "6.8.0",
- "cksum": "4106006492 16485",
+ "version": "6.9.0",
+ "cksum": "617116616 16682",
"tables": {
"Open_vSwitch": {
"columns": {
@@ -197,6 +197,11 @@
"ephemeral": true},
"cfm_fault_status": {
"type": {"key": "string", "min": 0, "max": "unlimited"}},
+ "cfm_health": {
+ "type": {"key": {"type": "integer",
+ "minInteger": 0,
+ "maxInteger": 100},
+ "min": 0, "max": 1}},
"lacp_current": {
"type": {"key": {"type": "boolean"},
"min": 0, "max": 1},
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index f3ea338..520d344 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -1726,6 +1726,30 @@
an <code>ovs-appctl</code> command.
</column>
+ <column name="cfm_health">
+ <p>
+ Indicates the health of the interface as a percentage value. The
+ percentage value is an average of the health of links to all
+ remote_mpids. The cfm_health indicated here is an exponentially
+ weighted moving average. The health of an interface will reduce if
+ healthy heartbeats are not received at a rate of 3.5/'fault_interval,
+ and will gradually improve as healthy heartbeats are received at the
+ desired rate. The health of the cfm interface is refreshed every 2
+ seconds.
+ </p>
+ <p>
+ As mentioned above, the faults can be triggered for several reasons.
+ The link health will deteriorate even if heartbeats are received but
+ they are reported to be unhealthy. An unhealthy heartbeat in this
+ context is a heartbeat for which either some fault is set or is out
+ of sequence. The health of a interface can vary from 0 to 100.
+ The interface health can be 100 only on receiving healthy heartbeats
+ at the desired rate. If either the heartbeats are not received at
+ the desired rate or some of the heartbeats are unhealthy, the
+ interface health will be less than 100.
+ </p>
+ </column>
+
<column name="cfm_remote_mpids">
When CFM is properly configured, Open vSwitch will occasionally
receive CCM broadcasts. These broadcasts contain the MPID of the
--
1.7.2.5
More information about the dev
mailing list