[ovs-dev] [RFC PATCH 17/21] keepalive: Check the packet statisitcs as part of PMD health checks.

Bhanuprakash Bodireddy bhanuprakash.bodireddy at intel.com
Wed Jun 7 16:15:13 UTC 2017


This commit adds the support to check the packet statistics on the port
polled by PMD thread. If the packets aren't processed due to PMD thread
stall/deadlock the statistics wont update and this can be used by
monitoring framework to confirm PMD failure.

This mechanism has limitation with MQ enabled. In some cases queues of
the DPDK port can be polled by different PMD threads. Even if one PMD
thread stalls the port statistics will be incremented due to the queue
processed by other PMD. The function can return active state considering
the packets processed in this case.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy at intel.com>
---
 lib/dpif-netdev.c |  13 ++++++-
 lib/keepalive.c   | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/keepalive.h   |   6 +++
 3 files changed, 128 insertions(+), 2 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index b7689e3..1d98c0b 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -974,8 +974,9 @@ sorted_poll_thread_list(struct dp_netdev *dp,
 static void
 pmd_health_check(struct dp_netdev_pmd_thread *pmd)
 {
-    int port_link_status = 0;
     struct rxq_poll *poll;
+    int port_link_status = 0;
+    int port_stats = 0;
 
     struct svec pmd_poll_list;
     svec_init(&pmd_poll_list);
@@ -998,6 +999,12 @@ pmd_health_check(struct dp_netdev_pmd_thread *pmd)
                 ka_shm_update_port_status(netdev_rxq_get_name(poll->rxq->rx),
                                   netdev_rxq_get_queue_id(poll->rxq->rx),
                                   link_state, pmd->core_id, i);
+
+                if (!strcmp(link_state, "up")) {
+                    ka_shm_update_port_statistics(poll->rxq->port->netdev,
+                                                    pmd->core_id, i);
+                }
+
                 break;
             }
         }
@@ -1005,8 +1012,10 @@ pmd_health_check(struct dp_netdev_pmd_thread *pmd)
     svec_destroy(&pmd_poll_list);
 
     port_link_status = ka_get_polled_ports_status(pmd->core_id);
+    port_stats = ka_get_polled_ports_stats(pmd->core_id);
 
-    if (port_link_status == ACTIVE_RUN_STATE) {
+    if (port_link_status == ACTIVE_RUN_STATE &&
+        port_stats == ACTIVE_RUN_STATE ) {
         ka_set_pmd_state(pmd->core_id, KA_STATE_ALIVE);
     } else {
         ka_set_pmd_state(pmd->core_id, KA_STATE_CHECK);
diff --git a/lib/keepalive.c b/lib/keepalive.c
index d1858ac..f513921 100644
--- a/lib/keepalive.c
+++ b/lib/keepalive.c
@@ -24,6 +24,7 @@
 #include "dpdk.h"
 #include "keepalive.h"
 #include "lib/vswitch-idl.h"
+#include "netdev-dpdk.h"
 #include "openvswitch/dynamic-string.h"
 #include "openvswitch/vlog.h"
 #include "ovs-thread.h"
@@ -159,6 +160,18 @@ ka_set_pmd_state(unsigned core_id, enum keepalive_state state)
     ka_shm->core_state[core_id] = state;
 }
 
+static inline int
+ka_get_pmd_state(unsigned core_id)
+{
+    struct keepalive_shm *ka_shm = get_ka_shm();
+    if (!ka_shm) {
+        VLOG_ERR_RL(&rl, "KeepAlive: Invalid shared memory block.");
+        return KA_STATE_UNUSED;
+    }
+
+    return ka_shm->core_state[core_id];
+}
+
 /* Retrieve and return the keepalive timer interval from OVSDB. */
 static uint32_t
 get_ka_timer_interval(const struct smap *ovs_other_config OVS_UNUSED)
@@ -348,6 +361,33 @@ enum pmdhealth_status ka_get_polled_ports_status(unsigned core_id)
     }
 }
 
+enum pmdhealth_status ka_get_polled_ports_stats(unsigned core_id)
+{
+    struct keepalive_shm *ka_shm = get_ka_shm();
+    if (!ka_shm) {
+        VLOG_ERR_RL(&rl, "KeepAlive: Invalid shared memory block.");
+        return -1;
+    }
+
+    int failed = 0;
+    int n_ports = ka_shm->ext_stats[core_id].num_poll_ports;
+    for (int i = 0; i < n_ports; i++) {
+        int state;
+        state =
+          ka_shm->ext_stats[core_id].port_stats[i].state[PORT_STATS_CHECK];
+        if (state == FAILURE_STATE) {
+            failed = 1;
+            break;
+        }
+    }
+
+    if (!failed) {
+        return ACTIVE_RUN_STATE;
+    } else {
+        return FAILURE_STATE;
+    }
+}
+
 void
 ka_shm_update_port_status(const char *port, int qid, char *link_state,
                           int core_id, int idx)
@@ -386,6 +426,77 @@ ka_shm_update_port_status(const char *port, int qid, char *link_state,
                                                                state;
 }
 
+void
+ka_shm_update_port_statistics(const struct netdev *netdev,
+                              int core_id, int idx)
+{
+    int error;
+    int state = 0;
+    struct keepalive_shm *ka_shm = get_ka_shm();
+    if (!ka_shm) {
+        VLOG_ERR_RL(&rl, "KeepAlive: Invalid shared memory block.");
+        return;
+    }
+
+    ka_shm->ext_stats[core_id].num_poll_ports = idx;
+
+    int pmd_state = ka_get_pmd_state(core_id);
+    if (pmd_state == KA_STATE_CHECK) {
+        struct netdev_stats temp_stats;
+        VLOG_DBG_RL(&rl, "KeepAlive: HEALTH CHECKS ENABLED.");
+
+        error = netdev_get_stats(netdev, &temp_stats);
+        if (!error) {
+            uint64_t tx_pkts_cnt = 0;
+            uint64_t rx_pkts_cnt = 0;
+            int skip_tx_check = 0, skip_rx_check = 0;
+
+            struct netdev_stats *prev_stats =
+                   &ka_shm->ext_stats[core_id].port_stats[idx].stats;
+
+            if (!temp_stats.tx_packets && !prev_stats->tx_packets) {
+                VLOG_DBG_RL(&rl, "KeepAlive: No packets transmitted");
+                skip_tx_check = 1;
+            } else {
+                tx_pkts_cnt = temp_stats.tx_packets -
+                                         prev_stats->tx_packets;
+            }
+
+            if (!temp_stats.rx_packets && !prev_stats->rx_packets) {
+                VLOG_DBG_RL(&rl, "KeepAlive: No packets received");
+                skip_rx_check = 1;
+            } else {
+                rx_pkts_cnt = temp_stats.rx_packets -
+                                         prev_stats->rx_packets;
+            }
+
+            if (skip_tx_check && skip_rx_check) {
+                VLOG_DBG_RL(&rl, "KeepAlive: No active traffic");
+                state = ACTIVE_RUN_STATE;
+            } else if ((!skip_tx_check && tx_pkts_cnt) ||
+                      (!skip_rx_check && rx_pkts_cnt)) {
+                VLOG_DBG_RL(&rl, "KeepAlive: Stats updated");
+                state = ACTIVE_RUN_STATE;
+            } else {
+                VLOG_DBG("PMD failure");
+                state = FAILURE_STATE;
+            }
+        }
+    } else {
+        struct netdev_stats *stats;
+        stats = &ka_shm->ext_stats[core_id].port_stats[idx].stats;
+        error = netdev_get_stats(netdev, stats);
+        if (error) {
+            VLOG_ERR("Couldn't retrieve stats (%s)", ovs_strerror(error));
+        }
+
+        state = ACTIVE_RUN_STATE;
+    }
+
+    ka_shm->ext_stats[core_id].port_stats[idx].state[PORT_STATS_CHECK] =
+                                                        state;
+}
+
 static void
 ka_unixctl_pmd_health_show(struct unixctl_conn *conn, int argc OVS_UNUSED,
                        const char *argv[] OVS_UNUSED, void *ka_shm_)
diff --git a/lib/keepalive.h b/lib/keepalive.h
index 2de50f3..f9bdf12 100644
--- a/lib/keepalive.h
+++ b/lib/keepalive.h
@@ -18,6 +18,7 @@
 #define KEEPALIVE_H
 
 #include <stdint.h>
+
 #ifdef DPDK_NETDEV
 #include <rte_keepalive.h>
 #define KEEPALIVE_MAXCORES RTE_KEEPALIVE_MAXCORES
@@ -25,6 +26,8 @@
 #define KEEPALIVE_MAXCORES 128
 #endif /* DPDK_NETDEV */
 
+#include "netdev.h"
+
 #define MAX_POLL_PORTS 20
 
 struct smap;
@@ -51,6 +54,7 @@ struct poll_port_stats {
     char *link_state;
     int qid;
     int state[PORT_NUM_CHECKS];
+    struct netdev_stats stats;
 };
 
 struct pmd_extended_stats {
@@ -103,5 +107,7 @@ struct smap *ka_stats_run(void);
 
 void ka_shm_update_port_status(const char *,int,char *,int,int);
 enum pmdhealth_status ka_get_polled_ports_status(unsigned);
+void ka_shm_update_port_statistics(const struct netdev *,int,int);
+enum pmdhealth_status ka_get_polled_ports_stats(unsigned);
 
 #endif /* keepalive.h */
-- 
2.4.11



More information about the dev mailing list