[ovs-dev] [RFC PATCH v2 17/19] netdev-dpdk: Enable PMD health checks on heartbeat failure.

Bhanuprakash Bodireddy bhanuprakash.bodireddy at intel.com
Mon Jun 12 16:49:45 UTC 2017


The keepalive thread sends heartbeats to PMD thread and when PMD fails to
respond to successive heartbeats the PMD is potentially stalled. The PMD
state transition is as below:

ALIVE -> MISSING -> DEAD -> GONE

This commit enables PMD healthchecks when PMD doesn't respond to
heartbeats. This is needed to handle false negatives. With this commit
the new state transition is as below:

ALIVE -> MISSING -> DEAD -> CHECK -> GONE

PMD Health checking state is introduced and will immediately kickin when
the PMD gets in to DEAD state. As part of this below are considered.

  - Link status of the ports polled by PMD thread.
  - Statistics of the ports polled by PMD thread.
  - PMD polling and processing cycles.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy at intel.com>
---
 lib/keepalive.h   |  3 +++
 lib/netdev-dpdk.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/lib/keepalive.h b/lib/keepalive.h
index 7501065..36789ee 100644
--- a/lib/keepalive.h
+++ b/lib/keepalive.h
@@ -71,6 +71,9 @@ struct keepalive_shm {
     /* Last seen timestamp of the core */
     uint64_t core_last_seen_times[KEEPALIVE_MAXCORES];
 
+    /* Number of PMD failures */
+    uint32_t core_failures[KEEPALIVE_MAXCORES];
+
     /* Store pmd thread tid */
     pid_t thread_id[KEEPALIVE_MAXCORES];
 
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 24a87bb..15c8c68 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -606,6 +606,51 @@ dpdk_failcore_cb(void *ptr_data, const int core_id)
     }
 }
 
+static void
+dpdk_ka_handle_failure(enum keepalive_state fail_state, const int core_id,
+               const enum rte_keepalive_state core_state,
+               uint64_t last_alive, struct keepalive_shm *ka_shm)
+{
+    if (fail_state == KA_STATE_DEAD) {
+        /* If process is in DEFUNC/UNINTERRUPTIBLE/TRACED state it is inactive
+         * and no additional health checks are needed. */
+        uint32_t tid = ka_get_tid(core_id);
+        if (process_is_active(tid)) {
+           /* Enable PMD health check only when PMD is in 'RUNNING' state and
+            * still doesn't respond to heartbeats. Health checks are needed to
+            * analyze other stats as we are in penultimate state of declaring
+            * PMD as failed. */
+            ka_enable_pmd_health_check(core_id);
+        }
+        ka_set_pmd_state_ts(core_id, KA_STATE_DEAD, last_alive);
+    }
+
+    if (fail_state == KA_STATE_GONE) {
+        int pmd_hc_state = ka_get_pmd_health_check_state(core_id);
+
+        switch (pmd_hc_state) {
+        case PMD_HC_ENABLE:
+            break;
+        case PMD_HC_DISABLE:
+            VLOG_DBG_RL(&rl, "Health check disabled for PMD core:%d", core_id);
+            break;
+        case PMD_HC_PROGRESS:
+            ka_set_pmd_state_ts(core_id, KA_STATE_CHECK, last_alive);
+            break;
+
+        case PMD_HC_COMPLETE:
+            ka_shm->core_failures[core_id]++;
+            ka_set_pmd_state_ts(core_id, core_state, last_alive);
+            ka_disable_pmd_health_check(core_id);
+            break;
+
+        default:
+            VLOG_DBG_RL(&rl, "Unknown health check state %d", pmd_hc_state);
+            OVS_NOT_REACHED();
+        }
+    }
+}
+
 /* Update the core state in shared memory.
  *
  * This function shall be invoked periodically to write the core status and
@@ -631,10 +676,16 @@ dpdk_ka_update_core_state(void *ptr_data, const int core_id,
     case RTE_KA_STATE_MISSING:
         ka_set_pmd_state_ts(core_id, KA_STATE_ALIVE, last_alive);
         break;
-    case RTE_KA_STATE_DOZING:
-    case RTE_KA_STATE_SLEEP:
     case RTE_KA_STATE_DEAD:
+        dpdk_ka_handle_failure(KA_STATE_DEAD, core_id, core_state,
+                               last_alive, ka_shm);
+        break;
     case RTE_KA_STATE_GONE:
+        dpdk_ka_handle_failure(KA_STATE_GONE, core_id, core_state,
+                               last_alive, ka_shm);
+        break;
+    case RTE_KA_STATE_DOZING:
+    case RTE_KA_STATE_SLEEP:
         ka_set_pmd_state_ts(core_id, core_state, last_alive);
         break;
     case RTE_KA_STATE_UNUSED:
-- 
2.4.11



More information about the dev mailing list