[ovs-dev] [RFC PATCH 19/21] netdev-dpdk: Enable PMD health checks on heartbeat failure.

Bhanuprakash Bodireddy bhanuprakash.bodireddy at intel.com
Wed Jun 7 16:15:15 UTC 2017


The keepalive thread sends heartbeats to PMD thread and when PMD fails to
respond to successive heartbeats the PMD is potentially stalled. The PMD
state transition is as below:

ALIVE -> MISSING -> DEAD -> GONE

This commit enables PMD healthchecks when PMD doesn't respond to
heartbeats. This is needed to handle false negatives. With this commit
the new state transition is as below:

ALIVE -> MISSING -> DEAD -> CHECK -> GONE

PMD Health checking state is introduced and will immediately kickin when
the PMD gets in to DEAD state. As part of this below are considered.

  - Link status of the ports polled by PMD thread.
  - Statistics of the ports polled by PMD thread.
  - PMD polling and processing cycles.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy at intel.com>
---
 lib/keepalive.h   |  3 +++
 lib/netdev-dpdk.c | 39 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/lib/keepalive.h b/lib/keepalive.h
index d8e55d5..6d1733c 100644
--- a/lib/keepalive.h
+++ b/lib/keepalive.h
@@ -71,6 +71,9 @@ struct keepalive_shm {
     /* Last seen timestamp of the core */
     uint64_t core_last_seen_times[KEEPALIVE_MAXCORES];
 
+    /* Number of PMD failures */
+    uint32_t core_failures[KEEPALIVE_MAXCORES];
+
     /* Store pmd thread tid */
     pid_t thread_id[KEEPALIVE_MAXCORES];
 
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 74c1ab1..bb93fd8 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -606,6 +606,32 @@ dpdk_failcore_cb(void *ptr_data, const int core_id)
     }
 }
 
+static void
+dpdk_ka_handle_failure(struct keepalive_shm *ka_shm, const int core_id,
+               const enum rte_keepalive_state core_state, uint64_t last_alive)
+{
+    if ((ka_shm->core_state[core_id] != KA_STATE_CHECK) &&
+        (ka_shm->core_state[core_id] == KA_STATE_DEAD)) {
+        ka_set_pmd_state(core_id, KA_STATE_CHECK);
+    } else {
+        /* The core failure has to be incremented only once when the
+         * state transition happens from CHECK -> GONE.
+         */
+        if (ka_shm->core_state[core_id] == KA_STATE_CHECK) {
+            ka_shm->core_failures[core_id]++;
+        }
+
+        /* Set the PMD core state to KA_STATE_GONE i.e failure. */
+        ka_set_pmd_state(core_id, core_state);
+
+        if (ka_is_pmdhealth_check_needed(core_id)) {
+            ka_disable_pmd_health_check(core_id);
+        }
+    }
+
+    ka_shm->core_last_seen_times[core_id] = last_alive;
+}
+
 /* Update the core state in shared memory.
  *
  * This function shall be invoked periodically to write the core status and
@@ -632,10 +658,19 @@ dpdk_ka_update_core_state(void *ptr_data, const int core_id,
         ka_shm->core_state[core_id] = KA_STATE_ALIVE;
         ka_shm->core_last_seen_times[core_id] = last_alive;
         break;
-    case RTE_KA_STATE_DOZING:
-    case RTE_KA_STATE_SLEEP:
     case RTE_KA_STATE_DEAD:
+        /* Enable PMD health check here, as we are in penultimate state
+         * of declaring PMD as failed. */
+        ka_enable_pmd_health_check(core_id);
+
+        ka_shm->core_state[core_id] = core_state;
+        ka_shm->core_last_seen_times[core_id] = last_alive;
+        break;
     case RTE_KA_STATE_GONE:
+        dpdk_ka_handle_failure(ka_shm, core_id, core_state, last_alive);
+        break;
+    case RTE_KA_STATE_DOZING:
+    case RTE_KA_STATE_SLEEP:
         ka_shm->core_state[core_id] = core_state;
         ka_shm->core_last_seen_times[core_id] = last_alive;
         break;
-- 
2.4.11



More information about the dev mailing list