[ovs-dev] [PATCH v4 3/7] dpif-netdev: Register packet processing cores to KA framework.
Aaron Conole
aconole at redhat.com
Wed Sep 6 21:15:24 UTC 2017
Bhanuprakash Bodireddy <bhanuprakash.bodireddy at intel.com> writes:
> This commit registers the packet processing PMD cores to keepalive
> framework. Only PMDs that have rxqs mapped will be registered and
> actively monitored by KA framework.
>
> This commit spawns a keepalive thread that will dispatch heartbeats to
> PMD cores. The pmd threads respond to heartbeats by marking themselves
> alive. As long as PMD responds to heartbeats it is considered 'healthy'.
>
> Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy at intel.com>
> ---
I'm really confused with this patch. I've stopped reviewing the series.
It seems like there's a mix of 'track by core id' and 'track by thread
id'.
I don't think it's possible to do anything by core id. We can never
know what else has been scheduled on those cores, and we cannot be sure
that a taskset or other scheduler provisioning call will move the
threads.
> lib/dpif-netdev.c | 70 +++++++++++++++++++++++++
> lib/keepalive.c | 153 ++++++++++++++++++++++++++++++++++++++++++++++++++----
> lib/keepalive.h | 17 ++++++
> lib/util.c | 23 ++++++++
> lib/util.h | 2 +
> 5 files changed, 254 insertions(+), 11 deletions(-)
>
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index e2cd931..84c7ffd 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -49,6 +49,7 @@
> #include "flow.h"
> #include "hmapx.h"
> #include "id-pool.h"
> +#include "keepalive.h"
> #include "latch.h"
> #include "netdev.h"
> #include "netdev-vport.h"
> @@ -978,6 +979,63 @@ sorted_poll_thread_list(struct dp_netdev *dp,
> *n = k;
> }
>
> +static void *
> +ovs_keepalive(void *f_ OVS_UNUSED)
> +{
> + pthread_detach(pthread_self());
> +
> + for (;;) {
> + xusleep(get_ka_interval() * 1000);
> + }
> +
> + return NULL;
> +}
> +
> +static void
> +ka_thread_start(struct dp_netdev *dp)
> +{
> + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
> +
> + if (ovsthread_once_start(&once)) {
> + ovs_thread_create("ovs_keepalive", ovs_keepalive, dp);
> +
> + ovsthread_once_done(&once);
> + }
> +}
> +
> +static void
> +ka_register_datapath_threads(struct dp_netdev *dp)
> +{
> + int ka_init = get_ka_init_status();
> + VLOG_DBG("Keepalive: Was initialization successful? [%s]",
> + ka_init ? "Success" : "Failure");
> + if (!ka_init) {
> + return;
> + }
> +
> + ka_thread_start(dp);
> +
> + struct dp_netdev_pmd_thread *pmd;
> + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> + /* Register only PMD threads. */
> + if (pmd->core_id != NON_PMD_CORE_ID) {
> + int tid = ka_get_pmd_tid(pmd->core_id);
> +
> + /* Skip PMD thread with no rxqs mapping. */
why skip these pmds? we should still watch them, and then we can
correlated interesting events (for instance... when an rxq gets added
whats the change in utilization, etc).
> + if (OVS_UNLIKELY(!hmap_count(&pmd->poll_list))) {
> + /* rxq mapping changes due to reconfiguration,
> + * if there are no rxqs mapped to PMD, unregister it. */
> + ka_unregister_thread(tid, true);
> + continue;
> + }
> +
> + ka_register_thread(tid, true);
> + VLOG_INFO("Registered PMD thread [%d] on Core [%d] to KA framework",
> + tid, pmd->core_id);
> + }
> + }
> +}
> +
> static void
> dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
> void *aux)
> @@ -3625,6 +3683,9 @@ reconfigure_datapath(struct dp_netdev *dp)
>
> /* Reload affected pmd threads. */
> reload_affected_pmds(dp);
> +
> + /* Register datapath threads to KA monitoring. */
> + ka_register_datapath_threads(dp);
> }
>
> /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
> @@ -3824,6 +3885,8 @@ pmd_thread_main(void *f_)
>
> poll_list = NULL;
>
> + ka_store_pmd_id(pmd->core_id);
> +
> /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
> ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
> ovs_numa_thread_setaffinity_core(pmd->core_id);
> @@ -3859,6 +3922,9 @@ reload:
> : PMD_CYCLES_IDLE);
> }
>
> + /* Mark PMD thread alive. */
> + ka_mark_pmd_thread_alive(pmd->core_id);
> +
> if (lc++ > 1024) {
> bool reload;
>
> @@ -3892,6 +3958,10 @@ reload:
> }
>
> emc_cache_uninit(&pmd->flow_cache);
> +
> + int tid = ka_get_pmd_tid(pmd->core_id);
> + ka_unregister_thread(tid, true);
> +
> free(poll_list);
> pmd_free_cached_ports(pmd);
> return NULL;
> diff --git a/lib/keepalive.c b/lib/keepalive.c
> index ac73a42..dfafaeb 100644
> --- a/lib/keepalive.c
> +++ b/lib/keepalive.c
> @@ -23,6 +23,7 @@
> #include "keepalive.h"
> #include "lib/vswitch-idl.h"
> #include "openvswitch/vlog.h"
> +#include "process.h"
> #include "timeval.h"
>
> VLOG_DEFINE_THIS_MODULE(keepalive);
> @@ -48,6 +49,134 @@ ka_get_pmd_tid(unsigned core_idx)
> return -EINVAL;
> }
>
> +/* Return the Keepalive timer interval. */
> +inline uint32_t
Same as before w.r.t. inline in c files.
> +get_ka_interval(void)
> +{
> + return keepalive_timer_interval;
> +}
> +
> +int
> +get_ka_init_status(void)
> +{
> + return ka_init_status;
> +}
> +
> +void
> +ka_store_pmd_id(unsigned core_idx)
> +{
> + int tid = gettid();
> +
> + if (ka_is_enabled()) {
> + ovs_assert(tid > 0);
If someone changes pid_max to be a larger number than a 32-bit integer,
won't this assert trigger?
> + ka_info->thread_id[core_idx] = tid;
> + }
> +}
> +
> +/* Register thread to KA framework. */
> +void
> +ka_register_thread(int tid, bool thread_is_pmd)
> +{
> + if (ka_is_enabled()) {
> + struct ka_process_info *ka_pinfo;
> + int core_num = -1;
> + char proc_name[18] = "UNDEFINED";
> +
> + struct process_info pinfo;
> + int success = get_process_info(tid, &pinfo);
> + if (success) {
> + core_num = pinfo.core_id;
> + ovs_strlcpy(proc_name, pinfo.name, sizeof proc_name);
> + }
> +
> + ovs_assert(core_num >= 0);
> +
> + uint32_t hash = hash_int(tid, 0);
> + ovs_mutex_lock(&ka_info->proclist_mutex);
> + HMAP_FOR_EACH_WITH_HASH (ka_pinfo, node,
> + hash, &ka_info->process_list) {
> + /* PMD thread is already registered. */
> + if (ka_pinfo->tid == tid) {
> + goto out;
> + }
> + }
> +
> + ka_pinfo = xmalloc(sizeof *ka_pinfo);
> + ka_pinfo->tid = tid;
> + ka_pinfo->heartbeats = true;
> + ka_pinfo->core_id = core_num;
> + ovs_strlcpy(ka_pinfo->name, proc_name, sizeof ka_pinfo->name);
> +
> + hmap_insert(&ka_info->process_list, &ka_pinfo->node, hash);
> +
> + if (thread_is_pmd) {
Why treat these differently? Get rid of the thread_is_pmd.
> + ka_info->active_cores[core_num] = KA_STATE_ALIVE;
> + ka_info->last_alive[core_num] = time_wall_msec();
As noted in the previous patch, there is no way to properly infer core
usage from this, so get rid of all the "core based" tracking.
> + ka_info->pmd_cnt++; /* Increment PMD count. */
> + } else {
> + ka_info->nonpmd_cnt++; /* Increment non-pmd thread count. */
> + }
> +out:
> + ovs_mutex_unlock(&ka_info->proclist_mutex);
> + }
> +}
> +
> +/* Unregister thread from KA framework. */
> +void
> +ka_unregister_thread(int tid, bool thread_is_pmd)
> +{
> + if (ka_is_enabled()) {
> + struct ka_process_info *ka_pinfo;
> +
> + int core_num = -1;
> + struct process_info pinfo;
> + if (get_process_info(tid, &pinfo)) {
> + core_num = pinfo.core_id;
> + }
> +
> + ovs_assert(core_num >= 0);
> +
> + ovs_mutex_lock(&ka_info->proclist_mutex);
> + HMAP_FOR_EACH_WITH_HASH (ka_pinfo, node, hash_int(tid, 0),
> + &ka_info->process_list) {
> + /* If PMD thread is registered, remove it from the list */
> + if (ka_pinfo->tid == tid) {
> + hmap_remove(&ka_info->process_list, &ka_pinfo->node);
> + free(ka_pinfo);
> +
> + if (thread_is_pmd) {
> + ka_info->active_cores[core_num] = KA_STATE_UNUSED;
> + ka_info->pmd_cnt--; /* Decrement PMD count. */
> + } else {
> + ka_info->nonpmd_cnt--; /* Decrement non-pmd thread cnt. */
> + }
> +
> + break;
> + }
> + }
> +
> + ovs_mutex_unlock(&ka_info->proclist_mutex);
> + }
> +}
> +
> +/* Mark packet processing core alive. */
> +void
> +ka_mark_pmd_thread_alive(unsigned core_id)
> +{
> + if (ka_is_enabled()) {
> + ka_info->state_flags[core_id] = KA_STATE_ALIVE;
shouldn't this be done by tid not coreid?
> + }
> +}
> +
> +/* Mark packet processing core as idle. */
> +inline void
And again.
> +ka_mark_pmd_thread_sleep(unsigned core_id)
> +{
> + if (ka_is_enabled()) {
> + ka_info->state_flags[core_id] = KA_STATE_DOZING;
> + }
> +}
> +
> void
> ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state,
> uint64_t last_alive)
> @@ -166,18 +295,20 @@ ka_init(const struct smap *ovs_other_config)
This whole diff section is inappropriate. Please don't add code in one
patch and then remove it in the next.
> void
> ka_destroy(void)
> {
> - if (ka_info) {
> - struct ka_process_info *pinfo;
> -
> - ovs_mutex_lock(&ka_info->proclist_mutex);
> - HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) {
> - free(pinfo);
> - }
> - ovs_mutex_unlock(&ka_info->proclist_mutex);
> + if (!ka_info) {
> + return;
> + }
>
> - hmap_destroy(&ka_info->process_list);
> - ovs_mutex_destroy(&ka_info->proclist_mutex);
> + struct ka_process_info *pinfo;
>
> - free(ka_info);
> + ovs_mutex_lock(&ka_info->proclist_mutex);
> + HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) {
> + free(pinfo);
> }
> + ovs_mutex_unlock(&ka_info->proclist_mutex);
> +
> + hmap_destroy(&ka_info->process_list);
> + ovs_mutex_destroy(&ka_info->proclist_mutex);
> +
> + free(ka_info);
> }
> diff --git a/lib/keepalive.h b/lib/keepalive.h
> index d133398..607ee3b 100644
> --- a/lib/keepalive.h
> +++ b/lib/keepalive.h
> @@ -38,8 +38,10 @@ enum keepalive_state {
> typedef void (*ka_relay_cb)(int, enum keepalive_state, uint64_t, void *);
>
> struct ka_process_info {
> + char name[16];
> int tid;
> int core_id;
> + bool heartbeats;
> enum keepalive_state core_state;
> uint64_t core_last_seen_times;
> struct hmap_node node;
> @@ -52,6 +54,10 @@ struct keepalive_info {
> /* List of process/threads monitored by KA framework. */
> struct hmap process_list OVS_GUARDED;
>
> + /* count of threads registered to KA framework. */
> + uint32_t pmd_cnt;
> + uint32_t nonpmd_cnt;
> +
What are these counts for?
> /* Store Datapath threads 'tid'.
> * In case of DPDK there can be max of KA_DP_MAXCORES threads. */
> pid_t thread_id[KA_DP_MAXCORES];
> @@ -84,4 +90,15 @@ void ka_update_core_state(const int, const enum keepalive_state,
>
> int ka_get_pmd_tid(unsigned core);
> bool ka_is_enabled(void);
> +void ka_register_thread(int, bool);
> +void ka_unregister_thread(int, bool);
> +void ka_mark_pmd_thread_alive(unsigned);
> +void ka_mark_pmd_thread_sleep(unsigned);
> +
> +void ka_store_pmd_id(unsigned core);
> +uint32_t get_ka_interval(void);
> +int get_ka_init_status(void);
> +int ka_alloc_portstats(unsigned, int);
> +void ka_destroy_portstats(void);
> +
> #endif /* keepalive.h */
> diff --git a/lib/util.c b/lib/util.c
> index 36e3731..83f7e0a 100644
> --- a/lib/util.c
> +++ b/lib/util.c
> @@ -26,6 +26,9 @@
> #include <stdlib.h>
> #include <string.h>
> #include <sys/stat.h>
> +#ifdef __linux__
> +#include <sys/syscall.h>
> +#endif
> #include <unistd.h>
> #include "bitmap.h"
> #include "byte-order.h"
> @@ -568,6 +571,16 @@ get_page_size(void)
> return cached;
> }
>
> +int
> +gettid(void)
> +{
> +#ifdef __linux__
> + return (int)syscall(SYS_gettid);
This returns a pid_t type. Don't cast it to int, please.
Also, call it ovs_gettid()
> +#endif
> +
> + return -EINVAL;
> +}
> +
> /* Returns the time at which the system booted, as the number of milliseconds
> * since the epoch, or 0 if the time of boot cannot be determined. */
> long long int
> @@ -2197,6 +2210,16 @@ xsleep(unsigned int seconds)
> ovsrcu_quiesce_end();
> }
>
> +void
> +xusleep(unsigned int microseconds)
> +{
> + ovsrcu_quiesce_start();
> +#ifdef __linux__
> + usleep(microseconds);
I commented before on this not being put in a generic place since
there's no equivalent.
Further, according to the manpages:
POSIX.1-2001 declares this function obsolete;
use nanosleep(2) instead. POSIX.1-2008 removes the specification of
usleep().
Please don't expose this through utils - ONLY linux will provide it, and
it's possible at some future point that glibc removes it.
> +#endif
> + ovsrcu_quiesce_end();
> +}
> +
> /* Determine whether standard output is a tty or not. This is useful to decide
> * whether to use color output or not when --color option for utilities is set
> * to `auto`.
> diff --git a/lib/util.h b/lib/util.h
> index 764e0a0..f3cb61a 100644
> --- a/lib/util.h
> +++ b/lib/util.h
> @@ -120,6 +120,7 @@ const char *get_subprogram_name(void);
>
> unsigned int get_page_size(void);
> long long int get_boot_time(void);
> +int gettid(void);
>
> void ovs_print_version(uint8_t min_ofp, uint8_t max_ofp);
>
> @@ -489,6 +490,7 @@ ovs_u128_and(const ovs_u128 a, const ovs_u128 b)
> }
>
> void xsleep(unsigned int seconds);
> +void xusleep(unsigned int microseconds);
>
> bool is_stdout_a_tty(void);
More information about the dev
mailing list