[ovs-dev] [PATCH v4 3/7] dpif-netdev: Register packet processing cores to KA framework.

Aaron Conole aconole at redhat.com
Wed Sep 6 21:15:24 UTC 2017


Bhanuprakash Bodireddy <bhanuprakash.bodireddy at intel.com> writes:

> This commit registers the packet processing PMD cores to keepalive
> framework. Only PMDs that have rxqs mapped will be registered and
> actively monitored by KA framework.
>
> This commit spawns a keepalive thread that will dispatch heartbeats to
> PMD cores. The pmd threads respond to heartbeats by marking themselves
> alive. As long as PMD responds to heartbeats it is considered 'healthy'.
>
> Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy at intel.com>
> ---

I'm really confused with this patch.  I've stopped reviewing the series.

It seems like there's a mix of 'track by core id' and 'track by thread
id'.

I don't think it's possible to do anything by core id.  We can never
know what else has been scheduled on those cores, and we cannot be sure
that a taskset or other scheduler provisioning call will move the
threads.

>  lib/dpif-netdev.c |  70 +++++++++++++++++++++++++
>  lib/keepalive.c   | 153 ++++++++++++++++++++++++++++++++++++++++++++++++++----
>  lib/keepalive.h   |  17 ++++++
>  lib/util.c        |  23 ++++++++
>  lib/util.h        |   2 +
>  5 files changed, 254 insertions(+), 11 deletions(-)
>
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index e2cd931..84c7ffd 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -49,6 +49,7 @@
>  #include "flow.h"
>  #include "hmapx.h"
>  #include "id-pool.h"
> +#include "keepalive.h"
>  #include "latch.h"
>  #include "netdev.h"
>  #include "netdev-vport.h"
> @@ -978,6 +979,63 @@ sorted_poll_thread_list(struct dp_netdev *dp,
>      *n = k;
>  }
>  
> +static void *
> +ovs_keepalive(void *f_ OVS_UNUSED)
> +{
> +    pthread_detach(pthread_self());
> +
> +    for (;;) {
> +        xusleep(get_ka_interval() * 1000);
> +    }
> +
> +    return NULL;
> +}
> +
> +static void
> +ka_thread_start(struct dp_netdev *dp)
> +{
> +    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
> +
> +    if (ovsthread_once_start(&once)) {
> +        ovs_thread_create("ovs_keepalive", ovs_keepalive, dp);
> +
> +        ovsthread_once_done(&once);
> +    }
> +}
> +
> +static void
> +ka_register_datapath_threads(struct dp_netdev *dp)
> +{
> +    int ka_init = get_ka_init_status();
> +    VLOG_DBG("Keepalive: Was initialization successful? [%s]",
> +                ka_init ? "Success" : "Failure");
> +    if (!ka_init) {
> +        return;
> +    }
> +
> +    ka_thread_start(dp);
> +
> +    struct dp_netdev_pmd_thread *pmd;
> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> +        /*  Register only PMD threads. */
> +        if (pmd->core_id != NON_PMD_CORE_ID) {
> +            int tid = ka_get_pmd_tid(pmd->core_id);
> +
> +            /* Skip PMD thread with no rxqs mapping. */

why skip these pmds?  we should still watch them, and then we can
correlated interesting events (for instance... when an rxq gets added
whats the change in utilization, etc).

> +            if (OVS_UNLIKELY(!hmap_count(&pmd->poll_list))) {
> +                /* rxq mapping changes due to reconfiguration,
> +                 * if there are no rxqs mapped to PMD, unregister it. */
> +                ka_unregister_thread(tid, true);
> +                continue;
> +            }
> +
> +            ka_register_thread(tid, true);
> +            VLOG_INFO("Registered PMD thread [%d] on Core [%d] to KA framework",
> +                      tid, pmd->core_id);
> +        }
> +    }
> +}
> +
>  static void
>  dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
>                       void *aux)
> @@ -3625,6 +3683,9 @@ reconfigure_datapath(struct dp_netdev *dp)
>  
>      /* Reload affected pmd threads. */
>      reload_affected_pmds(dp);
> +
> +    /* Register datapath threads to KA monitoring. */
> +    ka_register_datapath_threads(dp);
>  }
>  
>  /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
> @@ -3824,6 +3885,8 @@ pmd_thread_main(void *f_)
>  
>      poll_list = NULL;
>  
> +    ka_store_pmd_id(pmd->core_id);
> +
>      /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
>      ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
>      ovs_numa_thread_setaffinity_core(pmd->core_id);
> @@ -3859,6 +3922,9 @@ reload:
>                                                        : PMD_CYCLES_IDLE);
>          }
>  
> +        /* Mark PMD thread alive. */
> +        ka_mark_pmd_thread_alive(pmd->core_id);
> +
>          if (lc++ > 1024) {
>              bool reload;
>  
> @@ -3892,6 +3958,10 @@ reload:
>      }
>  
>      emc_cache_uninit(&pmd->flow_cache);
> +
> +    int tid = ka_get_pmd_tid(pmd->core_id);
> +    ka_unregister_thread(tid, true);
> +
>      free(poll_list);
>      pmd_free_cached_ports(pmd);
>      return NULL;
> diff --git a/lib/keepalive.c b/lib/keepalive.c
> index ac73a42..dfafaeb 100644
> --- a/lib/keepalive.c
> +++ b/lib/keepalive.c
> @@ -23,6 +23,7 @@
>  #include "keepalive.h"
>  #include "lib/vswitch-idl.h"
>  #include "openvswitch/vlog.h"
> +#include "process.h"
>  #include "timeval.h"
>  
>  VLOG_DEFINE_THIS_MODULE(keepalive);
> @@ -48,6 +49,134 @@ ka_get_pmd_tid(unsigned core_idx)
>      return -EINVAL;
>  }
>  
> +/* Return the Keepalive timer interval. */
> +inline uint32_t

Same as before w.r.t. inline in c files.

> +get_ka_interval(void)
> +{
> +    return keepalive_timer_interval;
> +}
> +
> +int
> +get_ka_init_status(void)
> +{
> +    return ka_init_status;
> +}
> +
> +void
> +ka_store_pmd_id(unsigned core_idx)
> +{
> +    int tid = gettid();
> +
> +    if (ka_is_enabled()) {
> +        ovs_assert(tid > 0);

If someone changes pid_max to be a larger number than a 32-bit integer,
won't this assert trigger?

> +        ka_info->thread_id[core_idx] = tid;
> +    }
> +}
> +
> +/* Register thread to KA framework. */
> +void
> +ka_register_thread(int tid, bool thread_is_pmd)
> +{
> +    if (ka_is_enabled()) {
> +        struct ka_process_info *ka_pinfo;
> +        int core_num = -1;
> +        char proc_name[18] = "UNDEFINED";
> +
> +        struct process_info pinfo;
> +        int success = get_process_info(tid, &pinfo);
> +        if (success) {
> +            core_num = pinfo.core_id;
> +            ovs_strlcpy(proc_name, pinfo.name, sizeof proc_name);
> +        }
> +
> +        ovs_assert(core_num >= 0);
> +
> +        uint32_t hash = hash_int(tid, 0);
> +        ovs_mutex_lock(&ka_info->proclist_mutex);
> +        HMAP_FOR_EACH_WITH_HASH (ka_pinfo, node,
> +                                 hash, &ka_info->process_list) {
> +            /* PMD thread is already registered. */
> +            if (ka_pinfo->tid == tid) {
> +                goto out;
> +            }
> +        }
> +
> +        ka_pinfo = xmalloc(sizeof *ka_pinfo);
> +        ka_pinfo->tid = tid;
> +        ka_pinfo->heartbeats = true;
> +        ka_pinfo->core_id = core_num;
> +        ovs_strlcpy(ka_pinfo->name, proc_name, sizeof ka_pinfo->name);
> +
> +        hmap_insert(&ka_info->process_list, &ka_pinfo->node, hash);
> +
> +        if (thread_is_pmd) {

Why treat these differently?  Get rid of the thread_is_pmd.

> +            ka_info->active_cores[core_num] = KA_STATE_ALIVE;
> +            ka_info->last_alive[core_num] = time_wall_msec();

As noted in the previous patch, there is no way to properly infer core
usage from this, so get rid of all the "core based" tracking.

> +            ka_info->pmd_cnt++;  /* Increment PMD count. */
> +        } else {
> +            ka_info->nonpmd_cnt++;  /* Increment non-pmd thread count. */
> +        }
> +out:
> +        ovs_mutex_unlock(&ka_info->proclist_mutex);
> +    }
> +}
> +
> +/* Unregister thread from KA framework. */
> +void
> +ka_unregister_thread(int tid, bool thread_is_pmd)
> +{
> +    if (ka_is_enabled()) {
> +        struct ka_process_info *ka_pinfo;
> +
> +        int core_num = -1;
> +        struct process_info pinfo;
> +        if (get_process_info(tid, &pinfo)) {
> +            core_num = pinfo.core_id;
> +        }
> +
> +        ovs_assert(core_num >= 0);
> +
> +        ovs_mutex_lock(&ka_info->proclist_mutex);
> +        HMAP_FOR_EACH_WITH_HASH (ka_pinfo, node, hash_int(tid, 0),
> +                                 &ka_info->process_list) {
> +            /* If PMD thread is registered, remove it from the list */
> +            if (ka_pinfo->tid == tid) {
> +                hmap_remove(&ka_info->process_list, &ka_pinfo->node);
> +                free(ka_pinfo);
> +
> +                if (thread_is_pmd) {
> +                    ka_info->active_cores[core_num] = KA_STATE_UNUSED;
> +                    ka_info->pmd_cnt--;  /* Decrement PMD count. */
> +                } else {
> +                    ka_info->nonpmd_cnt--;  /* Decrement non-pmd thread cnt. */
> +                }
> +
> +                break;
> +            }
> +        }
> +
> +        ovs_mutex_unlock(&ka_info->proclist_mutex);
> +    }
> +}
> +
> +/* Mark packet processing core alive. */
> +void
> +ka_mark_pmd_thread_alive(unsigned core_id)
> +{
> +    if (ka_is_enabled()) {
> +        ka_info->state_flags[core_id] = KA_STATE_ALIVE;

shouldn't this be done by tid not coreid?

> +    }
> +}
> +
> +/* Mark packet processing core as idle. */
> +inline void

And again.

> +ka_mark_pmd_thread_sleep(unsigned core_id)
> +{
> +    if (ka_is_enabled()) {
> +        ka_info->state_flags[core_id] = KA_STATE_DOZING;
> +    }
> +}
> +
>  void
>  ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state,
>                      uint64_t last_alive)
> @@ -166,18 +295,20 @@ ka_init(const struct smap *ovs_other_config)

This whole diff section is inappropriate.  Please don't add code in one
patch and then remove it in the next.

>  void
>  ka_destroy(void)
>  {
> -    if (ka_info) {
> -        struct ka_process_info *pinfo;
> -
> -        ovs_mutex_lock(&ka_info->proclist_mutex);
> -        HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) {
> -            free(pinfo);
> -        }
> -        ovs_mutex_unlock(&ka_info->proclist_mutex);
> +    if (!ka_info) {
> +        return;
> +    }
>  
> -        hmap_destroy(&ka_info->process_list);
> -        ovs_mutex_destroy(&ka_info->proclist_mutex);
> +    struct ka_process_info *pinfo;
>  
> -        free(ka_info);
> +    ovs_mutex_lock(&ka_info->proclist_mutex);
> +    HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) {
> +        free(pinfo);
>      }
> +    ovs_mutex_unlock(&ka_info->proclist_mutex);
> +
> +    hmap_destroy(&ka_info->process_list);
> +    ovs_mutex_destroy(&ka_info->proclist_mutex);
> +
> +    free(ka_info);
>  }
> diff --git a/lib/keepalive.h b/lib/keepalive.h
> index d133398..607ee3b 100644
> --- a/lib/keepalive.h
> +++ b/lib/keepalive.h
> @@ -38,8 +38,10 @@ enum keepalive_state {
>  typedef void (*ka_relay_cb)(int, enum keepalive_state, uint64_t, void *);
>  
>  struct ka_process_info {
> +    char name[16];
>      int tid;
>      int core_id;
> +    bool heartbeats;
>      enum keepalive_state core_state;
>      uint64_t core_last_seen_times;
>      struct hmap_node node;
> @@ -52,6 +54,10 @@ struct keepalive_info {
>      /* List of process/threads monitored by KA framework. */
>      struct hmap process_list OVS_GUARDED;
>  
> +    /* count of threads registered to KA framework. */
> +    uint32_t pmd_cnt;
> +    uint32_t nonpmd_cnt;
> +

What are these counts for?

>      /* Store Datapath threads 'tid'.
>       * In case of DPDK there can be max of KA_DP_MAXCORES threads. */
>      pid_t thread_id[KA_DP_MAXCORES];
> @@ -84,4 +90,15 @@ void ka_update_core_state(const int, const enum keepalive_state,
>  
>  int ka_get_pmd_tid(unsigned core);
>  bool ka_is_enabled(void);
> +void ka_register_thread(int, bool);
> +void ka_unregister_thread(int, bool);
> +void ka_mark_pmd_thread_alive(unsigned);
> +void ka_mark_pmd_thread_sleep(unsigned);
> +
> +void ka_store_pmd_id(unsigned core);
> +uint32_t get_ka_interval(void);
> +int get_ka_init_status(void);
> +int ka_alloc_portstats(unsigned, int);
> +void ka_destroy_portstats(void);
> +
>  #endif /* keepalive.h */
> diff --git a/lib/util.c b/lib/util.c
> index 36e3731..83f7e0a 100644
> --- a/lib/util.c
> +++ b/lib/util.c
> @@ -26,6 +26,9 @@
>  #include <stdlib.h>
>  #include <string.h>
>  #include <sys/stat.h>
> +#ifdef __linux__
> +#include <sys/syscall.h>
> +#endif
>  #include <unistd.h>
>  #include "bitmap.h"
>  #include "byte-order.h"
> @@ -568,6 +571,16 @@ get_page_size(void)
>      return cached;
>  }
>  
> +int
> +gettid(void)
> +{
> +#ifdef __linux__
> +    return (int)syscall(SYS_gettid);

This returns a pid_t type.  Don't cast it to int, please.

Also, call it ovs_gettid()

> +#endif
> +
> +    return -EINVAL;
> +}
> +
>  /* Returns the time at which the system booted, as the number of milliseconds
>   * since the epoch, or 0 if the time of boot cannot be determined. */
>  long long int
> @@ -2197,6 +2210,16 @@ xsleep(unsigned int seconds)
>      ovsrcu_quiesce_end();
>  }
>  
> +void
> +xusleep(unsigned int microseconds)
> +{
> +    ovsrcu_quiesce_start();
> +#ifdef __linux__
> +    usleep(microseconds);

I commented before on this not being put in a generic place since
there's no equivalent.

Further, according to the manpages:

POSIX.1-2001  declares this function obsolete;
use nanosleep(2) instead.  POSIX.1-2008 removes  the  specification  of
usleep().

Please don't expose this through utils - ONLY linux will provide it, and
it's possible at some future point that glibc removes it.

> +#endif
> +    ovsrcu_quiesce_end();
> +}
> +
>  /* Determine whether standard output is a tty or not. This is useful to decide
>   * whether to use color output or not when --color option for utilities is set
>   * to `auto`.
> diff --git a/lib/util.h b/lib/util.h
> index 764e0a0..f3cb61a 100644
> --- a/lib/util.h
> +++ b/lib/util.h
> @@ -120,6 +120,7 @@ const char *get_subprogram_name(void);
>  
>  unsigned int get_page_size(void);
>  long long int get_boot_time(void);
> +int gettid(void);
>  
>  void ovs_print_version(uint8_t min_ofp, uint8_t max_ofp);
>  
> @@ -489,6 +490,7 @@ ovs_u128_and(const ovs_u128 a, const ovs_u128 b)
>  }
>  
>  void xsleep(unsigned int seconds);
> +void xusleep(unsigned int microseconds);
>  
>  bool is_stdout_a_tty(void);


More information about the dev mailing list