[ovs-dev] [PATCH v5 3/3] dpif-netlink: Introduce per-cpu upcall dispatch
Aaron Conole
aconole at redhat.com
Thu Jul 15 20:46:55 UTC 2021
Mark Gray <mark.d.gray at redhat.com> writes:
> The Open vSwitch kernel module uses the upcall mechanism to send
> packets from kernel space to user space when it misses in the kernel
> space flow table. The upcall sends packets via a Netlink socket.
> Currently, a Netlink socket is created for every vport. In this way,
> there is a 1:1 mapping between a vport and a Netlink socket.
> When a packet is received by a vport, if it needs to be sent to
> user space, it is sent via the corresponding Netlink socket.
>
> This mechanism, with various iterations of the corresponding user
> space code, has seen some limitations and issues:
>
> * On systems with a large number of vports, there is correspondingly
> a large number of Netlink sockets which can limit scaling.
> (https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
> * Packet reordering on upcalls.
> (https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
> * A thundering herd issue.
> (https://bugzilla.redhat.com/show_bug.cgi?id=1834444)
>
> This patch introduces an alternative, feature-negotiated, upcall
> mode using a per-cpu dispatch rather than a per-vport dispatch.
>
> In this mode, the Netlink socket to be used for the upcall is
> selected based on the CPU of the thread that is executing the upcall.
> In this way, it resolves the issues above as:
>
> a) The number of Netlink sockets scales with the number of CPUs
> rather than the number of vports.
> b) Ordering per-flow is maintained as packets are distributed to
> CPUs based on mechanisms such as RSS and flows are distributed
> to a single user space thread.
> c) Packets from a flow can only wake up one user space thread.
>
> Reported-at: https://bugzilla.redhat.com/1844576
> Signed-off-by: Mark Gray <mark.d.gray at redhat.com>
> ---
>
> Notes:
> v1 - Reworked based on Flavio's comments:
> * change DISPATCH_MODE_PER_CPU() to inline function
> * add `ovs-appctl` command to check dispatch mode for datapaths
> * fixed issue with userspace actions (tested using `ovs-ofctl monitor br0 65534 -P nxt_packet_in`)
> * update documentation as requested
> v2 - Reworked based on Flavio's comments:
> * Used dpif_netlink_upcall_per_cpu() for check in dpif_netlink_set_handler_pids()
> * Added macro for (ignored) Netlink PID
> * Fixed indentation issue
> * Added NEWS entry
> * Added section to ovs-vswitchd.8 man page
> v4 - Reworked based on Flavio's comments:
> * Cleaned up log message when dispatch mode is set
> v5 - Reworked based on Flavio's comments:
> * Added macros to remove functions for Window's build
> Reworked based on David's comments:
> * Updated the NEWS file
>
> NEWS | 6 +
> .../linux/compat/include/linux/openvswitch.h | 7 +
> lib/automake.mk | 1 +
> lib/dpif-netdev.c | 1 +
> lib/dpif-netlink-unixctl.man | 6 +
> lib/dpif-netlink.c | 463 ++++++++++++++++--
> lib/dpif-provider.h | 32 +-
> lib/dpif.c | 17 +
> lib/dpif.h | 1 +
> ofproto/ofproto-dpif-upcall.c | 51 +-
> ofproto/ofproto.c | 12 -
> vswitchd/ovs-vswitchd.8.in | 1 +
> vswitchd/vswitch.xml | 23 +-
> 13 files changed, 526 insertions(+), 95 deletions(-)
> create mode 100644 lib/dpif-netlink-unixctl.man
>
<SNIP>
> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
> index f92905dd83fd..255aab8ee513 100644
> --- a/lib/dpif-netlink.c
> +++ b/lib/dpif-netlink.c
> @@ -84,6 +84,9 @@ enum { MAX_PORTS = USHRT_MAX };
> #define EPOLLEXCLUSIVE (1u << 28)
> #endif
>
> +/* This PID is not used by the kernel datapath when using dispatch per CPU,
> + * but it is required to be set (not zero). */
> +#define DPIF_NETLINK_PER_CPU_PID UINT32_MAX
> struct dpif_netlink_dp {
> /* Generic Netlink header. */
> uint8_t cmd;
> @@ -98,6 +101,8 @@ struct dpif_netlink_dp {
> const struct ovs_dp_stats *stats; /* OVS_DP_ATTR_STATS. */
> const struct ovs_dp_megaflow_stats *megaflow_stats;
> /* OVS_DP_ATTR_MEGAFLOW_STATS.*/
> + const uint32_t *upcall_pids; /* OVS_DP_ATTR_PER_CPU_PIDS */
> + uint32_t n_upcall_pids;
> };
>
> static void dpif_netlink_dp_init(struct dpif_netlink_dp *);
> @@ -113,6 +118,10 @@ static int dpif_netlink_dp_get(const struct dpif *,
> static int
> dpif_netlink_set_features(struct dpif *dpif_, uint32_t new_features);
>
> +static void
> +dpif_netlink_unixctl_dispatch_mode(struct unixctl_conn *conn, int argc,
> + const char *argv[], void *aux);
> +
> struct dpif_netlink_flow {
> /* Generic Netlink header. */
> uint8_t cmd;
> @@ -178,11 +187,16 @@ struct dpif_windows_vport_sock {
> #endif
>
> struct dpif_handler {
> + /* per-vport dispatch mode */
> struct epoll_event *epoll_events;
> int epoll_fd; /* epoll fd that includes channel socks. */
> int n_events; /* Num events returned by epoll_wait(). */
> int event_offset; /* Offset into 'epoll_events'. */
>
> + /* per-cpu dispatch mode */
> + struct nl_sock *sock; /* Each handler thread holds one netlink
> + socket. */
> +
> #ifdef _WIN32
> /* Pool of sockets. */
> struct dpif_windows_vport_sock *vport_sock_pool;
> @@ -201,6 +215,8 @@ struct dpif_netlink {
> struct fat_rwlock upcall_lock;
> struct dpif_handler *handlers;
> uint32_t n_handlers; /* Num of upcall handlers. */
> +
> + /* Per-vport dispatch mode */
> struct dpif_channel *channels; /* Array of channels for each port. */
> int uc_array_size; /* Size of 'handler->channels' and */
> /* 'handler->epoll_events'. */
> @@ -241,8 +257,12 @@ static int open_dpif(const struct dpif_netlink_dp *, struct dpif **);
> static uint32_t dpif_netlink_port_get_pid(const struct dpif *,
> odp_port_t port_no);
> static void dpif_netlink_handler_uninit(struct dpif_handler *handler);
> -static int dpif_netlink_refresh_channels(struct dpif_netlink *,
> - uint32_t n_handlers);
> +static int dpif_netlink_refresh_handlers_vport_dispatch(struct dpif_netlink *,
> + uint32_t n_handlers);
> +static void destroy_all_channels(struct dpif_netlink *);
> +static int dpif_netlink_refresh_handlers_cpu_dispatch(struct dpif_netlink *);
> +static void destroy_all_handlers(struct dpif_netlink *);
> +
> static void dpif_netlink_vport_to_ofpbuf(const struct dpif_netlink_vport *,
> struct ofpbuf *);
> static int dpif_netlink_vport_from_ofpbuf(struct dpif_netlink_vport *,
> @@ -294,6 +314,11 @@ dpif_netlink_cast(const struct dpif *dpif)
> return CONTAINER_OF(dpif, struct dpif_netlink, dpif);
> }
>
> +static inline bool
> +dpif_netlink_upcall_per_cpu(const struct dpif_netlink *dpif) {
> + return !!((dpif)->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU);
> +}
> +
> static int
> dpif_netlink_enumerate(struct sset *all_dps,
> const struct dpif_class *dpif_class OVS_UNUSED)
> @@ -357,11 +382,39 @@ dpif_netlink_open(const struct dpif_class *class OVS_UNUSED, const char *name,
> dp_request.cmd = OVS_DP_CMD_SET;
> }
>
> + /* The Open vSwitch kernel module has two modes for dispatching upcalls:
> + * per-vport and per-cpu.
> + *
> + * When dispatching upcalls per-vport, the kernel will
> + * send the upcall via a Netlink socket that has been selected based on the
> + * vport that received the packet that is causing the upcall.
> + *
> + * When dispatching upcall per-cpu, the kernel will send the upcall via
> + * a Netlink socket that has been selected based on the cpu that received
> + * the packet that is causing the upcall.
> + *
> + * First we test to see if the kernel module supports per-cpu dispatching
> + * (the preferred method). If it does not support per-cpu dispatching, we
> + * fall back to the per-vport dispatch mode.
> + */
> dp_request.user_features |= OVS_DP_F_UNALIGNED;
> - dp_request.user_features |= OVS_DP_F_VPORT_PIDS;
> + dp_request.user_features &= ~OVS_DP_F_VPORT_PIDS;
> + dp_request.user_features |= OVS_DP_F_DISPATCH_UPCALL_PER_CPU;
> error = dpif_netlink_dp_transact(&dp_request, &dp, &buf);
> if (error) {
> - return error;
> + dp_request.user_features &= ~OVS_DP_F_DISPATCH_UPCALL_PER_CPU;
> + dp_request.user_features |= OVS_DP_F_VPORT_PIDS;
> + error = dpif_netlink_dp_transact(&dp_request, &dp, &buf);
Just a nit:
> + if (error) {
> + return error;
> + }
> + if (create) {
> + VLOG_INFO("Datapath dispatch mode: per-vport");
> + }
> + } else {
> + if (create) {
> + VLOG_INFO("Datapath dispatch mode: per-cpu");
> + }
> }
This could probably be written something like:
}
if (error) {
return error;
} else if (create) {
VLOG_INFO("Datapath dispatch mode: %s",
(dp_request.user_features & OVS_DP_F_VPORT_PIDS) ?
"per-vport" : "per-cpu");
}
And then we trim the code a bit. It doesn't really eliminate any
branches (well, in truth it might add one), but it's a bit easier to
read for me - but not a big deal.
> error = open_dpif(&dp, dpifp);
> @@ -609,6 +662,24 @@ destroy_all_channels(struct dpif_netlink *dpif)
> dpif->uc_array_size = 0;
> }
Otherwise, LGTM. I didn't test with the kernel module changes.
I also haven't checked for any differences in thread allocation for
userspace handler threads. I don't anticipate any impact there, but
I'll keep going with testing. If I see something I'll holler.
Acked-by: Aaron Conole <aconole at redhat.com>
More information about the dev
mailing list