[ovs-dev] [RFC net-next] openvswitch: add trace points

Aaron Conole aconole at redhat.com
Mon Jun 14 13:28:50 UTC 2021


Eelco Chaudron <echaudro at redhat.com> writes:

> On 27 May 2021, at 21:15, Aaron Conole wrote:
>
>  This makes openvswitch module use the event tracing framework 
>  to log the upcall interface and action execution pipeline. When 
>  using openvswitch as the packet forwarding engine, some types of 
>  debugging are made possible simply by using the ovs-vswitchd's 
>  ofproto/trace command. However, such a command has some 
>  limitations:
>
>  1. When trying to trace packets that go through the CT action, 
>  the state of the packet can't be determined, and probably 
>  would be potentially wrong.
>
>  2. Deducing problem packets can sometimes be difficult as well 
>  even if many of the flows are known
>
>  3. It's possible to use the openvswitch module even without 
>  the ovs-vswitchd (although, not common use).
>
>  Introduce the event tracing points here to make it possible for 
>  working through these problems in kernel space. The style is 
>  copied from the mac80211 driver-trace / trace code for 
>  consistency.
>
> Thanks for doing this Aaron, it will definitely help when trying to debug some customer issues.

Thanks for the review!

> Just to be sure, I did some tests to make sure these changes do not impact performance, and it looks fine! See some
> small nits/comments below, but other than that I would say please re-send as an official patch.
>
> Cheers,
>
> Eelco
>
>  Signed-off-by: Aaron Conole <aconole at redhat.com> 
>  --- 
>  net/openvswitch/Makefile | 3 + 
>  net/openvswitch/actions.c | 4 + 
>  net/openvswitch/datapath.c | 7 ++ 
>  net/openvswitch/openvswitch_trace.c | 10 ++ 
>  net/openvswitch/openvswitch_trace.h | 152 ++++++++++++++++++++++++++++ 
>  5 files changed, 176 insertions(+) 
>  create mode 100644 net/openvswitch/openvswitch_trace.c 
>  create mode 100644 net/openvswitch/openvswitch_trace.h
>
>  diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile 
>  index 41109c326f3a..28982630bef3 100644 
>  --- a/net/openvswitch/Makefile 
>  +++ b/net/openvswitch/Makefile 
>  @@ -13,6 +13,7 @@ openvswitch-y := \ 
>  flow_netlink.o \ 
>  flow_table.o \ 
>  meter.o \ 
>  + openvswitch_trace.o \ 
>  vport.o \ 
>  vport-internal_dev.o \ 
>  vport-netdev.o 
>  @@ -24,3 +25,5 @@ endif 
>  obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o 
>  obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o 
>  obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o 
>  + 
>  +CFLAGS_openvswitch_trace.o = -I$(src) 
>  diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c 
>  index d858ea580e43..62285453ca79 100644 
>  --- a/net/openvswitch/actions.c 
>  +++ b/net/openvswitch/actions.c 
>  @@ -30,6 +30,7 @@ 
>  #include "conntrack.h" 
>  #include "vport.h" 
>  #include "flow_netlink.h" 
>  +#include "openvswitch_trace.h"
>
>  struct deferred_action { 
>  struct sk_buff *skb; 
>  @@ -1242,6 +1243,9 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, 
>  a = nla_next(a, &rem)) { 
>  int err = 0;
>
>  + if (trace_openvswitch_probe_action_enabled()) 
>  + trace_openvswitch_probe_action(dp, skb, key, a, rem); 
>  + 
>  switch (nla_type(a)) { 
>  case OVS_ACTION_ATTR_OUTPUT: { 
>  int port = nla_get_u32(a); 
>  diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c 
>  index 9d6ef6cb9b26..63f19a6ed472 100644 
>  --- a/net/openvswitch/datapath.c 
>  +++ b/net/openvswitch/datapath.c 
>  @@ -43,6 +43,7 @@ 
>  #include "flow_table.h" 
>  #include "flow_netlink.h" 
>  #include "meter.h" 
>  +#include "openvswitch_trace.h" 
>  #include "vport-internal_dev.h" 
>  #include "vport-netdev.h"
>
>  @@ -275,6 +276,12 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, 
>  struct dp_stats_percpu *stats; 
>  int err;
>
>  + if (trace_openvswitch_probe_userspace_enabled()) { 
>  + struct sw_flow_key ukey = *key; 
>  + 
>  + trace_openvswitch_probe_userspace(dp, skb, &ukey, upcall_info); 
>  + }
>
> Rather than work around the const here, can we not fix this in the trace include?

Done.

>     TP_PROTO(struct datapath *dp, struct sk_buff *skb,
>
> *            struct sw_flow_key *key,
>
> *            const struct sw_flow_key *key,
>            const struct dp_upcall_info *upcall_info),
>
> @@ -97,7 +97,7 @@ TRACE_EVENT(openvswitch_probe_userspace,
> __field( u16, gso_type )
> __field( u32, ovs_flow_hash )
> __field( u32, recirc_id )
>
> *           __field(        void *,         keyaddr                 )
>
> *           __field(        const void *,   keyaddr                 )
>           __field(        u16,            key_eth_type            )
>
>  + 
>  if (upcall_info->portid == 0) { 
>  err = -ENOTCONN; 
>  goto err; 
>  diff --git a/net/openvswitch/openvswitch_trace.c b/net/openvswitch/openvswitch_trace.c 
>  new file mode 100644 
>  index 000000000000..62c5f7d6f023 
>  --- /dev/null 
>  +++ b/net/openvswitch/openvswitch_trace.c 
>  @@ -0,0 +1,10 @@ 
>  +// SPDX-License-Identifier: GPL-2.0 
>  +/* bug in tracepoint.h, it should include this */ 
>  +#include <linux/module.h> 
>  + 
>  +/* sparse isn't too happy with all macros... */ 
>  +#ifndef __CHECKER__ 
>  +#define CREATE_TRACE_POINTS 
>  +#include "openvswitch_trace.h" 
>  + 
>  +#endif 
>  diff --git a/net/openvswitch/openvswitch_trace.h b/net/openvswitch/openvswitch_trace.h 
>  new file mode 100644 
>  index 000000000000..1b350306f622 
>  --- /dev/null 
>  +++ b/net/openvswitch/openvswitch_trace.h 
>  @@ -0,0 +1,152 @@ 
>  +/* SPDX-License-Identifier: GPL-2.0 */ 
>  +#undef TRACE_SYSTEM 
>  +#define TRACE_SYSTEM openvswitch 
>  + 
>  +#if !defined(_TRACE_OPENVSWITCH_H) || defined(TRACE_HEADER_MULTI_READ) 
>  +#define _TRACE_OPENVSWITCH_H 
>  + 
>  +#include <linux/tracepoint.h> 
>  + 
>  +#include "datapath.h" 
>  +
>
> I guess the naming of the events can be a long debate :) But I think just adding the word “probe” doesn’t help much.
>
> Maybe for just for function entry/exit, we could do something like <subsys>_<function_name>_entry/exit. I’ve seen
> this used in other components. For example:
>
> openvswitch_probe_userspace -> openvswitch_ovs_dp_upcall_entry
>
> For none entry/exit trace macros, we could either use a specific name, as we have:
>
> openvswitch_probe_action -> openvswitch_execute_action
>
> Or maybe even prepended by the function name to easily locate it. For example:
>
> openvswitch_probe_action -> openvswitch_do_execute_actions__execute_action
>
> I like the double underscore to differentiate, so it might even be nice for the __entry, so
> openvswitch_ovs_dp_upcall__entry.
>
> Maybe we should also change openvswitch to ovs for both system and events? Or is this too short and might cause
> confusion?

I agree that we can make these more descriptive, and that 'probe'
doesn't add much.  OTOH, if I use
"openvswitch_do_execute_actions__execute_action" or the corresponding
upcall trace point name, the line lengths balloon.  I don't see that
it's as useful to say both 'do_execute_actions' and '__execute_action' -
I don't think there will ever be a reason to add an additional
tracepoint here since the bulk of processing happens during the action
loop.  But maybe something like:

  'openvswitch_execute_action'

and

  'openvswitch_dp_upcall'

?

Just want to get this part right.

>  +TRACE_EVENT(openvswitch_probe_action, 
>  + 
>  + TP_PROTO(struct datapath *dp, struct sk_buff *skb, 
>  + struct sw_flow_key *key, const struct nlattr *a, int rem), 
>  + 
>  + TP_ARGS(dp, skb, key, a, rem), 
>  + 
>  + TP_STRUCT__entry( 
>  + __field( void *, dpaddr ) 
>  + __string( dp_name, ovs_dp_name(dp) ) 
>  + __string( dev_name, skb->dev->name ) 
>  + __field( void *, skbaddr ) 
>  + __field( unsigned int, len ) 
>  + __field( unsigned int, data_len ) 
>  + __field( unsigned int, truesize ) 
>  + __field( u8, nr_frags ) 
>  + __field( u16, gso_size ) 
>  + __field( u16, gso_type ) 
>  + __field( u32, ovs_flow_hash ) 
>  + __field( u32, recirc_id ) 
>  + __field( void *, keyaddr ) 
>  + __field( u16, key_eth_type ) 
>  + __field( u8, key_ct_state ) 
>  + __field( u8, key_ct_orig_proto ) 
>  + __field( unsigned int, flow_key_valid ) 
>  + __field( u8, action_type ) 
>  + __field( unsigned int, action_len ) 
>  + __field( void *, action_data ) 
>  + __field( u8, is_last ) 
>  + ), 
>  + 
>  + TP_fast_assign( 
>  + __entry->dpaddr = dp; 
>  + __assign_str(dp_name, ovs_dp_name(dp)); 
>  + __assign_str(dev_name, skb->dev->name); 
>  + __entry->skbaddr = skb; 
>  + __entry->len = skb->len; 
>  + __entry->data_len = skb->data_len; 
>  + __entry->truesize = skb->truesize; 
>  + __entry->nr_frags = skb_shinfo(skb)->nr_frags; 
>  + __entry->gso_size = skb_shinfo(skb)->gso_size; 
>  + __entry->gso_type = skb_shinfo(skb)->gso_type; 
>  + __entry->ovs_flow_hash = key->ovs_flow_hash; 
>  + __entry->recirc_id = key->recirc_id; 
>  + __entry->keyaddr = key; 
>  + __entry->key_eth_type = key->eth.type; 
>  + __entry->key_ct_state = key->ct_state; 
>  + __entry->key_ct_orig_proto = key->ct_orig_proto; 
>  + __entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID); 
>  + __entry->action_type = nla_type(a); 
>  + __entry->action_len = nla_len(a); 
>  + __entry->action_data = nla_data(a); 
>  + __entry->is_last = nla_is_last(a, rem); 
>  + ), 
>  + 
>  + TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d
>  gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x
>  ct_state=%02x ct_orig_proto=%02x flow_key_valid=%d action_type=%u action_len=%u action_data=%p
>  is_last=%d",
>
> I guess %p will be good enough here, as I see no other trace modules use %px. But I guess, for some of the values
> %*ph might be useful, as it will dump the bytes, i.e., action_data.

The issue with using %*ph for action_data is in the cases where a
netlink attribute doesn't have data.  In that case, we don't have any
bytes to print, and I think the behavior would be undefined (see
ct_clear action, or pop_eth action, etc).

>  + __entry->dpaddr, __get_str(dp_name), __get_str(dev_name), 
>  + __entry->skbaddr, __entry->len, __entry->data_len, 
>  + __entry->truesize, __entry->nr_frags, __entry->gso_size, 
>  + __entry->gso_type, __entry->ovs_flow_hash, 
>  + __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type, 
>  + __entry->key_ct_state, __entry->key_ct_orig_proto, 
>  + __entry->flow_key_valid, 
>  + __entry->action_type, __entry->action_len, 
>  + __entry->action_data, __entry->is_last) 
>  +); 
>  + 
>  +TRACE_EVENT(openvswitch_probe_userspace, 
>  + 
>  + TP_PROTO(struct datapath *dp, struct sk_buff *skb, 
>  + struct sw_flow_key *key, 
>  + const struct dp_upcall_info *upcall_info), 
>  + 
>  + TP_ARGS(dp, skb, key, upcall_info), 
>  + 
>  + TP_STRUCT__entry( 
>  + __field( void *, dpaddr ) 
>  + __string( dp_name, ovs_dp_name(dp) ) 
>  + __string( dev_name, skb->dev->name ) 
>  + __field( void *, skbaddr ) 
>  + __field( unsigned int, len ) 
>  + __field( unsigned int, data_len ) 
>  + __field( unsigned int, truesize ) 
>  + __field( u8, nr_frags ) 
>  + __field( u16, gso_size ) 
>  + __field( u16, gso_type ) 
>  + __field( u32, ovs_flow_hash ) 
>  + __field( u32, recirc_id ) 
>  + __field( void *, keyaddr ) 
>  + __field( u16, key_eth_type ) 
>  + __field( u8, key_ct_state ) 
>  + __field( u8, key_ct_orig_proto ) 
>  + __field( unsigned int, flow_key_valid ) 
>  + __field( u8, upcall_cmd ) 
>  + __field( u32, upcall_port ) 
>  + __field( u16, upcall_mru ) 
>  + ), 
>  + 
>  + TP_fast_assign( 
>  + __entry->dpaddr = dp; 
>  + __assign_str(dp_name, ovs_dp_name(dp)); 
>  + __assign_str(dev_name, skb->dev->name); 
>  + __entry->skbaddr = skb; 
>  + __entry->len = skb->len; 
>  + __entry->data_len = skb->data_len; 
>  + __entry->truesize = skb->truesize; 
>  + __entry->nr_frags = skb_shinfo(skb)->nr_frags; 
>  + __entry->gso_size = skb_shinfo(skb)->gso_size; 
>  + __entry->gso_type = skb_shinfo(skb)->gso_type; 
>  + __entry->ovs_flow_hash = key->ovs_flow_hash; 
>  + __entry->recirc_id = key->recirc_id; 
>  + __entry->keyaddr = key; 
>  + __entry->key_eth_type = key->eth.type; 
>  + __entry->key_ct_state = key->ct_state; 
>  + __entry->key_ct_orig_proto = key->ct_orig_proto; 
>  + __entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID); 
>  + __entry->upcall_cmd = upcall_info->cmd; 
>  + __entry->upcall_port = upcall_info->portid; 
>  + __entry->upcall_mru = upcall_info->mru; 
>  + ), 
>  + 
>  + TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d
>  gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x
>  ct_state=%02x ct_orig_proto=%02x flow_key_valid=%d upcall_cmd=%u upcall_port=%u upcall_mru=%u",
>
> See %p comment above.

Same applies.

>  + __entry->dpaddr, __get_str(dp_name), __get_str(dev_name), 
>  + __entry->skbaddr, __entry->len, __entry->data_len, 
>  + __entry->truesize, __entry->nr_frags, __entry->gso_size, 
>  + __entry->gso_type, __entry->ovs_flow_hash, 
>  + __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type, 
>  + __entry->key_ct_state, __entry->key_ct_orig_proto, 
>  + __entry->flow_key_valid, 
>  + __entry->upcall_cmd, __entry->upcall_port, 
>  + __entry->upcall_mru) 
>  +); 
>  + 
>  +#endif /* _TRACE_OPENVSWITCH_H */ 
>  + 
>  +/* This part must be outside protection */ 
>  +#undef TRACE_INCLUDE_PATH 
>  +#define TRACE_INCLUDE_PATH . 
>  +#undef TRACE_INCLUDE_FILE 
>  +#define TRACE_INCLUDE_FILE openvswitch_trace 
>  +#include <trace/define_trace.h> 
>  -- 
>  2.31.1



More information about the dev mailing list