[ovs-dev] [PATCH 1/4] datapath: Action for returning packets to the kernel

Chris Luke chrisy at flirble.org
Thu Jan 2 16:42:00 UTC 2014


This provides the 'back_to_kernel' datapath action which has one
of two effects:

- For packets originating from the kernel, sets a flag allowing
  the kernel hook to return it to the kernel.
- For packets that came from userspace, re-inserts the packet
  into the kernel input queue.

Signed-off-by: Chris Luke <chris_luke at cable.comcast.com>
---
 datapath/actions.c          |   40 ++++++++++++++++++
 datapath/datapath.c         |   23 +++++++++--
 datapath/datapath.h         |    9 +++-
 datapath/flow_netlink.c     |    6 ++-
 datapath/vport-netdev.c     |   96 +++++++++++++++++++++++++++++++++++++------
 datapath/vport.c            |   26 +++++++++++-
 datapath/vport.h            |    4 +-
 include/linux/openvswitch.h |    1 +
 lib/dpif-netdev.c           |    1 +
 lib/dpif.c                  |    1 +
 lib/odp-execute.c           |    1 +
 lib/odp-util.c              |   12 ++++++
 tests/odp.at                |    1 +
 13 files changed, 201 insertions(+), 20 deletions(-)

diff --git a/datapath/actions.c b/datapath/actions.c
index 30ea1d2..2476af9 100644
--- a/datapath/actions.c
+++ b/datapath/actions.c
@@ -405,6 +405,34 @@ static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
 	return 0;
 }
 
+static int do_insert(struct datapath *dp, struct sk_buff *skb)
+{
+	struct vport *vport;
+	int error = 0;
+
+	if (unlikely(!skb))
+		return -ENOMEM;
+
+	if (likely(OVS_CB(skb)->pkt_from_kernel)) {
+		/* Since we got this packet from the kernel
+		 * we can simply set a flag to return it to
+		 * the kernel. */
+		OVS_CB(skb)->return_pkt_to_kernel = true;
+	} else {
+		/* We got this packet from userspace so we
+		 * need to insert this into the network input
+		 * queue. */
+		vport = ovs_vport_rcu(dp, OVS_CB(skb)->flow->key.phy.in_port);
+		if (unlikely(!vport)) {
+			kfree_skb(skb);
+			return -ENODEV;
+		}
+
+		error = ovs_vport_insert(vport, skb_clone(skb, GFP_ATOMIC));
+	}
+	return error;
+}
+
 static int output_userspace(struct datapath *dp, struct sk_buff *skb,
 			    const struct nlattr *attr)
 {
@@ -552,6 +580,18 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 
 		case OVS_ACTION_ATTR_SAMPLE:
 			err = sample(dp, skb, a);
+			/* If a sampled output action has us send packets
+			 * back to the kernel, we need to keep the skb around. */
+			if (OVS_CB(skb)->return_pkt_to_kernel)
+				keep_skb = true;
+			break;
+
+		case OVS_ACTION_ATTR_BACK_TO_KERNEL:
+			do_insert(dp, skb);
+			/* If we need to return the packet to
+			 * the kernel, keep the skb for it. */
+			if (OVS_CB(skb)->return_pkt_to_kernel)
+				keep_skb = true;
 			break;
 		}
 
diff --git a/datapath/datapath.c b/datapath/datapath.c
index b42fd8b..6649045 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -215,7 +215,7 @@ void ovs_dp_detach_port(struct vport *p)
 }
 
 /* Must be called with rcu_read_lock. */
-void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
+int ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
 {
 	struct datapath *dp = p->dp;
 	struct sw_flow *flow;
@@ -231,7 +231,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
 	error = ovs_flow_extract(skb, p->port_no, &key);
 	if (unlikely(error)) {
 		kfree_skb(skb);
-		return;
+		return error;
 	}
 
 	/* Look up flow. */
@@ -253,7 +253,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
 	OVS_CB(skb)->pkt_key = &key;
 
 	ovs_flow_stats_update(OVS_CB(skb)->flow, skb);
-	ovs_execute_actions(dp, skb);
+	error = ovs_execute_actions(dp, skb);
 	stats_counter = &stats->n_hit;
 
 out:
@@ -262,6 +262,8 @@ out:
 	(*stats_counter)++;
 	stats->n_mask_hit += n_mask_hit;
 	u64_stats_update_end(&stats->sync);
+
+	return error;
 }
 
 static struct genl_family dp_packet_genl_family = {
@@ -544,6 +546,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	OVS_CB(packet)->pkt_key = &flow->key;
 	packet->priority = flow->key.phy.priority;
 	packet->mark = flow->key.phy.skb_mark;
+	OVS_CB(packet)->pkt_from_kernel = false;
 
 	rcu_read_lock();
 	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
@@ -551,6 +554,20 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	if (!dp)
 		goto err_unlock;
 
+	/* Work out the dev of the original input port */
+	if (flow->key.phy.in_port != DP_MAX_PORTS &&
+			flow->key.phy.in_port != OVSP_LOCAL) {
+		struct vport *vport;
+		vport = ovs_vport_rcu(dp, flow->key.phy.in_port);
+		if (vport != NULL)
+			packet->dev = dev_get_by_name(sock_net(skb->sk),
+				vport->ops->get_name(vport));
+		if (packet->dev != NULL) {
+			packet->skb_iif = packet->dev->ifindex;
+			packet->protocol = eth_type_trans(packet, packet->dev);
+		}
+	}
+
 	local_bh_disable();
 	err = ovs_execute_actions(dp, packet);
 	local_bh_enable();
diff --git a/datapath/datapath.h b/datapath/datapath.h
index b3ae7cd..c012d2a 100644
--- a/datapath/datapath.h
+++ b/datapath/datapath.h
@@ -100,11 +100,18 @@ struct datapath {
  * @pkt_key: The flow information extracted from the packet.  Must be nonnull.
  * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the
  * packet is not being tunneled.
+ * @pkt_from_kernel: True if this packet was handed to us by the kernel, false
+ * if it came from userspace.
+ * @return_pkt_to_kernel: True if the action of a matching flow wants us to
+ * give this packet back to the kernel. Only relevant if it came from the
+ * kernel.
  */
 struct ovs_skb_cb {
 	struct sw_flow		*flow;
 	struct sw_flow_key	*pkt_key;
 	struct ovs_key_ipv4_tunnel  *tun_key;
+	bool			pkt_from_kernel;
+	bool			return_pkt_to_kernel;
 };
 #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
 
@@ -186,7 +193,7 @@ static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_n
 extern struct notifier_block ovs_dp_device_notifier;
 extern struct genl_multicast_group ovs_dp_vport_multicast_group;
 
-void ovs_dp_process_received_packet(struct vport *, struct sk_buff *);
+int ovs_dp_process_received_packet(struct vport *, struct sk_buff *);
 void ovs_dp_detach_port(struct vport *);
 int ovs_dp_upcall(struct datapath *, struct sk_buff *,
 		  const struct dp_upcall_info *);
diff --git a/datapath/flow_netlink.c b/datapath/flow_netlink.c
index 9b26528..7d3519a 100644
--- a/datapath/flow_netlink.c
+++ b/datapath/flow_netlink.c
@@ -1515,7 +1515,8 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
 			[OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
 			[OVS_ACTION_ATTR_POP_VLAN] = 0,
 			[OVS_ACTION_ATTR_SET] = (u32)-1,
-			[OVS_ACTION_ATTR_SAMPLE] = (u32)-1
+			[OVS_ACTION_ATTR_SAMPLE] = (u32)-1,
+			[OVS_ACTION_ATTR_BACK_TO_KERNEL] = 0,
 		};
 		const struct ovs_action_push_vlan *vlan;
 		int type = nla_type(a);
@@ -1567,6 +1568,9 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
 			skip_copy = true;
 			break;
 
+		case OVS_ACTION_ATTR_BACK_TO_KERNEL:
+			break;
+
 		default:
 			return -EINVAL;
 		}
diff --git a/datapath/vport-netdev.c b/datapath/vport-netdev.c
index c15923b..196c851 100644
--- a/datapath/vport-netdev.c
+++ b/datapath/vport-netdev.c
@@ -27,6 +27,10 @@
 #include <linux/skbuff.h>
 #include <linux/openvswitch.h>
 
+#ifdef CONFIG_NET_CLS_ACT
+#include <uapi/linux/pkt_cls.h>
+#endif
+
 #include <net/llc.h>
 
 #include "datapath.h"
@@ -34,7 +38,7 @@
 #include "vport-internal_dev.h"
 #include "vport-netdev.h"
 
-static void netdev_port_receive(struct vport *vport, struct sk_buff *skb);
+static bool netdev_port_receive(struct vport *vport, struct sk_buff *skb);
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
 /* Called with rcu_read_lock and bottom-halves disabled. */
@@ -48,7 +52,10 @@ static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb)
 
 	vport = ovs_netdev_get_vport(skb->dev);
 
-	netdev_port_receive(vport, skb);
+	if (netdev_port_receive(vport, skb)) {
+		/* Tell the kernel we didn't want it. */
+		return RX_HANDLER_PASS;
+	}
 
 	return RX_HANDLER_CONSUMED;
 }
@@ -64,7 +71,10 @@ static struct sk_buff *netdev_frame_hook(struct sk_buff *skb)
 
 	vport = ovs_netdev_get_vport(skb->dev);
 
-	netdev_port_receive(vport, skb);
+	if (netdev_port_receive(vport, skb)) {
+		/* Tell the kernel we didn't want it. */
+		return skb;
+	}
 
 	return NULL;
 }
@@ -189,31 +199,51 @@ const char *ovs_netdev_get_name(const struct vport *vport)
 	return netdev_vport->dev->name;
 }
 
-/* Must be called with rcu_read_lock. */
-static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
+/* Must be called with rcu_read_lock.
+ * Returns true if we want the hook to return the packet to the kernel.
+ */
+static bool netdev_port_receive(struct vport *vport, struct sk_buff *skb)
 {
+	int error;
+
 	if (unlikely(!vport))
 		goto error;
 
 	if (unlikely(skb_warn_if_lro(skb)))
 		goto error;
 
-	/* Make our own copy of the packet.  Otherwise we will mangle the
-	 * packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
-	 * (No one comes after us, since we tell handle_bridge() that we took
-	 * the packet.) */
+	/* Make a clone of the skb if someone else has a reference to it so
+	 * nothing we do with it interferes with anyone higher up the chain.
+	 */
 	skb = skb_share_check(skb, GFP_ATOMIC);
 	if (unlikely(!skb))
-		return;
+		return false;
 
 	skb_push(skb, ETH_HLEN);
 	ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
 
-	ovs_vport_receive(vport, skb, NULL);
-	return;
+	OVS_CB(skb)->pkt_from_kernel = true;
+	OVS_CB(skb)->return_pkt_to_kernel = false;
+
+	error = ovs_vport_receive(vport, skb, NULL);
+	if (unlikely(error)) {
+		/* If we encountered an error, then the skb
+		 * has been freed and we cannot look at it
+		 * safely anymore. */
+		return false;
+	}
+
+	if (OVS_CB(skb)->return_pkt_to_kernel) {
+		/* Clean up the skb */
+		skb->protocol = eth_type_trans(skb, skb->dev);
+		return true;
+	}
+
+	return false;
 
 error:
 	kfree_skb(skb);
+	return false;
 }
 
 static unsigned int packet_length(const struct sk_buff *skb)
@@ -250,6 +280,47 @@ drop:
 	return 0;
 }
 
+/* Must be called with rcu_read_lock. */
+static int netdev_insert(struct sk_buff *skb)
+{
+	int len, ret;
+
+	if (unlikely(skb == NULL || skb->dev == NULL))
+		return -EINVAL;
+
+	/* The skb will have gone when we want this later. */
+	len = skb->len;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
+#ifdef CONFIG_NET_CLS_ACT
+	/* This flag will skip the top half of the
+	 * code in __netif_receive_skb(). */
+	skb->tc_verd = TC_NCLS;
+#else
+#warning Without kernel option CONFIG_NET_CLS_ACT some 'back_to_kernel' \
+		packets may deliver twice in AF_PACKET 'ALL' listeners, \
+		such as 'tcpdump' or 'lldpd'.
+#endif /* CONFIG_NET_CLS_ACT */
+
+	/* Send it! */
+	ret = netif_rx_ni(skb);
+
+#else
+	/* Unsupported kernel version. */
+	kfree_skb(skb);
+	return -EINVAL;
+
+#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36) */
+
+	if (likely(ret == NET_RX_SUCCESS)) {
+		return len;
+	} else if(ret == NET_RX_DROP) {
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
 /* Returns null if this device is not attached to a datapath. */
 struct vport *ovs_netdev_get_vport(struct net_device *dev)
 {
@@ -278,6 +349,7 @@ const struct vport_ops ovs_netdev_vport_ops = {
 	.destroy	= netdev_destroy,
 	.get_name	= ovs_netdev_get_name,
 	.send		= netdev_send,
+	.insert		= netdev_insert,
 };
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36) && \
diff --git a/datapath/vport.c b/datapath/vport.c
index 7f12acc..f5dfcd6 100644
--- a/datapath/vport.c
+++ b/datapath/vport.c
@@ -358,8 +358,9 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb)
  * Must be called with rcu_read_lock.  The packet cannot be shared and
  * skb->data should point to the Ethernet header.  The caller must have already
  * called compute_ip_summed() to initialize the checksumming fields.
+ * Returns 0 on success, or -errno otherwise.
  */
-void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
+int ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
 		       struct ovs_key_ipv4_tunnel *tun_key)
 {
 	struct pcpu_tstats *stats;
@@ -371,7 +372,7 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
 	u64_stats_update_end(&stats->syncp);
 
 	OVS_CB(skb)->tun_key = tun_key;
-	ovs_dp_process_received_packet(vport, skb);
+	return ovs_dp_process_received_packet(vport, skb);
 }
 
 /**
@@ -406,6 +407,27 @@ int ovs_vport_send(struct vport *vport, struct sk_buff *skb)
 }
 
 /**
+ *	ovs_vport_insert - send a packet to the input queue
+ *
+ * @vport: vport from which to insert the packet
+ * @skb: skb to send
+ *
+ * Sends the given packet and returns the number of bytes data sent.
+ * Returns 0 if the packet was dropped and -errno for any errors.
+ * rcu_read_lock must be held.
+ */
+int ovs_vport_insert(struct vport *vport, struct sk_buff *skb)
+{
+	if (unlikely(vport->ops->insert == NULL))
+		return -EINVAL;
+
+	if (unlikely(skb->dev == NULL))
+		return -ENODEV;
+
+	return vport->ops->insert(skb);
+}
+
+/**
  *	ovs_vport_record_error - indicate device error to generic stats layer
  *
  * @vport: vport that encountered the error
diff --git a/datapath/vport.h b/datapath/vport.h
index 2cf2b18..34303dd 100644
--- a/datapath/vport.h
+++ b/datapath/vport.h
@@ -51,6 +51,7 @@ int ovs_vport_set_options(struct vport *, struct nlattr *options);
 int ovs_vport_get_options(const struct vport *, struct sk_buff *);
 
 int ovs_vport_send(struct vport *, struct sk_buff *);
+int ovs_vport_insert(struct vport *, struct sk_buff *);
 
 /* The following definitions are for implementers of vport devices: */
 
@@ -146,6 +147,7 @@ struct vport_ops {
 	const char *(*get_name)(const struct vport *);
 
 	int (*send)(struct vport *, struct sk_buff *);
+	int (*insert)(struct sk_buff *);
 };
 
 enum vport_err_type {
@@ -191,7 +193,7 @@ static inline struct vport *vport_from_priv(const void *priv)
 	return (struct vport *)(priv - ALIGN(sizeof(struct vport), VPORT_ALIGN));
 }
 
-void ovs_vport_receive(struct vport *, struct sk_buff *,
+int ovs_vport_receive(struct vport *, struct sk_buff *,
 		       struct ovs_key_ipv4_tunnel *);
 
 /* List of statically compiled vport implementations.  Don't forget to also
diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h
index 5137c2f..5530f91 100644
--- a/include/linux/openvswitch.h
+++ b/include/linux/openvswitch.h
@@ -569,6 +569,7 @@ enum ovs_action_attr {
 	OVS_ACTION_ATTR_SAMPLE,       /* Nested OVS_SAMPLE_ATTR_*. */
 	OVS_ACTION_ATTR_PUSH_MPLS,    /* struct ovs_action_push_mpls. */
 	OVS_ACTION_ATTR_POP_MPLS,     /* __be16 ethertype. */
+	OVS_ACTION_ATTR_BACK_TO_KERNEL, /* No argument. */
 	__OVS_ACTION_ATTR_MAX
 };
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index cb64bdc..d9ad2b6 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -1804,6 +1804,7 @@ dp_execute_cb(void *aux_, struct ofpbuf *packet,
     case OVS_ACTION_ATTR_POP_MPLS:
     case OVS_ACTION_ATTR_SET:
     case OVS_ACTION_ATTR_SAMPLE:
+    case OVS_ACTION_ATTR_BACK_TO_KERNEL:
     case OVS_ACTION_ATTR_UNSPEC:
     case __OVS_ACTION_ATTR_MAX:
         OVS_NOT_REACHED();
diff --git a/lib/dpif.c b/lib/dpif.c
index 2b79a6e..7b3b97c 100644
--- a/lib/dpif.c
+++ b/lib/dpif.c
@@ -1078,6 +1078,7 @@ dpif_execute_helper_cb(void *aux_, struct ofpbuf *packet,
     switch ((enum ovs_action_attr)type) {
     case OVS_ACTION_ATTR_OUTPUT:
     case OVS_ACTION_ATTR_USERSPACE:
+    case OVS_ACTION_ATTR_BACK_TO_KERNEL:
         execute.actions = action;
         execute.actions_len = NLA_ALIGN(action->nla_len);
         execute.packet = packet;
diff --git a/lib/odp-execute.c b/lib/odp-execute.c
index 5b77fa9..c98cfb4 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -195,6 +195,7 @@ odp_execute_actions__(void *dp, struct ofpbuf *packet, struct pkt_metadata *md,
             /* These only make sense in the context of a datapath. */
         case OVS_ACTION_ATTR_OUTPUT:
         case OVS_ACTION_ATTR_USERSPACE:
+        case OVS_ACTION_ATTR_BACK_TO_KERNEL:
             if (dp_execute_action) {
                 /* Allow 'dp_execute_action' to steal the packet data if we do
                  * not need it any more. */
diff --git a/lib/odp-util.c b/lib/odp-util.c
index 873e05a..be25613 100644
--- a/lib/odp-util.c
+++ b/lib/odp-util.c
@@ -81,6 +81,7 @@ odp_action_len(uint16_t type)
     case OVS_ACTION_ATTR_POP_MPLS: return sizeof(ovs_be16);
     case OVS_ACTION_ATTR_SET: return -2;
     case OVS_ACTION_ATTR_SAMPLE: return -2;
+    case OVS_ACTION_ATTR_BACK_TO_KERNEL: return 0;
 
     case OVS_ACTION_ATTR_UNSPEC:
     case __OVS_ACTION_ATTR_MAX:
@@ -424,6 +425,9 @@ format_odp_action(struct ds *ds, const struct nlattr *a)
     case OVS_ACTION_ATTR_SAMPLE:
         format_odp_sample_action(ds, a);
         break;
+    case OVS_ACTION_ATTR_BACK_TO_KERNEL:
+        ds_put_format(ds, "back_to_kernel");
+        break;
     case OVS_ACTION_ATTR_UNSPEC:
     case __OVS_ACTION_ATTR_MAX:
     default:
@@ -665,6 +669,14 @@ parse_odp_action(const char *s, const struct simap *port_names,
         }
     }
 
+    {
+        int len = strcspn(s, delimiters);
+        if (strncmp(s, "back_to_kernel", len) == 0) {
+            nl_msg_put_flag(actions, OVS_ACTION_ATTR_BACK_TO_KERNEL);
+            return len;
+        }
+    }
+
     return -EINVAL;
 }
 
diff --git a/tests/odp.at b/tests/odp.at
index b505345..3c4d9fe 100644
--- a/tests/odp.at
+++ b/tests/odp.at
@@ -243,6 +243,7 @@ pop_vlan
 sample(sample=9.7%,actions(1,2,3,push_vlan(vid=1,pcp=2)))
 set(tunnel(tun_id=0xabcdef1234567890,src=1.1.1.1,dst=2.2.2.2,tos=0x0,ttl=64,flags(df,csum,key)))
 set(tunnel(tun_id=0xabcdef1234567890,src=1.1.1.1,dst=2.2.2.2,tos=0x0,ttl=64,flags(key)))
+back_to_kernel
 ])
 AT_CHECK_UNQUOTED([test-odp parse-actions < actions.txt], [0],
   [`cat actions.txt`
-- 
1.7.9.5




More information about the dev mailing list