[ovs-dev] [PATCH] Implement new fragment handling policy.

Ben Pfaff blp at nicira.com
Thu Oct 20 05:56:21 UTC 2011


On Wed, Oct 19, 2011 at 06:09:30PM -0700, Jesse Gross wrote:
> >     switch (wc->tos_frag_mask & FLOW_FRAG_MASK) {
> > -    case FLOW_FRAG_ANY | FLOW_FRAG_FIRST:
> > +    case FLOW_FRAG_ANY | FLOW_FRAG_LATER:
> >         ds_put_format(s, "frag=%s,",
> >                       f->tos_frag & FLOW_FRAG_ANY
> > -                      ? (f->tos_frag & FLOW_FRAG_FIRST ? "first" : "later")
> > -                      : (f->tos_frag & FLOW_FRAG_FIRST ? "<error>" : "no"));
> > +                      ? (f->tos_frag & FLOW_FRAG_LATER ? "later" : "first")
> > +                      : (f->tos_frag & FLOW_FRAG_LATER ? "no" : "<error>"));
>
> I think the last condition is reversed because !FLOW_FRAG_ANY and
> FLOW_FRAG_LATER is still an error.

Oops.  Applied:

diff --git a/lib/classifier.c b/lib/classifier.c
index 0335f13..869029f 100644
--- a/lib/classifier.c
+++ b/lib/classifier.c
@@ -618,7 +618,7 @@ cls_rule_format(const struct cls_rule *rule, struct ds *s)
         ds_put_format(s, "frag=%s,",
                       f->tos_frag & FLOW_FRAG_ANY
                       ? (f->tos_frag & FLOW_FRAG_LATER ? "later" : "first")
-                      : (f->tos_frag & FLOW_FRAG_LATER ? "no" : "<error>"));
+                      : (f->tos_frag & FLOW_FRAG_LATER ? "<error>" : "no"));
         break;

     case FLOW_FRAG_ANY:

> > diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
> > index 58fc18e..536a213 100644
> > --- a/ofproto/ofproto-dpif.c
> > +++ b/ofproto/ofproto-dpif.c
> > @@ -2954,7 +2954,7 @@ rule_dpif_lookup(struct ofproto_dpif *ofproto, const struct flow *flow,
> >     }
> >
> >     cls = &ofproto->up.tables[table_id];
> > -    if (flow->tos_frag & FLOW_FRAG_FIRST
> > +    if ((flow->tos_frag & FLOW_FRAG_MASK) == FLOW_FRAG_ANY
>
> Is there a reason for this to not be (flow->tos_frag & FLOW_FRAG_ANY)?
>  I realize that it does the masking out for last frags as well, which
> is not strictly necessary although I'm not sure that it matters all
> that much.

Over-optimization: I wanted to avoid the "struct flow" copy in that
case.

I changed it:

--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -2954,7 +2954,7 @@ rule_dpif_lookup(struct ofproto_dpif *ofproto, const stru\
ct flow *flow,
     }

     cls = &ofproto->up.tables[table_id];
-    if ((flow->tos_frag & FLOW_FRAG_MASK) == FLOW_FRAG_ANY
+    if (flow->tos_frag & FLOW_FRAG_ANY
         && ofproto->up.frag_handling == OFPC_FRAG_NORMAL) {
         /* For OFPC_NORMAL frag_handling, we must pretend that transport ports
          * are unavailable. */

> Otherwise, the incremental looks good.  However, I realized that there
> is one more issue: when we pass up the flow key for UDP GSO packets,
> they will all reflect the first fragment and not the later packets.

Hmm, good point.

Here's a build-tested-only try at this.  It needs testing and comments,
at least.  Is it on the right track?

(I've appended two copies.  The first one is a full diff, the second one
uses "git diff -b" to omit changes that reindented lines.)

--8<--------------------------cut here-------------------------->8--

diff --git a/datapath/datapath.c b/datapath/datapath.c
index 2f8027c..2815bcc 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -84,8 +84,10 @@ EXPORT_SYMBOL(dp_ioctl_hook);
 static LIST_HEAD(dps);
 
 static struct vport *new_vport(const struct vport_parms *);
-static int queue_userspace_packets(struct datapath *, struct sk_buff *,
-				 const struct dp_upcall_info *);
+static int queue_gso_packets(int dp_ifindex, struct sk_buff *,
+			     const struct dp_upcall_info *);
+static int queue_userspace_packet(int dp_ifindex, struct sk_buff *,
+				  const struct dp_upcall_info *);
 
 /* Must be called with rcu_read_lock, genl_mutex, or RTNL lock. */
 struct datapath *get_dp(int dp_ifindex)
@@ -353,8 +355,8 @@ static struct genl_family dp_packet_genl_family = {
 int dp_upcall(struct datapath *dp, struct sk_buff *skb,
 	      const struct dp_upcall_info *upcall_info)
 {
-	struct sk_buff *segs = NULL;
 	struct dp_stats_percpu *stats;
+	int dp_ifindex;
 	int err;
 
 	if (upcall_info->pid == 0) {
@@ -362,30 +364,18 @@ int dp_upcall(struct datapath *dp, struct sk_buff *skb,
 		goto err;
 	}
 
-	forward_ip_summed(skb, true);
-
-	/* Break apart GSO packets into their component pieces.  Otherwise
-	 * userspace may try to stuff a 64kB packet into a 1500-byte MTU. */
-	if (skb_is_gso(skb)) {
-		segs = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM);
-		
-		if (IS_ERR(segs)) {
-			err = PTR_ERR(segs);
-			goto err;
-		}
-		skb = segs;
+	dp_ifindex = get_dpifindex(dp);
+	if (!dp_ifindex) {
+		err = -ENODEV;
+		goto err;
 	}
 
-	err = queue_userspace_packets(dp, skb, upcall_info);
-	if (segs) {
-		struct sk_buff *next;
-		/* Free GSO-segments */
-		do {
-			next = segs->next;
-			kfree_skb(segs);
-		} while ((segs = next) != NULL);
-	}
+	forward_ip_summed(skb, true);
 
+	if (!skb_is_gso(skb))
+		err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
+	else
+		err = queue_gso_packets(dp_ifindex, skb, upcall_info);
 	if (err)
 		goto err;
 
@@ -401,68 +391,93 @@ err:
 	return err;
 }
 
-/* Send each packet in the 'skb' list to userspace for 'dp' as directed by
- * 'upcall_info'.  There will be only one packet unless we broke up a GSO
- * packet.
- */
-static int queue_userspace_packets(struct datapath *dp, struct sk_buff *skb,
-				   const struct dp_upcall_info *upcall_info)
+static int queue_gso_packets(int dp_ifindex, struct sk_buff *skb,
+			     const struct dp_upcall_info *upcall_info)
 {
-	int dp_ifindex;
+	struct dp_upcall_info later_info;
+	struct sw_flow_key later_key;
+	struct sk_buff *segs, *nskb;
+	int err;
 
-	dp_ifindex = get_dpifindex(dp);
-	if (!dp_ifindex)
-		return -ENODEV;
+	segs = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
 
+	/* Queue all of the segments. */
+	skb = segs;
 	do {
-		struct ovs_header *upcall;
-		struct sk_buff *user_skb; /* to be queued to userspace */
-		struct nlattr *nla;
-		unsigned int len;
-		int err;
-
-		err = vlan_deaccel_tag(skb);
-		if (unlikely(err))
-			return err;
-
-		if (nla_attr_size(skb->len) > USHRT_MAX)
-			return -EFBIG;
-
-		len = sizeof(struct ovs_header);
-		len += nla_total_size(skb->len);
-		len += nla_total_size(FLOW_BUFSIZE);
-		if (upcall_info->cmd == OVS_PACKET_CMD_ACTION)
-			len += nla_total_size(8);
-
-		user_skb = genlmsg_new(len, GFP_ATOMIC);
-		if (!user_skb)
-			return -ENOMEM;
-
-		upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
-					 0, upcall_info->cmd);
-		upcall->dp_ifindex = dp_ifindex;
-
-		nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
-		flow_to_nlattrs(upcall_info->key, user_skb);
-		nla_nest_end(user_skb, nla);
-
-		if (upcall_info->userdata)
-			nla_put_u64(user_skb, OVS_PACKET_ATTR_USERDATA,
-				    nla_get_u64(upcall_info->userdata));
-
-		nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
-		if (skb->ip_summed == CHECKSUM_PARTIAL)
-			copy_and_csum_skb(skb, nla_data(nla));
-		else
-			skb_copy_bits(skb, 0, nla_data(nla), skb->len);
-
-		err = genlmsg_unicast(&init_net, user_skb, upcall_info->pid);
+		err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
 		if (err)
-			return err;
+			break;
+
+		if (skb == segs && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) {
+			later_key = *upcall_info->key;
+			later_key.ip.tos_frag &= ~OVS_FRAG_TYPE_MASK;
+			later_key.ip.tos_frag |= OVS_FRAG_TYPE_LATER;
 
+			later_info = *upcall_info;
+			later_info.key = &later_key;
+			upcall_info = &later_info;
+		}
 	} while ((skb = skb->next));
 
-	return 0;
+	/* Free all of the segments. */
+	skb = segs;
+	do {
+		nskb = skb->next;
+		if (err)
+			kfree_skb(skb);
+		else
+			consume_skb(skb);
+	} while ((skb = nskb));
+	return err;
+}
+
+static int queue_userspace_packet(int dp_ifindex, struct sk_buff *skb,
+				  const struct dp_upcall_info *upcall_info)
+{
+	struct ovs_header *upcall;
+	struct sk_buff *user_skb; /* to be queued to userspace */
+	struct nlattr *nla;
+	unsigned int len;
+	int err;
+
+	err = vlan_deaccel_tag(skb);
+	if (unlikely(err))
+		return err;
+
+	if (nla_attr_size(skb->len) > USHRT_MAX)
+		return -EFBIG;
+
+	len = sizeof(struct ovs_header);
+	len += nla_total_size(skb->len);
+	len += nla_total_size(FLOW_BUFSIZE);
+	if (upcall_info->cmd == OVS_PACKET_CMD_ACTION)
+		len += nla_total_size(8);
+
+	user_skb = genlmsg_new(len, GFP_ATOMIC);
+	if (!user_skb)
+		return -ENOMEM;
+
+	upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
+			     0, upcall_info->cmd);
+	upcall->dp_ifindex = dp_ifindex;
+
+	nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
+	flow_to_nlattrs(upcall_info->key, user_skb);
+	nla_nest_end(user_skb, nla);
+
+	if (upcall_info->userdata)
+		nla_put_u64(user_skb, OVS_PACKET_ATTR_USERDATA,
+			    nla_get_u64(upcall_info->userdata));
+
+	nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		copy_and_csum_skb(skb, nla_data(nla));
+	else
+		skb_copy_bits(skb, 0, nla_data(nla), skb->len);
+
+	return genlmsg_unicast(&init_net, user_skb, upcall_info->pid);
 }
 
 /* Called with genl_mutex. */

--8<--------------------------cut here-------------------------->8--

diff --git a/datapath/datapath.c b/datapath/datapath.c
index 2f8027c..2815bcc 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -84,7 +84,9 @@ EXPORT_SYMBOL(dp_ioctl_hook);
 static LIST_HEAD(dps);
 
 static struct vport *new_vport(const struct vport_parms *);
-static int queue_userspace_packets(struct datapath *, struct sk_buff *,
+static int queue_gso_packets(int dp_ifindex, struct sk_buff *,
+			     const struct dp_upcall_info *);
+static int queue_userspace_packet(int dp_ifindex, struct sk_buff *,
 				 const struct dp_upcall_info *);
 
 /* Must be called with rcu_read_lock, genl_mutex, or RTNL lock. */
@@ -353,8 +355,8 @@ static struct genl_family dp_packet_genl_family = {
 int dp_upcall(struct datapath *dp, struct sk_buff *skb,
 	      const struct dp_upcall_info *upcall_info)
 {
-	struct sk_buff *segs = NULL;
 	struct dp_stats_percpu *stats;
+	int dp_ifindex;
 	int err;
 
 	if (upcall_info->pid == 0) {
@@ -362,30 +364,18 @@ int dp_upcall(struct datapath *dp, struct sk_buff *skb,
 		goto err;
 	}
 
-	forward_ip_summed(skb, true);
-
-	/* Break apart GSO packets into their component pieces.  Otherwise
-	 * userspace may try to stuff a 64kB packet into a 1500-byte MTU. */
-	if (skb_is_gso(skb)) {
-		segs = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM);
-		
-		if (IS_ERR(segs)) {
-			err = PTR_ERR(segs);
+	dp_ifindex = get_dpifindex(dp);
+	if (!dp_ifindex) {
+		err = -ENODEV;
 			goto err;
 		}
-		skb = segs;
-	}
 
-	err = queue_userspace_packets(dp, skb, upcall_info);
-	if (segs) {
-		struct sk_buff *next;
-		/* Free GSO-segments */
-		do {
-			next = segs->next;
-			kfree_skb(segs);
-		} while ((segs = next) != NULL);
-	}
+	forward_ip_summed(skb, true);
 
+	if (!skb_is_gso(skb))
+		err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
+	else
+		err = queue_gso_packets(dp_ifindex, skb, upcall_info);
 	if (err)
 		goto err;
 
@@ -401,20 +391,51 @@ err:
 	return err;
 }
 
-/* Send each packet in the 'skb' list to userspace for 'dp' as directed by
- * 'upcall_info'.  There will be only one packet unless we broke up a GSO
- * packet.
- */
-static int queue_userspace_packets(struct datapath *dp, struct sk_buff *skb,
+static int queue_gso_packets(int dp_ifindex, struct sk_buff *skb,
 				   const struct dp_upcall_info *upcall_info)
 {
-	int dp_ifindex;
+	struct dp_upcall_info later_info;
+	struct sw_flow_key later_key;
+	struct sk_buff *segs, *nskb;
+	int err;
 
-	dp_ifindex = get_dpifindex(dp);
-	if (!dp_ifindex)
-		return -ENODEV;
+	segs = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
+
+	/* Queue all of the segments. */
+	skb = segs;
+	do {
+		err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
+		if (err)
+			break;
 
+		if (skb == segs && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) {
+			later_key = *upcall_info->key;
+			later_key.ip.tos_frag &= ~OVS_FRAG_TYPE_MASK;
+			later_key.ip.tos_frag |= OVS_FRAG_TYPE_LATER;
+
+			later_info = *upcall_info;
+			later_info.key = &later_key;
+			upcall_info = &later_info;
+		}
+	} while ((skb = skb->next));
+
+	/* Free all of the segments. */
+	skb = segs;
 	do {
+		nskb = skb->next;
+		if (err)
+			kfree_skb(skb);
+		else
+			consume_skb(skb);
+	} while ((skb = nskb));
+	return err;
+}
+
+static int queue_userspace_packet(int dp_ifindex, struct sk_buff *skb,
+				  const struct dp_upcall_info *upcall_info)
+{
 		struct ovs_header *upcall;
 		struct sk_buff *user_skb; /* to be queued to userspace */
 		struct nlattr *nla;
@@ -456,13 +477,7 @@ static int queue_userspace_packets(struct datapath *dp, struct sk_buff *skb,
 		else
 			skb_copy_bits(skb, 0, nla_data(nla), skb->len);
 
-		err = genlmsg_unicast(&init_net, user_skb, upcall_info->pid);
-		if (err)
-			return err;
-
-	} while ((skb = skb->next));
-
-	return 0;
+	return genlmsg_unicast(&init_net, user_skb, upcall_info->pid);
 }
 
 /* Called with genl_mutex. */



More information about the dev mailing list