[ovs-dev] [PATCH] Implement new fragment handling policy.
Ben Pfaff
blp at nicira.com
Thu Oct 20 05:56:21 UTC 2011
On Wed, Oct 19, 2011 at 06:09:30PM -0700, Jesse Gross wrote:
> > switch (wc->tos_frag_mask & FLOW_FRAG_MASK) {
> > - case FLOW_FRAG_ANY | FLOW_FRAG_FIRST:
> > + case FLOW_FRAG_ANY | FLOW_FRAG_LATER:
> > ds_put_format(s, "frag=%s,",
> > f->tos_frag & FLOW_FRAG_ANY
> > - ? (f->tos_frag & FLOW_FRAG_FIRST ? "first" : "later")
> > - : (f->tos_frag & FLOW_FRAG_FIRST ? "<error>" : "no"));
> > + ? (f->tos_frag & FLOW_FRAG_LATER ? "later" : "first")
> > + : (f->tos_frag & FLOW_FRAG_LATER ? "no" : "<error>"));
>
> I think the last condition is reversed because !FLOW_FRAG_ANY and
> FLOW_FRAG_LATER is still an error.
Oops. Applied:
diff --git a/lib/classifier.c b/lib/classifier.c
index 0335f13..869029f 100644
--- a/lib/classifier.c
+++ b/lib/classifier.c
@@ -618,7 +618,7 @@ cls_rule_format(const struct cls_rule *rule, struct ds *s)
ds_put_format(s, "frag=%s,",
f->tos_frag & FLOW_FRAG_ANY
? (f->tos_frag & FLOW_FRAG_LATER ? "later" : "first")
- : (f->tos_frag & FLOW_FRAG_LATER ? "no" : "<error>"));
+ : (f->tos_frag & FLOW_FRAG_LATER ? "<error>" : "no"));
break;
case FLOW_FRAG_ANY:
> > diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
> > index 58fc18e..536a213 100644
> > --- a/ofproto/ofproto-dpif.c
> > +++ b/ofproto/ofproto-dpif.c
> > @@ -2954,7 +2954,7 @@ rule_dpif_lookup(struct ofproto_dpif *ofproto, const struct flow *flow,
> > }
> >
> > cls = &ofproto->up.tables[table_id];
> > - if (flow->tos_frag & FLOW_FRAG_FIRST
> > + if ((flow->tos_frag & FLOW_FRAG_MASK) == FLOW_FRAG_ANY
>
> Is there a reason for this to not be (flow->tos_frag & FLOW_FRAG_ANY)?
> I realize that it does the masking out for last frags as well, which
> is not strictly necessary although I'm not sure that it matters all
> that much.
Over-optimization: I wanted to avoid the "struct flow" copy in that
case.
I changed it:
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -2954,7 +2954,7 @@ rule_dpif_lookup(struct ofproto_dpif *ofproto, const stru\
ct flow *flow,
}
cls = &ofproto->up.tables[table_id];
- if ((flow->tos_frag & FLOW_FRAG_MASK) == FLOW_FRAG_ANY
+ if (flow->tos_frag & FLOW_FRAG_ANY
&& ofproto->up.frag_handling == OFPC_FRAG_NORMAL) {
/* For OFPC_NORMAL frag_handling, we must pretend that transport ports
* are unavailable. */
> Otherwise, the incremental looks good. However, I realized that there
> is one more issue: when we pass up the flow key for UDP GSO packets,
> they will all reflect the first fragment and not the later packets.
Hmm, good point.
Here's a build-tested-only try at this. It needs testing and comments,
at least. Is it on the right track?
(I've appended two copies. The first one is a full diff, the second one
uses "git diff -b" to omit changes that reindented lines.)
--8<--------------------------cut here-------------------------->8--
diff --git a/datapath/datapath.c b/datapath/datapath.c
index 2f8027c..2815bcc 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -84,8 +84,10 @@ EXPORT_SYMBOL(dp_ioctl_hook);
static LIST_HEAD(dps);
static struct vport *new_vport(const struct vport_parms *);
-static int queue_userspace_packets(struct datapath *, struct sk_buff *,
- const struct dp_upcall_info *);
+static int queue_gso_packets(int dp_ifindex, struct sk_buff *,
+ const struct dp_upcall_info *);
+static int queue_userspace_packet(int dp_ifindex, struct sk_buff *,
+ const struct dp_upcall_info *);
/* Must be called with rcu_read_lock, genl_mutex, or RTNL lock. */
struct datapath *get_dp(int dp_ifindex)
@@ -353,8 +355,8 @@ static struct genl_family dp_packet_genl_family = {
int dp_upcall(struct datapath *dp, struct sk_buff *skb,
const struct dp_upcall_info *upcall_info)
{
- struct sk_buff *segs = NULL;
struct dp_stats_percpu *stats;
+ int dp_ifindex;
int err;
if (upcall_info->pid == 0) {
@@ -362,30 +364,18 @@ int dp_upcall(struct datapath *dp, struct sk_buff *skb,
goto err;
}
- forward_ip_summed(skb, true);
-
- /* Break apart GSO packets into their component pieces. Otherwise
- * userspace may try to stuff a 64kB packet into a 1500-byte MTU. */
- if (skb_is_gso(skb)) {
- segs = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM);
-
- if (IS_ERR(segs)) {
- err = PTR_ERR(segs);
- goto err;
- }
- skb = segs;
+ dp_ifindex = get_dpifindex(dp);
+ if (!dp_ifindex) {
+ err = -ENODEV;
+ goto err;
}
- err = queue_userspace_packets(dp, skb, upcall_info);
- if (segs) {
- struct sk_buff *next;
- /* Free GSO-segments */
- do {
- next = segs->next;
- kfree_skb(segs);
- } while ((segs = next) != NULL);
- }
+ forward_ip_summed(skb, true);
+ if (!skb_is_gso(skb))
+ err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
+ else
+ err = queue_gso_packets(dp_ifindex, skb, upcall_info);
if (err)
goto err;
@@ -401,68 +391,93 @@ err:
return err;
}
-/* Send each packet in the 'skb' list to userspace for 'dp' as directed by
- * 'upcall_info'. There will be only one packet unless we broke up a GSO
- * packet.
- */
-static int queue_userspace_packets(struct datapath *dp, struct sk_buff *skb,
- const struct dp_upcall_info *upcall_info)
+static int queue_gso_packets(int dp_ifindex, struct sk_buff *skb,
+ const struct dp_upcall_info *upcall_info)
{
- int dp_ifindex;
+ struct dp_upcall_info later_info;
+ struct sw_flow_key later_key;
+ struct sk_buff *segs, *nskb;
+ int err;
- dp_ifindex = get_dpifindex(dp);
- if (!dp_ifindex)
- return -ENODEV;
+ segs = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+ /* Queue all of the segments. */
+ skb = segs;
do {
- struct ovs_header *upcall;
- struct sk_buff *user_skb; /* to be queued to userspace */
- struct nlattr *nla;
- unsigned int len;
- int err;
-
- err = vlan_deaccel_tag(skb);
- if (unlikely(err))
- return err;
-
- if (nla_attr_size(skb->len) > USHRT_MAX)
- return -EFBIG;
-
- len = sizeof(struct ovs_header);
- len += nla_total_size(skb->len);
- len += nla_total_size(FLOW_BUFSIZE);
- if (upcall_info->cmd == OVS_PACKET_CMD_ACTION)
- len += nla_total_size(8);
-
- user_skb = genlmsg_new(len, GFP_ATOMIC);
- if (!user_skb)
- return -ENOMEM;
-
- upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
- 0, upcall_info->cmd);
- upcall->dp_ifindex = dp_ifindex;
-
- nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
- flow_to_nlattrs(upcall_info->key, user_skb);
- nla_nest_end(user_skb, nla);
-
- if (upcall_info->userdata)
- nla_put_u64(user_skb, OVS_PACKET_ATTR_USERDATA,
- nla_get_u64(upcall_info->userdata));
-
- nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
- if (skb->ip_summed == CHECKSUM_PARTIAL)
- copy_and_csum_skb(skb, nla_data(nla));
- else
- skb_copy_bits(skb, 0, nla_data(nla), skb->len);
-
- err = genlmsg_unicast(&init_net, user_skb, upcall_info->pid);
+ err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
if (err)
- return err;
+ break;
+
+ if (skb == segs && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) {
+ later_key = *upcall_info->key;
+ later_key.ip.tos_frag &= ~OVS_FRAG_TYPE_MASK;
+ later_key.ip.tos_frag |= OVS_FRAG_TYPE_LATER;
+ later_info = *upcall_info;
+ later_info.key = &later_key;
+ upcall_info = &later_info;
+ }
} while ((skb = skb->next));
- return 0;
+ /* Free all of the segments. */
+ skb = segs;
+ do {
+ nskb = skb->next;
+ if (err)
+ kfree_skb(skb);
+ else
+ consume_skb(skb);
+ } while ((skb = nskb));
+ return err;
+}
+
+static int queue_userspace_packet(int dp_ifindex, struct sk_buff *skb,
+ const struct dp_upcall_info *upcall_info)
+{
+ struct ovs_header *upcall;
+ struct sk_buff *user_skb; /* to be queued to userspace */
+ struct nlattr *nla;
+ unsigned int len;
+ int err;
+
+ err = vlan_deaccel_tag(skb);
+ if (unlikely(err))
+ return err;
+
+ if (nla_attr_size(skb->len) > USHRT_MAX)
+ return -EFBIG;
+
+ len = sizeof(struct ovs_header);
+ len += nla_total_size(skb->len);
+ len += nla_total_size(FLOW_BUFSIZE);
+ if (upcall_info->cmd == OVS_PACKET_CMD_ACTION)
+ len += nla_total_size(8);
+
+ user_skb = genlmsg_new(len, GFP_ATOMIC);
+ if (!user_skb)
+ return -ENOMEM;
+
+ upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
+ 0, upcall_info->cmd);
+ upcall->dp_ifindex = dp_ifindex;
+
+ nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
+ flow_to_nlattrs(upcall_info->key, user_skb);
+ nla_nest_end(user_skb, nla);
+
+ if (upcall_info->userdata)
+ nla_put_u64(user_skb, OVS_PACKET_ATTR_USERDATA,
+ nla_get_u64(upcall_info->userdata));
+
+ nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ copy_and_csum_skb(skb, nla_data(nla));
+ else
+ skb_copy_bits(skb, 0, nla_data(nla), skb->len);
+
+ return genlmsg_unicast(&init_net, user_skb, upcall_info->pid);
}
/* Called with genl_mutex. */
--8<--------------------------cut here-------------------------->8--
diff --git a/datapath/datapath.c b/datapath/datapath.c
index 2f8027c..2815bcc 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -84,7 +84,9 @@ EXPORT_SYMBOL(dp_ioctl_hook);
static LIST_HEAD(dps);
static struct vport *new_vport(const struct vport_parms *);
-static int queue_userspace_packets(struct datapath *, struct sk_buff *,
+static int queue_gso_packets(int dp_ifindex, struct sk_buff *,
+ const struct dp_upcall_info *);
+static int queue_userspace_packet(int dp_ifindex, struct sk_buff *,
const struct dp_upcall_info *);
/* Must be called with rcu_read_lock, genl_mutex, or RTNL lock. */
@@ -353,8 +355,8 @@ static struct genl_family dp_packet_genl_family = {
int dp_upcall(struct datapath *dp, struct sk_buff *skb,
const struct dp_upcall_info *upcall_info)
{
- struct sk_buff *segs = NULL;
struct dp_stats_percpu *stats;
+ int dp_ifindex;
int err;
if (upcall_info->pid == 0) {
@@ -362,30 +364,18 @@ int dp_upcall(struct datapath *dp, struct sk_buff *skb,
goto err;
}
- forward_ip_summed(skb, true);
-
- /* Break apart GSO packets into their component pieces. Otherwise
- * userspace may try to stuff a 64kB packet into a 1500-byte MTU. */
- if (skb_is_gso(skb)) {
- segs = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM);
-
- if (IS_ERR(segs)) {
- err = PTR_ERR(segs);
+ dp_ifindex = get_dpifindex(dp);
+ if (!dp_ifindex) {
+ err = -ENODEV;
goto err;
}
- skb = segs;
- }
- err = queue_userspace_packets(dp, skb, upcall_info);
- if (segs) {
- struct sk_buff *next;
- /* Free GSO-segments */
- do {
- next = segs->next;
- kfree_skb(segs);
- } while ((segs = next) != NULL);
- }
+ forward_ip_summed(skb, true);
+ if (!skb_is_gso(skb))
+ err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
+ else
+ err = queue_gso_packets(dp_ifindex, skb, upcall_info);
if (err)
goto err;
@@ -401,20 +391,51 @@ err:
return err;
}
-/* Send each packet in the 'skb' list to userspace for 'dp' as directed by
- * 'upcall_info'. There will be only one packet unless we broke up a GSO
- * packet.
- */
-static int queue_userspace_packets(struct datapath *dp, struct sk_buff *skb,
+static int queue_gso_packets(int dp_ifindex, struct sk_buff *skb,
const struct dp_upcall_info *upcall_info)
{
- int dp_ifindex;
+ struct dp_upcall_info later_info;
+ struct sw_flow_key later_key;
+ struct sk_buff *segs, *nskb;
+ int err;
- dp_ifindex = get_dpifindex(dp);
- if (!dp_ifindex)
- return -ENODEV;
+ segs = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ /* Queue all of the segments. */
+ skb = segs;
+ do {
+ err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
+ if (err)
+ break;
+ if (skb == segs && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) {
+ later_key = *upcall_info->key;
+ later_key.ip.tos_frag &= ~OVS_FRAG_TYPE_MASK;
+ later_key.ip.tos_frag |= OVS_FRAG_TYPE_LATER;
+
+ later_info = *upcall_info;
+ later_info.key = &later_key;
+ upcall_info = &later_info;
+ }
+ } while ((skb = skb->next));
+
+ /* Free all of the segments. */
+ skb = segs;
do {
+ nskb = skb->next;
+ if (err)
+ kfree_skb(skb);
+ else
+ consume_skb(skb);
+ } while ((skb = nskb));
+ return err;
+}
+
+static int queue_userspace_packet(int dp_ifindex, struct sk_buff *skb,
+ const struct dp_upcall_info *upcall_info)
+{
struct ovs_header *upcall;
struct sk_buff *user_skb; /* to be queued to userspace */
struct nlattr *nla;
@@ -456,13 +477,7 @@ static int queue_userspace_packets(struct datapath *dp, struct sk_buff *skb,
else
skb_copy_bits(skb, 0, nla_data(nla), skb->len);
- err = genlmsg_unicast(&init_net, user_skb, upcall_info->pid);
- if (err)
- return err;
-
- } while ((skb = skb->next));
-
- return 0;
+ return genlmsg_unicast(&init_net, user_skb, upcall_info->pid);
}
/* Called with genl_mutex. */
More information about the dev
mailing list