[ovs-dev] [PATCH 6/8] datapath: Add support for CAPWAP UDP transport.

Ben Pfaff blp at nicira.com
Mon Aug 23 18:21:05 UTC 2010


On Wed, Aug 18, 2010 at 12:08:50AM -0700, Jesse Gross wrote:
> Add support for the transport portion of the CAPWAP protocol as
> an alternative to GRE for L2 over L3 tunneling.  This is not
> full support for the CAPWAP protocol.
> 
> Signed-off-by: Jesse Gross <jesse at nicira.com>

This looks very clean.  Thank you.  Detailed comments follow.

I see that you are checking for whether to build CAPWAP support at
configure time, but I don't understand a reason to do that.  Why not
just do #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25) in the two
places where it matters (or figure out BUILD_CAPWAP from a header file
at build time)?

> --- a/datapath/tunnel.h
> +++ b/datapath/tunnel.h
> @@ -24,6 +24,7 @@
>  
>  /* One of these goes in your struct tnl_ops and in tnl_find_port(). */
>  #define TNL_T_PROTO_GRE		(1 << 10)
> +#define TNL_T_PROTO_CAPWAP	(1 << 11)

Does it make sense to have separate bits like this, or should it be more
like:
        #define TNL_T_PROTO_MASK        (1 << 10)
        #define TNL_T_PROTO_GRE         (0 << 10)
        #define TNL_T_PROTO_CAPWAP      (1 << 10)

I don't actually know, I'm just asking.

I see that there are many multiline comments that don't follow the style
set out by the Linux CodingStyle:

	/*
	 * This is the preferred style for multi-line
	 * comments in the Linux kernel source code.
	 * Please use it consistently.
	 *
	 * Description:  A column of asterisks on the left side,
	 * with beginning and ending almost-blank lines.
	 */

Beats me why the blank lines on the /* and */ lines, but people actually
complain about it on the linux-kernel list if you leave them out.

> +/* The fragment offset is actually the high 13 bits of the last 16 bit field,
> + * so we would normally need to right shift 3 places.  However, it stores the
> + * offset in 8 byte chunks, which would involve a 3 place left shift.  So we
> + * just mask off the last 3 bits and be done with it. */
> +#define FRAG_OFF_MASK ~0x7

There's not much that can go wrong with that macro definition, since so
few operators have higher precedence, but I'd still feel more
comfortable with (~0x7).  Or even with (~0x7U) to make sure that it
doesn't have a signed type; bitwise operators on negative signed
integers always makes me nervous.

> +struct frag_skb_cb {
> +	u16 offset;
> +};
> +#define FRAG_CB(skb) ((struct frag_skb_cb *)(skb)->cb)

Is there benefit to this being a macro instead of an inline function?
Andrew Morton will nail you for that ;-)

> +static struct socket *rcv_socket;

That's a pretty generic name, although it is static.  capwap_rcv_socket
would make it more likely to be unique.

> +static struct sk_buff *capwap_build_header(struct sk_buff *skb,
> +					   const struct vport *vport,
> +					   const struct tnl_mutable_config *mutable,
> +					   struct dst_entry *dst)
> +{
> +	struct udphdr *udph = udp_hdr(skb);
> +	struct capwap_hdr *cwh = (struct capwap_hdr *)(udph + 1);

The above line of code, or similar, appears a number of times.  Would it
be a good idea to write a capwap_hdr() helper to find the capwap_hdr
given the skb?

> +	udph->source = htons(CAPWAP_SRC_PORT);
> +	udph->dest = htons(CAPWAP_DST_PORT);
> +	udph->len = htons(skb->len - sizeof(struct iphdr));
> +	udph->check = 0;
> +
> +	cwh->begin = NO_FRAG_HDR;
> +	cwh->frag_id = 0;
> +	cwh->frag_off = 0;
> +
> +	if (unlikely(skb->len > dst_mtu(dst)))
> +		skb = fragment(skb, vport, dst);
> +	else
> +		skb->next = NULL;

I'm surprised that the "skb->next = NULL;" is necessary here.  I've lost
the context of this call, though.

> +	return skb;
> +}
> +
> +static inline struct sk_buff *process_capwap_proto(struct sk_buff *skb)
> +{
> +	struct capwap_hdr *cwh = (struct capwap_hdr *)(udp_hdr(skb) + 1);
> +
> +	if (likely(cwh->begin == NO_FRAG_HDR))
> +		return skb;
> +	else if (cwh->begin == FRAG_HDR)
> +		return defrag(skb, false);
> +	else if (cwh->begin == FRAG_LAST_HDR)
> +		return defrag(skb, true);
> +	else {

Is this an odd enough situation that we should log something?

> +		kfree_skb(skb);
> +		return NULL;
> +	}
> +}
> +
> +/* Called with rcu_read_lock and BH disabled. */
> +static int capwap_rcv(struct sock *sk, struct sk_buff *skb)
> +{
> +	struct vport *vport;
> +	const struct tnl_mutable_config *mutable;
> +	struct iphdr *iph;
> +
> +	if (unlikely(!pskb_may_pull(skb, CAPWAP_HLEN + ETH_HLEN)))
> +		goto error;
> +
> +	__skb_pull(skb, CAPWAP_HLEN);
> +	skb_postpull_rcsum(skb, skb_transport_header(skb), CAPWAP_HLEN + ETH_HLEN);

Speaking naively, I am surprised that the final argument to __skb_pull()
is different from the final argument to skb_postpull_rcsum().  The
definition of skb_pull_rcsum() implies that the common case, at least,
is that they should be the same.

> +static struct sk_buff *fragment(struct sk_buff *skb, const struct vport *vport,
> +				struct dst_entry *dst)
> +{
> +	struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
> +	unsigned int hlen = sizeof(struct iphdr) + CAPWAP_HLEN;

May CAPWAP assume that the IP header does not have IP options?

> +	unsigned int headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len;
> +	struct sk_buff *result = NULL, *list_cur = NULL;
> +	unsigned int remaining;
> +	unsigned int offset;
> +	__be16 frag_id;
> +
> +	remaining = skb->len - hlen;
> +	offset = 0;
> +	frag_id = htons(atomic_inc_return(&tnl_vport->frag_id));
> +
> +	while (remaining) {
> +		struct sk_buff *skb2;
> +		int frag_size;
> +		struct iphdr *iph;
> +		struct udphdr *udph;
> +		struct capwap_hdr *cwh;
> +
> +		frag_size = min(hlen + remaining, dst_mtu(dst));
> +		frag_size -= hlen;

Would the two lines above be better written as:
        frag_size = min(remaining, dst_mtu(dst) - hlen);

> +		if (hlen + frag_size > dst_mtu(dst))
> +			frag_size &= FRAG_OFF_MASK;

Can the above "if" condition ever be true?

> +		skb2 = alloc_skb(headroom + hlen + frag_size, GFP_ATOMIC);
> +		if (!skb2)
> +			goto error;
> +
> +		skb_reserve(skb2, headroom);
> +		skb_put(skb2, hlen + frag_size);

I think you can use __skb_put() here.

> +		if (remaining - frag_size)

I would think "remaining > frag_size" would be more natural here.

> +			cwh->begin = FRAG_HDR;
> +		else
> +			cwh->begin = FRAG_LAST_HDR;
> +		cwh->frag_id = frag_id;
> +		cwh->frag_off = htons(offset);
> +
> +		if (result) {
> +			list_cur->next = skb2;
> +			list_cur = skb2;
> +		} else
> +			result = list_cur = skb2;
> +
> +		offset += frag_size;
> +		remaining -= frag_size;
> +	};

Stray ";" here.

> +	goto out;
> +
> +error:
> +	while (result) {
> +		list_cur = result->next;
> +		kfree_skb(result);
> +		result = list_cur;
> +	};

Here too.

> +out:
> +	kfree_skb(skb);
> +	return result;
> +}

> +static struct sk_buff *frag_reasm(struct frag_queue *fq, struct net_device *dev)
> +{
> +	struct sk_buff *head = fq->ifq.fragments;
> +	struct sk_buff *frag;
> +
> +	/* Succeed or fail, we're done with this queue. */
> +	inet_frag_kill(&fq->ifq, &frag_state);
> +
> +	if (fq->ifq.len > 65535)
> +		return NULL;
> +
> +	/* Can't have the head be a clone. */
> +	if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
> +		return NULL;
> +
> +	/* We're about to build frag list for this SKB.  If it already has a
> +	 * frag list, alloc a new SKB and put the existing frag list there. */
> +	if (skb_shinfo(head)->frag_list) {

The code in this 'if' block (and in fact in this whole function) looks a
little tricky.  I don't feel competent to review it.  Is it a
cut-and-paste from somewhere else in the kernel?

> +		int i;
> +		int paged_len = 0;
> +
> +		frag = alloc_skb(0, GFP_ATOMIC);
> +		if (!frag)
> +			return NULL;

Do we need to free 'head' in this case?

> +		frag->next = head->next;
> +		head->next = frag;
> +		skb_shinfo(frag)->frag_list = skb_shinfo(head)->frag_list;
> +		skb_shinfo(head)->frag_list = NULL;
> +
> +		for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
> +			paged_len += skb_shinfo(head)->frags[i].size;
> +		frag->len = frag->data_len = head->data_len - paged_len;
> +		head->data_len -= frag->len;
> +		head->len -= frag->len;
> +
> +		frag->ip_summed = head->ip_summed;
> +		atomic_add(frag->truesize, &fq->ifq.net->mem);
> +	}
> +
> +	skb_shinfo(head)->frag_list = head->next;
> +	atomic_sub(head->truesize, &fq->ifq.net->mem);
> +
> +	/* Properly account for data in various packets. */
> +	for (frag = head->next; frag; frag = frag->next) {
> +		head->data_len += frag->len;
> +		head->len += frag->len;
> +
> +		if (head->ip_summed != frag->ip_summed)
> +			head->ip_summed = CHECKSUM_NONE;
> +		else if (head->ip_summed == CHECKSUM_COMPLETE)
> +			head->csum = csum_add(head->csum, frag->csum);
> +
> +		head->truesize += frag->truesize;
> +		atomic_sub(frag->truesize, &fq->ifq.net->mem);
> +	}
> +
> +	head->next = NULL;
> +	head->dev = dev;
> +	head->tstamp = fq->ifq.stamp;
> +	fq->ifq.fragments = NULL;
> +
> +	return head;
> +}
> +
> +static struct sk_buff *frag_queue(struct frag_queue *fq, struct sk_buff *skb,
> +				  u16 offset, bool frag_last)
> +{
> +	struct sk_buff *prev, *next;
> +	struct net_device *dev;
> +	int end;
> +
> +	if (fq->ifq.last_in & INET_FRAG_COMPLETE)
> +		goto error;
> +
> +	if (!skb->len)
> +		goto error;
> +
> +	end = offset + skb->len;
> +
> +	if (frag_last) {
> +		/* Last fragment, shouldn't already have data past our end or
> +		 * have another last fragment. */
> +		if (end < fq->ifq.len || fq->ifq.last_in & INET_FRAG_LAST_IN)
> +			goto error;

Packet duplication could cause duplicate last fragments, should we
handle that?

> +		fq->ifq.last_in |= INET_FRAG_LAST_IN;
> +		fq->ifq.len = end;
> +	} else {
> +		/* Fragments should align to 8 byte chunks. */
> +		if (end & ~FRAG_OFF_MASK)
> +			goto error;
> +
> +		if (end > fq->ifq.len) {
> +			/* Shouldn't have data past the end, if we already
> +			 * have one. */
> +			if (fq->ifq.last_in & INET_FRAG_LAST_IN)
> +				goto error;
> +
> +			fq->ifq.len = end;
> +		}
> +	}
> +
> +	/* Find where we fit in. */
> +	prev = NULL;
> +	for (next = fq->ifq.fragments; next != NULL; next = next->next) {
> +		if (FRAG_CB(next)->offset >= offset)
> +			break;
> +		prev = next;
> +	}
> +
> +	/* Overlapping fragments aren't allowed.  We shouldn't start before
> +	 * the end of the previous fragment. */
> +	if (prev && FRAG_CB(prev)->offset + prev->len > offset)
> +		goto error;
> +
> +	/* We also shouldn't end after the beginning of the next fragment. */
> +	if (next && end > FRAG_CB(next)->offset)
> +		goto error;

Should we allow the case where the fragment is an exact duplicate of a
previously seen fragment, to allow for packet duplication?

> +static struct sk_buff *defrag(struct sk_buff *skb, bool frag_last)
> +{
> +	struct iphdr *iph = ip_hdr(skb);
> +	struct capwap_hdr *cwh = (struct capwap_hdr *)(udp_hdr(skb) + 1);
> +	struct frag_match match;
> +	u16 frag_off;
> +	struct frag_queue *fq;
> +
> +	if (atomic_read(&frag_netns_state.mem) > frag_netns_state.high_thresh)
> +		inet_frag_evictor(&frag_netns_state, &frag_state);
> +
> +	match.daddr = iph->daddr;
> +	match.saddr = iph->saddr;
> +	match.id = cwh->frag_id;
> +	frag_off = ntohs(cwh->frag_off) & FRAG_OFF_MASK;
> +
> +	if ((fq = queue_find(&match))) {

Assignment within a condition is mildly unusual in most kernel code.

> +		spin_lock(&fq->ifq.lock);
> +		skb = frag_queue(fq, skb, frag_off, frag_last);
> +		spin_unlock(&fq->ifq.lock);
> +
> +		inet_frag_put(&fq->ifq, &frag_state);
> +
> +		return skb;
> +	}
> +	kfree_skb(skb);
> +	return NULL;
> +}
> +
> +static void defrag_init(void)
> +{
> +	inet_frags_init(&frag_state);
> +	inet_frags_init_net(&frag_netns_state);
> +}
> +
> +static void defrag_exit(void)
> +{
> +	inet_frags_exit_net(&frag_netns_state, &frag_state);
> +	inet_frags_fini(&frag_state);
> +}
> +
> +static void capwap_frag_init(struct inet_frag_queue *ifq, void *match_)
> +{
> +	struct frag_queue *fq = container_of(ifq, struct frag_queue, ifq);

This container_of() gets repeated a lot, would a helper function be a
good idea?

> +static int capwap_frag_match(struct inet_frag_queue *ifq, void *match_)
> +{
> +	struct frag_queue *fq;
> +	struct frag_match *match = match_;
> +
> +	fq = container_of(ifq, struct frag_queue, ifq);
> +	return !memcmp(&fq->match, match, sizeof(struct frag_match));

struct frag_match has 16 bits of padding at its tail.  Is it safe to do
memcmp() like this, i.e. do you always memset that padding to zeros?

GCC is pretty stupid about optimizing memcmp anyway, in my experience,
so it might be better to just write out the comparisons.

> +}




More information about the dev mailing list