[ovs-dev] [PATCH v2] datapath: Remove tunnel header caching.

Pravin B Shelar pshelar at nicira.com
Thu Oct 25 18:14:38 UTC 2012


v1-v2:
	- Fixed capwap fragment case.
	- simplified tnl_send.
--8<--------------------------cut here-------------------------->8--

Tunnel caching was added to reduce CPU utilization on TX path
by caching packet header, So performance gain is directly proportional
to number of skbs transferred. But with help of offloads skb are getting
larger. So there are less number of skbs. Therefore header caching does
not shows similar gains we seen in past. And now kernel 3.6 has removed
dst caching from networking which makes header caching even more tricky.
So this commit removes header caching from OVS tunnelling.

Signed-off-by: Pravin B Shelar <pshelar at nicira.com>
---
 datapath/flow.c              |   15 +-
 datapath/flow.h              |    4 -
 datapath/tunnel.c            |  462 +++---------------------------------------
 datapath/tunnel.h            |  100 +--------
 datapath/vport-capwap.c      |   40 +---
 datapath/vport-gre.c         |   59 +-----
 include/openvswitch/tunnel.h |    1 -
 lib/netdev-vport.c           |   11 +-
 vswitchd/vswitch.xml         |   10 -
 9 files changed, 56 insertions(+), 646 deletions(-)

diff --git a/datapath/flow.c b/datapath/flow.c
index c70daee..8b3f21b 100644
--- a/datapath/flow.c
+++ b/datapath/flow.c
@@ -226,9 +226,7 @@ struct sw_flow *ovs_flow_alloc(void)
 		return ERR_PTR(-ENOMEM);
 
 	spin_lock_init(&flow->lock);
-	atomic_set(&flow->refcnt, 1);
 	flow->sf_acts = NULL;
-	flow->dead = false;
 
 	return flow;
 }
@@ -292,7 +290,6 @@ struct flow_table *ovs_flow_tbl_alloc(int new_size)
 
 static void flow_free(struct sw_flow *flow)
 {
-	flow->dead = true;
 	ovs_flow_put(flow);
 }
 
@@ -423,7 +420,6 @@ static void rcu_free_flow_callback(struct rcu_head *rcu)
 {
 	struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
 
-	flow->dead = true;
 	ovs_flow_put(flow);
 }
 
@@ -434,20 +430,13 @@ void ovs_flow_deferred_free(struct sw_flow *flow)
 	call_rcu(&flow->rcu, rcu_free_flow_callback);
 }
 
-void ovs_flow_hold(struct sw_flow *flow)
-{
-	atomic_inc(&flow->refcnt);
-}
-
 void ovs_flow_put(struct sw_flow *flow)
 {
 	if (unlikely(!flow))
 		return;
 
-	if (atomic_dec_and_test(&flow->refcnt)) {
-		kfree((struct sf_flow_acts __force *)flow->sf_acts);
-		kmem_cache_free(flow_cache, flow);
-	}
+	kfree((struct sf_flow_acts __force *)flow->sf_acts);
+	kmem_cache_free(flow_cache, flow);
 }
 
 /* RCU callback used by ovs_flow_deferred_free_acts. */
diff --git a/datapath/flow.h b/datapath/flow.h
index f4ef285..373df67 100644
--- a/datapath/flow.h
+++ b/datapath/flow.h
@@ -104,9 +104,6 @@ struct sw_flow {
 	struct sw_flow_key key;
 	struct sw_flow_actions __rcu *sf_acts;
 
-	atomic_t refcnt;
-	bool dead;
-
 	spinlock_t lock;	/* Lock for values below. */
 	unsigned long used;	/* Last used time (in jiffies). */
 	u64 packet_count;	/* Number of packets matched. */
@@ -137,7 +134,6 @@ void ovs_flow_deferred_free(struct sw_flow *);
 struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *);
 void ovs_flow_deferred_free_acts(struct sw_flow_actions *);
 
-void ovs_flow_hold(struct sw_flow *);
 void ovs_flow_put(struct sw_flow *);
 
 int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *,
diff --git a/datapath/tunnel.c b/datapath/tunnel.c
index bfb09ce..a5222d1 100644
--- a/datapath/tunnel.c
+++ b/datapath/tunnel.c
@@ -52,43 +52,9 @@
 #include "vport-generic.h"
 #include "vport-internal_dev.h"
 
-#ifdef NEED_CACHE_TIMEOUT
-/*
- * On kernels where we can't quickly detect changes in the rest of the system
- * we use an expiration time to invalidate the cache.  A shorter expiration
- * reduces the length of time that we may potentially blackhole packets while
- * a longer time increases performance by reducing the frequency that the
- * cache needs to be rebuilt.  A variety of factors may cause the cache to be
- * invalidated before the expiration time but this is the maximum.  The time
- * is expressed in jiffies.
- */
-#define MAX_CACHE_EXP HZ
-#endif
-
-/*
- * Interval to check for and remove caches that are no longer valid.  Caches
- * are checked for validity before they are used for packet encapsulation and
- * old caches are removed at that time.  However, if no packets are sent through
- * the tunnel then the cache will never be destroyed.  Since it holds
- * references to a number of system objects, the cache will continue to use
- * system resources by not allowing those objects to be destroyed.  The cache
- * cleaner is periodically run to free invalid caches.  It does not
- * significantly affect system performance.  A lower interval will release
- * resources faster but will itself consume resources by requiring more frequent
- * checks.  A longer interval may result in messages being printed to the kernel
- * message buffer about unreleased resources.  The interval is expressed in
- * jiffies.
- */
-#define CACHE_CLEANER_INTERVAL (5 * HZ)
-
-#define CACHE_DATA_ALIGN 16
 #define PORT_TABLE_SIZE  1024
 
 static struct hlist_head *port_table __read_mostly;
-static int port_table_count;
-
-static void cache_cleaner(struct work_struct *work);
-static DECLARE_DELAYED_WORK(cache_cleaner_wq, cache_cleaner);
 
 /*
  * These are just used as an optimization: they don't require any kind of
@@ -109,60 +75,17 @@ static unsigned int multicast_ports __read_mostly;
 #define rt_dst(rt) (rt->u.dst)
 #endif
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0)
-static struct hh_cache *rt_hh(struct rtable *rt)
-{
-	struct neighbour *neigh = dst_get_neighbour_noref(&rt->dst);
-	if (!neigh || !(neigh->nud_state & NUD_CONNECTED) ||
-			!neigh->hh.hh_len)
-		return NULL;
-	return &neigh->hh;
-}
-#else
-#define rt_hh(rt) (rt_dst(rt).hh)
-#endif
-
 static struct vport *tnl_vport_to_vport(const struct tnl_vport *tnl_vport)
 {
 	return vport_from_priv(tnl_vport);
 }
 
-/* This is analogous to rtnl_dereference for the tunnel cache.  It checks that
- * cache_lock is held, so it is only for update side code.
- */
-static struct tnl_cache *cache_dereference(struct tnl_vport *tnl_vport)
-{
-	return rcu_dereference_protected(tnl_vport->cache,
-				 lockdep_is_held(&tnl_vport->cache_lock));
-}
-
-static void schedule_cache_cleaner(void)
-{
-	schedule_delayed_work(&cache_cleaner_wq, CACHE_CLEANER_INTERVAL);
-}
-
-static void free_cache(struct tnl_cache *cache)
-{
-	if (!cache)
-		return;
-
-	ovs_flow_put(cache->flow);
-	ip_rt_put(cache->rt);
-	kfree(cache);
-}
-
 static void free_config_rcu(struct rcu_head *rcu)
 {
 	struct tnl_mutable_config *c = container_of(rcu, struct tnl_mutable_config, rcu);
 	kfree(c);
 }
 
-static void free_cache_rcu(struct rcu_head *rcu)
-{
-	struct tnl_cache *c = container_of(rcu, struct tnl_cache, rcu);
-	free_cache(c);
-}
-
 /* Frees the portion of 'mutable' that requires RTNL and thus can't happen
  * within an RCU callback.  Fortunately this part doesn't require waiting for
  * an RCU grace period.
@@ -191,18 +114,6 @@ static void assign_config_rcu(struct vport *vport,
 	call_rcu(&old_config->rcu, free_config_rcu);
 }
 
-static void assign_cache_rcu(struct vport *vport, struct tnl_cache *new_cache)
-{
-	struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
-	struct tnl_cache *old_cache;
-
-	old_cache = cache_dereference(tnl_vport);
-	rcu_assign_pointer(tnl_vport->cache, new_cache);
-
-	if (old_cache)
-		call_rcu(&old_cache->rcu, free_cache_rcu);
-}
-
 static unsigned int *find_port_pool(const struct tnl_mutable_config *mutable)
 {
 	bool is_multicast = ipv4_is_multicast(mutable->key.daddr);
@@ -242,13 +153,9 @@ static void port_table_add_port(struct vport *vport)
 	const struct tnl_mutable_config *mutable;
 	u32 hash;
 
-	if (port_table_count == 0)
-		schedule_cache_cleaner();
-
 	mutable = rtnl_dereference(tnl_vport->mutable);
 	hash = port_hash(&mutable->key);
 	hlist_add_head_rcu(&tnl_vport->hash_node, find_bucket(hash));
-	port_table_count++;
 
 	(*find_port_pool(rtnl_dereference(tnl_vport->mutable)))++;
 }
@@ -274,10 +181,6 @@ static void port_table_remove_port(struct vport *vport)
 
 	hlist_del_init_rcu(&tnl_vport->hash_node);
 
-	port_table_count--;
-	if (port_table_count == 0)
-		cancel_delayed_work_sync(&cache_cleaner_wq);
-
 	(*find_port_pool(rtnl_dereference(tnl_vport->mutable)))--;
 }
 
@@ -798,227 +701,7 @@ static bool check_mtu(struct sk_buff *skb,
 	return true;
 }
 
-static void create_tunnel_header(const struct vport *vport,
-				 const struct tnl_mutable_config *mutable,
-				 const struct ovs_key_ipv4_tunnel *tun_key,
-				 const struct rtable *rt, void *header)
-{
-	struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
-	struct iphdr *iph = header;
-
-	iph->version	= 4;
-	iph->ihl	= sizeof(struct iphdr) >> 2;
-	iph->frag_off	= htons(IP_DF);
-	iph->protocol	= tnl_vport->tnl_ops->ipproto;
-	iph->tos	= mutable->tos;
-	iph->daddr	= rt->rt_dst;
-	iph->saddr	= rt->rt_src;
-	iph->ttl	= mutable->ttl;
-	if (!iph->ttl)
-		iph->ttl = ip4_dst_hoplimit(&rt_dst(rt));
-
-	tnl_vport->tnl_ops->build_header(vport, mutable, tun_key, iph + 1);
-}
-
-static void *get_cached_header(const struct tnl_cache *cache)
-{
-	return (void *)cache + ALIGN(sizeof(struct tnl_cache), CACHE_DATA_ALIGN);
-}
-
-#ifdef HAVE_RT_GENID
-static inline int rt_genid(struct net *net)
-{
-	return atomic_read(&net->ipv4.rt_genid);
-}
-#endif
-
-static bool check_cache_valid(const struct tnl_cache *cache,
-			      const struct tnl_mutable_config *mutable)
-{
-	struct hh_cache *hh;
-
-	if (!cache)
-		return false;
-
-	hh = rt_hh(cache->rt);
-	return hh &&
-#ifdef NEED_CACHE_TIMEOUT
-		time_before(jiffies, cache->expiration) &&
-#endif
-#ifdef HAVE_RT_GENID
-		rt_genid(dev_net(rt_dst(cache->rt).dev)) == cache->rt->rt_genid &&
-#endif
-#ifdef HAVE_HH_SEQ
-		hh->hh_lock.sequence == cache->hh_seq &&
-#endif
-		mutable->seq == cache->mutable_seq &&
-		(!ovs_is_internal_dev(rt_dst(cache->rt).dev) ||
-		(cache->flow && !cache->flow->dead));
-}
-
-static void __cache_cleaner(struct tnl_vport *tnl_vport)
-{
-	const struct tnl_mutable_config *mutable =
-			rcu_dereference(tnl_vport->mutable);
-	const struct tnl_cache *cache = rcu_dereference(tnl_vport->cache);
-
-	if (cache && !check_cache_valid(cache, mutable) &&
-	    spin_trylock_bh(&tnl_vport->cache_lock)) {
-		assign_cache_rcu(tnl_vport_to_vport(tnl_vport), NULL);
-		spin_unlock_bh(&tnl_vport->cache_lock);
-	}
-}
-
-static void cache_cleaner(struct work_struct *work)
-{
-	int i;
-
-	schedule_cache_cleaner();
-
-	rcu_read_lock();
-	for (i = 0; i < PORT_TABLE_SIZE; i++) {
-		struct hlist_node *n;
-		struct hlist_head *bucket;
-		struct tnl_vport *tnl_vport;
-
-		bucket = &port_table[i];
-		hlist_for_each_entry_rcu(tnl_vport, n, bucket, hash_node)
-			__cache_cleaner(tnl_vport);
-	}
-	rcu_read_unlock();
-}
-
-static void create_eth_hdr(struct tnl_cache *cache, struct hh_cache *hh)
-{
-	void *cache_data = get_cached_header(cache);
-	int hh_off;
-
-#ifdef HAVE_HH_SEQ
-	unsigned hh_seq;
-
-	do {
-		hh_seq = read_seqbegin(&hh->hh_lock);
-		hh_off = HH_DATA_ALIGN(hh->hh_len) - hh->hh_len;
-		memcpy(cache_data, (void *)hh->hh_data + hh_off, hh->hh_len);
-		cache->hh_len = hh->hh_len;
-	} while (read_seqretry(&hh->hh_lock, hh_seq));
-
-	cache->hh_seq = hh_seq;
-#else
-	read_lock(&hh->hh_lock);
-	hh_off = HH_DATA_ALIGN(hh->hh_len) - hh->hh_len;
-	memcpy(cache_data, (void *)hh->hh_data + hh_off, hh->hh_len);
-	cache->hh_len = hh->hh_len;
-	read_unlock(&hh->hh_lock);
-#endif
-}
-
-static struct tnl_cache *build_cache(struct vport *vport,
-				     const struct tnl_mutable_config *mutable,
-				     struct rtable *rt)
-{
-	struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
-	static const struct ovs_key_ipv4_tunnel tun_key;
-	struct tnl_cache *cache;
-	void *cache_data;
-	int cache_len;
-	struct hh_cache *hh;
-	int tunnel_hlen;
-
-	if (!(mutable->flags & TNL_F_HDR_CACHE))
-		return NULL;
-
-	tunnel_hlen = tnl_vport->tnl_ops->hdr_len(mutable, &tun_key);
-	if (tunnel_hlen < 0)
-		return NULL;
-
-	tunnel_hlen += sizeof(struct iphdr);
-
-	/*
-	 * If there is no entry in the ARP cache or if this device does not
-	 * support hard header caching just fall back to the IP stack.
-	 */
-
-	hh = rt_hh(rt);
-	if (!hh)
-		return NULL;
-
-	/*
-	 * If lock is contended fall back to directly building the header.
-	 * We're not going to help performance by sitting here spinning.
-	 */
-	if (!spin_trylock(&tnl_vport->cache_lock))
-		return NULL;
-
-	cache = cache_dereference(tnl_vport);
-	if (check_cache_valid(cache, mutable))
-		goto unlock;
-	else
-		cache = NULL;
-
-	cache_len = LL_RESERVED_SPACE(rt_dst(rt).dev) + tunnel_hlen;
-
-	cache = kzalloc(ALIGN(sizeof(struct tnl_cache), CACHE_DATA_ALIGN) +
-			cache_len, GFP_ATOMIC);
-	if (!cache)
-		goto unlock;
-
-	create_eth_hdr(cache, hh);
-	cache_data = get_cached_header(cache) + cache->hh_len;
-	cache->len = cache->hh_len + tunnel_hlen;
-
-	create_tunnel_header(vport, mutable, &tun_key, rt, cache_data);
-
-	cache->mutable_seq = mutable->seq;
-	cache->rt = rt;
-#ifdef NEED_CACHE_TIMEOUT
-	cache->expiration = jiffies + tnl_vport->cache_exp_interval;
-#endif
-
-	if (ovs_is_internal_dev(rt_dst(rt).dev)) {
-		struct sw_flow_key flow_key;
-		struct vport *dst_vport;
-		struct sk_buff *skb;
-		int err;
-		int flow_key_len;
-		struct sw_flow *flow;
-
-		dst_vport = ovs_internal_dev_get_vport(rt_dst(rt).dev);
-		if (!dst_vport)
-			goto done;
-
-		skb = alloc_skb(cache->len, GFP_ATOMIC);
-		if (!skb)
-			goto done;
-
-		__skb_put(skb, cache->len);
-		memcpy(skb->data, get_cached_header(cache), cache->len);
-
-		err = ovs_flow_extract(skb, dst_vport->port_no, &flow_key,
-				       &flow_key_len);
-
-		consume_skb(skb);
-		if (err)
-			goto done;
-
-		flow = ovs_flow_tbl_lookup(rcu_dereference(dst_vport->dp->table),
-					   &flow_key, flow_key_len);
-		if (flow) {
-			cache->flow = flow;
-			ovs_flow_hold(flow);
-		}
-	}
-
-done:
-	assign_cache_rcu(vport, cache);
-
-unlock:
-	spin_unlock(&tnl_vport->cache_lock);
-
-	return cache;
-}
-
-static struct rtable *__find_route(const struct tnl_mutable_config *mutable,
+static struct rtable *find_route(const struct tnl_mutable_config *mutable,
 				   __be32 saddr, __be32 daddr, u8 ipproto,
 				   u8 tos)
 {
@@ -1047,35 +730,6 @@ static struct rtable *__find_route(const struct tnl_mutable_config *mutable,
 #endif
 }
 
-static struct rtable *find_route(struct vport *vport,
-				 const struct tnl_mutable_config *mutable,
-				 __be32 saddr, __be32 daddr, u8 tos,
-				 struct tnl_cache **cache)
-{
-	struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
-	struct tnl_cache *cur_cache = rcu_dereference(tnl_vport->cache);
-
-	*cache = NULL;
-	tos = RT_TOS(tos);
-
-	if (tos == RT_TOS(mutable->tos) &&
-	    check_cache_valid(cur_cache, mutable)) {
-		*cache = cur_cache;
-		return cur_cache->rt;
-	} else {
-		struct rtable *rt;
-
-		rt = __find_route(mutable, saddr, daddr,
-				  tnl_vport->tnl_ops->ipproto, tos);
-		if (IS_ERR(rt))
-			return NULL;
-		if (likely(tos == RT_TOS(mutable->tos)))
-			*cache = build_cache(vport, mutable, rt);
-
-		return rt;
-	}
-}
-
 static bool need_linearize(const struct sk_buff *skb)
 {
 	int i;
@@ -1197,8 +851,6 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 	const struct tnl_mutable_config *mutable = rcu_dereference(tnl_vport->mutable);
 	enum vport_err_type err = VPORT_E_TX_ERROR;
 	struct rtable *rt;
-	struct dst_entry *unattached_dst = NULL;
-	struct tnl_cache *cache;
 	struct ovs_key_ipv4_tunnel tun_key;
 	int sent_len = 0;
 	int tunnel_hlen;
@@ -1294,11 +946,9 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 	}
 
 	/* Route lookup */
-	rt = find_route(vport, mutable, saddr, daddr, tos, &cache);
-	if (unlikely(!rt))
+	rt = find_route(mutable, saddr, daddr, tnl_vport->tnl_ops->ipproto, tos);
+	if (IS_ERR(rt))
 		goto error_free;
-	if (unlikely(!cache))
-		unattached_dst = &rt_dst(rt);
 
 	/* Reset SKB */
 	nf_reset(skb);
@@ -1308,24 +958,15 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 
 	/* Offloading */
 	skb = handle_offloads(skb, mutable, rt, tunnel_hlen);
-	if (IS_ERR(skb))
-		goto error;
+	if (IS_ERR(skb)) {
+		skb = NULL;
+		goto err_free_rt;
+	}
 
 	/* MTU */
 	if (unlikely(!check_mtu(skb, vport, mutable, rt, &frag_off, tunnel_hlen))) {
 		err = VPORT_E_TX_DROPPED;
-		goto error_free;
-	}
-
-	/*
-	 * If we are over the MTU, allow the IP stack to handle fragmentation.
-	 * Fragmentation is a slow path anyways.
-	 */
-	if (unlikely(skb->len + tunnel_hlen > dst_mtu(&rt_dst(rt)) &&
-		     cache)) {
-		unattached_dst = &rt_dst(rt);
-		dst_hold(unattached_dst);
-		cache = NULL;
+		goto err_free_rt;
 	}
 
 	/* TTL Fixup. */
@@ -1344,67 +985,35 @@ int ovs_tnl_send(struct vport *vport, struct sk_buff *skb)
 		if (unlikely(vlan_deaccel_tag(skb)))
 			goto next;
 
-		if (likely(cache)) {
-			skb_push(skb, cache->len);
-			memcpy(skb->data, get_cached_header(cache), cache->len);
-			skb_reset_mac_header(skb);
-			skb_set_network_header(skb, cache->hh_len);
-
-		} else {
-			skb_push(skb, tunnel_hlen);
-			create_tunnel_header(vport, mutable, OVS_CB(skb)->tun_key, rt, skb->data);
-			skb_reset_network_header(skb);
-
-			if (next_skb)
-				skb_dst_set(skb, dst_clone(unattached_dst));
-			else {
-				skb_dst_set(skb, unattached_dst);
-				unattached_dst = NULL;
-			}
-		}
+		skb_push(skb, tunnel_hlen);
+		skb_reset_network_header(skb);
 		skb_set_transport_header(skb, skb_network_offset(skb) + sizeof(struct iphdr));
 
+		if (next_skb)
+			skb_dst_set(skb, dst_clone(&rt_dst(rt)));
+		else
+			skb_dst_set(skb, &rt_dst(rt));
+
+		/* Push IP header. */
 		iph = ip_hdr(skb);
-		iph->tos = tos;
-		iph->ttl = ttl;
-		iph->frag_off = frag_off;
+		iph->version	= 4;
+		iph->ihl	= sizeof(struct iphdr) >> 2;
+		iph->frag_off	= htons(IP_DF);
+		iph->protocol	= tnl_vport->tnl_ops->ipproto;
+		iph->daddr	= rt->rt_dst;
+		iph->saddr	= rt->rt_src;
+		iph->tos	= tos;
+		iph->ttl	= ttl;
+		iph->frag_off	= frag_off;
 		ip_select_ident(iph, &rt_dst(rt), NULL);
 
+		/* Push Tunnel header. */
 		skb = tnl_vport->tnl_ops->update_header(vport, mutable,
 							&rt_dst(rt), skb, tunnel_hlen);
 		if (unlikely(!skb))
 			goto next;
 
-		if (likely(cache)) {
-			int orig_len = skb->len - cache->len;
-			struct vport *cache_vport;
-
-			cache_vport = ovs_internal_dev_get_vport(rt_dst(rt).dev);
-			skb->protocol = htons(ETH_P_IP);
-			iph = ip_hdr(skb);
-			iph->tot_len = htons(skb->len - skb_network_offset(skb));
-			ip_send_check(iph);
-
-			if (cache_vport) {
-				if (unlikely(compute_ip_summed(skb, true))) {
-					kfree_skb(skb);
-					goto next;
-				}
-
-				OVS_CB(skb)->flow = cache->flow;
-				ovs_vport_receive(cache_vport, skb);
-				sent_len += orig_len;
-			} else {
-				int xmit_err;
-
-				skb->dev = rt_dst(rt).dev;
-				xmit_err = dev_queue_xmit(skb);
-
-				if (likely(net_xmit_eval(xmit_err) == 0))
-					sent_len += orig_len;
-			}
-		} else
-			sent_len += send_frags(skb, tunnel_hlen);
+		sent_len += send_frags(skb, tunnel_hlen);
 
 next:
 		skb = next_skb;
@@ -1413,14 +1022,13 @@ next:
 	if (unlikely(sent_len == 0))
 		ovs_vport_record_error(vport, VPORT_E_TX_DROPPED);
 
-	goto out;
+	return sent_len;
 
+err_free_rt:
+	ip_rt_put(rt);
 error_free:
 	ovs_tnl_free_linked_skbs(skb);
-error:
 	ovs_vport_record_error(vport, err);
-out:
-	dst_release(unattached_dst);
 	return sent_len;
 }
 
@@ -1495,7 +1103,7 @@ static int tnl_set_config(struct net *net, struct nlattr *options,
 		struct net_device *dev;
 		struct rtable *rt;
 
-		rt = __find_route(mutable, mutable->key.saddr, mutable->key.daddr,
+		rt = find_route(mutable, mutable->key.saddr, mutable->key.daddr,
 				  tnl_ops->ipproto, mutable->tos);
 		if (IS_ERR(rt))
 			return -EADDRNOTAVAIL;
@@ -1552,13 +1160,6 @@ struct vport *ovs_tnl_create(const struct vport_parms *parms,
 	if (err)
 		goto error_free_mutable;
 
-	spin_lock_init(&tnl_vport->cache_lock);
-
-#ifdef NEED_CACHE_TIMEOUT
-	tnl_vport->cache_exp_interval = MAX_CACHE_EXP -
-				       (net_random() % (MAX_CACHE_EXP / 2));
-#endif
-
 	rcu_assign_pointer(tnl_vport->mutable, mutable);
 
 	port_table_add_port(vport);
@@ -1649,7 +1250,6 @@ static void free_port_rcu(struct rcu_head *rcu)
 	struct tnl_vport *tnl_vport = container_of(rcu,
 						   struct tnl_vport, rcu);
 
-	free_cache((struct tnl_cache __force *)tnl_vport->cache);
 	kfree((struct tnl_mutable __force *)tnl_vport->mutable);
 	ovs_vport_free(tnl_vport_to_vport(tnl_vport));
 }
diff --git a/datapath/tunnel.h b/datapath/tunnel.h
index 951a6f1..f296ecb 100644
--- a/datapath/tunnel.h
+++ b/datapath/tunnel.h
@@ -56,7 +56,7 @@
 /* All public tunnel flags. */
 #define TNL_F_PUBLIC (TNL_F_CSUM | TNL_F_TOS_INHERIT | TNL_F_TTL_INHERIT | \
 		      TNL_F_DF_INHERIT | TNL_F_DF_DEFAULT | TNL_F_PMTUD | \
-		      TNL_F_HDR_CACHE | TNL_F_IPSEC)
+		      TNL_F_IPSEC)
 
 /**
  * struct port_lookup_key - Tunnel port key, used as hash table key.
@@ -134,19 +134,7 @@ struct tnl_ops {
 		       const struct ovs_key_ipv4_tunnel *);
 
 	/*
-	 * Builds the static portion of the tunnel header, which is stored in
-	 * the header cache.  In general the performance of this function is
-	 * not too important as we try to only call it when building the cache
-	 * so it is preferable to shift as much work as possible here.  However,
-	 * in some circumstances caching is disabled and this function will be
-	 * called for every packet, so try not to make it too slow.
-	 */
-	void (*build_header)(const struct vport *,
-			     const struct tnl_mutable_config *,
-			     const struct ovs_key_ipv4_tunnel *, void *header);
-
-	/*
-	 * Updates the cached header of a packet to match the actual packet
+	 * Updates the header of a packet to match the actual packet
 	 * data.  Typical things that might need to be updated are length,
 	 * checksum, etc.  The IP header will have already been updated and this
 	 * is the final step before transmission.  Returns a linked list of
@@ -159,77 +147,6 @@ struct tnl_ops {
 					 int tunnel_hlen);
 };
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
-/*
- * On these kernels we have a fast mechanism to tell if the ARP cache for a
- * particular destination has changed.
- */
-#define HAVE_HH_SEQ
-#endif
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27)
-/*
- * On these kernels we have a fast mechanism to tell if the routing table
- * has changed.
- */
-#define HAVE_RT_GENID
-#endif
-#if !defined(HAVE_HH_SEQ) || !defined(HAVE_RT_GENID)
-/* If we can't detect all system changes directly we need to use a timeout. */
-#define NEED_CACHE_TIMEOUT
-#endif
-struct tnl_cache {
-	struct rcu_head rcu;
-
-	int len;		/* Length of data to be memcpy'd from cache. */
-	int hh_len;		/* Hardware hdr length, cached from hh_cache. */
-
-	/* Sequence number of mutable->seq from which this cache was
-	 * generated. */
-	unsigned mutable_seq;
-
-#ifdef HAVE_HH_SEQ
-	/*
-	 * The sequence number from the seqlock protecting the hardware header
-	 * cache (in the ARP cache).  Since every write increments the counter
-	 * this gives us an easy way to tell if it has changed.
-	 */
-	unsigned hh_seq;
-#endif
-
-#ifdef NEED_CACHE_TIMEOUT
-	/*
-	 * If we don't have direct mechanisms to detect all important changes in
-	 * the system fall back to an expiration time.  This expiration time
-	 * can be relatively short since at high rates there will be millions of
-	 * packets per second, so we'll still get plenty of benefit from the
-	 * cache.  Note that if something changes we may blackhole packets
-	 * until the expiration time (depending on what changed and the kernel
-	 * version we may be able to detect the change sooner).  Expiration is
-	 * expressed as a time in jiffies.
-	 */
-	unsigned long expiration;
-#endif
-
-	/*
-	 * The routing table entry that is the result of looking up the tunnel
-	 * endpoints.  It also contains a sequence number (called a generation
-	 * ID) that can be compared to a global sequence to tell if the routing
-	 * table has changed (and therefore there is a potential that this
-	 * cached route has been invalidated).
-	 */
-	struct rtable *rt;
-
-	/*
-	 * If the output device for tunnel traffic is an OVS internal device,
-	 * the flow of that datapath.  Since all tunnel traffic will have the
-	 * same headers this allows us to cache the flow lookup.  NULL if the
-	 * output device is not OVS or if there is no flow installed.
-	 */
-	struct sw_flow *flow;
-
-	/* The cached header follows after padding for alignment. */
-};
-
 struct tnl_vport {
 	struct rcu_head rcu;
 	struct hlist_node hash_node;
@@ -245,19 +162,6 @@ struct tnl_vport {
 	 * this is not needed.
 	 */
 	atomic_t frag_id;
-
-	spinlock_t cache_lock;
-	struct tnl_cache __rcu *cache;	/* Protected by RCU/cache_lock. */
-
-#ifdef NEED_CACHE_TIMEOUT
-	/*
-	 * If we must rely on expiration time to invalidate the cache, this is
-	 * the interval.  It is randomized within a range (defined by
-	 * MAX_CACHE_EXP in tunnel.c) to avoid synchronized expirations caused
-	 * by creation of a large number of tunnels at a one time.
-	 */
-	unsigned long cache_exp_interval;
-#endif
 };
 
 struct vport *ovs_tnl_create(const struct vport_parms *, const struct vport_ops *,
diff --git a/datapath/vport-capwap.c b/datapath/vport-capwap.c
index 39aec42..072bc22 100644
--- a/datapath/vport-capwap.c
+++ b/datapath/vport-capwap.c
@@ -199,12 +199,14 @@ static int capwap_hdr_len(const struct tnl_mutable_config *mutable,
 	return size;
 }
 
-static void capwap_build_header(const struct vport *vport,
-				const struct tnl_mutable_config *mutable,
-				const struct ovs_key_ipv4_tunnel *tun_key,
-				void *header)
+static struct sk_buff *capwap_update_header(const struct vport *vport,
+					    const struct tnl_mutable_config *mutable,
+					    struct dst_entry *dst,
+					    struct sk_buff *skb,
+					    int tunnel_hlen)
 {
-	struct udphdr *udph = header;
+	struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->tun_key;
+	struct udphdr *udph = udp_hdr(skb);
 	struct capwaphdr *cwh = (struct capwaphdr *)(udph + 1);
 	u32 flags;
 	__be64 out_key;
@@ -218,7 +220,8 @@ static void capwap_build_header(const struct vport *vport,
 	cwh->frag_id = 0;
 	cwh->frag_off = 0;
 
-	if (out_key || (flags & TNL_F_OUT_KEY_ACTION)) {
+	if (out_key || flags & TNL_F_OUT_KEY_ACTION) {
+		/* first field in WSI is key */
 		struct capwaphdr_wsi *wsi = (struct capwaphdr_wsi *)(cwh + 1);
 
 		cwh->begin = CAPWAP_KEYED;
@@ -237,30 +240,6 @@ static void capwap_build_header(const struct vport *vport,
 		/* make packet readable by old capwap code */
 		cwh->begin = CAPWAP_NO_WSI;
 	}
-}
-
-static struct sk_buff *capwap_update_header(const struct vport *vport,
-					    const struct tnl_mutable_config *mutable,
-					    struct dst_entry *dst,
-					    struct sk_buff *skb,
-					    int tunnel_hlen)
-{
-	const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->tun_key;
-	struct udphdr *udph = udp_hdr(skb);
-	u32 flags;
-	__be64 out_key;
-
-	get_capwap_param(mutable, tun_key, &flags, &out_key);
-
-	if (flags & TNL_F_OUT_KEY_ACTION) {
-		/* first field in WSI is key */
-		struct capwaphdr *cwh = (struct capwaphdr *)(udph + 1);
-		struct capwaphdr_wsi *wsi = (struct capwaphdr_wsi *)(cwh + 1);
-		struct capwaphdr_wsi_key *opt = (struct capwaphdr_wsi_key *)(wsi + 1);
-
-		opt->key = out_key;
-	}
-
 	udph->len = htons(skb->len - skb_transport_offset(skb));
 
 	if (unlikely(skb->len - skb_network_offset(skb) > dst_mtu(dst))) {
@@ -396,7 +375,6 @@ static const struct tnl_ops capwap_tnl_ops = {
 	.tunnel_type	= TNL_T_PROTO_CAPWAP,
 	.ipproto	= IPPROTO_UDP,
 	.hdr_len	= capwap_hdr_len,
-	.build_header	= capwap_build_header,
 	.update_header	= capwap_update_header,
 };
 
diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c
index d02d4ec..3a877f3 100644
--- a/datapath/vport-gre.c
+++ b/datapath/vport-gre.c
@@ -115,45 +115,6 @@ static __be32 be64_get_high32(__be64 x)
 #endif
 }
 
-static void gre_build_header(const struct vport *vport,
-			     const struct tnl_mutable_config *mutable,
-			     const struct ovs_key_ipv4_tunnel *tun_key,
-			     void *header)
-{
-	struct gre_base_hdr *greh = header;
-	__be32 *options = (__be32 *)(greh + 1);
-	u32 flags;
-	u32 tunnel_type;
-	__be64 out_key;
-
-	get_gre_param(mutable, tun_key, &flags, &tunnel_type, &out_key);
-
-	greh->protocol = htons(ETH_P_TEB);
-	greh->flags = 0;
-
-	if (flags & TNL_F_CSUM) {
-		greh->flags |= GRE_CSUM;
-		*options = 0;
-		options++;
-	}
-
-	if (flags & TNL_F_OUT_KEY_ACTION) {
-		greh->flags |= GRE_KEY;
-		if (tunnel_type & TNL_T_PROTO_GRE64)
-			greh->flags |= GRE_SEQ;
-
-	} else if (out_key ||
-		   tunnel_type & TNL_T_PROTO_GRE64) {
-		greh->flags |= GRE_KEY;
-		*options = be64_get_low32(out_key);
-		if (tunnel_type & TNL_T_PROTO_GRE64) {
-			options++;
-			*options = be64_get_high32(out_key);
-			greh->flags |= GRE_SEQ;
-		}
-	}
-}
-
 static struct sk_buff *gre_update_header(const struct vport *vport,
 					 const struct tnl_mutable_config *mutable,
 					 struct dst_entry *dst,
@@ -166,29 +127,34 @@ static struct sk_buff *gre_update_header(const struct vport *vport,
 	const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->tun_key;
 	__be32 *options = (__be32 *)(skb_network_header(skb) + tunnel_hlen
 					       - GRE_HEADER_SECTION);
+	struct gre_base_hdr *greh = (struct gre_base_hdr *) skb_transport_header(skb);
 
 	get_gre_param(mutable, tun_key, &flags, &tunnel_type, &out_key);
 
+	greh->protocol = htons(ETH_P_TEB);
+	greh->flags = 0;
+
 	/* Work backwards over the options so the checksum is last. */
-	if (flags & TNL_F_OUT_KEY_ACTION) {
+	if (out_key || flags & TNL_F_OUT_KEY_ACTION || tunnel_type & TNL_T_PROTO_GRE64) {
+		greh->flags |= GRE_KEY;
 		if (tunnel_type & TNL_T_PROTO_GRE64) {
 			/* Set higher 32 bits to seq. */
 			*options = be64_get_high32(out_key);
 			options--;
+			greh->flags |= GRE_SEQ;
 		}
 		*options = be64_get_low32(out_key);
 		options--;
-	} else if (out_key || tunnel_type & TNL_T_PROTO_GRE64) {
-		options--;
-		if (tunnel_type & TNL_T_PROTO_GRE64)
-			options--;
 	}
 
-	if (flags & TNL_F_CSUM)
+	if (flags & TNL_F_CSUM) {
+		greh->flags |= GRE_CSUM;
+		*options = 0;
 		*(__sum16 *)options = csum_fold(skb_checksum(skb,
 						skb_transport_offset(skb),
 						skb->len - skb_transport_offset(skb),
 						0));
+	}
 	/*
 	 * Allow our local IP stack to fragment the outer packet even if the
 	 * DF bit is set as a last resort.  We also need to force selection of
@@ -196,7 +162,6 @@ static struct sk_buff *gre_update_header(const struct vport *vport,
 	 * packet originally had DF set.
 	 */
 	skb->local_df = 1;
-	__ip_select_ident(ip_hdr(skb), dst, 0);
 
 	return skb;
 }
@@ -479,7 +444,6 @@ static const struct tnl_ops gre_tnl_ops = {
 	.tunnel_type	= TNL_T_PROTO_GRE,
 	.ipproto	= IPPROTO_GRE,
 	.hdr_len	= gre_hdr_len,
-	.build_header	= gre_build_header,
 	.update_header	= gre_update_header,
 };
 
@@ -492,7 +456,6 @@ static const struct tnl_ops gre64_tnl_ops = {
 	.tunnel_type	= TNL_T_PROTO_GRE64,
 	.ipproto	= IPPROTO_GRE,
 	.hdr_len	= gre_hdr_len,
-	.build_header	= gre_build_header,
 	.update_header	= gre_update_header,
 };
 
diff --git a/include/openvswitch/tunnel.h b/include/openvswitch/tunnel.h
index c494791..0938f70 100644
--- a/include/openvswitch/tunnel.h
+++ b/include/openvswitch/tunnel.h
@@ -69,7 +69,6 @@ enum {
 #define TNL_F_DF_DEFAULT	(1 << 4) /* Set DF bit if inherit off or
 					  * not IP. */
 #define TNL_F_PMTUD		(1 << 5) /* Enable path MTU discovery. */
-#define TNL_F_HDR_CACHE		(1 << 6) /* Enable tunnel header caching. */
 #define TNL_F_IPSEC		(1 << 7) /* Traffic is IPsec encrypted. */
 
 #endif /* openvswitch/tunnel.h */
diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
index 621abd1..ca68756 100644
--- a/lib/netdev-vport.c
+++ b/lib/netdev-vport.c
@@ -590,14 +590,13 @@ parse_tunnel_config(const char *name, const char *type,
     ovs_be32 saddr = htonl(0);
     uint32_t flags;
 
-    flags = TNL_F_DF_DEFAULT | TNL_F_PMTUD | TNL_F_HDR_CACHE;
+    flags = TNL_F_DF_DEFAULT | TNL_F_PMTUD;
     if (!strcmp(type, "gre") || !strcmp(type, "gre64")) {
         is_gre = true;
     } else if (!strcmp(type, "ipsec_gre") || !strcmp(type, "ipsec_gre64")) {
         is_gre = true;
         is_ipsec = true;
         flags |= TNL_F_IPSEC;
-        flags &= ~TNL_F_HDR_CACHE;
     }
 
     SMAP_FOR_EACH (node, args) {
@@ -650,10 +649,6 @@ parse_tunnel_config(const char *name, const char *type,
             if (!strcmp(node->value, "false")) {
                 flags &= ~TNL_F_PMTUD;
             }
-        } else if (!strcmp(node->key, "header_cache")) {
-            if (!strcmp(node->value, "false")) {
-                flags &= ~TNL_F_HDR_CACHE;
-            }
         } else if (!strcmp(node->key, "peer_cert") && is_ipsec) {
             if (smap_get(args, "certificate")) {
                 ipsec_mech_set = true;
@@ -785,10 +780,6 @@ unparse_tunnel_config(const char *name OVS_UNUSED, const char *type OVS_UNUSED,
     }
 
     flags = nl_attr_get_u32(a[OVS_TUNNEL_ATTR_FLAGS]);
-    if (!(flags & TNL_F_HDR_CACHE) == !(flags & TNL_F_IPSEC)) {
-        smap_add(args, "header_cache",
-                 flags & TNL_F_HDR_CACHE ? "true" : "false");
-    }
 
     daddr = nl_attr_get_be32(a[OVS_TUNNEL_ATTR_DST_IPV4]);
     smap_add_format(args, "remote_ip", IP_FMT, IP_ARGS(&daddr));
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index e9ea0c4..14060ca 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -1373,16 +1373,6 @@
         <p>
           Only <code>gre</code> interfaces support these options.
         </p>
-
-        <column name="options" key="header_cache" type='{"type": "boolean"}'>
-          Enable caching of tunnel headers and the output path.  This can lead
-          to a significant performance increase without changing behavior.  In
-          general it should not be necessary to adjust this setting.  However,
-          the caching can bypass certain components of the IP stack (such as
-          <code>iptables</code>) and it may be useful to disable it if these
-          features are required or as a debugging measure.  Default is enabled,
-          set to <code>false</code> to disable.
-        </column>
       </group>
 
       <group title="Tunnel Options: gre and ipsec_gre only">
-- 
1.7.10




More information about the dev mailing list