[ovs-dev] [net-next RFC 05/14] route: Per route tunnel metadata with RTA_TUNNEL

Thomas Graf tgraf at suug.ch
Mon Jun 1 14:27:29 UTC 2015


Introduces a new Netlink attribute RTA_TUNNEL which allows routes
to set tunnel transmit metadata and specify the tunnel endpoint or
tunnel id on a per route basis. The route must point to a tunnel
device which understands per skb tunnel metadata and has been put
into the respective mode.

Signed-off-by: Thomas Graf <tgraf at suug.ch>
---
 include/net/ip_fib.h           |  3 +++
 include/net/ip_tunnels.h       |  1 -
 include/net/route.h            | 10 ++++++++
 include/uapi/linux/rtnetlink.h | 16 ++++++++++++
 net/ipv4/fib_frontend.c        | 57 ++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/fib_semantics.c       | 45 +++++++++++++++++++++++++++++++++
 net/ipv4/route.c               | 30 +++++++++++++++++++++-
 net/openvswitch/vport.h        |  1 +
 8 files changed, 161 insertions(+), 2 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 54271ed..1cd7cf8 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -22,6 +22,7 @@
 #include <net/fib_rules.h>
 #include <net/inetpeer.h>
 #include <linux/percpu.h>
+#include <net/ip_tunnels.h>
 
 struct fib_config {
 	u8			fc_dst_len;
@@ -44,6 +45,7 @@ struct fib_config {
 	u32			fc_flow;
 	u32			fc_nlflags;
 	struct nl_info		fc_nlinfo;
+	struct ip_tunnel_info	fc_tunnel;
  };
 
 struct fib_info;
@@ -117,6 +119,7 @@ struct fib_info {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	int			fib_power;
 #endif
+	struct ip_tunnel_info	*fib_tunnel;
 	struct rcu_head		rcu;
 	struct fib_nh		fib_nh[0];
 #define fib_dev		fib_nh[0].nh_dev
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index df8cfd3..b4ab930 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -9,7 +9,6 @@
 #include <net/dsfield.h>
 #include <net/gro_cells.h>
 #include <net/inet_ecn.h>
-#include <net/ip.h>
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
 #include <net/flow.h>
diff --git a/include/net/route.h b/include/net/route.h
index 6ede321..dbda603 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -28,6 +28,7 @@
 #include <net/inetpeer.h>
 #include <net/flow.h>
 #include <net/inet_sock.h>
+#include <net/ip_tunnels.h>
 #include <linux/in_route.h>
 #include <linux/rtnetlink.h>
 #include <linux/rcupdate.h>
@@ -66,6 +67,7 @@ struct rtable {
 
 	struct list_head	rt_uncached;
 	struct uncached_list	*rt_uncached_list;
+	struct ip_tunnel_info	*rt_tun_info;
 };
 
 static inline bool rt_is_input_route(const struct rtable *rt)
@@ -198,6 +200,8 @@ struct in_ifaddr;
 void fib_add_ifaddr(struct in_ifaddr *);
 void fib_del_ifaddr(struct in_ifaddr *, struct in_ifaddr *);
 
+int fib_dump_tun_info(struct sk_buff *skb, struct ip_tunnel_info *tun_info);
+
 static inline void ip_rt_put(struct rtable *rt)
 {
 	/* dst_release() accepts a NULL parameter.
@@ -317,9 +321,15 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
 
 static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb)
 {
+	struct rtable *rt;
+
 	if (skb_shinfo(skb)->tun_info)
 		return skb_shinfo(skb)->tun_info;
 
+	rt = skb_rtable(skb);
+	if (rt)
+		return rt->rt_tun_info;
+
 	return NULL;
 }
 
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 17fb02f..1f7aa68 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -286,6 +286,21 @@ enum rt_class_t {
 
 /* Routing message attributes */
 
+enum rta_tunnel_t {
+	RTA_TUN_UNSPEC,
+	RTA_TUN_ID,
+	RTA_TUN_DST,
+	RTA_TUN_SRC,
+	RTA_TUN_TTL,
+	RTA_TUN_TOS,
+	RTA_TUN_SPORT,
+	RTA_TUN_DPORT,
+	RTA_TUN_FLAGS,
+	__RTA_TUN_MAX,
+};
+
+#define RTA_TUN_MAX (__RTA_TUN_MAX - 1)
+
 enum rtattr_type_t {
 	RTA_UNSPEC,
 	RTA_DST,
@@ -308,6 +323,7 @@ enum rtattr_type_t {
 	RTA_VIA,
 	RTA_NEWDST,
 	RTA_PREF,
+	RTA_TUNNEL,	/* destination VTEP */
 	__RTA_MAX
 };
 
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 872494e..bfa77a6 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -580,6 +580,57 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	return -EINVAL;
 }
 
+static const struct nla_policy tunnel_policy[RTA_TUN_MAX + 1] = {
+	[RTA_TUN_ID]		= { .type = NLA_U64 },
+	[RTA_TUN_DST]		= { .type = NLA_U32 },
+	[RTA_TUN_SRC]		= { .type = NLA_U32 },
+	[RTA_TUN_TTL]		= { .type = NLA_U8 },
+	[RTA_TUN_TOS]		= { .type = NLA_U8 },
+	[RTA_TUN_SPORT]		= { .type = NLA_U16 },
+	[RTA_TUN_DPORT]		= { .type = NLA_U16 },
+	[RTA_TUN_FLAGS]		= { .type = NLA_U16 },
+};
+
+static int parse_rta_tunnel(struct fib_config *cfg, struct nlattr *attr)
+{
+	struct nlattr *tb[RTA_TUN_MAX+1];
+	int err;
+
+	err = nla_parse_nested(tb, RTA_TUN_MAX, attr, tunnel_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[RTA_TUN_ID])
+		cfg->fc_tunnel.key.tun_id = nla_get_u64(tb[RTA_TUN_ID]);
+
+	if (tb[RTA_TUN_DST])
+		cfg->fc_tunnel.key.ipv4_dst = nla_get_be32(tb[RTA_TUN_DST]);
+
+	if (tb[RTA_TUN_SRC])
+		cfg->fc_tunnel.key.ipv4_src = nla_get_be32(tb[RTA_TUN_SRC]);
+
+	if (tb[RTA_TUN_TTL])
+		cfg->fc_tunnel.key.ipv4_ttl = nla_get_u8(tb[RTA_TUN_TTL]);
+
+	if (tb[RTA_TUN_TOS])
+		cfg->fc_tunnel.key.ipv4_tos = nla_get_u8(tb[RTA_TUN_TOS]);
+
+	if (tb[RTA_TUN_SPORT])
+		cfg->fc_tunnel.key.tp_src = nla_get_be16(tb[RTA_TUN_SPORT]);
+
+	if (tb[RTA_TUN_DPORT])
+		cfg->fc_tunnel.key.tp_dst = nla_get_be16(tb[RTA_TUN_DPORT]);
+
+	if (tb[RTA_TUN_FLAGS])
+		cfg->fc_tunnel.key.tun_flags = nla_get_u16(tb[RTA_TUN_FLAGS]);
+
+	cfg->fc_tunnel.mode = IP_TUNNEL_INFO_TX;
+	cfg->fc_tunnel.options = NULL;
+	cfg->fc_tunnel.options_len = 0;
+
+	return 0;
+}
+
 const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 	[RTA_DST]		= { .type = NLA_U32 },
 	[RTA_SRC]		= { .type = NLA_U32 },
@@ -591,6 +642,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 	[RTA_METRICS]		= { .type = NLA_NESTED },
 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
 	[RTA_FLOW]		= { .type = NLA_U32 },
+	[RTA_TUNNEL]		= { .type = NLA_NESTED },
 };
 
 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
@@ -656,6 +708,11 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 		case RTA_TABLE:
 			cfg->fc_table = nla_get_u32(attr);
 			break;
+		case RTA_TUNNEL:
+			err = parse_rta_tunnel(cfg, attr);
+			if (err < 0)
+				goto errout;
+			break;
 		}
 	}
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 28ec3c1..1e94c81 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -215,6 +215,9 @@ static void free_fib_info_rcu(struct rcu_head *head)
 
 	if (fi->fib_metrics != (u32 *) dst_default_metrics)
 		kfree(fi->fib_metrics);
+
+	ip_tunnel_info_put(fi->fib_tunnel);
+
 	kfree(fi);
 }
 
@@ -760,6 +763,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 	struct fib_info *ofi;
 	int nhs = 1;
 	struct net *net = cfg->fc_nlinfo.nl_net;
+	struct ip_tunnel_info *tun_info = NULL;
 
 	if (cfg->fc_type > RTN_MAX)
 		goto err_inval;
@@ -856,6 +860,19 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 		}
 	}
 
+	if (cfg->fc_tunnel.mode) {
+		/* TODO: Allow specification of options */
+		tun_info = ip_tunnel_info_alloc(0, GFP_KERNEL);
+		if (!tun_info) {
+			err = -ENOMEM;
+			goto failure;
+		}
+
+		memcpy(tun_info, &cfg->fc_tunnel, sizeof(*tun_info));
+		ip_tunnel_info_get(tun_info);
+		fi->fib_tunnel = tun_info;
+	}
+
 	if (cfg->fc_mp) {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
@@ -975,6 +992,8 @@ err_inval:
 	err = -EINVAL;
 
 failure:
+	kfree(tun_info);
+
 	if (fi) {
 		fi->fib_dead = 1;
 		free_fib_info(fi);
@@ -983,6 +1002,29 @@ failure:
 	return ERR_PTR(err);
 }
 
+int fib_dump_tun_info(struct sk_buff *skb, struct ip_tunnel_info *tun_info)
+{
+	struct nlattr *tun_attr;
+
+	tun_attr = nla_nest_start(skb, RTA_TUNNEL);
+	if (!tun_attr)
+		return -ENOMEM;
+
+	if (nla_put_u64(skb, RTA_TUN_ID, tun_info->key.tun_id) ||
+	    nla_put_be32(skb, RTA_TUN_DST, tun_info->key.ipv4_dst) ||
+	    nla_put_be32(skb, RTA_TUN_SRC, tun_info->key.ipv4_src) ||
+	    nla_put_u8(skb, RTA_TUN_TOS, tun_info->key.ipv4_tos) ||
+	    nla_put_u8(skb, RTA_TUN_TTL, tun_info->key.ipv4_ttl) ||
+	    nla_put_u16(skb, RTA_TUN_SPORT, tun_info->key.tp_src) ||
+	    nla_put_u16(skb, RTA_TUN_DPORT, tun_info->key.tp_dst) ||
+	    nla_put_u16(skb, RTA_TUN_FLAGS, tun_info->key.tun_flags))
+		return -ENOMEM;
+
+	nla_nest_end(skb, tun_attr);
+
+	return 0;
+}
+
 int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 		  u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
 		  struct fib_info *fi, unsigned int flags)
@@ -1068,6 +1110,9 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 		nla_nest_end(skb, mp);
 	}
 #endif
+	if (fi->fib_tunnel && fib_dump_tun_info(skb, fi->fib_tunnel))
+		goto nla_put_failure;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6e8e1be..f53c62f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1356,6 +1356,8 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
 		list_del(&rt->rt_uncached);
 		spin_unlock_bh(&ul->lock);
 	}
+
+	ip_tunnel_info_put(rt->rt_tun_info);
 }
 
 void rt_flush_dev(struct net_device *dev)
@@ -1489,6 +1491,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_gateway	= 0;
 	rth->rt_uses_gateway = 0;
 	INIT_LIST_HEAD(&rth->rt_uncached);
+	rth->rt_tun_info = NULL;
 	if (our) {
 		rth->dst.input= ip_local_deliver;
 		rth->rt_flags |= RTCF_LOCAL;
@@ -1543,6 +1546,7 @@ static int __mkroute_input(struct sk_buff *skb,
 			   struct in_device *in_dev,
 			   __be32 daddr, __be32 saddr, u32 tos)
 {
+	struct fib_info *fi = res->fi;
 	struct fib_nh_exception *fnhe;
 	struct rtable *rth;
 	int err;
@@ -1590,7 +1594,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	}
 
 	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
-	if (do_cache) {
+	if (do_cache && !(fi && fi->fib_tunnel)) {
 		if (fnhe)
 			rth = rcu_dereference(fnhe->fnhe_rth_input);
 		else
@@ -1621,6 +1625,13 @@ static int __mkroute_input(struct sk_buff *skb,
 	INIT_LIST_HEAD(&rth->rt_uncached);
 	RT_CACHE_STAT_INC(in_slow_tot);
 
+	if (fi && fi->fib_tunnel) {
+		ip_tunnel_info_get(fi->fib_tunnel);
+		rth->rt_tun_info = fi->fib_tunnel;
+	} else {
+		rth->rt_tun_info = NULL;
+	}
+
 	rth->dst.input = ip_forward;
 	rth->dst.output = ip_output;
 
@@ -1794,6 +1805,7 @@ local_input:
 	rth->rt_gateway	= 0;
 	rth->rt_uses_gateway = 0;
 	INIT_LIST_HEAD(&rth->rt_uncached);
+	rth->rt_tun_info = NULL;
 	RT_CACHE_STAT_INC(in_slow_tot);
 	if (res.type == RTN_UNREACHABLE) {
 		rth->dst.input= ip_error;
@@ -1940,6 +1952,11 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 
 	fnhe = NULL;
 	do_cache &= fi != NULL;
+
+	/* Force dst for flows with tunnel encapsulation */
+	if (fi && fi->fib_tunnel)
+		goto add;
+
 	if (do_cache) {
 		struct rtable __rcu **prth;
 		struct fib_nh *nh = &FIB_RES_NH(*res);
@@ -1984,6 +2001,13 @@ add:
 	rth->rt_uses_gateway = 0;
 	INIT_LIST_HEAD(&rth->rt_uncached);
 
+	if (fi && fi->fib_tunnel) {
+		ip_tunnel_info_get(fi->fib_tunnel);
+		rth->rt_tun_info = fi->fib_tunnel;
+	} else {
+		rth->rt_tun_info = NULL;
+	}
+
 	RT_CACHE_STAT_INC(out_slow_tot);
 
 	if (flags & RTCF_LOCAL)
@@ -2263,6 +2287,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_uses_gateway = ort->rt_uses_gateway;
 
 		INIT_LIST_HEAD(&rt->rt_uncached);
+		rt->rt_tun_info = NULL;
 
 		dst_free(new);
 	}
@@ -2394,6 +2419,9 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
 		goto nla_put_failure;
 
+	if (rt->rt_tun_info && fib_dump_tun_info(skb, rt->rt_tun_info))
+		goto nla_put_failure;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index 4750fb6..75d6824 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -27,6 +27,7 @@
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/u64_stats_sync.h>
+#include <net/route.h>
 
 #include "datapath.h"
 
-- 
2.3.5




More information about the dev mailing list