[ovs-dev] [tunnels2 4/5] datapath: Add multicast tunnel support.

Ben Pfaff blp at nicira.com
Fri Oct 21 23:24:09 UTC 2011


On Thu, Oct 20, 2011 at 05:11:33PM -0700, Jesse Gross wrote:
> On Mon, Oct 17, 2011 at 3:14 PM, Ben Pfaff <blp at nicira.com> wrote:
> > diff --git a/datapath/tunnel.c b/datapath/tunnel.c
> > index 8edff06..6fde389 100644
> > --- a/datapath/tunnel.c
> > +++ b/datapath/tunnel.c
> >  int tnl_set_addr(struct vport *vport, const unsigned char *addr)
> >  {
> >        struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
> > -       struct tnl_mutable_config *mutable;
> > +       struct tnl_mutable_config *old_mutable, *mutable;
> >
> > -       mutable = kmemdup(rtnl_dereference(tnl_vport->mutable),
> > -                         sizeof(struct tnl_mutable_config), GFP_KERNEL);
> > +       old_mutable = rtnl_dereference(tnl_vport->mutable);
> > +       mutable = kmemdup(old_mutable, sizeof(struct tnl_mutable_config), GFP_KERNEL);
> >        if (!mutable)
> >                return -ENOMEM;
> >
> > +       mutable->mlink = mutable->mlink;
> 
> I think this line probably isn't going to do a whole lot...

Oops.  The one part that I don't test...

Changed to:
	mutable->mlink = old_mutable->mlink;

> > diff --git a/include/openvswitch/tunnel.h b/include/openvswitch/tunnel.h
> > index 110e652..ecdd821 100644
> > --- a/include/openvswitch/tunnel.h
> > +++ b/include/openvswitch/tunnel.h
> > @@ -57,6 +57,7 @@ enum {
> >        OVS_TUNNEL_ATTR_IN_KEY,   /* __be64 key to match on input. */
> >        OVS_TUNNEL_ATTR_TOS,      /* 8-bit TOS value. */
> >        OVS_TUNNEL_ATTR_TTL,      /* 8-bit TTL value. */
> > +       OVS_TUNNEL_ATTR_LINK,     /* 32-bit ifindex of linked device. */
> 
> I think this was supposed to be part of the following patch.

Deleted.

> > diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
> > index d579b87..d09ade4 100644
> > --- a/vswitchd/vswitch.xml
> > +++ b/vswitchd/vswitch.xml
> >       <column name="options" key="local_ip">
> > -        Optional.  The destination IP that received packets must
> > -        match.  Default is to match all addresses.
> > +        Optional.  The destination IP that received packets must match.
> > +        Default is to match all addresses.  Must be omitted when <ref
> > +        column="options" key="remote_ip"/> is a multicast address.
> >       </column>
> 
> I think we probably should do some validation on this - otherwise
> ports that specify a local address and a multicast address silently
> won't match.

OK, how's this:

--- a/datapath/tunnel.c
+++ b/datapath/tunnel.c
@@ -1336,9 +1336,12 @@ static int tnl_set_config(struct nlattr *options, const struct tnl_ops *tnl_ops,
 
 	mutable->flags = nla_get_u32(a[OVS_TUNNEL_ATTR_FLAGS]) & TNL_F_PUBLIC;
 
-	if (a[OVS_TUNNEL_ATTR_SRC_IPV4])
-		mutable->key.saddr = nla_get_be32(a[OVS_TUNNEL_ATTR_SRC_IPV4]);
 	mutable->key.daddr = nla_get_be32(a[OVS_TUNNEL_ATTR_DST_IPV4]);
+	if (a[OVS_TUNNEL_ATTR_SRC_IPV4]) {
+		if (ipv4_is_multicast(mutable->key.daddr))
+			return -EINVAL;
+		mutable->key.saddr = nla_get_be32(a[OVS_TUNNEL_ATTR_SRC_IPV4]);
+	}
 
 	if (a[OVS_TUNNEL_ATTR_TOS]) {
 		mutable->tos = nla_get_u8(a[OVS_TUNNEL_ATTR_TOS]);
--- a/lib/netdev-vport.c
+++ b/lib/netdev-vport.c
@@ -578,6 +578,7 @@ parse_tunnel_config(const char *name, const char *type,
     struct shash_node *node;
     bool ipsec_mech_set = false;
     ovs_be32 daddr = htonl(0);
+    ovs_be32 saddr = htonl(0);
     uint32_t flags;
 
     flags = TNL_F_DF_DEFAULT | TNL_F_PMTUD | TNL_F_HDR_CACHE;
@@ -603,8 +604,7 @@ parse_tunnel_config(const char *name, const char *type,
             if (lookup_ip(node->data, &in_addr)) {
                 VLOG_WARN("%s: bad %s 'local_ip'", name, type);
             } else {
-                nl_msg_put_be32(options, OVS_TUNNEL_ATTR_SRC_IPV4,
-                                in_addr.s_addr);
+                saddr = in_addr.s_addr;
             }
         } else if (!strcmp(node->name, "tos")) {
             if (!strcmp(node->data, "inherit")) {
@@ -707,6 +707,14 @@ parse_tunnel_config(const char *name, const char *type,
     }
     nl_msg_put_be32(options, OVS_TUNNEL_ATTR_DST_IPV4, daddr);
 
+    if (saddr) {
+        if (ip_is_multicast(daddr)) {
+            VLOG_WARN("%s: remote_ip is multicast, ignoring local_ip", name);
+        } else {
+            nl_msg_put_be32(options, OVS_TUNNEL_ATTR_SRC_IPV4, saddr);
+        }
+    }
+
     nl_msg_put_u32(options, OVS_TUNNEL_ATTR_FLAGS, flags);
 
     return 0;
diff --git a/lib/packets.h b/lib/packets.h
index e727cc9..c63ac3f 100644
--- a/lib/packets.h
+++ b/lib/packets.h
@@ -291,6 +291,11 @@ ip_is_cidr(ovs_be32 netmask)
     uint32_t x = ~ntohl(netmask);
     return !(x & (x + 1));
 }
+static inline bool
+ip_is_multicast(ovs_be32 ip)
+{
+    return (ip & htonl(0xf0000000)) == htonl(0xe0000000);
+}
 int ip_count_cidr_bits(ovs_be32 netmask);
 void ip_format_masked(ovs_be32 ip, ovs_be32 mask, struct ds *);
 
> Maybe we should also document that routing table changes for multicast
> tunnels require deleting and recreating them.

It's an embarrassing limitation.  For now:

       <column name="options" key="remote_ip">
        <p>
          Required.  The tunnel endpoint.  Unicast and multicast endpoints are
          both supported.
        </p>

        <p>
          When a multicast endpoint is specified, a routing table lookup occurs
          only when the tunnel is created.  Following a routing change, delete
          and then re-create the tunnel to force a new routing table lookup.
        </p>
       </column>

> I think in the gre_err() code path, we should ignore ICMP messages
> sent to multicast addresses.

OK, like this?

diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c
index cc64d12..95ac4bb 100644
--- a/datapath/vport-gre.c
+++ b/datapath/vport-gre.c
@@ -188,6 +188,8 @@ static void gre_err(struct sk_buff *skb, u32 info)
 		return;
 
 	iph = (struct iphdr *)skb->data;
+	if (ipv4_is_multicast(iph->daddr))
+		return;
 
 	tunnel_hdr_len = parse_header(iph, &flags, &key);
 	if (tunnel_hdr_len < 0)

Revised patch follows.

--8<--------------------------cut here-------------------------->8--

From: Ben Pfaff <blp at nicira.com>
Date: Fri, 21 Oct 2011 15:35:53 -0700
Subject: [PATCH] datapath: Add multicast tunnel support.

Something like this, on two separate vswitches, works to try it out:
    route add -net 224.0.0.0 netmask 240.0.0.0 dev eth0
    ovs-vsctl \
        -- add-port br0 gre0 \
        -- set interface gre0 type=gre options:remote_ip=224.0.0.1

Runtime tested on Linux 3.0, build tested on Linux 2.6.18, both i386.

Signed-off-by: Ben Pfaff <blp at nicira.com>
---
 datapath/tunnel.c    |  111 +++++++++++++++++++++++++++++++++++++++++---------
 datapath/tunnel.h    |    3 +
 vswitchd/vswitch.xml |    8 ++-
 3 files changed, 99 insertions(+), 23 deletions(-)

diff --git a/datapath/tunnel.c b/datapath/tunnel.c
index 8edff06..6fde389 100644
--- a/datapath/tunnel.c
+++ b/datapath/tunnel.c
@@ -10,8 +10,10 @@
 #include <linux/if_ether.h>
 #include <linux/ip.h>
 #include <linux/if_vlan.h>
+#include <linux/igmp.h>
 #include <linux/in.h>
 #include <linux/in_route.h>
+#include <linux/inetdevice.h>
 #include <linux/jhash.h>
 #include <linux/list.h>
 #include <linux/kernel.h>
@@ -134,6 +136,21 @@ static void free_cache_rcu(struct rcu_head *rcu)
 	free_cache(c);
 }
 
+/* Frees the portion of 'mutable' that requires RTNL and thus can't happen
+ * within an RCU callback.  Fortunately this part doesn't require waiting for
+ * an RCU grace period.
+ */
+static void free_mutable_rtnl(struct tnl_mutable_config *mutable)
+{
+	ASSERT_RTNL();
+	if (ipv4_is_multicast(mutable->key.daddr) && mutable->mlink) {
+		struct in_device *in_dev;
+		in_dev = inetdev_by_index(&init_net, mutable->mlink);
+		if (in_dev)
+			ip_mc_dec_group(in_dev, mutable->key.daddr);
+	}
+}
+
 static void assign_config_rcu(struct vport *vport,
 			      struct tnl_mutable_config *new_config)
 {
@@ -142,6 +159,8 @@ static void assign_config_rcu(struct vport *vport,
 
 	old_config = rtnl_dereference(tnl_vport->mutable);
 	rcu_assign_pointer(tnl_vport->mutable, new_config);
+
+	free_mutable_rtnl(old_config);
 	call_rcu(&old_config->rcu, free_config_rcu);
 }
 
@@ -257,6 +276,26 @@ struct vport *tnl_find_port(__be32 saddr, __be32 daddr, __be64 key,
 	struct port_lookup_key lookup;
 	struct vport *vport;
 
+	if (ipv4_is_multicast(saddr)) {
+		lookup.saddr = 0;
+		lookup.daddr = saddr;
+		if (key_remote_ports) {
+			lookup.tunnel_type = tunnel_type | TNL_T_KEY_EXACT;
+			lookup.in_key = key;
+			vport = port_table_lookup(&lookup, mutable);
+			if (vport)
+				return vport;
+		}
+		if (remote_ports) {
+			lookup.tunnel_type = tunnel_type | TNL_T_KEY_MATCH;
+			lookup.in_key = 0;
+			vport = port_table_lookup(&lookup, mutable);
+			if (vport)
+				return vport;
+		}
+		return NULL;
+	}
+
 	lookup.saddr = saddr;
 	lookup.daddr = daddr;
 
@@ -909,6 +948,31 @@ unlock:
 	return cache;
 }
 
+static struct rtable *__find_route(const struct tnl_mutable_config *mutable,
+				   u8 ipproto, u8 tos)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
+	struct flowi fl = { .nl_u = { .ip4_u =
+				      { .daddr = mutable->key.daddr,
+					.saddr = mutable->key.saddr,
+					.tos = tos } },
+			    .proto = ipproto };
+	struct rtable *rt;
+
+	if (unlikely(ip_route_output_key(&init_net, &rt, &fl)))
+		return ERR_PTR(-EADDRNOTAVAIL);
+
+	return rt;
+#else
+	struct flowi4 fl = { .daddr = mutable->key.daddr,
+			     .saddr = mutable->key.saddr,
+			     .flowi4_tos = tos,
+			     .flowi4_proto = ipproto };
+
+	return ip_route_output_key(&init_net, &fl);
+#endif
+}
+
 static struct rtable *find_route(struct vport *vport,
 				 const struct tnl_mutable_config *mutable,
 				 u8 tos, struct tnl_cache **cache)
@@ -924,25 +988,10 @@ static struct rtable *find_route(struct vport *vport,
 		return cur_cache->rt;
 	} else {
 		struct rtable *rt;
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
-		struct flowi fl = { .nl_u = { .ip4_u =
-					      { .daddr = mutable->key.daddr,
-						.saddr = mutable->key.saddr,
-						.tos = tos } },
-				    .proto = tnl_vport->tnl_ops->ipproto };
 
-		if (unlikely(ip_route_output_key(&init_net, &rt, &fl)))
-			return NULL;
-#else
-		struct flowi4 fl = { .daddr = mutable->key.daddr,
-				     .saddr = mutable->key.saddr,
-				     .flowi4_tos = tos,
-				     .flowi4_proto = tnl_vport->tnl_ops->ipproto };
-
-		rt = ip_route_output_key(&init_net, &fl);
+		rt = __find_route(mutable, tnl_vport->tnl_ops->ipproto, tos);
 		if (IS_ERR(rt))
 			return NULL;
-#endif
 
 		if (likely(tos == mutable->tos))
 			*cache = build_cache(vport, mutable, rt);
@@ -1324,6 +1373,22 @@ static int tnl_set_config(struct nlattr *options, const struct tnl_ops *tnl_ops,
 	if (old_vport && old_vport != cur_vport)
 		return -EEXIST;
 
+	mutable->mlink = 0;
+	if (ipv4_is_multicast(mutable->key.daddr)) {
+		struct net_device *dev;
+		struct rtable *rt;
+
+		rt = __find_route(mutable, tnl_ops->ipproto, mutable->tos);
+		if (IS_ERR(rt))
+			return -EADDRNOTAVAIL;
+		dev = rt_dst(rt).dev;
+		ip_rt_put(rt);
+		if (__in_dev_get_rtnl(dev) == NULL)
+			return -EADDRNOTAVAIL;
+		mutable->mlink = dev->ifindex;
+		ip_mc_inc_group(__in_dev_get_rtnl(dev), mutable->key.daddr);
+	}
+
 	return 0;
 }
 
@@ -1376,6 +1441,7 @@ struct vport *tnl_create(const struct vport_parms *parms,
 	return vport;
 
 error_free_mutable:
+	free_mutable_rtnl(mutable);
 	kfree(mutable);
 error_free_vport:
 	vport_free(vport);
@@ -1414,6 +1480,7 @@ int tnl_set_options(struct vport *vport, struct nlattr *options)
 	return 0;
 
 error_free:
+	free_mutable_rtnl(mutable);
 	kfree(mutable);
 error:
 	return err;
@@ -1457,23 +1524,27 @@ static void free_port_rcu(struct rcu_head *rcu)
 void tnl_destroy(struct vport *vport)
 {
 	struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
-	const struct tnl_mutable_config *mutable;
+	struct tnl_mutable_config *mutable;
 
 	mutable = rtnl_dereference(tnl_vport->mutable);
 	port_table_remove_port(vport);
+	free_mutable_rtnl(mutable);
 	call_rcu(&tnl_vport->rcu, free_port_rcu);
 }
 
 int tnl_set_addr(struct vport *vport, const unsigned char *addr)
 {
 	struct tnl_vport *tnl_vport = tnl_vport_priv(vport);
-	struct tnl_mutable_config *mutable;
+	struct tnl_mutable_config *old_mutable, *mutable;
 
-	mutable = kmemdup(rtnl_dereference(tnl_vport->mutable),
-			  sizeof(struct tnl_mutable_config), GFP_KERNEL);
+	old_mutable = rtnl_dereference(tnl_vport->mutable);
+	mutable = kmemdup(old_mutable, sizeof(struct tnl_mutable_config), GFP_KERNEL);
 	if (!mutable)
 		return -ENOMEM;
 
+	mutable->mlink = mutable->mlink;
+	old_mutable->mlink = 0;
+
 	memcpy(mutable->eth_addr, addr, ETH_ALEN);
 	assign_config_rcu(vport, mutable);
 
diff --git a/datapath/tunnel.h b/datapath/tunnel.h
index 8d20c77..1e707a9 100644
--- a/datapath/tunnel.h
+++ b/datapath/tunnel.h
@@ -89,6 +89,9 @@ struct tnl_mutable_config {
 	u32	flags;
 	u8	tos;
 	u8	ttl;
+
+	/* Multicast configuration. */
+	int	mlink;
 };
 
 struct tnl_ops {
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 683b27e..a888e59 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -965,12 +965,14 @@
       </p>
 
       <column name="options" key="remote_ip">
-        Required.  The tunnel endpoint.
+        Required.  The tunnel endpoint.  Unicast and multicast endpoints are
+        both supported.
       </column>
 
       <column name="options" key="local_ip">
-        Optional.  The destination IP that received packets must
-        match.  Default is to match all addresses.
+        Optional.  The destination IP that received packets must match.
+        Default is to match all addresses.  Must be omitted when <ref
+        column="options" key="remote_ip"/> is a multicast address.
       </column>
 
       <column name="options" key="in_key">
-- 
1.7.4.4




More information about the dev mailing list