[ovs-dev] [PATCH v2] VxLAN-gpe implementation

Thadeu Lima de Souza Cascardo cascardo at redhat.com
Mon Jun 20 13:11:14 UTC 2016


On Mon, Jun 20, 2016 at 08:36:43PM +0800, Yi Yang wrote:
> Current Linux kernel git tree has included VxLAN-gpe implementation
> 
> author  Jiri Benc <jbenc at redhat.com>
> committer       David S. Miller <davem at davemloft.net>
> commit  e1e5314de08ba6003b358125eafc9ad9e75a950c (patch)
> tree    1e18cdabf1c9d9ef17e26c6480e629465447f77f /drivers/net/vxlan.c
> parent  a6d5bbf34efa8330af7b0b1dba0f38148516ed97 (diff)
> vxlan: implement GPE
> 
> This patch is to port it to ovs in order that people also can use VxLAN-gpe
> even if they don't replace their kernels with latest Linux kernel.
> 
> Signed-off-by: Johnson Li <johnson.li at intel.com>
> Signed-off-by: Yi Yang <yi.y.yang at intel.com>


Hi, Yi Yang.

Before adding the OVS_VXLAN_EXT_GPE extension to the out-of-tree module, you
should send it to the mainline kernel. Besides, you need a very good
justification why you can't wait for my patchset to be accepted and have
VXLAN-GPE enabled using rtnetlink.

Also, I would split any changes to the datapath and userspace parts of the code
into multiple commits.

Meanwhile, you could backport only the upstreamed portions of VXLAN-GPE and send
that as a single commit, no userspace changes.

Cascardo.

> ---
>  datapath/linux/compat/include/linux/if_link.h     |   4 +
>  datapath/linux/compat/include/linux/openvswitch.h |   1 +
>  datapath/linux/compat/include/net/vxlan.h         |  73 ++++
>  datapath/linux/compat/vxlan.c                     | 461 ++++++++++++++++++++--
>  lib/dpif-netlink.c                                |   5 +
>  lib/netdev-vport.c                                |   4 +-
>  6 files changed, 512 insertions(+), 36 deletions(-)
> 
> diff --git a/datapath/linux/compat/include/linux/if_link.h b/datapath/linux/compat/include/linux/if_link.h
> index 6209dcb..de87769 100644
> --- a/datapath/linux/compat/include/linux/if_link.h
> +++ b/datapath/linux/compat/include/linux/if_link.h
> @@ -100,6 +100,10 @@ enum {
>  	IFLA_VXLAN_REMCSUM_NOPARTIAL,
>  #define IFLA_VXLAN_COLLECT_METADATA rpl_IFLA_VXLAN_COLLECT_METADATA
>  	IFLA_VXLAN_COLLECT_METADATA,
> +#define IFLA_VXLAN_LABEL rpl_IFLA_VXLAN_LABEL
> +        IFLA_VXLAN_LABEL,
> +#define IFLA_VXLAN_GPE rpl_IFLA_VXLAN_GPE
> +        IFLA_VXLAN_GPE,
>  #define __IFLA_VXLAN_MAX rpl___IFLA_VXLAN_MAX
>  	__IFLA_VXLAN_MAX
>  };
> diff --git a/datapath/linux/compat/include/linux/openvswitch.h b/datapath/linux/compat/include/linux/openvswitch.h
> index edfa7a1..761d9c6 100644
> --- a/datapath/linux/compat/include/linux/openvswitch.h
> +++ b/datapath/linux/compat/include/linux/openvswitch.h
> @@ -287,6 +287,7 @@ enum ovs_vport_attr {
>  enum {
>  	OVS_VXLAN_EXT_UNSPEC,
>  	OVS_VXLAN_EXT_GBP,      /* Flag or __u32 */
> +	OVS_VXLAN_EXT_GPE,      /* Flag, Generic Protocol Extension */
>  	__OVS_VXLAN_EXT_MAX,
>  };
>  
> diff --git a/datapath/linux/compat/include/net/vxlan.h b/datapath/linux/compat/include/net/vxlan.h
> index 75a5a7a..b3f45c4 100644
> --- a/datapath/linux/compat/include/net/vxlan.h
> +++ b/datapath/linux/compat/include/net/vxlan.h
> @@ -84,6 +84,66 @@ struct vxlanhdr_gbp {
>  #define VXLAN_GBP_POLICY_APPLIED	(BIT(3) << 16)
>  #define VXLAN_GBP_ID_MASK		(0xFFFF)
>  
> +/*
> + * VXLAN Generic Protocol Extension (VXLAN_F_GPE):
> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + * |R|R|Ver|I|P|R|O|       Reserved                |Next Protocol  |
> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + * |                VXLAN Network Identifier (VNI) |   Reserved    |
> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + *
> + * Ver = Version. Indicates VXLAN GPE protocol version.
> + *
> + * P = Next Protocol Bit. The P bit is set to indicate that the
> + *     Next Protocol field is present.
> + *
> + * O = OAM Flag Bit. The O bit is set to indicate that the packet
> + *     is an OAM packet.
> + *
> + * Next Protocol = This 8 bit field indicates the protocol header
> + * immediately following the VXLAN GPE header.
> + *
> + * https://tools.ietf.org/html/draft-ietf-nvo3-vxlan-gpe-01
> + */
> +
> +struct vxlanhdr_gpe {
> +#if defined(__LITTLE_ENDIAN_BITFIELD)
> +       u8      oam_flag:1,
> +               reserved_flags1:1,
> +               np_applied:1,
> +               instance_applied:1,
> +               version:2,
> +reserved_flags2:2;
> +#elif defined(__BIG_ENDIAN_BITFIELD)
> +       u8      reserved_flags2:2,
> +               version:2,
> +               instance_applied:1,
> +               np_applied:1,
> +               reserved_flags1:1,
> +               oam_flag:1;
> +#endif
> +       u8      reserved_flags3;
> +       u8      reserved_flags4;
> +       u8      next_protocol;
> +       __be32  vx_vni;
> +};
> +
> +/* VXLAN-GPE header flags. */
> +#define VXLAN_HF_VER   (BIT(29) | BIT(28))
> +#define VXLAN_HF_NP    (BIT(26))
> +#define VXLAN_HF_OAM   (BIT(24))
> +#define VXLAN_HF_GPE   (BIT(26))
> +
> +#define VXLAN_GPE_USED_BITS (VXLAN_HF_VER | VXLAN_HF_NP | VXLAN_HF_OAM | \
> +                            (0xFF))
> +
> +/* VXLAN-GPE header Next Protocol. */
> +#define VXLAN_GPE_NP_IPV4      0x01
> +#define VXLAN_GPE_NP_IPV6      0x02
> +#define VXLAN_GPE_NP_ETHERNET  0x03
> +#define VXLAN_GPE_NP_NSH       0x04
> +#define ETH_P_NSH              0x894f
> +
>  /* VXLAN protocol header:
>   * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
>   * |G|R|R|R|I|R|R|C|               Reserved                        |
> @@ -167,6 +227,7 @@ struct vxlan_config {
>  	__u16			port_max;
>  	__u8			tos;
>  	__u8			ttl;
> +	__be32                  label;
>  	u32			flags;
>  	unsigned long		age_interval;
>  	unsigned int		addrmax;
> @@ -205,15 +266,27 @@ struct vxlan_dev {
>  #define VXLAN_F_GBP			0x800
>  #define VXLAN_F_REMCSUM_NOPARTIAL	0x1000
>  #define VXLAN_F_COLLECT_METADATA	0x2000
> +#define VXLAN_F_GPE                     0x4000
> +#define VXLAN_F_UDP_ZERO_CSUM_TX VXLAN_F_UDP_CSUM
>  
>  /* Flags that are used in the receive path. These flags must match in
>   * order for a socket to be shareable
>   */
>  #define VXLAN_F_RCV_FLAGS		(VXLAN_F_GBP |			\
> +                                         VXLAN_F_GPE |                  \
>  					 VXLAN_F_UDP_ZERO_CSUM6_RX |	\
>  					 VXLAN_F_REMCSUM_RX |		\
>  					 VXLAN_F_REMCSUM_NOPARTIAL |	\
>  					 VXLAN_F_COLLECT_METADATA)
> +
> +/* Flags that can be set together with VXLAN_F_GPE. */
> +#define VXLAN_F_ALLOWED_GPE             (VXLAN_F_GPE |                  \
> +                                         VXLAN_F_IPV6 |                 \
> +                                         VXLAN_F_UDP_CSUM |     \
> +                                         VXLAN_F_UDP_ZERO_CSUM6_TX |    \
> +                                         VXLAN_F_UDP_ZERO_CSUM6_RX |    \
> +                                         VXLAN_F_COLLECT_METADATA)
> +
>  #define vxlan_dev_create rpl_vxlan_dev_create
>  struct net_device *rpl_vxlan_dev_create(struct net *net, const char *name,
>  				    u8 name_assign_type, struct vxlan_config *conf);
> diff --git a/datapath/linux/compat/vxlan.c b/datapath/linux/compat/vxlan.c
> index 4faa18f..570d2d9 100644
> --- a/datapath/linux/compat/vxlan.c
> +++ b/datapath/linux/compat/vxlan.c
> @@ -812,6 +812,45 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
>  }
>  #endif
>  
> +static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed,
> +			       __be32 *protocol,
> +			       struct sk_buff *skb, u32 vxflags)
> +{
> +       struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)unparsed;
> +
> +       /* Need to have Next Protocol set for interfaces in GPE mode. */
> +       if (!gpe->np_applied)
> +	       return false;
> +       /* "The initial version is 0. If a receiver does not support the
> +	* version indicated it MUST drop the packet.
> +	*/
> +       if (gpe->version != 0)
> +	       return false;
> +       /* "When the O bit is set to 1, the packet is an OAM packet and OAM
> +	* processing MUST occur." However, we don't implement OAM
> +	* processing, thus drop the packet.
> +	*/
> +       if (gpe->oam_flag)
> +	       return false;
> +
> +       switch (gpe->next_protocol) {
> +       case VXLAN_GPE_NP_IPV4:
> +	       *protocol = htons(ETH_P_IP);
> +	       break;
> +       case VXLAN_GPE_NP_IPV6:
> +	       *protocol = htons(ETH_P_IPV6);
> +	       break;
> +       case VXLAN_GPE_NP_ETHERNET:
> +	       *protocol = htons(ETH_P_TEB);
> +	       break;
> +       default:
> +	       return false;
> +       }
> +
> +       unparsed->vx_flags &= ~VXLAN_GPE_USED_BITS;
> +       return true;
> +}
> +
>  static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
>  		      struct vxlan_metadata *md, u32 vni,
>  		      struct metadata_dst *tun_dst)
> @@ -822,6 +861,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
>  	struct pcpu_sw_netstats *stats;
>  	union vxlan_addr saddr;
>  	int err = 0;
> +	struct vxlanhdr unparsed;
> +	__be32 protocol = htons(ETH_P_TEB);
> +	bool raw_proto = false;
>  
>  	/* For flow based devices, map all packets to VNI 0 */
>  	if (vs->flags & VXLAN_F_COLLECT_METADATA)
> @@ -832,14 +874,35 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
>  	if (!vxlan)
>  		goto drop;
>  
> -	skb_reset_mac_header(skb);
> -	skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev)));
> -	skb->protocol = eth_type_trans(skb, vxlan->dev);
> -	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
> +	/* For backwards compatibility, only allow reserved fields to be
> +	 * used by VXLAN extensions if explicitly requested.
> +	 */
> +	if (vs->flags & VXLAN_F_GPE) {
> +		unparsed = *(struct vxlanhdr *)(udp_hdr(skb) + 1);
> +		if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags))
> +			goto drop;
> +		if (protocol != htons(ETH_P_TEB)) {
> +		    raw_proto = true;
> +		}
> +	}
>  
> -	/* Ignore packet loops (and multicast echo) */
> -	if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
> -		goto drop;
> +	if (!raw_proto) {
> +		skb_reset_mac_header(skb);
> +		skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev)));
> +		skb->protocol = eth_type_trans(skb, vxlan->dev);
> +		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
> +
> +		/* Ignore packet loops (and multicast echo) */
> +		if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
> +			goto drop;
> +
> +		if ((vxlan->flags & VXLAN_F_LEARN) &&
> +		    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
> +			goto drop;
> +	} else {
> +		skb->dev = vxlan->dev;
> +		skb->pkt_type = PACKET_HOST;
> +	}
>  
>  	/* Get data from the outer IP header */
>  	if (vxlan_get_sk_family(vs) == AF_INET) {
> @@ -861,10 +924,6 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
>  		goto drop;
>  	}
>  
> -	if ((vxlan->flags & VXLAN_F_LEARN) &&
> -	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
> -		goto drop;
> -
>  	skb_reset_network_header(skb);
>  	/* In flow-based mode, GBP is carried in dst_metadata */
>  	if (!(vs->flags & VXLAN_F_COLLECT_METADATA))
> @@ -908,6 +967,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>  		struct metadata_dst dst;
>  		char buf[sizeof(struct metadata_dst) + sizeof(*md)];
>  	} buf;
> +	struct vxlanhdr unparsed;
> +	__be32 protocol = htons(ETH_P_TEB);
>  
>  	/* Need Vxlan and inner Ethernet header to be present */
>  	if (!pskb_may_pull(skb, VXLAN_HLEN))
> @@ -924,14 +985,25 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
>  		goto bad_flags;
>  	}
>  
> -	if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
> -		goto drop;
> -	vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
> -
>  	vs = rcu_dereference_sk_user_data(sk);
>  	if (!vs)
>  		goto drop;
>  
> +	/* For backwards compatibility, only allow reserved fields to be
> +	 * used by VXLAN extensions if explicitly requested.
> +	 */
> +	if (vs->flags & VXLAN_F_GPE) {
> +		unparsed = *(struct vxlanhdr *)(udp_hdr(skb) + 1);
> +		if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags))
> +			goto drop;
> +		buf.dst.u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT;
> +		flags &= ~VXLAN_GPE_USED_BITS;
> +	}
> +
> +	if (iptunnel_pull_header(skb, VXLAN_HLEN, protocol))
> +		goto drop;
> +	vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
> +
>  #ifdef HAVE_VXLAN_HF_RCO
>  	if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
>  		vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni,
> @@ -1023,6 +1095,33 @@ static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags,
>  	gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
>  }
>  
> +static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags,
> +			       __be16 protocol)
> +{
> +	struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh;
> +
> +	vxh->vx_flags |= htonl(VXLAN_HF_GPE);
> +	gpe->np_applied = 1;
> +	gpe->version = 0;
> +	gpe->oam_flag = 0;
> +
> +	switch (protocol) {
> +	case htons(ETH_P_IP):
> +		gpe->next_protocol = VXLAN_GPE_NP_IPV4;
> +		return 0;
> +	case htons(ETH_P_IPV6):
> +		gpe->next_protocol = VXLAN_GPE_NP_IPV6;
> +		return 0;
> +	case htons(ETH_P_TEB):
> +		gpe->next_protocol = VXLAN_GPE_NP_ETHERNET;
> +		return 0;
> +	case htons(ETH_P_NSH):
> +		gpe->next_protocol = VXLAN_GPE_NP_NSH;
> +		return 0;
> +	}
> +	return -EPFNOSUPPORT;
> +}
> +
>  #if IS_ENABLED(CONFIG_IPV6)
>  static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk,
>  			   struct sk_buff *skb,
> @@ -1036,6 +1135,7 @@ static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk,
>  	int err;
>  	bool udp_sum = !(vxflags & VXLAN_F_UDP_ZERO_CSUM6_TX);
>  	int type = 0;
> +	__be16 inner_protocol = htons(ETH_P_TEB);
>  
>  	if ((vxflags & VXLAN_F_REMCSUM_TX) &&
>  	    skb->ip_summed == CHECKSUM_PARTIAL) {
> @@ -1106,8 +1206,14 @@ static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk,
>  
>  	if (vxflags & VXLAN_F_GBP)
>  		vxlan_build_gbp_hdr(vxh, vxflags, md);
> +	if (vxflags & VXLAN_F_GPE) {
> +		err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol);
> +		if (err < 0)
> +			goto err;
> +		inner_protocol = skb->protocol;
> +	}
>  
> -	ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB));
> +	ovs_skb_set_inner_protocol(skb, inner_protocol);
>  
>  	udp_tunnel6_xmit_skb(dst, sk, skb, dev, saddr, daddr, prio,
>  			     ttl, src_port, dst_port,
> @@ -1129,6 +1235,7 @@ static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *sk
>  	int err;
>  	bool udp_sum = !!(vxflags & VXLAN_F_UDP_CSUM);
>  	int type = 0;
> +	__be16 inner_protocol = htons(ETH_P_TEB);
>  
>  	if ((vxflags & VXLAN_F_REMCSUM_TX) &&
>  	    skb->ip_summed == CHECKSUM_PARTIAL) {
> @@ -1191,8 +1298,14 @@ static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *sk
>  	}
>  	if (vxflags & VXLAN_F_GBP)
>  		vxlan_build_gbp_hdr(vxh, vxflags, md);
> +	if (vxflags & VXLAN_F_GPE) {
> +		err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol);
> +		if (err < 0)
> +			return err;
> +		inner_protocol = skb->protocol;
> +	}
>  
> -	ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB));
> +	ovs_skb_set_inner_protocol(skb, inner_protocol);
>  
>  	return udp_tunnel_xmit_skb(rt, sk, skb, src, dst, tos,
>  				   ttl, df, src_port, dst_port, xnet,
> @@ -1419,7 +1532,7 @@ tx_free:
>   *
>   * Outer IP header inherits ECN and DF from inner header.
>   * Outer UDP destination is the VXLAN assigned port.
> - *           source port is based on hash of flow
> + *	   source port is based on hash of flow
>   */
>  netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb)
>  {
> @@ -1648,7 +1761,7 @@ static netdev_tx_t vxlan_dev_xmit(struct sk_buff *skb, struct net_device *dev)
>  	return NETDEV_TX_OK;
>  }
>  
> -static const struct net_device_ops vxlan_netdev_ops = {
> +static const struct net_device_ops vxlan_netdev_ether_ops = {
>  	.ndo_init		= vxlan_init,
>  	.ndo_uninit		= vxlan_uninit,
>  	.ndo_get_stats64	= ip_tunnel_get_stats64,
> @@ -1661,6 +1774,16 @@ static const struct net_device_ops vxlan_netdev_ops = {
>  	.ndo_set_mac_address	= eth_mac_addr,
>  };
>  
> +static const struct net_device_ops vxlan_netdev_raw_ops = {
> +	.ndo_init		= vxlan_init,
> +	.ndo_uninit		= vxlan_uninit,
> +	.ndo_get_stats64	= ip_tunnel_get_stats64,
> +	.ndo_open		= vxlan_open,
> +	.ndo_stop		= vxlan_stop,
> +	.ndo_start_xmit		= vxlan_dev_xmit,
> +	.ndo_change_mtu		= vxlan_change_mtu,
> +};
> +
>  /* Info for udev, that this is a virtual tunnel endpoint */
>  static struct device_type vxlan_type = {
>  	.name = "vxlan",
> @@ -1675,7 +1798,7 @@ static void vxlan_setup(struct net_device *dev)
>  	eth_hw_addr_random(dev);
>  	ether_setup(dev);
>  
> -	dev->netdev_ops = &vxlan_netdev_ops;
> +	dev->netdev_ops = &vxlan_netdev_ether_ops;
>  	dev->destructor = free_netdev;
>  	SET_NETDEV_DEVTYPE(dev, &vxlan_type);
>  
> @@ -1712,8 +1835,51 @@ static void vxlan_setup(struct net_device *dev)
>  		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
>  }
>  
> +static void vxlan_ether_setup(struct net_device *dev)
> +{
> +	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
> +	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
> +	dev->netdev_ops = &vxlan_netdev_ether_ops;
> +}
> +
> +static void vxlan_raw_setup(struct net_device *dev)
> +{
> +	dev->header_ops = NULL;
> +	dev->type = ARPHRD_NONE;
> +	dev->hard_header_len = 0;
> +	dev->addr_len = 0;
> +	dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
> +	dev->netdev_ops = &vxlan_netdev_raw_ops;
> +}
> +
>  static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
> -	[IFLA_VXLAN_PORT]	= { .type = NLA_U16 },
> +	[IFLA_VXLAN_ID]	 = { .type = NLA_U32 },
> +	[IFLA_VXLAN_GROUP]      = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
> +	[IFLA_VXLAN_GROUP6]     = { .len = sizeof(struct in6_addr) },
> +	[IFLA_VXLAN_LINK]       = { .type = NLA_U32 },
> +	[IFLA_VXLAN_LOCAL]      = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
> +	[IFLA_VXLAN_LOCAL6]     = { .len = sizeof(struct in6_addr) },
> +	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
> +	[IFLA_VXLAN_TTL]	= { .type = NLA_U8 },
> +	[IFLA_VXLAN_LABEL]      = { .type = NLA_U32 },
> +	[IFLA_VXLAN_LEARNING]   = { .type = NLA_U8 },
> +	[IFLA_VXLAN_AGEING]     = { .type = NLA_U32 },
> +	[IFLA_VXLAN_LIMIT]      = { .type = NLA_U32 },
> +	[IFLA_VXLAN_PORT_RANGE] = { .len  = sizeof(struct ifla_vxlan_port_range) },
> +	[IFLA_VXLAN_PROXY]      = { .type = NLA_U8 },
> +	[IFLA_VXLAN_RSC]	= { .type = NLA_U8 },
> +	[IFLA_VXLAN_L2MISS]     = { .type = NLA_U8 },
> +	[IFLA_VXLAN_L3MISS]     = { .type = NLA_U8 },
> +	[IFLA_VXLAN_COLLECT_METADATA]   = { .type = NLA_U8 },
> +	[IFLA_VXLAN_PORT]       = { .type = NLA_U16 },
> +	[IFLA_VXLAN_UDP_CSUM]   = { .type = NLA_U8 },
> +	[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]  = { .type = NLA_U8 },
> +	[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]  = { .type = NLA_U8 },
> +	[IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 },
> +	[IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 },
> +	[IFLA_VXLAN_GBP]	= { .type = NLA_FLAG, },
> +	[IFLA_VXLAN_GPE]	= { .type = NLA_FLAG, },
> +	[IFLA_VXLAN_REMCSUM_NOPARTIAL]  = { .type = NLA_FLAG },
>  };
>  
>  static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
> @@ -1897,6 +2063,21 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
>  	__be16 default_port = vxlan->cfg.dst_port;
>  	struct net_device *lowerdev = NULL;
>  
> +	if (conf->flags & VXLAN_F_GPE) {
> +		if (conf->flags & ~VXLAN_F_ALLOWED_GPE)
> +			return -EINVAL;
> +		/* For now, allow GPE only together with COLLECT_METADATA.
> +		 * This can be relaxed later; in such case, the other side
> +		 * of the PtP link will have to be provided.
> +		 */
> +		if (!(conf->flags & VXLAN_F_COLLECT_METADATA))
> +			return -EINVAL;
> +
> +		vxlan_raw_setup(dev);
> +	} else {
> +		vxlan_ether_setup(dev);
> +	}
> +
>  	vxlan->net = src_net;
>  
>  	dst->remote_vni = conf->vni;
> @@ -2023,7 +2204,136 @@ static int vxlan_newlink(struct net_device *dev,
>  			 struct nlattr *tb[], struct nlattr *data[])
>  #endif
>  {
> -	return -EINVAL;
> +	struct vxlan_config conf;
> +	int err;
> +
> +	memset(&conf, 0, sizeof(conf));
> +
> +	if (data[IFLA_VXLAN_ID])
> +		conf.vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));
> +
> +	if (data[IFLA_VXLAN_GROUP]) {
> +		conf.remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
> +	} else if (data[IFLA_VXLAN_GROUP6]) {
> +		if (!IS_ENABLED(CONFIG_IPV6))
> +			return -EPFNOSUPPORT;
> +
> +		conf.remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]);
> +		conf.remote_ip.sa.sa_family = AF_INET6;
> +	}
> +
> +	if (data[IFLA_VXLAN_LOCAL]) {
> +		conf.saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]);
> +		conf.saddr.sa.sa_family = AF_INET;
> +	} else if (data[IFLA_VXLAN_LOCAL6]) {
> +		if (!IS_ENABLED(CONFIG_IPV6))
> +			return -EPFNOSUPPORT;
> +
> +		/* TODO: respect scope id */
> +		conf.saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]);
> +		conf.saddr.sa.sa_family = AF_INET6;
> +	}
> +
> +	if (data[IFLA_VXLAN_LINK])
> +		conf.remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]);
> +
> +	if (data[IFLA_VXLAN_TOS])
> +		conf.tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
> +
> +	if (data[IFLA_VXLAN_TTL])
> +		conf.ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
> +
> +	if (data[IFLA_VXLAN_LABEL])
> +		conf.label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
> +			     IPV6_FLOWLABEL_MASK;
> +
> +	if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
> +		conf.flags |= VXLAN_F_LEARN;
> +
> +	if (data[IFLA_VXLAN_AGEING])
> +		conf.age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
> +
> +	if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY]))
> +		conf.flags |= VXLAN_F_PROXY;
> +
> +	if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC]))
> +		conf.flags |= VXLAN_F_RSC;
> +
> +	if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS]))
> +		conf.flags |= VXLAN_F_L2MISS;
> +
> +	if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS]))
> +		conf.flags |= VXLAN_F_L3MISS;
> +
> +	if (data[IFLA_VXLAN_LIMIT])
> +		conf.addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
> +
> +	if (data[IFLA_VXLAN_COLLECT_METADATA] &&
> +	    nla_get_u8(data[IFLA_VXLAN_COLLECT_METADATA]))
> +		conf.flags |= VXLAN_F_COLLECT_METADATA;
> +
> +	if (data[IFLA_VXLAN_PORT_RANGE]) {
> +		const struct ifla_vxlan_port_range *p
> +			= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
> +		conf.port_min = ntohs(p->low);
> +		conf.port_max = ntohs(p->high);
> +	}
> +
> +	if (data[IFLA_VXLAN_PORT])
> +		conf.dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
> +
> +	if (data[IFLA_VXLAN_UDP_CSUM] &&
> +	    !nla_get_u8(data[IFLA_VXLAN_UDP_CSUM]))
> +		conf.flags |= VXLAN_F_UDP_ZERO_CSUM_TX;
> +
> +	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] &&
> +	    nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]))
> +		conf.flags |= VXLAN_F_UDP_ZERO_CSUM6_TX;
> +
> +	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX] &&
> +	    nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]))
> +		conf.flags |= VXLAN_F_UDP_ZERO_CSUM6_RX;
> +
> +	if (data[IFLA_VXLAN_REMCSUM_TX] &&
> +	    nla_get_u8(data[IFLA_VXLAN_REMCSUM_TX]))
> +		conf.flags |= VXLAN_F_REMCSUM_TX;
> +
> +	if (data[IFLA_VXLAN_REMCSUM_RX] &&
> +	    nla_get_u8(data[IFLA_VXLAN_REMCSUM_RX]))
> +		conf.flags |= VXLAN_F_REMCSUM_RX;
> +
> +	if (data[IFLA_VXLAN_GBP])
> +		conf.flags |= VXLAN_F_GBP;
> +
> +	if (data[IFLA_VXLAN_GPE])
> +		conf.flags |= VXLAN_F_GPE;
> +
> +	if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL])
> +		conf.flags |= VXLAN_F_REMCSUM_NOPARTIAL;
> +
> +	if (tb[IFLA_MTU])
> +		conf.mtu = nla_get_u32(tb[IFLA_MTU]);
> +
> +	err = vxlan_dev_configure(src_net, dev, &conf);
> +	switch (err) {
> +	case -ENODEV:
> +		pr_info("ifindex %d does not exist\n", conf.remote_ifindex);
> +		break;
> +
> +	case -EPERM:
> +		pr_info("IPv6 is disabled via sysctl\n");
> +		break;
> +
> +	case -EEXIST:
> +		pr_info("duplicate VNI %u\n", be32_to_cpu(conf.vni));
> +		break;
> +
> +	case -EINVAL:
> +		pr_info("unsupported combination of extensions\n");
> +		break;
> +	}
> +
> +	return err;
>  }
>  
>  #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
> @@ -2047,20 +2357,21 @@ static void vxlan_dellink(struct net_device *dev)
>  static size_t vxlan_get_size(const struct net_device *dev)
>  {
>  
> -	return nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_ID */
> +	return nla_total_size(sizeof(__u32)) +  /* IFLA_VXLAN_ID */
>  		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
> -		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LINK */
> +		nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */
>  		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
> -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
> -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
> -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
> -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
> -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
> -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L2MISS */
> -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L3MISS */
> -		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_COLLECT_METADATA */
> -		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_AGEING */
> -		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LIMIT */
> +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TTL */
> +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TOS */
> +		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
> +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LEARNING */
> +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_PROXY */
> +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_RSC */
> +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L2MISS */
> +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L3MISS */
> +		nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_COLLECT_METADATA */
> +		nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */
> +		nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */
>  		nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
>  		nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */
>  		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */
> @@ -2074,8 +2385,88 @@ static size_t vxlan_get_size(const struct net_device *dev)
>  static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
>  {
>  	const struct vxlan_dev *vxlan = netdev_priv(dev);
> +	const struct vxlan_rdst *dst = &vxlan->default_dst;
> +	struct ifla_vxlan_port_range ports = {
> +		.low =  htons(vxlan->cfg.port_min),
> +		.high = htons(vxlan->cfg.port_max),
> +	};
> +
> +	if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni)))
> +		goto nla_put_failure;
> +
> +	if (!vxlan_addr_any(&dst->remote_ip)) {
> +		if (dst->remote_ip.sa.sa_family == AF_INET) {
> +			if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP,
> +					    dst->remote_ip.sin.sin_addr.s_addr))
> +				goto nla_put_failure;
> +#if IS_ENABLED(CONFIG_IPV6)
> +		} else {
> +			if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6,
> +					     &dst->remote_ip.sin6.sin6_addr))
> +				goto nla_put_failure;
> +#endif
> +		}
> +	}
> +
> +	if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
> +		goto nla_put_failure;
> +
> +	if (!vxlan_addr_any(&vxlan->cfg.saddr)) {
> +		if (vxlan->cfg.saddr.sa.sa_family == AF_INET) {
> +			if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL,
> +					    vxlan->cfg.saddr.sin.sin_addr.s_addr))
> +				goto nla_put_failure;
> +#if IS_ENABLED(CONFIG_IPV6)
> +		} else {
> +			if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6,
> +					     &vxlan->cfg.saddr.sin6.sin6_addr))
> +				goto nla_put_failure;
> +#endif
> +		}
> +	}
> +
> +	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
> +	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
> +			!!(vxlan->flags & VXLAN_F_LEARN)) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_PROXY,
> +			!!(vxlan->flags & VXLAN_F_PROXY)) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_L2MISS,
> +			!!(vxlan->flags & VXLAN_F_L2MISS)) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_L3MISS,
> +			!!(vxlan->flags & VXLAN_F_L3MISS)) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA,
> +		       !!(vxlan->flags & VXLAN_F_COLLECT_METADATA)) ||
> +	    nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) ||
> +	    nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) ||
> +	    nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM,
> +			!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM_TX)) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
> +			!!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
> +			!!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX,
> +			!!(vxlan->flags & VXLAN_F_REMCSUM_TX)) ||
> +	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX,
> +			!!(vxlan->flags & VXLAN_F_REMCSUM_RX)))
> +		goto nla_put_failure;
> +
> +	if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
> +		goto nla_put_failure;
> +
> +	if (vxlan->flags & VXLAN_F_GBP &&
> +	    nla_put_flag(skb, IFLA_VXLAN_GBP))
> +		goto nla_put_failure;
> +
> +	if (vxlan->flags & VXLAN_F_GPE &&
> +	    nla_put_flag(skb, IFLA_VXLAN_GPE))
> +		goto nla_put_failure;
>  
> -	if (nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port))
> +	if (vxlan->flags & VXLAN_F_REMCSUM_NOPARTIAL &&
> +	    nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
>  		goto nla_put_failure;
>  
>  	return 0;
> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
> index 1e88c13..2b07e54 100644
> --- a/lib/dpif-netlink.c
> +++ b/lib/dpif-netlink.c
> @@ -988,6 +988,8 @@ netdev_geneve_destroy(const char *name)
>  #define IFLA_VXLAN_UDP_ZERO_CSUM6_RX 20
>  #define IFLA_VXLAN_GBP 23
>  #define IFLA_VXLAN_COLLECT_METADATA 25
> +#define IFLA_VXLAN_LABEL 26
> +#define IFLA_VXLAN_GPE 27
>  #endif
>  
>  #if IFLA_GRE_MAX < 18
> @@ -1037,6 +1039,9 @@ netdev_vxlan_create(struct netdev *netdev)
>              if (tnl_cfg->exts & (1 << OVS_VXLAN_EXT_GBP)) {
>                  nl_msg_put_flag(&request, IFLA_VXLAN_GBP);
>              }
> +            else if (tnl_cfg->exts & (1 << OVS_VXLAN_EXT_GPE)) {
> +                nl_msg_put_flag(&request, IFLA_VXLAN_GPE);
> +            }
>              nl_msg_put_be16(&request, IFLA_VXLAN_PORT, tnl_cfg->dst_port);
>          nl_msg_end_nested(&request, infodata_off);
>      nl_msg_end_nested(&request, linkinfo_off);
> diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
> index ec5c44e..fa56af5 100644
> --- a/lib/netdev-vport.c
> +++ b/lib/netdev-vport.c
> @@ -541,7 +541,9 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args)
>              while (ext) {
>                  if (!strcmp(type, "vxlan") && !strcmp(ext, "gbp")) {
>                      tnl_cfg.exts |= (1 << OVS_VXLAN_EXT_GBP);
> -                } else {
> +                } else if (!strcmp(type, "vxlan") && !strcmp(ext, "gpe")) {
> +                     tnl_cfg.exts |= (1 << OVS_VXLAN_EXT_GPE);
> +		} else {
>                      VLOG_WARN("%s: unknown extension '%s'", name, ext);
>                  }
>  
> -- 
> 1.9.3
> 



More information about the dev mailing list