[ovs-dev] [RFC PATCH 5/5] openvswitch: Interface with NAT.

Thomas Graf tgraf at suug.ch
Wed Oct 21 10:59:16 UTC 2015


On 10/20/15 at 03:20pm, Jarno Rajahalme wrote:
> Extend OVS conntrack interface to cover NAT.  New nested
> OVS_CT_ATTR_NAT may be used to include NAT with a CT action.  A bare
> OVS_CT_ATTR_NAT only mangles existing connections.  If
> OVS_NAT_ATTR_SRC or OVS_NAT_ATTR_DST is included within the nested
> attributes, new (non-committed/non-confirmed) connections are mangled
> according to the rest of the nested attributes.
> 
> This work extends on a branch by Thomas Graf at
> https://github.com/tgraf/ovs/tree/nat.
> 
> Signed-off-by: Jarno Rajahalme <jrajahalme at nicira.com>

Awesome work. This is a great start.

There are some, probably unintended, unrelated style changes. More
comments below.

> +enum ovs_nat_attr {
> +	OVS_NAT_ATTR_UNSPEC,
> +	OVS_NAT_ATTR_SRC,
> +	OVS_NAT_ATTR_DST,
> +	OVS_NAT_ATTR_IP_MIN,
> +	OVS_NAT_ATTR_IP_MAX,
> +	OVS_NAT_ATTR_PROTO_MIN,
> +	OVS_NAT_ATTR_PROTO_MAX,
> +	OVS_NAT_ATTR_PERSISTENT,
> +	OVS_NAT_ATTR_PROTO_HASH,
> +	OVS_NAT_ATTR_PROTO_RANDOM,

Simplify this with an OVS_NAT_ATTR_FLAGS?

> @@ -137,11 +159,17 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
>  	ovs_ct_get_label(ct, &key->ct.label);
>  }
>  
> -/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
> - * previously sent the packet to conntrack via the ct action.
> +/* Update 'key' based on skb->nfct.  If 'post_ct' is true, then OVS has
> + * previously sent the packet to conntrack via the ct action.  If
> + * 'keep_nat_flags' is true, the existing NAT flags retained, else they are
> + * initialized from the connection status.
>   */
>  static void ovs_ct_update_key(const struct sk_buff *skb,
> -			      struct sw_flow_key *key, bool post_ct)
> +			      struct sw_flow_key *key, bool post_ct
> +#ifdef CONFIG_NF_NAT_NEEDED
> +			      , bool keep_nat_flags
> +#endif
> +			      )

I suggest keeping the argument even for !CONFIG_NF_NAT_NEEDED. This
unclutters the call sites of this function. An ifdef inside the
keep_nat_flags branch should be enough. The compiler will optimize
the code away and it's much prettier to read.

>  {
>  	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
>  	enum ip_conntrack_info ctinfo;
> @@ -151,8 +179,20 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
>  	ct = nf_ct_get(skb, &ctinfo);
>  	if (ct) {
>  		state = ovs_ct_get_state(ctinfo);
> +		/* OVS persists the related flag for the duration of the
> +		 * connection. */
>  		if (ct->master)
>  			state |= OVS_CS_F_RELATED;
> +#ifdef CONFIG_NF_NAT_NEEDED
> +		if (keep_nat_flags)
> +			state |= key->ct.state & OVS_CS_F_NAT_MASK;
> +		else {
> +			if (ct->status & IPS_SRC_NAT)
> +				state |= OVS_CS_F_SRC_NAT;
> +			if (ct->status & IPS_DST_NAT)
> +				state |= OVS_CS_F_DST_NAT;
> +		}
> +#endif
>  		zone = nf_ct_zone(ct);
>  	} else if (post_ct) {
>  		state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
> @@ -291,7 +337,16 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
>  		return NF_DROP;
>  	}
>  
> -	return helper->help(skb, protoff, ct, ctinfo);
> +	if (helper->help(skb, protoff, ct, ctinfo) != NF_ACCEPT)
> +		return NF_DROP;

Return the returned value here instead of hardcoding NF_DROP?

> +#ifdef CONFIG_NF_NAT_NEEDED
> +	/* Adjust seqs after helper. */

A comment on why this is needed would be helpful.

> +	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)
> +	    && !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
> +		return NF_DROP;
> +#endif
> +	return NF_ACCEPT;

> @@ -377,7 +432,211 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,
>  	return true;
>  }
>  
> -static int __ovs_ct_lookup(struct net *net, const struct sw_flow_key *key,
> +#ifdef CONFIG_NF_NAT_NEEDED
> +/* Modeled after nf_nat_ipv[46]_fn().
> + * range is only used for new, uninitialized NAT state.
> + * Returns either NF_ACCEPT or NF_DROP. */
> +static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
> +			      enum ip_conntrack_info ctinfo,
> +			      const struct nf_nat_range *range,
> +			      enum nf_nat_manip_type maniptype)
> +{
> +	int hooknum, nh_off, err = NF_ACCEPT;
> +
> +	nh_off = skb_network_offset(skb);
> +	skb_pull(skb, nh_off);
> +
> +	/* See HOOK2MANIP(). */
> +	if (maniptype == NF_NAT_MANIP_SRC)
> +		hooknum = NF_INET_LOCAL_IN; /* Source NAT */
> +	else
> +		hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
> +
> +	switch (ctinfo) {
> +	case IP_CT_RELATED:
> +	case IP_CT_RELATED_REPLY:
> +		if (skb->protocol == htons(ETH_P_IP)
> +		    && ip_hdr(skb)->protocol == IPPROTO_ICMP) {
> +			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
> +							   hooknum))
> +				err = NF_DROP;
> +			goto push;
> +		} else if (skb->protocol == htons(ETH_P_IPV6)) {
> +			__be16 frag_off;
> +			u8 nexthdr = ipv6_hdr(skb)->nexthdr;
> +			int hdrlen = ipv6_skip_exthdr(skb,
> +						      sizeof(struct ipv6hdr),
> +						      &nexthdr, &frag_off);
> +
> +			if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
> +				if (!nf_nat_icmpv6_reply_translation(skb, ct,
> +								     ctinfo,
> +								     hooknum,
> +								     hdrlen))
> +					err = NF_DROP;
> +				goto push;
> +			}
> +		}
> +		/* Non-ICMP, fall thru to initialize if needed. */
> +	case IP_CT_NEW:
> +		/* Seen it before?  This can happen for loopback, retrans,
> +		 * or local packets.
> +		 */
> +		if (!nf_nat_initialized(ct, maniptype)) {

Explicit unlikely()?

> +			/* Initialize according to the NAT action. */
> +			err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
> +				/* Action is set up to establish a new
> +				 * mapping */
> +				? nf_nat_setup_info(ct, range, maniptype)
> +				: nf_nat_alloc_null_binding(ct, hooknum);
> +		}
> +		break;
> +
> +	case IP_CT_ESTABLISHED:
> +	case IP_CT_ESTABLISHED_REPLY:
> +		break;
> +
> +	default:
> +		err = NF_DROP;
> +		goto push;
> +	}
> +
> +	if (err == NF_ACCEPT)
> +		err = nf_nat_packet(ct, ctinfo, hooknum, skb);

If you goto push on init failure (IP_CT_NEW branch), then this
conditional is no longer needed and a more straight forward exception
handling is seen.

> +push:
> +	skb_push(skb, nh_off);
> +
> +	return err;
> +}

> +/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise.
> + * This action can be used to both NAT and reverse NAT, however, reverse NAT
> + * can also be done with the conntrack action. */
> +static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
> +		      const struct ovs_conntrack_info *info,
> +		      struct sk_buff *skb)
> +{
> +	enum nf_nat_manip_type maniptype;
> +	enum ip_conntrack_info ctinfo;
> +	struct nf_conn *ct;
> +	int err;
> +
> +	/* No NAT action or already NATed? */
> +	if (!(info->flags & OVS_CT_F_NAT_MASK)
> +	    || key->ct.state & OVS_CS_F_NAT_MASK)
> +		return NF_ACCEPT;
> +
> +	ct = nf_ct_get(skb, &ctinfo);
> +	/* Check if an existing conntrack entry may be found for this skb.
> +	 * This happens when we lose the ct entry pointer due to an upcall.
> +	 * Don't lookup invalid connections. */
> +	if (!ct && key->ct.state & OVS_CS_F_TRACKED
> +	    && !(key->ct.state & OVS_CS_F_INVALID))
> +		ct = ovs_ct_find_existing(net, &info->zone, info->family, skb,
> +					  &ctinfo);
> +	if (!ct || nf_ct_is_untracked(ct))
> +		/* A NAT action may only be performed on tracked packets. */
> +		return NF_ACCEPT;

Braces

> +	/* Add NAT extension if not commited yet. */
> +	if (!nf_ct_is_confirmed(ct)) {
> +		if (!nf_ct_nat_ext_add(ct))
> +			return NF_ACCEPT;   /* Can't NAT. */
> +	}

&&

> +	/* Determine NAT type.
> +	 * Check if the NAT type can be deduced from the tracked connection.
> +	 * Make sure expected traffic is NATted only when commiting. */
> +	if (info->flags & OVS_CT_F_NAT && ctinfo != IP_CT_NEW
> +	    && ct->status & IPS_NAT_MASK
> +	    && (!(ct->status & IPS_EXPECTED_BIT)
> +		|| info->flags & OVS_CT_F_COMMIT)) {
> +		/* NAT an established or related connection like before. */
> +		if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
> +			/* This is the REPLY direction for a connection
> +			 * for which NAT was applied in the forward
> +			 * direction.  Do the reverse NAT. */
> +			maniptype = ct->status & IPS_SRC_NAT
> +				? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
> +		else
> +			maniptype = ct->status & IPS_SRC_NAT
> +				? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
> +	}
> +	else if (info->flags & OVS_CT_F_SRC_NAT)
> +		maniptype = NF_NAT_MANIP_SRC;
> +	else if (info->flags & OVS_CT_F_DST_NAT)
> +		maniptype = NF_NAT_MANIP_DST;
> +	else
> +		return NF_ACCEPT; /* Connection is not NATed. */
> +
> +	err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype);
> +
> +	/* Mark NAT done if successful. */
> +	if (err == NF_ACCEPT)
> +		key->ct.state |= (maniptype == NF_NAT_MANIP_SRC)
> +			? OVS_CS_F_SRC_NAT : OVS_CS_F_DST_NAT;
> +	return err;
> +}
> +#endif
> +
> +static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
>  			   const struct ovs_conntrack_info *info,
>  			   struct sk_buff *skb)
>  {


> @@ -538,6 +819,131 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
>  	return 0;
>  }
>  
> +#ifdef CONFIG_NF_NAT_NEEDED
> +static int parse_nat(const struct nlattr *attr,
> +		     struct ovs_conntrack_info *info, bool log)
> +{
> +	struct nlattr *a;
> +	int rem;
> +	bool have_ip_max = false;
> +	bool have_proto_max = false;
> +	bool ip_vers = (info->family == NFPROTO_IPV6);
> +
> +	nla_for_each_nested(a, attr, rem) {
> +		static const u16 ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = {
> +			[OVS_NAT_ATTR_SRC] = {0, 0},
> +			[OVS_NAT_ATTR_DST] = {0, 0},
> +			[OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr),
> +						 sizeof(struct in6_addr)},
> +			[OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr),
> +						 sizeof(struct in6_addr)},
> +			[OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16),sizeof(u16)},
> +			[OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16),sizeof(u16)},
> +			[OVS_NAT_ATTR_PERSISTENT] = {0, 0},
> +			[OVS_NAT_ATTR_PROTO_HASH] = {0, 0},
> +			[OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0},
> +		};
> +		int type = nla_type(a);
> +
> +		if (type > OVS_NAT_ATTR_MAX) {
> +			OVS_NLERR(log, "Unknown nat attribute (type=%d, max=%d).\n",
> +			type, OVS_NAT_ATTR_MAX);

Formatting

> +			return -EINVAL;
> +		}
> +
> +		case OVS_NAT_ATTR_IP_MIN:
> +			nla_memcpy(&info->range.min_addr, a, nla_len(a));

The length attribute should be sizeof of min_addr like for max_addr
below.

> +			info->range.flags |= NF_NAT_RANGE_MAP_IPS;
> +			break;
> +
> +		case OVS_NAT_ATTR_IP_MAX:
> +			have_ip_max = true;
> +			nla_memcpy(&info->range.max_addr, a,
> +				   sizeof(info->range.max_addr));
> +			info->range.flags |= NF_NAT_RANGE_MAP_IPS;
> +			break;
> +
> +	}

>  static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
>  	[OVS_CT_ATTR_COMMIT]	= { .minlen = 0,
>  				    .maxlen = 0 },
> @@ -548,7 +954,11 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
>  	[OVS_CT_ATTR_LABEL]	= { .minlen = sizeof(struct md_label),
>  				    .maxlen = sizeof(struct md_label) },
>  	[OVS_CT_ATTR_HELPER]	= { .minlen = 1,
> -				    .maxlen = NF_CT_HELPER_NAME_LEN }
> +				    .maxlen = NF_CT_HELPER_NAME_LEN },
> +#ifdef CONFIG_NF_NAT_NEEDED
> +	[OVS_CT_ATTR_NAT]	= { .minlen = 0,
> +				    .maxlen = 96 }
> +#endif

Is the 96 a temporary hack here?

> @@ -607,6 +1017,14 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
>  				return -EINVAL;
>  			}
>  			break;
> +#ifdef CONFIG_NF_NAT_NEEDED
> +		case OVS_CT_ATTR_NAT: {
> +			int err = parse_nat(a, info, log);
> +			if (err)
> +				return err;
> +			break;
> +		}
> +#endif

We should probably bark if user space provides a OVS_CT_ATTR_NAT but the
kernel is compiled without support for it.

> +#ifdef CONFIG_NF_NAT_NEEDED
> +static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info,
> +			       struct sk_buff *skb)
> +{
> +	struct nlattr *start;
> +
> +	start = nla_nest_start(skb, OVS_CT_ATTR_NAT);
> +	if (!start)
> +		return false;
> +
> +	if (info->flags & OVS_CT_F_SRC_NAT) {
> +		if (nla_put_flag(skb, OVS_NAT_ATTR_SRC))
> +			return false;
> +	} else if (info->flags & OVS_CT_F_DST_NAT) {
> +		if (nla_put_flag(skb, OVS_NAT_ATTR_DST))
> +			return false;
> +	} else {
> +		goto out;

Is the empty nested attribute intended here?



More information about the dev mailing list