[ovs-dev] [PATCH v2 15/22] odp: Support conntrack orig tuple key.

Joe Stringer joe at ovn.org
Fri Mar 3 18:09:21 UTC 2017


On 28 February 2017 at 17:17, Jarno Rajahalme <jarno at ovn.org> wrote:
> Userspace support for datapath original direction conntrack tuple.
>
> Signed-off-by: Jarno Rajahalme <jarno at ovn.org>

Thanks for the submission. Some feedback below.

<snip>

> diff --git a/include/openvswitch/meta-flow.h b/include/openvswitch/meta-flow.h
> index aac9945..94cee20 100644
> --- a/include/openvswitch/meta-flow.h
> +++ b/include/openvswitch/meta-flow.h
> @@ -740,6 +740,139 @@ enum OVS_PACKED_ENUM mf_field_id {
>       */
>      MFF_CT_LABEL,
>
> +    /* "ct_nw_proto".
> +     *
> +     * The "protocol" byte in the IPv4 or IPv6 header for the original
> +     * direction conntrack tuple, or of the master conntrack entry, if the
> +     * current connection is a related connection.
> +     *
> +     * The value is initially zero and populated by the CT action.  The value
> +     * remains zero after the CT action only if the packet can not be
> +     * associated with a tracked connection, in which case the prerequisites

"Tracked" in the current API documentation refers to whether the
packet was submitted to the connection tracker during the current
pipeline processing, and not connection state. To refer to connections
which have been committed, we call that "committed". See the
"Connection Tracking Fields" section of ovs-fields(7) for more
details.

<snip>

> @@ -383,61 +388,63 @@ parse_ethertype(const void **datap, size_t *sizep)
>      return htons(FLOW_DL_TYPE_NONE);
>  }
>
> -static inline void
> +/* Returns 'true' if the packet is an ND packet. */
> +static inline bool
>  parse_icmpv6(const void **datap, size_t *sizep, const struct icmp6_hdr *icmp,
>               const struct in6_addr **nd_target,
>               struct eth_addr arp_buf[2])
>  {
> -    if (icmp->icmp6_code == 0 &&
> -        (icmp->icmp6_type == ND_NEIGHBOR_SOLICIT ||
> -         icmp->icmp6_type == ND_NEIGHBOR_ADVERT)) {
> +    if (icmp->icmp6_code != 0 ||
> +        (icmp->icmp6_type != ND_NEIGHBOR_SOLICIT &&
> +         icmp->icmp6_type != ND_NEIGHBOR_ADVERT)) {
> +        return false;
> +    }
>
> -        *nd_target = data_try_pull(datap, sizep, sizeof **nd_target);
> -        if (OVS_UNLIKELY(!*nd_target)) {
> -            return;
> -        }
> +    *nd_target = data_try_pull(datap, sizep, sizeof **nd_target);
> +    if (OVS_UNLIKELY(!*nd_target)) {
> +        return true;
> +    }
>
> -        while (*sizep >= 8) {
> -            /* The minimum size of an option is 8 bytes, which also is
> -             * the size of Ethernet link-layer options. */
> -            const struct ovs_nd_opt *nd_opt = *datap;
> -            int opt_len = nd_opt->nd_opt_len * ND_OPT_LEN;
> +    while (*sizep >= 8) {
> +        /* The minimum size of an option is 8 bytes, which also is
> +         * the size of Ethernet link-layer options. */
> +        const struct ovs_nd_opt *nd_opt = *datap;
> +        int opt_len = nd_opt->nd_opt_len * ND_OPT_LEN;
>
> -            if (!opt_len || opt_len > *sizep) {
> -                return;
> -            }
> +        if (!opt_len || opt_len > *sizep) {
> +            return true;
> +        }
>
> -            /* Store the link layer address if the appropriate option is
> -             * provided.  It is considered an error if the same link
> -             * layer option is specified twice. */
> -            if (nd_opt->nd_opt_type == ND_OPT_SOURCE_LINKADDR
> -                && opt_len == 8) {
> -                if (OVS_LIKELY(eth_addr_is_zero(arp_buf[0]))) {
> -                    arp_buf[0] = nd_opt->nd_opt_mac;
> -                } else {
> -                    goto invalid;
> -                }
> -            } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LINKADDR
> -                       && opt_len == 8) {
> -                if (OVS_LIKELY(eth_addr_is_zero(arp_buf[1]))) {
> -                    arp_buf[1] = nd_opt->nd_opt_mac;
> -                } else {
> -                    goto invalid;
> -                }
> +        /* Store the link layer address if the appropriate option is
> +         * provided.  It is considered an error if the same link
> +         * layer option is specified twice. */
> +        if (nd_opt->nd_opt_type == ND_OPT_SOURCE_LINKADDR
> +            && opt_len == 8) {
> +            if (OVS_LIKELY(eth_addr_is_zero(arp_buf[0]))) {
> +                arp_buf[0] = nd_opt->nd_opt_mac;
> +            } else {
> +                goto invalid;
>              }
> -
> -            if (OVS_UNLIKELY(!data_try_pull(datap, sizep, opt_len))) {
> -                return;
> +        } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LINKADDR
> +                   && opt_len == 8) {
> +            if (OVS_LIKELY(eth_addr_is_zero(arp_buf[1]))) {
> +                arp_buf[1] = nd_opt->nd_opt_mac;
> +            } else {
> +                goto invalid;
>              }
>          }
> -    }
>
> -    return;
> +        if (OVS_UNLIKELY(!data_try_pull(datap, sizep, opt_len))) {
> +            return true;
> +        }
> +    }
> +    return true;
>
>  invalid:
>      *nd_target = NULL;
>      arp_buf[0] = eth_addr_zero;
>      arp_buf[1] = eth_addr_zero;
> +    return true;
>  }

It is strange for this parse_icmpv6() function to return one of two
values, but only populate arp_buf[] in some of the cases that it
returns true -- while the caller does initialize the buffer, it'd be
more robust if this function consistently initialized the buffer.

> diff --git a/lib/flow.h b/lib/flow.h
> index 62315bc..14a3004 100644
> --- a/lib/flow.h
> +++ b/lib/flow.h
> @@ -862,9 +862,35 @@ flow_union_with_miniflow(struct flow *dst, const struct miniflow *src)
>      flow_union_with_miniflow_subset(dst, src, src->map);
>  }
>
> +static inline bool is_ct_valid(const struct flow *flow,
> +                               const struct flow_wildcards *mask,
> +                               struct flow_wildcards *wc)
> +{
> +    /* Matches are checked with 'mask' and without 'wc'. */
> +    if (mask && !wc) {
> +        /* Must match at least one of the bits that implies a valid
> +         * conntrack entry, or an explicit not-invalid. */
> +        return flow->ct_state & (CS_NEW | CS_ESTABLISHED | CS_RELATED
> +                                 | CS_REPLY_DIR | CS_SRC_NAT | CS_DST_NAT)

So these bits indicate valid? (...)

> +            || (flow->ct_state & CS_TRACKED
> +                && mask->masks.ct_state & CS_INVALID
> +                && !(flow->ct_state & CS_INVALID));
> +    }
> +    /* Else we are checking a fully extracted flow, where valid CT state always
> +     * has either 'new', 'established', or 'reply_dir' bit set. */
> +#define CS_VALID_MASK (CS_NEW | CS_ESTABLISHED | CS_REPLY_DIR)

(...) But for these bits, we define a macro to identify a different
set of valid bits?

> diff --git a/lib/meta-flow.xml b/lib/meta-flow.xml
> index 3db0f82..7a7f03d 100644
> --- a/lib/meta-flow.xml
> +++ b/lib/meta-flow.xml
> @@ -2479,6 +2479,98 @@ actions=clone(load:0->NXM_OF_IN_PORT[],output:123)
>        parameter to the <code>ct</code> action, to the connection to which the
>        current packet belongs.
>      </field>
> +
> +    <p>
> +      Open vSwitch 2.8 introduced the matching support for connection
> +      tracker original direction 5-tuple fields.
> +    </p>
> +
> +    <p>
> +      For non-committed non-related connections the conntrack original
> +      direction tuple fields always have the same values as the
> +      corresponding headers in the packet itself.  For any other packets of
> +      a committed connection the conntrack original direction tuple fields
> +      reflect the values from that initial non-committed non-related packet,
> +      and generally are different from the actual packet headers, as the

Picking nits a bit, but this states that the ct 5tuple is generally
different from the packet headers, but this depends on your traffic
pattern and your actions. If the majority of the traffic for the
connection is in the forward direction and there's no NAT involved,
this statement doesn't really hold.

> +      actual packet headers may in reverse direction (for reply packets),

"may in"

> @@ -607,7 +610,8 @@ nx_pull_match(struct ofpbuf *b, unsigned int match_len, struct match *match,
>  }
>
>  /* Behaves the same as nx_pull_match(), but skips over unknown NXM headers,
> - * instead of failing with an error. */
> + * instead of failing with an error, and does not check for field
> + * prerequisities. */
>  enum ofperr
>  nx_pull_match_loose(struct ofpbuf *b, unsigned int match_len,
>                      struct match *match,
> @@ -664,8 +668,9 @@ oxm_pull_match(struct ofpbuf *b, const struct tun_table *tun_table,
>      return oxm_pull_match__(b, true, tun_table, match);
>  }
>
> -/* Behaves the same as oxm_pull_match() with one exception.  Skips over unknown
> - * OXM headers instead of failing with an error when they are encountered. */
> +/* Behaves the same as oxm_pull_match() with two exceptions.  Skips over
> + * unknown OXM headers instead of failing with an error when they are
> + * encountered, and does not check for field prerequisities. */
>  enum ofperr
>  oxm_pull_match_loose(struct ofpbuf *b, const struct tun_table *tun_table,
>                       struct match *match)
> @@ -676,14 +681,15 @@ oxm_pull_match_loose(struct ofpbuf *b, const struct tun_table *tun_table,
>  /* Parses the OXM match description in the 'oxm_len' bytes in 'oxm'.  Stores
>   * the result in 'match'.
>   *
> - * Fails with an error when encountering unknown OXM headers.
> + * Does NOT fail with an error when encountering unknown OXM headers.  Also
> + * does not check for field prerequisities.

The original comment defines return value/early exit semantics from
this function, but the new comment just defines the side-effects /
inner operation of the function. This is a bit more natural to read if
it's integrated into the overall description of this function in the
previous comment --- just like you did for nx_pull_match_loose().


> @@ -3397,8 +3397,9 @@ decode_nx_packet_in2(const struct ofp_header *oh, bool loose,
>          }
>
>          case NXPINT_METADATA:
> -            error = oxm_decode_match(payload.msg, ofpbuf_msgsize(&payload),
> -                                     tun_table, &pin->flow_metadata);
> +            error = oxm_decode_match_loose(payload.msg,
> +                                           ofpbuf_msgsize(&payload),
> +                                           tun_table, &pin->flow_metadata);
>              break;
>
>          case NXPINT_USERDATA:

Previously we would immediately reject unknown OXM headers; now we
don't due to the above, right? Is this intentional? This kind of
behavioural change would be easier to validate if there was a separate
patch outlining why, what it affects, etc.

> diff --git a/lib/packets.h b/lib/packets.h
> index f7e1d82..35e5d95 100644
> --- a/lib/packets.h
> +++ b/lib/packets.h
> @@ -100,9 +100,14 @@ struct pkt_metadata {
>      uint32_t skb_priority;      /* Packet priority for QoS. */
>      uint32_t pkt_mark;          /* Packet mark. */
>      uint8_t  ct_state;          /* Connection state. */
> +    bool ct_orig_tuple_ipv6;
>      uint16_t ct_zone;           /* Connection zone. */
>      uint32_t ct_mark;           /* Connection mark. */
>      ovs_u128 ct_label;          /* Connection label. */
> +    union {
> +        struct ovs_key_ct_tuple_ipv4 ipv4;
> +        struct ovs_key_ct_tuple_ipv6 ipv6;
> +    } ct_orig_tuple;
>      union flow_in_port in_port; /* Input port. */
>      struct flow_tnl tunnel;     /* Encapsulating tunnel parameters. Note that
>                                   * if 'ip_dst' == 0, the rest of the fields may

Should there be a comment that if ct_state is populated, then
ct_orig_tuple is populated? If "ct_orig_tuple_ipv6" is true, then
ct_orig_tuple.ipv6 is used, otherwise ct_orig_tuple.ipv4 is used.


More information about the dev mailing list