[ovs-dev] [PATCH 2/2] dpif-netdev: Translate Geneve options per-flow, not per-packet.
Traynor, Kevin
kevin.traynor at intel.com
Tue Aug 4 22:13:19 UTC 2015
> -----Original Message-----
> From: dev [mailto:dev-bounces at openvswitch.org] On Behalf Of Jesse Gross
> Sent: Thursday, July 30, 2015 4:10 AM
> To: dev at openvswitch.org
> Subject: [ovs-dev] [PATCH 2/2] dpif-netdev: Translate Geneve options per-
> flow, not per-packet.
>
> The kernel implementation of Geneve options stores the TLV option
> data in the flow exactly as received, without any further parsing.
> This is then translated to known options for the purposes of matching
> on flow setup (which will then install a datapath flow in the form
> the kernel is expecting).
>
> The userspace implementation behaves a little bit differently - it
> looks up known options as each packet is received. The reason for this
> is there is a much tighter coupling between datapath and flow translation
> and the representation is generally expected to be the same. This works
> but it incurs work on a per-packet basis that could be done per-flow
> instead.
>
> This introduces a small translation step for Geneve packets between
> datapath and flow lookup for the userspace datapath in order to
> allow the same kind of processing that the kernel does.
>
> There is a second benefit to this as well: for some operations it is
> preferable to keep the options exactly as they were received on the wire,
> which this enables. One example is that for packets that are executed from
> ofproto-dpif-upcall to the datapath, this avoids the translation of
> Geneve metadata. Since this conversion is potentially lossy (for unknown
> options), keeping everything in the same format removes the possibility
> of dropping options if the packet comes back up to userspace and the
> Geneve option translation table has changed. To help with these types of
> operations, most functions can understand both formats of data and
> seamlessly
> do the right thing.
I tested std bi-directional phy-phy flows with dpdk to see if this affected
performance for them and it looks to be fine - same performance with and
without this patch.
In general, my performance is down a few % from a couple of weeks ago but I
think it's something in my setup.
>
> Signed-off-by: Jesse Gross <jesse at nicira.com>
> ---
> lib/automake.mk | 1 +
> lib/dpif-netdev.c | 55 ++++++-
> lib/flow.c | 48 ++++--
> lib/flow.h | 13 +-
> lib/geneve.h | 63 ++++++++
> lib/meta-flow.c | 6 +-
> lib/netdev-vport.c | 26 ++--
> lib/odp-execute.c | 2 +-
> lib/odp-util.c | 58 ++++---
> lib/odp-util.h | 12 +-
> lib/packets.h | 41 +----
> lib/tun-metadata.c | 352 ++++++++++++++++++++++++++++++---------
> ---
> lib/tun-metadata.h | 74 ++++++---
> ofproto/ofproto-dpif-sflow.c | 2 +-
> ofproto/ofproto-dpif-upcall.c | 2 +-
> tests/tunnel-push-pop.at | 2 +-
> 16 files changed, 534 insertions(+), 223 deletions(-)
> create mode 100644 lib/geneve.h
>
> diff --git a/lib/automake.mk b/lib/automake.mk
> index faca968..5b6e9e8 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -81,6 +81,7 @@ lib_libopenvswitch_la_SOURCES = \
> lib/fatal-signal.h \
> lib/flow.c \
> lib/flow.h \
> + lib/geneve.h \
> lib/guarded-list.c \
> lib/guarded-list.h \
> lib/hash.c \
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index f587df5..c31a7e0 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -1884,8 +1884,8 @@ dpif_netdev_mask_from_nlattrs(const struct nlattr
> *key, uint32_t key_len,
> if (mask_key_len) {
> enum odp_key_fitness fitness;
>
> - fitness = odp_flow_key_to_mask(mask_key, mask_key_len, key,
> key_len,
> - &wc->masks, flow);
> + fitness = odp_flow_key_to_mask_udpif(mask_key, mask_key_len, key,
> + key_len, &wc->masks, flow);
> if (fitness) {
> /* This should not happen: it indicates that
> * odp_flow_key_from_mask() and odp_flow_key_to_mask()
> @@ -1919,7 +1919,7 @@ dpif_netdev_flow_from_nlattrs(const struct nlattr
> *key, uint32_t key_len,
> {
> odp_port_t in_port;
>
> - if (odp_flow_key_to_flow(key, key_len, flow)) {
> + if (odp_flow_key_to_flow_udpif(key, key_len, flow)) {
> /* This should not happen: it indicates that
> odp_flow_key_from_flow()
> * and odp_flow_key_to_flow() disagree on the acceptable form of a
> * flow. Log the problem as an error, with enough details to
> enable
> @@ -3014,11 +3014,25 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd,
> struct dp_packet *packet_,
> struct ofpbuf *actions, struct ofpbuf *put_actions)
> {
> struct dp_netdev *dp = pmd->dp;
> + struct flow_tnl orig_tunnel;
> + int err;
>
> if (OVS_UNLIKELY(!dp->upcall_cb)) {
> return ENODEV;
> }
>
> + orig_tunnel.flags = flow->tunnel.flags;
> + if (flow->tunnel.flags & FLOW_TNL_F_UDPIF) {
> + orig_tunnel.metadata.present.len = flow-
> >tunnel.metadata.present.len;
> + memcpy(orig_tunnel.metadata.opts.gnv, flow-
> >tunnel.metadata.opts.gnv,
> + flow->tunnel.metadata.present.len);
> + err = tun_metadata_from_geneve_udpif(&orig_tunnel, &orig_tunnel,
> + &flow->tunnel);
> + if (err) {
> + return err;
> + }
> + }
> +
> if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
> struct ds ds = DS_EMPTY_INITIALIZER;
> char *packet_str;
> @@ -3046,8 +3060,39 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd,
> struct dp_packet *packet_,
> ds_destroy(&ds);
> }
>
> - return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
> - actions, wc, put_actions, dp->upcall_aux);
> + err = dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
> + actions, wc, put_actions, dp->upcall_aux);
> + if (err && err != ENOSPC) {
> + return err;
> + }
> +
> + if (wc) {
> + if (wc->masks.tunnel.metadata.present.map) {
> + struct geneve_opt opts[GENEVE_TOT_OPT_SIZE /
> + sizeof(struct geneve_opt)];
> +
> + tun_metadata_to_geneve_udpif_mask(&flow->tunnel,
> + &wc->masks.tunnel,
> +
> orig_tunnel.metadata.opts.gnv,
> +
> orig_tunnel.metadata.present.len,
> + opts);
> +
> + memset(&wc->masks.tunnel.metadata, 0,
> + sizeof wc->masks.tunnel.metadata);
> + memcpy(&wc->masks.tunnel.metadata.opts.gnv, opts,
> + orig_tunnel.metadata.present.len);
> + }
> + wc->masks.tunnel.metadata.present.len = 0xff;
> + }
> +
> + if (orig_tunnel.flags & FLOW_TNL_F_UDPIF) {
> + memcpy(&flow->tunnel.metadata.opts.gnv,
> orig_tunnel.metadata.opts.gnv,
> + orig_tunnel.metadata.present.len);
> + flow->tunnel.metadata.present.len =
> orig_tunnel.metadata.present.len;
> + flow->tunnel.flags |= FLOW_TNL_F_UDPIF;
> + }
> +
> + return err;
> }
>
> static inline uint32_t
> diff --git a/lib/flow.c b/lib/flow.c
> index 352e9b8..d3d25e4 100644
> --- a/lib/flow.c
> +++ b/lib/flow.c
> @@ -462,9 +462,22 @@ miniflow_extract(struct dp_packet *packet, struct
> miniflow *dst)
> miniflow_push_words(mf, tunnel, &md->tunnel,
> offsetof(struct flow_tnl, metadata) /
> sizeof(uint64_t));
> - if (md->tunnel.metadata.opt_map) {
> - miniflow_push_words(mf, tunnel.metadata, &md->tunnel.metadata,
> - sizeof md->tunnel.metadata /
> sizeof(uint64_t));
> +
> + if (!(md->tunnel.flags & FLOW_TNL_F_UDPIF)) {
> + if (md->tunnel.metadata.present.map) {
> + miniflow_push_words(mf, tunnel.metadata, &md-
> >tunnel.metadata,
> + sizeof md->tunnel.metadata /
> + sizeof(uint64_t));
> + }
> + } else {
> + if (md->tunnel.metadata.present.len) {
> + miniflow_push_words(mf, tunnel.metadata.present,
> + &md->tunnel.metadata.present, 1);
> + miniflow_push_words(mf, tunnel.metadata.opts.gnv,
> + md->tunnel.metadata.opts.gnv,
> + DIV_ROUND_UP(md-
> >tunnel.metadata.present.len,
> + sizeof(uint64_t)));
> + }
> }
> }
> if (md->skb_priority || md->pkt_mark) {
> @@ -815,7 +828,7 @@ flow_get_metadata(const struct flow *flow, struct match
> *flow_metadata)
> if (flow->tunnel.gbp_flags) {
> match_set_tun_gbp_flags(flow_metadata, flow->tunnel.gbp_flags);
> }
> - tun_metadata_get_fmd(&flow->tunnel.metadata, flow_metadata);
> + tun_metadata_get_fmd(&flow->tunnel, flow_metadata);
> if (flow->metadata != htonll(0)) {
> match_set_metadata(flow_metadata, flow->metadata);
> }
> @@ -1161,9 +1174,16 @@ void flow_wildcards_init_for_packet(struct
> flow_wildcards *wc,
> WC_MASK_FIELD(wc, tunnel.gbp_id);
> WC_MASK_FIELD(wc, tunnel.gbp_flags);
>
> - if (flow->tunnel.metadata.opt_map) {
> - wc->masks.tunnel.metadata.opt_map = flow-
> >tunnel.metadata.opt_map;
> - WC_MASK_FIELD(wc, tunnel.metadata.opts);
> + if (!(flow->tunnel.flags & FLOW_TNL_F_UDPIF)) {
> + if (flow->tunnel.metadata.present.map) {
> + wc->masks.tunnel.metadata.present.map =
> + flow-
> >tunnel.metadata.present.map;
> + WC_MASK_FIELD(wc, tunnel.metadata.opts.u8);
> + }
> + } else {
> + WC_MASK_FIELD(wc, tunnel.metadata.present.len);
> + memset(wc->masks.tunnel.metadata.opts.gnv, 0xff,
> + flow->tunnel.metadata.present.len);
> }
> } else if (flow->tunnel.tun_id) {
> WC_MASK_FIELD(wc, tunnel.tun_id);
> @@ -1253,9 +1273,17 @@ flow_wc_map(const struct flow *flow, struct miniflow
> *map)
>
> map->tnl_map = 0;
> if (flow->tunnel.ip_dst) {
> - map->tnl_map = MINIFLOW_TNL_MAP(tunnel);
> - if (!flow->tunnel.metadata.opt_map) {
> - map->tnl_map &= ~MINIFLOW_TNL_MAP(tunnel.metadata);
> + map->tnl_map |= MINIFLOW_TNL_MAP__(tunnel,
> + offsetof(struct flow_tnl,
> metadata));
> + if (!(flow->tunnel.flags & FLOW_TNL_F_UDPIF)) {
> + if (flow->tunnel.metadata.present.map) {
> + map->tnl_map |= MINIFLOW_TNL_MAP__(tunnel.metadata,
> + sizeof(flow-
> >tunnel.metadata));
> + }
> + } else {
> + map->tnl_map |= MINIFLOW_TNL_MAP(tunnel.metadata.present.len);
> + map->tnl_map |= MINIFLOW_TNL_MAP__(tunnel.metadata.opts.gnv,
> + flow-
> >tunnel.metadata.present.len);
> }
> }
>
> diff --git a/lib/flow.h b/lib/flow.h
> index 96aa4aa..5bc9267 100644
> --- a/lib/flow.h
> +++ b/lib/flow.h
> @@ -80,6 +80,12 @@ BUILD_ASSERT_DECL(FLOW_TNL_F_OAM == NX_TUN_FLAG_OAM);
>
> #define FLOW_TNL_F_MASK ((1 << 4) - 1)
>
> +/* Purely internal to OVS userspace. These flags should never be exposed to
> + * the outside world and so aren't included in the flags mask. */
> +
> +/* Tunnel information is in userspace datapath format. */
> +#define FLOW_TNL_F_UDPIF (1 << 4)
> +
> const char *flow_tun_flag_to_string(uint32_t flags);
>
> /* Maximum number of supported MPLS labels. */
> @@ -518,9 +524,12 @@ flow_values_get_next_in_maps(struct
> flow_for_each_in_maps_aux *aux,
> #define FLOW_U64_SIZE(FIELD) \
> DIV_ROUND_UP(sizeof(((struct flow *)0)->FIELD), sizeof(uint64_t))
>
> -#define MINIFLOW_TNL_MAP(FIELD) \
> - (((UINT64_C(1) << FLOW_U64_SIZE(FIELD)) - 1) \
> +#define MINIFLOW_TNL_MAP__(FIELD, LEN) \
> + (((UINT64_C(1) << DIV_ROUND_UP(LEN, sizeof(uint64_t))) - 1) \
> << (offsetof(struct flow, FIELD) / sizeof(uint64_t)))
> +
> +#define MINIFLOW_TNL_MAP(FIELD) \
> + MINIFLOW_TNL_MAP__(FIELD, sizeof(((struct flow *)0)->FIELD))
> #define MINIFLOW_PKT_MAP(FIELD) \
> (((UINT64_C(1) << FLOW_U64_SIZE(FIELD)) - 1) \
> << ((offsetof(struct flow, FIELD) / sizeof(uint64_t)) -
> FLOW_TNL_U64S))
> diff --git a/lib/geneve.h b/lib/geneve.h
> new file mode 100644
> index 0000000..f0256b1
> --- /dev/null
> +++ b/lib/geneve.h
> @@ -0,0 +1,63 @@
> +/*
> + * Copyright (c) 2015 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef GENEVE_H
> +#define GENEVE_H 1
> +
> +#include "openvswitch/types.h"
> +
> +#define GENEVE_MAX_OPT_SIZE 124
> +#define GENEVE_TOT_OPT_SIZE 252
> +
> +#define GENEVE_CRIT_OPT_TYPE (1 << 7)
> +
> +struct geneve_opt {
> + ovs_be16 opt_class;
> + uint8_t type;
> +#ifdef WORDS_BIGENDIAN
> + uint8_t r1:1;
> + uint8_t r2:1;
> + uint8_t r3:1;
> + uint8_t length:5;
> +#else
> + uint8_t length:5;
> + uint8_t r3:1;
> + uint8_t r2:1;
> + uint8_t r1:1;
> +#endif
> + /* Option data */
> +};
> +
> +struct genevehdr {
> +#ifdef WORDS_BIGENDIAN
> + uint8_t ver:2;
> + uint8_t opt_len:6;
> + uint8_t oam:1;
> + uint8_t critical:1;
> + uint8_t rsvd1:6;
> +#else
> + uint8_t opt_len:6;
> + uint8_t ver:2;
> + uint8_t rsvd1:6;
> + uint8_t critical:1;
> + uint8_t oam:1;
> +#endif
> + ovs_be16 proto_type;
> + ovs_16aligned_be32 vni;
> + struct geneve_opt options[];
> +};
> +
> +#endif /* geneve.h */
> diff --git a/lib/meta-flow.c b/lib/meta-flow.c
> index 0c01414..4c7cf2c 100644
> --- a/lib/meta-flow.c
> +++ b/lib/meta-flow.c
> @@ -196,7 +196,7 @@ mf_is_all_wild(const struct mf_field *mf, const struct
> flow_wildcards *wc)
> CASE_MFF_TUN_METADATA: {
> union mf_value value;
>
> - tun_metadata_read(&wc->masks.tunnel.metadata, mf, &value);
> + tun_metadata_read(&wc->masks.tunnel, mf, &value);
> return is_all_zeros(&value.tun_metadata, mf->n_bytes);
> }
> case MFF_METADATA:
> @@ -616,7 +616,7 @@ mf_get_value(const struct mf_field *mf, const struct
> flow *flow,
> value->u8 = flow->tunnel.ip_tos;
> break;
> CASE_MFF_TUN_METADATA:
> - tun_metadata_read(&flow->tunnel.metadata, mf, value);
> + tun_metadata_read(&flow->tunnel, mf, value);
> break;
>
> case MFF_METADATA:
> @@ -1119,7 +1119,7 @@ mf_set_flow_value(const struct mf_field *mf,
> flow->tunnel.ip_ttl = value->u8;
> break;
> CASE_MFF_TUN_METADATA:
> - tun_metadata_write(&flow->tunnel.metadata, mf, value);
> + tun_metadata_write(&flow->tunnel, mf, value);
> break;
> case MFF_METADATA:
> flow->metadata = value->be64;
> diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
> index a3394dd..a0e53b8 100644
> --- a/lib/netdev-vport.c
> +++ b/lib/netdev-vport.c
> @@ -1054,11 +1054,10 @@ parse_gre_header(struct dp_packet *packet,
> static void
> pkt_metadata_init_tnl(struct pkt_metadata *md)
> {
> - memset(md, 0, offsetof(struct pkt_metadata, tunnel.metadata));
> -
> - /* If 'opt_map' is zero then none of the rest of the tunnel metadata
> - * will be read, so we can skip clearing it. */
> - md->tunnel.metadata.opt_map = 0;
> + /* Zero up through the tunnel metadata options. The length and table
> + * are before this and as long as they are empty, the options won't
> + * be looked at. */
> + memset(md, 0, offsetof(struct pkt_metadata, tunnel.metadata.opts));
> }
>
> static int
> @@ -1208,8 +1207,7 @@ netdev_geneve_pop_header(struct dp_packet *packet)
> struct pkt_metadata *md = &packet->md;
> struct flow_tnl *tnl = &md->tunnel;
> struct genevehdr *gnh;
> - unsigned int hlen;
> - int err;
> + unsigned int hlen, opts_len;
>
> pkt_metadata_init_tnl(md);
> if (GENEVE_BASE_HLEN > dp_packet_size(packet)) {
> @@ -1223,7 +1221,8 @@ netdev_geneve_pop_header(struct dp_packet *packet)
> return EINVAL;
> }
>
> - hlen = GENEVE_BASE_HLEN + gnh->opt_len * 4;
> + opts_len = gnh->opt_len * 4;
> + hlen = GENEVE_BASE_HLEN + opts_len;
> if (hlen > dp_packet_size(packet)) {
> VLOG_WARN_RL(&err_rl, "geneve packet too small: header len=%u
> packet size=%u\n",
> hlen, dp_packet_size(packet));
> @@ -1245,12 +1244,9 @@ netdev_geneve_pop_header(struct dp_packet *packet)
> tnl->tun_id = htonll(ntohl(get_16aligned_be32(&gnh->vni)) >> 8);
> tnl->flags |= FLOW_TNL_F_KEY;
>
> - err = tun_metadata_from_geneve_header(gnh->options, gnh->opt_len * 4,
> - &tnl->metadata);
> - if (err) {
> - VLOG_WARN_RL(&err_rl, "invalid geneve options");
> - return err;
> - }
> + memcpy(tnl->metadata.opts.gnv, gnh->options, opts_len);
> + tnl->metadata.present.len = opts_len;
> + tnl->flags |= FLOW_TNL_F_UDPIF;
>
> dp_packet_reset_packet(packet, hlen);
>
> @@ -1278,7 +1274,7 @@ netdev_geneve_build_header(const struct netdev
> *netdev,
>
> ovs_mutex_unlock(&dev->mutex);
>
> - opt_len = tun_metadata_to_geneve_header(&tnl_flow->tunnel.metadata,
> + opt_len = tun_metadata_to_geneve_header(&tnl_flow->tunnel,
> gnh->options, &crit_opt);
>
> gnh->opt_len = opt_len / 4;
> diff --git a/lib/odp-execute.c b/lib/odp-execute.c
> index c676451..c4806e1 100644
> --- a/lib/odp-execute.c
> +++ b/lib/odp-execute.c
> @@ -151,7 +151,7 @@ odp_set_tunnel_action(const struct nlattr *a, struct
> flow_tnl *tun_key)
> {
> enum odp_key_fitness fitness;
>
> - fitness = odp_tun_key_from_attr(a, tun_key);
> + fitness = odp_tun_key_from_attr(a, true, tun_key);
> ovs_assert(fitness != ODP_FIT_ERROR);
> }
>
> diff --git a/lib/odp-util.c b/lib/odp-util.c
> index eec0bfb..f142f03 100644
> --- a/lib/odp-util.c
> +++ b/lib/odp-util.c
> @@ -1264,7 +1264,8 @@ ovs_frag_type_to_string(enum ovs_frag_type type)
> static enum odp_key_fitness
> odp_tun_key_from_attr__(const struct nlattr *attr,
> const struct nlattr *flow_attrs, size_t
> flow_attr_len,
> - const struct flow_tnl *src_tun, struct flow_tnl
> *tun)
> + const struct flow_tnl *src_tun, struct flow_tnl
> *tun,
> + bool udpif)
> {
> unsigned int left;
> const struct nlattr *a;
> @@ -1335,8 +1336,7 @@ odp_tun_key_from_attr__(const struct nlattr *attr,
> }
> case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS:
> if (tun_metadata_from_geneve_nlattr(a, flow_attrs,
> flow_attr_len,
> - &src_tun->metadata,
> - &tun->metadata)) {
> + src_tun, udpif, tun)) {
> return ODP_FIT_ERROR;
> }
> break;
> @@ -1359,10 +1359,11 @@ odp_tun_key_from_attr__(const struct nlattr *attr,
> }
>
> enum odp_key_fitness
> -odp_tun_key_from_attr(const struct nlattr *attr, struct flow_tnl *tun)
> +odp_tun_key_from_attr(const struct nlattr *attr, bool udpif,
> + struct flow_tnl *tun)
> {
> memset(tun, 0, sizeof *tun);
> - return odp_tun_key_from_attr__(attr, NULL, 0, NULL, tun);
> + return odp_tun_key_from_attr__(attr, NULL, 0, NULL, tun, udpif);
> }
>
> static void
> @@ -1411,13 +1412,7 @@ tun_key_to_attr(struct ofpbuf *a, const struct
> flow_tnl *tun_key,
> (tun_key->gbp_flags << 16) | ntohs(tun_key-
> >gbp_id));
> nl_msg_end_nested(a, vxlan_opts_ofs);
> }
> -
> - if (tun_key == tun_flow_key) {
> - tun_metadata_to_geneve_nlattr_flow(&tun_key->metadata, a);
> - } else {
> - tun_metadata_to_geneve_nlattr_mask(key_buf, &tun_key->metadata,
> - &tun_flow_key->metadata, a);
> - }
> + tun_metadata_to_geneve_nlattr(tun_key, tun_flow_key, key_buf, a);
>
> nl_msg_end_nested(a, tun_key_ofs);
> }
> @@ -3597,7 +3592,7 @@ odp_key_to_pkt_metadata(const struct nlattr *key,
> size_t key_len,
> case OVS_KEY_ATTR_TUNNEL: {
> enum odp_key_fitness res;
>
> - res = odp_tun_key_from_attr(nla, &md->tunnel);
> + res = odp_tun_key_from_attr(nla, true, &md->tunnel);
> if (res == ODP_FIT_ERROR) {
> memset(&md->tunnel, 0, sizeof md->tunnel);
> } else if (res == ODP_FIT_PERFECT) {
> @@ -4107,7 +4102,8 @@ parse_8021q_onward(const struct nlattr
> *attrs[OVS_KEY_ATTR_MAX + 1],
> static enum odp_key_fitness
> odp_flow_key_to_flow__(const struct nlattr *key, size_t key_len,
> const struct nlattr *src_key, size_t src_key_len,
> - struct flow *flow, const struct flow *src_flow)
> + struct flow *flow, const struct flow *src_flow,
> + bool udpif)
> {
> const struct nlattr *attrs[OVS_KEY_ATTR_MAX + 1];
> uint64_t expected_attrs;
> @@ -4150,9 +4146,10 @@ odp_flow_key_to_flow__(const struct nlattr *key,
> size_t key_len,
> if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_TUNNEL)) {
> enum odp_key_fitness res;
>
> - res = odp_tun_key_from_attr__(attrs[OVS_KEY_ATTR_TUNNEL], src_key,
> + res = odp_tun_key_from_attr__(attrs[OVS_KEY_ATTR_TUNNEL],
> + is_mask ? src_key : NULL,
> src_key_len, &src_flow->tunnel,
> - &flow->tunnel);
> + &flow->tunnel, udpif);
> if (res == ODP_FIT_ERROR) {
> return ODP_FIT_ERROR;
> } else if (res == ODP_FIT_PERFECT) {
> @@ -4224,7 +4221,7 @@ enum odp_key_fitness
> odp_flow_key_to_flow(const struct nlattr *key, size_t key_len,
> struct flow *flow)
> {
> - return odp_flow_key_to_flow__(key, key_len, NULL, 0, flow, flow);
> + return odp_flow_key_to_flow__(key, key_len, NULL, 0, flow, flow, false);
> }
>
> /* Converts the 'mask_key_len' bytes of OVS_KEY_ATTR_* attributes in
> 'mask_key'
> @@ -4238,7 +4235,32 @@ odp_flow_key_to_mask(const struct nlattr *mask_key,
> size_t mask_key_len,
> struct flow *mask, const struct flow *flow)
> {
> return odp_flow_key_to_flow__(mask_key, mask_key_len, flow_key,
> flow_key_len,
> - mask, flow);
> + mask, flow, false);
> +}
> +
> +/* These functions are similar to their non-"_udpif" variants but output a
> + * 'flow' that is suitable for fast-path packet processing.
> + *
> + * Some fields have different representation for flow setup and per-
> + * packet processing (i.e. different between ofproto-dpif and userspace
> + * datapath). In particular, with the non-"_udpif" functions, struct
> + * tun_metadata is in the per-flow format (using 'present.map' and
> 'opts.u8');
> + * with these functions, struct tun_metadata is in the per-packet format
> + * (using 'present.len' and 'opts.gnv'). */
> +enum odp_key_fitness
> +odp_flow_key_to_flow_udpif(const struct nlattr *key, size_t key_len,
> + struct flow *flow)
> +{
> + return odp_flow_key_to_flow__(key, key_len, NULL, 0, flow, flow, true);
> +}
> +
> +enum odp_key_fitness
> +odp_flow_key_to_mask_udpif(const struct nlattr *mask_key, size_t
> mask_key_len,
> + const struct nlattr *flow_key, size_t
> flow_key_len,
> + struct flow *mask, const struct flow *flow)
> +{
> + return odp_flow_key_to_flow__(mask_key, mask_key_len, flow_key,
> flow_key_len,
> + mask, flow, true);
> }
>
> /* Returns 'fitness' as a string, for use in debug messages. */
> diff --git a/lib/odp-util.h b/lib/odp-util.h
> index 1eaa06b..bc27794 100644
> --- a/lib/odp-util.h
> +++ b/lib/odp-util.h
> @@ -144,7 +144,7 @@ struct odputil_keybuf {
> uint32_t keybuf[DIV_ROUND_UP(ODPUTIL_FLOW_KEY_BYTES, 4)];
> };
>
> -enum odp_key_fitness odp_tun_key_from_attr(const struct nlattr *,
> +enum odp_key_fitness odp_tun_key_from_attr(const struct nlattr *, bool
> udpif,
> struct flow_tnl *);
>
> int odp_ufid_from_string(const char *s_, ovs_u128 *ufid);
> @@ -225,6 +225,16 @@ enum odp_key_fitness odp_flow_key_to_mask(const struct
> nlattr *mask_key,
> size_t flow_key_len,
> struct flow *mask,
> const struct flow *flow);
> +
> +enum odp_key_fitness odp_flow_key_to_flow_udpif(const struct nlattr *,
> size_t,
> + struct flow *);
> +enum odp_key_fitness odp_flow_key_to_mask_udpif(const struct nlattr
> *mask_key,
> + size_t mask_key_len,
> + const struct nlattr
> *flow_key,
> + size_t flow_key_len,
> + struct flow *mask,
> + const struct flow *flow);
> +
> const char *odp_key_fitness_to_string(enum odp_key_fitness);
>
> void commit_odp_tunnel_action(const struct flow *, struct flow *base,
> diff --git a/lib/packets.h b/lib/packets.h
> index c709af5..38af37b 100644
> --- a/lib/packets.h
> +++ b/lib/packets.h
> @@ -23,6 +23,7 @@
> #include <stdint.h>
> #include <string.h>
> #include "compiler.h"
> +#include "geneve.h"
> #include "openvswitch/types.h"
> #include "random.h"
> #include "hash.h"
> @@ -802,46 +803,6 @@ static inline bool dl_type_is_ip_any(ovs_be16 dl_type)
> }
>
> /* Tunnel header */
> -#define GENEVE_MAX_OPT_SIZE 124
> -#define GENEVE_TOT_OPT_SIZE 252
> -
> -#define GENEVE_CRIT_OPT_TYPE (1 << 7)
> -
> -struct geneve_opt {
> - ovs_be16 opt_class;
> - uint8_t type;
> -#ifdef WORDS_BIGENDIAN
> - uint8_t r1:1;
> - uint8_t r2:1;
> - uint8_t r3:1;
> - uint8_t length:5;
> -#else
> - uint8_t length:5;
> - uint8_t r3:1;
> - uint8_t r2:1;
> - uint8_t r1:1;
> -#endif
> - /* Option data */
> -};
> -
> -struct genevehdr {
> -#ifdef WORDS_BIGENDIAN
> - uint8_t ver:2;
> - uint8_t opt_len:6;
> - uint8_t oam:1;
> - uint8_t critical:1;
> - uint8_t rsvd1:6;
> -#else
> - uint8_t opt_len:6;
> - uint8_t ver:2;
> - uint8_t rsvd1:6;
> - uint8_t critical:1;
> - uint8_t oam:1;
> -#endif
> - ovs_be16 proto_type;
> - ovs_16aligned_be32 vni;
> - struct geneve_opt options[];
> -};
>
> /* GRE protocol header */
> struct gre_base_hdr {
> diff --git a/lib/tun-metadata.c b/lib/tun-metadata.c
> index 7d82fb7..216d5e4 100644
> --- a/lib/tun-metadata.c
> +++ b/lib/tun-metadata.c
> @@ -226,7 +226,7 @@ tun_metadata_table_request(struct
> ofputil_geneve_table_reply *gtr)
> }
> }
>
> -/* Copies the value of field 'mf' from 'metadata' into 'value'.
> +/* Copies the value of field 'mf' from 'tnl' (which must be in non-UDPIF
> format) * into 'value'.
> *
> * 'mf' must be an MFF_TUN_METADATA* field.
> *
> @@ -234,7 +234,7 @@ tun_metadata_table_request(struct
> ofputil_geneve_table_reply *gtr)
> * tun_metadata_init(). If no such table has been created or if 'mf'
> hasn't
> * been allocated in it yet, this just zeros 'value'. */
> void
> -tun_metadata_read(const struct tun_metadata *metadata,
> +tun_metadata_read(const struct flow_tnl *tnl,
> const struct mf_field *mf, union mf_value *value)
> {
> struct tun_table *map = ovsrcu_get(struct tun_table *, &metadata_tab);
> @@ -250,10 +250,10 @@ tun_metadata_read(const struct tun_metadata *metadata,
>
> memset(value->tun_metadata, 0, mf->n_bytes - loc->len);
> memcpy_from_metadata(value->tun_metadata + mf->n_bytes - loc->len,
> - metadata, loc);
> + &tnl->metadata, loc);
> }
>
> -/* Copies 'value' into field 'mf' in 'metadata'.
> +/* Copies 'value' into field 'mf' in 'tnl' (in non-UDPIF format).
> *
> * 'mf' must be an MFF_TUN_METADATA* field.
> *
> @@ -261,7 +261,7 @@ tun_metadata_read(const struct tun_metadata *metadata,
> * tun_metadata_init(). If no such table has been created or if 'mf'
> hasn't
> * been allocated in it yet, this function does nothing. */
> void
> -tun_metadata_write(struct tun_metadata *metadata,
> +tun_metadata_write(struct flow_tnl *tnl,
> const struct mf_field *mf, const union mf_value *value)
> {
> struct tun_table *map = ovsrcu_get(struct tun_table *, &metadata_tab);
> @@ -274,9 +274,9 @@ tun_metadata_write(struct tun_metadata *metadata,
>
> loc = &map->entries[idx].loc;
>
> - ULLONG_SET1(metadata->opt_map, idx);
> - memcpy_to_metadata(metadata, value->tun_metadata + mf->n_bytes - loc-
> >len,
> - loc);
> + ULLONG_SET1(tnl->metadata.present.map, idx);
> + memcpy_to_metadata(&tnl->metadata,
> + value->tun_metadata + mf->n_bytes - loc->len, loc);
> }
>
> static const struct tun_metadata_loc *
> @@ -310,7 +310,7 @@ metadata_loc_from_match(struct tun_table *map, struct
> match *match,
>
> /* Makes 'match' match 'value'/'mask' on field 'mf'.
> *
> - * 'mf' must be an MFF_TUN_METADATA* field.
> + * 'mf' must be an MFF_TUN_METADATA* field. 'match' must be in non-UDPIF
> format.
> *
> * If there is global tunnel metadata matching table, this function is
> * effective only if there is already a mapping for 'mf'. Otherwise, the
> @@ -334,6 +334,8 @@ tun_metadata_set_match(const struct mf_field *mf, const
> union mf_value *value,
> unsigned int data_offset;
> union mf_value data;
>
> + ovs_assert(!(match->flow.tunnel.flags & FLOW_TNL_F_UDPIF));
> +
> field_len = mf_field_len(mf, value, mask);
> loc = metadata_loc_from_match(map, match, idx, field_len);
> if (!loc) {
> @@ -353,7 +355,7 @@ tun_metadata_set_match(const struct mf_field *mf, const
> union mf_value *value,
> mask->tun_metadata[data_offset + i];
> }
> }
> - ULLONG_SET1(match->flow.tunnel.metadata.opt_map, idx);
> + ULLONG_SET1(match->flow.tunnel.metadata.present.map, idx);
> memcpy_to_metadata(&match->flow.tunnel.metadata, data.tun_metadata,
> loc);
>
> if (!value) {
> @@ -363,31 +365,67 @@ tun_metadata_set_match(const struct mf_field *mf,
> const union mf_value *value,
> } else {
> memcpy(data.tun_metadata, mask->tun_metadata + data_offset, loc-
> >len);
> }
> - ULLONG_SET1(match->wc.masks.tunnel.metadata.opt_map, idx);
> + ULLONG_SET1(match->wc.masks.tunnel.metadata.present.map, idx);
> memcpy_to_metadata(&match->wc.masks.tunnel.metadata, data.tun_metadata,
> loc);
> }
>
> -/* Copies all MFF_TUN_METADATA* fields from 'metadata' to 'flow_metadata'.
> */
> +static bool
> +udpif_to_parsed(const struct flow_tnl *flow, const struct flow_tnl *mask,
> + struct flow_tnl *flow_xlate, struct flow_tnl *mask_xlate)
> +{
> + if (flow->flags & FLOW_TNL_F_UDPIF) {
> + int err;
> +
> + err = tun_metadata_from_geneve_udpif(flow, flow, flow_xlate);
> + if (err) {
> + return false;
> + }
> +
> + if (mask) {
> + tun_metadata_from_geneve_udpif(flow, mask, mask_xlate);
> + if (err) {
> + return false;
> + }
> + }
> + } else {
> + if (flow->metadata.present.map == 0) {
> + /* There is no tunnel metadata, don't bother copying. */
> + return false;
> + }
> +
> + memcpy(flow_xlate, flow, sizeof *flow_xlate);
> + if (mask) {
> + memcpy(mask_xlate, mask, sizeof *mask_xlate);
> + }
> +
> + if (!flow_xlate->metadata.tab) {
> + flow_xlate->metadata.tab = ovsrcu_get(struct tun_table *,
> + &metadata_tab);
> + }
> + }
> +
> + return true;
> +}
> +
> +/* Copies all MFF_TUN_METADATA* fields from 'tnl' to 'flow_metadata'. */
> void
> -tun_metadata_get_fmd(const struct tun_metadata *metadata,
> - struct match *flow_metadata)
> +tun_metadata_get_fmd(const struct flow_tnl *tnl, struct match
> *flow_metadata)
> {
> - struct tun_table *map;
> + struct flow_tnl flow;
> int i;
>
> - map = metadata->tab;
> - if (!map) {
> - map = ovsrcu_get(struct tun_table *, &metadata_tab);
> + if (!udpif_to_parsed(tnl, NULL, &flow, NULL)) {
> + return;
> }
>
> - ULLONG_FOR_EACH_1 (i, metadata->opt_map) {
> + ULLONG_FOR_EACH_1 (i, flow.metadata.present.map) {
> union mf_value opts;
> - const struct tun_metadata_loc *old_loc = &map->entries[i].loc;
> + const struct tun_metadata_loc *old_loc = &flow.metadata.tab-
> >entries[i].loc;
> const struct tun_metadata_loc *new_loc;
>
> new_loc = metadata_loc_from_match(NULL, flow_metadata, i, old_loc-
> >len);
>
> - memcpy_from_metadata(opts.tun_metadata, metadata, old_loc);
> + memcpy_from_metadata(opts.tun_metadata, &flow.metadata, old_loc);
> memcpy_to_metadata(&flow_metadata->flow.tunnel.metadata,
> opts.tun_metadata, new_loc);
>
> @@ -424,7 +462,7 @@ memcpy_to_metadata(struct tun_metadata *dst, const void
> *src,
> int addr = 0;
>
> while (chain) {
> - memcpy(dst->opts + loc->c.offset + addr, (uint8_t *)src + addr,
> + memcpy(dst->opts.u8 + loc->c.offset + addr, (uint8_t *)src + addr,
> chain->len);
> addr += chain->len;
> chain = chain->next;
> @@ -439,7 +477,7 @@ memcpy_from_metadata(void *dst, const struct
> tun_metadata *src,
> int addr = 0;
>
> while (chain) {
> - memcpy((uint8_t *)dst + addr, src->opts + loc->c.offset + addr,
> + memcpy((uint8_t *)dst + addr, src->opts.u8 + loc->c.offset + addr,
> chain->len);
> addr += chain->len;
> chain = chain->next;
> @@ -579,10 +617,21 @@ tun_metadata_del_entry(struct tun_table *map, uint8_t
> idx)
> }
>
> static int
> -tun_metadata_from_geneve__(struct tun_table *map, const struct geneve_opt
> *opt,
> +tun_metadata_from_geneve__(const struct tun_metadata *flow_metadata,
> + const struct geneve_opt *opt,
> const struct geneve_opt *flow_opt, int opts_len,
> struct tun_metadata *metadata)
> {
> + struct tun_table *map;
> + bool is_mask = flow_opt != opt;
> +
> + if (!is_mask) {
> + map = ovsrcu_get(struct tun_table *, &metadata_tab);
> + metadata->tab = map;
> + } else {
> + map = flow_metadata->tab;
> + }
> +
> if (!map) {
> return 0;
> }
> @@ -606,7 +655,7 @@ tun_metadata_from_geneve__(struct tun_table *map, const
> struct geneve_opt *opt,
> if (entry) {
> if (entry->loc.len == flow_opt->length * 4) {
> memcpy_to_metadata(metadata, opt + 1, &entry->loc);
> - ULLONG_SET1(metadata->opt_map, entry - map->entries);
> + ULLONG_SET1(metadata->present.map, entry - map->entries);
> } else {
> return EINVAL;
> }
> @@ -622,59 +671,97 @@ tun_metadata_from_geneve__(struct tun_table *map,
> const struct geneve_opt *opt,
> return 0;
> }
>
> +static const struct nlattr *
> +tun_metadata_find_geneve_key(const struct nlattr *key, uint32_t key_len)
> +{
> + const struct nlattr *tnl_key;
> +
> + tnl_key = nl_attr_find__(key, key_len, OVS_KEY_ATTR_TUNNEL);
> + if (!tnl_key) {
> + return NULL;
> + }
> +
> + return nl_attr_find_nested(tnl_key, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS);
> +}
> +
> +/* Converts from Geneve netlink attributes in 'attr' to tunnel metadata
> + * in 'tun'. The result may either in be UDPIF format or not, as determined
> + * by 'udpif'.
> + *
> + * In the event that a mask is being converted, it is also necessary to
> + * pass in flow information. This includes the full set of netlink
> attributes
> + * (i.e. not just the Geneve attribute) in 'flow_attrs'/'flow_attr_len' and
> + * the previously converted tunnel metadata 'flow_tun'.
> + *
> + * If a flow rather than mask is being converted, 'flow_attrs' must be
> NULL. */
> int
> tun_metadata_from_geneve_nlattr(const struct nlattr *attr,
> const struct nlattr *flow_attrs,
> size_t flow_attr_len,
> - const struct tun_metadata *flow_metadata,
> - struct tun_metadata *metadata)
> + const struct flow_tnl *flow_tun, bool
> udpif,
> + struct flow_tnl *tun)
> {
> - struct tun_table *map;
> bool is_mask = !!flow_attrs;
> + int attr_len = nl_attr_get_size(attr);
> const struct nlattr *flow;
>
> - if (is_mask) {
> - const struct nlattr *tnl_key;
> - int mask_len = nl_attr_get_size(attr);
> + /* No need for real translation, just copy things over. */
> + if (udpif) {
> + memcpy(tun->metadata.opts.gnv, nl_attr_get(attr), attr_len);
>
> - tnl_key = nl_attr_find__(flow_attrs, flow_attr_len,
> OVS_KEY_ATTR_TUNNEL);
> - if (!tnl_key) {
> - return mask_len ? EINVAL : 0;
> + if (!is_mask) {
> + tun->metadata.present.len = attr_len;
> + tun->flags |= FLOW_TNL_F_UDPIF;
> + } else {
> + /* We need to exact match on the length so we don't
> + * accidentally match on sets of options that are the same
> + * at the beginning but with additional options after. */
> + tun->metadata.present.len = 0xff;
> }
>
> - flow = nl_attr_find_nested(tnl_key,
> OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS);
> + return 0;
> + }
> +
> + if (is_mask) {
> + flow = tun_metadata_find_geneve_key(flow_attrs, flow_attr_len);
> if (!flow) {
> - return mask_len ? EINVAL : 0;
> + return attr_len ? EINVAL : 0;
> }
>
> - if (mask_len != nl_attr_get_size(flow)) {
> + if (attr_len != nl_attr_get_size(flow)) {
> return EINVAL;
> }
> } else {
> flow = attr;
> }
>
> - if (!is_mask) {
> - map = ovsrcu_get(struct tun_table *, &metadata_tab);
> - metadata->tab = map;
> - } else {
> - map = flow_metadata->tab;
> - }
> -
> - return tun_metadata_from_geneve__(map, nl_attr_get(attr),
> nl_attr_get(flow),
> - nl_attr_get_size(flow), metadata);
> + return tun_metadata_from_geneve__(&flow_tun->metadata,
> nl_attr_get(attr),
> + nl_attr_get(flow),
> nl_attr_get_size(flow),
> + &tun->metadata);
> }
>
> +/* Converts from the flat Geneve options representation extracted directly
> + * from the tunnel header to the representation that maps options to
> + * pre-allocated locations. The original version (in UDPIF form) is passed
> + * in 'src' and the translated form in stored in 'dst'. To handle masks,
> the
> + * flow must also be passed in through 'flow' (in the original, raw form).
> */
> int
> -tun_metadata_from_geneve_header(const struct geneve_opt *opts, int opt_len,
> - struct tun_metadata *metadata)
> +tun_metadata_from_geneve_udpif(const struct flow_tnl *flow,
> + const struct flow_tnl *src,
> + struct flow_tnl *dst)
> {
> - struct tun_table *map;
> -
> - map = ovsrcu_get(struct tun_table *, &metadata_tab);
> - metadata->tab = map;
> + ovs_assert(flow->flags & FLOW_TNL_F_UDPIF);
>
> - return tun_metadata_from_geneve__(map, opts, opts, opt_len, metadata);
> + if (flow == src) {
> + dst->flags = flow->flags & ~FLOW_TNL_F_UDPIF;
> + } else {
> + dst->metadata.tab = NULL;
> + }
> + dst->metadata.present.map = 0;
> + return tun_metadata_from_geneve__(&flow->metadata, src-
> >metadata.opts.gnv,
> + flow->metadata.opts.gnv,
> + flow->metadata.present.len,
> + &dst->metadata);
> }
>
> static void
> @@ -691,7 +778,7 @@ tun_metadata_to_geneve__(const struct tun_metadata
> *flow, struct ofpbuf *b,
>
> *crit_opt = false;
>
> - ULLONG_FOR_EACH_1 (i, flow->opt_map) {
> + ULLONG_FOR_EACH_1 (i, flow->present.map) {
> struct tun_meta_entry *entry = &map->entries[i];
> struct geneve_opt *opt;
>
> @@ -709,14 +796,14 @@ tun_metadata_to_geneve__(const struct tun_metadata
> *flow, struct ofpbuf *b,
> }
> }
>
> -void
> -tun_metadata_to_geneve_nlattr_flow(const struct tun_metadata *flow,
> +static void
> +tun_metadata_to_geneve_nlattr_flow(const struct flow_tnl *flow,
> struct ofpbuf *b)
> {
> size_t nlattr_offset;
> bool crit_opt;
>
> - if (!flow->opt_map) {
> + if (!flow->metadata.present.map) {
> return;
> }
>
> @@ -725,58 +812,43 @@ tun_metadata_to_geneve_nlattr_flow(const struct
> tun_metadata *flow,
> * similar enough that we can use the same mechanism. */
> nlattr_offset = nl_msg_start_nested(b,
> OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS);
>
> - tun_metadata_to_geneve__(flow, b, &crit_opt);
> + tun_metadata_to_geneve__(&flow->metadata, b, &crit_opt);
>
> nl_msg_end_nested(b, nlattr_offset);
> }
>
> +/* Converts from processed tunnel metadata information (in non-udpif
> + * format) in 'flow' to a stream of Geneve options suitable for
> + * transmission in 'opts'. Additionally returns whether there were
> + * any critical options in 'crit_opt' as well as the total length of
> + * data. */
> int
> -tun_metadata_to_geneve_header(const struct tun_metadata *flow,
> +tun_metadata_to_geneve_header(const struct flow_tnl *flow,
> struct geneve_opt *opts, bool *crit_opt)
> {
> struct ofpbuf b;
>
> + ovs_assert(!(flow->flags & FLOW_TNL_F_UDPIF));
> +
> ofpbuf_use_stack(&b, opts, GENEVE_TOT_OPT_SIZE);
> - tun_metadata_to_geneve__(flow, &b, crit_opt);
> + tun_metadata_to_geneve__(&flow->metadata, &b, crit_opt);
>
> return b.size;
> }
>
> -void
> -tun_metadata_to_geneve_nlattr_mask(const struct ofpbuf *key,
> - const struct tun_metadata *mask,
> - const struct tun_metadata *flow,
> - struct ofpbuf *b)
> +static void
> +tun_metadata_to_geneve_mask__(const struct tun_metadata *flow,
> + const struct tun_metadata *mask,
> + struct geneve_opt *opt, int opts_len)
> {
> struct tun_table *map = flow->tab;
> - const struct nlattr *tnl_key, *geneve_key;
> - struct nlattr *geneve_mask;
> - struct geneve_opt *opt;
> - int opts_len;
>
> if (!map) {
> return;
> }
>
> - tnl_key = nl_attr_find(key, 0, OVS_KEY_ATTR_TUNNEL);
> - if (!tnl_key) {
> - return;
> - }
> -
> - geneve_key = nl_attr_find_nested(tnl_key,
> - OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS);
> - if (!geneve_key) {
> - return;
> - }
> -
> - geneve_mask = ofpbuf_tail(b);
> - nl_msg_put(b, geneve_key, geneve_key->nla_len);
> -
> /* All of these options have already been validated, so no need
> * for sanity checking. */
> - opt = CONST_CAST(struct geneve_opt *, nl_attr_get(geneve_mask));
> - opts_len = nl_attr_get_size(geneve_mask);
> -
> while (opts_len > 0) {
> struct tun_meta_entry *entry;
> int len = sizeof(*opt) + opt->length * 4;
> @@ -801,6 +873,80 @@ tun_metadata_to_geneve_nlattr_mask(const struct ofpbuf
> *key,
> }
> }
>
> +static void
> +tun_metadata_to_geneve_nlattr_mask(const struct ofpbuf *key,
> + const struct flow_tnl *mask,
> + const struct flow_tnl *flow,
> + struct ofpbuf *b)
> +{
> + const struct nlattr *geneve_key;
> + struct nlattr *geneve_mask;
> + struct geneve_opt *opt;
> + int opts_len;
> +
> + if (!key) {
> + return;
> + }
> +
> + geneve_key = tun_metadata_find_geneve_key(key->data, key->size);
> + if (!geneve_key) {
> + return;
> + }
> +
> + geneve_mask = ofpbuf_tail(b);
> + nl_msg_put(b, geneve_key, geneve_key->nla_len);
> +
> + opt = CONST_CAST(struct geneve_opt *, nl_attr_get(geneve_mask));
> + opts_len = nl_attr_get_size(geneve_mask);
> +
> + tun_metadata_to_geneve_mask__(&flow->metadata, &mask->metadata,
> + opt, opts_len);
> +}
> +
> +/* Convert from the tunnel metadata in 'tun' to netlink attributes stored
> + * in 'b'. Either UDPIF or non-UDPIF input forms are accepted.
> + *
> + * To assist with parsing, it is necessary to also pass in the tunnel
> metadata
> + * from the flow in 'flow' as well in the original netlink form of the flow
> in
> + * 'key'. */
> +void
> +tun_metadata_to_geneve_nlattr(const struct flow_tnl *tun,
> + const struct flow_tnl *flow,
> + const struct ofpbuf *key,
> + struct ofpbuf *b)
> +{
> + bool is_mask = tun != flow;
> +
> + if (!(flow->flags & FLOW_TNL_F_UDPIF)) {
> + if (!is_mask) {
> + tun_metadata_to_geneve_nlattr_flow(tun, b);
> + } else {
> + tun_metadata_to_geneve_nlattr_mask(key, tun, flow, b);
> + }
> + } else if (flow->metadata.present.len || is_mask) {
> + nl_msg_put_unspec(b, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,
> + tun->metadata.opts.gnv,
> + flow->metadata.present.len);
> + }
> +}
> +
> +/* Converts 'mask_src' (in non-UDPIF format) to a series of masked options
> in
> + * 'dst'. 'flow_src' (also in non-UDPIF format) and the original set of
> + * options 'flow_src_opt'/'opts_len' are needed as a guide to interpret the
> + * mask data. */
> +void
> +tun_metadata_to_geneve_udpif_mask(const struct flow_tnl *flow_src,
> + const struct flow_tnl *mask_src,
> + const struct geneve_opt *flow_src_opt,
> + int opts_len, struct geneve_opt *dst)
> +{
> + ovs_assert(!(flow_src->flags & FLOW_TNL_F_UDPIF));
> +
> + memcpy(dst, flow_src_opt, opts_len);
> + tun_metadata_to_geneve_mask__(&flow_src->metadata,
> + &mask_src->metadata, dst, opts_len);
> +}
> +
> static const struct tun_metadata_loc *
> metadata_loc_from_match_read(struct tun_table *map, const struct match
> *match,
> unsigned int idx)
> @@ -816,19 +962,22 @@ void
> tun_metadata_to_nx_match(struct ofpbuf *b, enum ofp_version oxm,
> const struct match *match)
> {
> - struct tun_table *map = ovsrcu_get(struct tun_table *, &metadata_tab);
> - const struct tun_metadata *metadata = &match->flow.tunnel.metadata;
> - const struct tun_metadata *mask = &match->wc.masks.tunnel.metadata;
> + struct flow_tnl flow, mask;
> int i;
>
> - ULLONG_FOR_EACH_1 (i, mask->opt_map) {
> + if (!udpif_to_parsed(&match->flow.tunnel, &match->wc.masks.tunnel,
> + &flow, &mask)) {
> + return;
> + }
> +
> + ULLONG_FOR_EACH_1 (i, mask.metadata.present.map) {
> const struct tun_metadata_loc *loc;
> union mf_value opts;
> union mf_value mask_opts;
>
> - loc = metadata_loc_from_match_read(map, match, i);
> - memcpy_from_metadata(opts.tun_metadata, metadata, loc);
> - memcpy_from_metadata(mask_opts.tun_metadata, mask, loc);
> + loc = metadata_loc_from_match_read(flow.metadata.tab, match, i);
> + memcpy_from_metadata(opts.tun_metadata, &flow.metadata, loc);
> + memcpy_from_metadata(mask_opts.tun_metadata, &mask.metadata, loc);
> nxm_put(b, MFF_TUN_METADATA0 + i, oxm, opts.tun_metadata,
> mask_opts.tun_metadata, loc->len);
> }
> @@ -837,22 +986,25 @@ tun_metadata_to_nx_match(struct ofpbuf *b, enum
> ofp_version oxm,
> void
> tun_metadata_match_format(struct ds *s, const struct match *match)
> {
> - struct tun_table *map = ovsrcu_get(struct tun_table *, &metadata_tab);
> - const struct tun_metadata *metadata = &match->flow.tunnel.metadata;
> - const struct tun_metadata *mask = &match->wc.masks.tunnel.metadata;
> + struct flow_tnl flow, mask;
> unsigned int i;
>
> - ULLONG_FOR_EACH_1 (i, mask->opt_map) {
> + if (!udpif_to_parsed(&match->flow.tunnel, &match->wc.masks.tunnel,
> + &flow, &mask)) {
> + return;
> + }
> +
> + ULLONG_FOR_EACH_1 (i, mask.metadata.present.map) {
> const struct tun_metadata_loc *loc;
> union mf_value opts;
>
> - loc = metadata_loc_from_match_read(map, match, i);
> + loc = metadata_loc_from_match_read(flow.metadata.tab, match, i);
>
> ds_put_format(s, "tun_metadata%u=", i);
> - memcpy_from_metadata(opts.tun_metadata, metadata, loc);
> + memcpy_from_metadata(opts.tun_metadata, &flow.metadata, loc);
> ds_put_hex(s, opts.tun_metadata, loc->len);
>
> - memcpy_from_metadata(opts.tun_metadata, mask, loc);
> + memcpy_from_metadata(opts.tun_metadata, &mask.metadata, loc);
> if (!is_all_ones(opts.tun_metadata, loc->len)) {
> ds_put_char(s, '/');
> ds_put_hex(s, opts.tun_metadata, loc->len);
> diff --git a/lib/tun-metadata.h b/lib/tun-metadata.h
> index 56bdf2a..49db511 100644
> --- a/lib/tun-metadata.h
> +++ b/lib/tun-metadata.h
> @@ -20,35 +20,56 @@
> #include <stdint.h>
>
> #include "dynamic-string.h"
> +#include "geneve.h"
> #include "netlink.h"
> #include "ofpbuf.h"
> #include "openflow/openflow.h"
>
> +struct flow_tnl;
> struct match;
> struct mf_field;
> union mf_value;
> struct ofputil_geneve_table_mod;
> struct ofputil_geneve_table_reply;
> struct tun_table;
> -struct geneve_opt;
>
> #define TUN_METADATA_NUM_OPTS 64
> #define TUN_METADATA_TOT_OPT_SIZE 256
>
> /* Tunnel option data, plus metadata to aid in their interpretation.
> *
> - * 'opt_map' is indexed by type, that is, by the <i> in TUN_METADATA<i>, so
> - * that e.g. TUN_METADATA5 is present if 'opt_map & (1ULL << 5)' is
> nonzero.
> - * The actual data for TUN_METADATA5, if present, might be anywhere in
> 'opts'
> - * (not necessarily even contiguous), and finding it requires referring to
> - * 'tab'. */
> + * The option data exists in two forms and is interpreted differently
> depending
> + * on whether FLOW_TNL_F_UDPIF is set in struct flow_tnl flags:
> + *
> + * When FLOW_TNL_F_UDPIF is set, the tunnel metadata is in "userspace
> datapath
> + * format". This is typically used for fast-path packet processing to avoid
> + * the cost of translating options and in situations where we need to
> maintain
> + * tunnel metadata exactly as it came in. In this case 'opts.gnv' is raw
> + * packet data from the tunnel header and 'present.len' indicates the
> length
> + * of the data stored there. In these situations, 'tab' is NULL.
> + *
> + * In all other cases, we are doing flow-based processing (such as during
> + * upcalls). FLOW_TNL_F_UDPIF is not set and options are reordered into
> + * pre-allocated locations. 'present.map' is indexed by type, that is, by
> the
> + * <i> in TUN_METADATA<i>, so that e.g. TUN_METADATA5 is present if
> + * 'present.map & (1ULL << 5)' is nonzero. The actual data for
> TUN_METADATA5,
> + * if present, might be anywhere in 'opts.u8' (not necessarily even
> contiguous),
> + * and finding it requires referring to 'tab', if set, or the global
> metadata
> + * table. */
> struct tun_metadata {
> - uint64_t opt_map; /* 1-bit for each present TLV.
> */
> - uint8_t opts[TUN_METADATA_TOT_OPT_SIZE]; /* Values from tunnel TLVs. */
> + union { /* Valid members of 'opts'. When 'opts' is sorted into known
> types,
> + * 'map' is used. When 'opts' is raw packet data, 'len' is
> used. */
> + uint64_t map; /* 1-bit for each present TLV.
> */
> + uint8_t len; /* Length of data in 'opts'. */
> + } present;
> struct tun_table *tab; /* Types & lengths for 'opts' and
> 'opt_map'. */
> uint8_t pad[sizeof(uint64_t) - sizeof(struct tun_table *)]; /* Make 8
> bytes */
> + union {
> + uint8_t u8[TUN_METADATA_TOT_OPT_SIZE]; /* Values from tunnel TLVs.
> */
> + struct geneve_opt gnv[GENEVE_TOT_OPT_SIZE / sizeof(struct
> geneve_opt)];
> + } opts;
> };
> -BUILD_ASSERT_DECL(sizeof(((struct tun_metadata *)0)->opt_map) * 8 >=
> +BUILD_ASSERT_DECL(sizeof(((struct tun_metadata *)0)->present.map) * 8 >=
> TUN_METADATA_NUM_OPTS);
>
> /* The location of an option can be stored either as a single offset/len
> @@ -81,31 +102,34 @@ void tun_metadata_init(void);
> enum ofperr tun_metadata_table_mod(struct ofputil_geneve_table_mod *);
> void tun_metadata_table_request(struct ofputil_geneve_table_reply *);
>
> -void tun_metadata_read(const struct tun_metadata *,
> +void tun_metadata_read(const struct flow_tnl *,
> const struct mf_field *, union mf_value *);
> -void tun_metadata_write(struct tun_metadata *,
> +void tun_metadata_write(struct flow_tnl *,
> const struct mf_field *, const union mf_value *);
> void tun_metadata_set_match(const struct mf_field *,
> const union mf_value *value,
> const union mf_value *mask, struct match *);
> -void tun_metadata_get_fmd(const struct tun_metadata *,
> - struct match *flow_metadata);
> +void tun_metadata_get_fmd(const struct flow_tnl *, struct match
> *flow_metadata);
>
> int tun_metadata_from_geneve_nlattr(const struct nlattr *attr,
> const struct nlattr *flow_attrs,
> size_t flow_attr_len,
> - const struct tun_metadata
> *flow_metadata,
> - struct tun_metadata *metadata);
> -int tun_metadata_from_geneve_header(const struct geneve_opt *, int opt_len,
> - struct tun_metadata *metadata);
> -
> -void tun_metadata_to_geneve_nlattr_flow(const struct tun_metadata *flow,
> - struct ofpbuf *);
> -void tun_metadata_to_geneve_nlattr_mask(const struct ofpbuf *key,
> - const struct tun_metadata *mask,
> - const struct tun_metadata *flow,
> - struct ofpbuf *);
> -int tun_metadata_to_geneve_header(const struct tun_metadata *flow,
> + const struct flow_tnl *flow_tun,
> + bool udpif, struct flow_tnl *tun);
> +void tun_metadata_to_geneve_nlattr(const struct flow_tnl *tun,
> + const struct flow_tnl *flow,
> + const struct ofpbuf *key,
> + struct ofpbuf *);
> +
> +int tun_metadata_from_geneve_udpif(const struct flow_tnl *flow,
> + const struct flow_tnl *src,
> + struct flow_tnl *dst);
> +void tun_metadata_to_geneve_udpif_mask(const struct flow_tnl *flow_src,
> + const struct flow_tnl *mask_src,
> + const struct geneve_opt
> *flow_src_opt,
> + int opts_len, struct geneve_opt
> *dst);
> +
> +int tun_metadata_to_geneve_header(const struct flow_tnl *flow,
> struct geneve_opt *, bool *crit_opt);
>
> void tun_metadata_to_nx_match(struct ofpbuf *b, enum ofp_version oxm,
> diff --git a/ofproto/ofproto-dpif-sflow.c b/ofproto/ofproto-dpif-sflow.c
> index e54d3fb..185addf 100644
> --- a/ofproto/ofproto-dpif-sflow.c
> +++ b/ofproto/ofproto-dpif-sflow.c
> @@ -972,7 +972,7 @@ sflow_read_set_action(const struct nlattr *attr,
> /* Do not handle multi-encap for now. */
> sflow_actions->tunnel_err = true;
> } else {
> - if (odp_tun_key_from_attr(attr, &sflow_actions->tunnel)
> + if (odp_tun_key_from_attr(attr, false, &sflow_actions->tunnel)
> == ODP_FIT_ERROR) {
> /* Tunnel parsing error. */
> sflow_actions->tunnel_err = true;
> diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
> index 440f9e9..2d75b13 100644
> --- a/ofproto/ofproto-dpif-upcall.c
> +++ b/ofproto/ofproto-dpif-upcall.c
> @@ -1164,7 +1164,7 @@ process_upcall(struct udpif *udpif, struct upcall
> *upcall,
> memcpy(&cookie, nl_attr_get(userdata), sizeof cookie.ipfix);
>
> if (upcall->out_tun_key) {
> - odp_tun_key_from_attr(upcall->out_tun_key,
> + odp_tun_key_from_attr(upcall->out_tun_key, false,
> &output_tunnel_key);
> }
> dpif_ipfix_bridge_sample(upcall->ipfix, packet, flow,
> diff --git a/tests/tunnel-push-pop.at b/tests/tunnel-push-pop.at
> index bd95c8e..0f1724a 100644
> --- a/tests/tunnel-push-pop.at
> +++ b/tests/tunnel-push-pop.at
> @@ -132,7 +132,7 @@ AT_CHECK([ovs-ofctl dump-ports int-br | grep 'port 5'],
> [0], [dnl
> port 5: rx pkts=1, bytes=98, drop=0, errs=0, frame=0, over=0, crc=0
> ])
> AT_CHECK([ovs-appctl dpif/dump-flows int-br], [0], [dnl
> -
> tunnel(tun_id=0x7b,src=1.1.2.92,dst=1.1.2.88,ttl=64,geneve({class=0xffff,typ
> e=0x80,len=4,0xa/0xf}),flags(-df-
> csum+key)),skb_mark(0),recirc_id(0),in_port(6081),eth_type(0x0800),ipv4(frag
> =no), packets:0, bytes:0, used:never, actions:drop
> +tunnel(tun_id=0x7b,src=1.1.2.92,dst=1.1.2.88,ttl=64,geneve({class=0xffff,ty
> pe=0x80,len=4,0xa/0xf}{class=0xffff,type=0,len=4}),flags(-df-
> csum+key)),skb_mark(0),recirc_id(0),in_port(6081),eth_type(0x0800),ipv4(frag
> =no), packets:0, bytes:0, used:never, actions:drop
> ])
>
> OVS_VSWITCHD_STOP
> --
> 2.1.4
>
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
More information about the dev
mailing list