[ovs-dev] [PATCH 8/8] native tunnel: Add support for STT

Pravin B Shelar pshelar at nicira.com
Mon Jan 11 07:18:14 UTC 2016


This patch used userpsace tunneling mechanism for implementing
STT tunneling protocol.

Signed-off-by: Pravin B Shelar <pshelar at nicira.com>
---
 lib/netdev-vport.c       |   6 +-
 lib/odp-util.c           |  41 ++-
 lib/packets.h            |  26 ++
 lib/tnl-push-pop.c       | 639 ++++++++++++++++++++++++++++++++++++++++++++++-
 lib/tnl-push-pop.h       |  11 +
 tests/tunnel-push-pop.at |  27 ++
 6 files changed, 745 insertions(+), 5 deletions(-)

diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
index 8b41201..c70a596 100644
--- a/lib/netdev-vport.c
+++ b/lib/netdev-vport.c
@@ -907,8 +907,12 @@ netdev_vport_tunnel_register(void)
         TUNNEL_CLASS("vxlan", "vxlan_sys", NULL, netdev_vxlan_build_header,
                                            push_udp_header,
                                            netdev_vxlan_pop_header),
+        TUNNEL_CLASS("stt", "stt_sys", netdev_stt_class_init,
+                                       netdev_stt_build_header,
+                                       netdev_stt_push_header,
+                                       netdev_stt_pop_header),
+
         TUNNEL_CLASS("lisp", "lisp_sys", NULL, NULL, NULL, NULL),
-        TUNNEL_CLASS("stt", "stt_sys", NULL, NULL, NULL, NULL),
     };
     static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
 
diff --git a/lib/odp-util.c b/lib/odp-util.c
index f16e113..bdc7391 100644
--- a/lib/odp-util.c
+++ b/lib/odp-util.c
@@ -85,6 +85,8 @@ static void format_geneve_opts(const struct geneve_opt *opt,
 static struct nlattr *generate_all_wildcard_mask(const struct attr_len_tbl tbl[],
                                                  int max, struct ofpbuf *,
                                                  const struct nlattr *key);
+static void format_be64(struct ds *ds, const char *name, ovs_be64 key,
+                        const ovs_be64 *mask, bool verbose);
 static void format_u128(struct ds *ds, const ovs_u128 *value,
                         const ovs_u128 *mask, bool verbose);
 static int scan_u128(const char *s, ovs_u128 *value, ovs_u128 *mask);
@@ -445,13 +447,29 @@ format_udp_tnl_push_header(struct ds *ds, const struct udp_header *udp)
     return udp + 1;
 }
 
+static const void *
+format_tcp_tnl_push_header(struct ds *ds, const struct tcp_header *tcp)
+{
+    ds_put_format(ds, "tcp(src=%"PRIu16",dst=%"PRIu16",seq=0x%"PRIx16","
+                  "ack=0x%"PRIx16",", ntohs(tcp->tcp_src), ntohs(tcp->tcp_dst),
+                  ntohl(get_16aligned_be32(&tcp->tcp_seq)),
+                  ntohl(get_16aligned_be32(&tcp->tcp_ack)));
+
+    format_flags_masked(ds, "flags", packet_tcp_flag_to_string,
+                        ntohs(tcp->tcp_ctl), TCP_FLAGS(OVS_BE16_MAX),
+                        TCP_FLAGS(OVS_BE16_MAX));
+
+    ds_put_format(ds, ",csum=0x%"PRIx16",urg=0x%"PRIx16")",
+                  ntohs(tcp->tcp_csum), ntohs(tcp->tcp_urg));
+    return tcp + 1;
+}
+
 static void
 format_odp_tnl_push_header(struct ds *ds, struct ovs_action_push_tnl *data)
 {
     const struct eth_header *eth;
     const void *l3;
     const void *l4;
-    const struct udp_header *udp;
 
     eth = (const struct eth_header *)data->header;
 
@@ -491,11 +509,12 @@ format_odp_tnl_push_header(struct ds *ds, struct ovs_action_push_tnl *data)
         l4 = (ip6 + 1);
     }
 
-    udp = (const struct udp_header *) l4;
 
     if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) {
         const struct vxlanhdr *vxh;
+        const struct udp_header *udp;
 
+        udp = (const struct udp_header *) l4;
         vxh = format_udp_tnl_push_header(ds, udp);
 
         ds_put_format(ds, "vxlan(flags=0x%"PRIx32",vni=0x%"PRIx32")",
@@ -503,7 +522,9 @@ format_odp_tnl_push_header(struct ds *ds, struct ovs_action_push_tnl *data)
                       ntohl(get_16aligned_be32(&vxh->vx_vni)) >> 8);
     } else if (data->tnl_type == OVS_VPORT_TYPE_GENEVE) {
         const struct genevehdr *gnh;
+        const struct udp_header *udp;
 
+        udp = (const struct udp_header *) l4;
         gnh = format_udp_tnl_push_header(ds, udp);
 
         ds_put_format(ds, "geneve(%s%svni=0x%"PRIx32,
@@ -541,6 +562,22 @@ format_odp_tnl_push_header(struct ds *ds, struct ovs_action_push_tnl *data)
             options++;
         }
         ds_put_format(ds, ")");
+    } else if (data->tnl_type == OVS_VPORT_TYPE_STT) {
+        const struct tcp_header *tcp;
+        const struct stthdr *stth;
+
+        tcp = (const struct tcp_header *) l4;
+        stth = format_tcp_tnl_push_header(ds, tcp);
+        ds_put_format(ds, ",stt(");
+        format_be64(ds, "tun_id", get_32aligned_be64(&stth->key), NULL, false);
+        ds_put_format(ds, "ver=0x%"PRIx8",flags=0x%"PRIx8","
+                          "l4_offset=0x%"PRIx8",res=0x%"PRIx8","
+                          "mss=0x%"PRIx16",",
+                           stth->version, stth->flags,
+                           stth->l4_offset, stth->reserved, ntohs(stth->mss));
+
+        format_vlan_tci(ds, stth->vlan_tci, OVS_BE16_MAX, false);
+        ds_put_format(ds, ")");
     }
     ds_put_format(ds, ")");
 }
diff --git a/lib/packets.h b/lib/packets.h
index 2157657..5459b0f 100644
--- a/lib/packets.h
+++ b/lib/packets.h
@@ -1021,6 +1021,32 @@ struct vxlanhdr {
 
 #define VXLAN_FLAGS 0x08000000  /* struct vxlanhdr.vx_flags required value. */
 
+/* STT header */
+
+struct stthdr {
+    __u8                version;
+    __u8                flags;
+    __u8                l4_offset;
+    __u8                reserved;
+    ovs_be16            mss;
+    ovs_be16            vlan_tci;
+    ovs_32aligned_be64  key;
+};
+
+/* Padding after the end of the tunnel headers to provide alignment
+ * for inner packet IP header after 14 byte Ethernet header.
+ */
+#define STT_ETH_PAD 2
+
+#define STT_BASE_HLEN   (sizeof(struct stthdr) + STT_ETH_PAD)
+#define STT_HEADER_LEN	(sizeof(struct tcp_header) + STT_BASE_HLEN)
+
+#define STT_CSUM_VERIFIED       (1 << 0)
+#define STT_CSUM_PARTIAL        (1 << 1)
+#define STT_PROTO_IPV4          (1 << 2)
+#define STT_PROTO_TCP           (1 << 3)
+#define STT_PROTO_TYPES         (STT_PROTO_IPV4 | STT_PROTO_TCP)
+
 void ipv6_format_addr(const struct in6_addr *addr, struct ds *);
 void ipv6_format_addr_bracket(const struct in6_addr *addr, struct ds *,
                               bool bracket);
diff --git a/lib/tnl-push-pop.c b/lib/tnl-push-pop.c
index 86023c2..9440033 100644
--- a/lib/tnl-push-pop.c
+++ b/lib/tnl-push-pop.c
@@ -16,8 +16,6 @@
 
 #include <config.h>
 
-#include "netdev-vport.h"
-
 #include <errno.h>
 #include <fcntl.h>
 #include <sys/socket.h>
@@ -25,6 +23,10 @@
 #include <netinet/ip6.h>
 #include <sys/ioctl.h>
 
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
 #include "byte-order.h"
 #include "csum.h"
 #include "daemon.h"
@@ -32,20 +34,25 @@
 #include "dpif.h"
 #include "dp-packet.h"
 #include "dynamic-string.h"
+#include "entropy.h"
 #include "flow.h"
 #include "hash.h"
 #include "hmap.h"
+#include "id-pool.h"
 #include "list.h"
 #include "netdev-provider.h"
+#include "netdev-vport.h"
 #include "netdev-vport-private.h"
 #include "odp-netlink.h"
 #include "dp-packet.h"
 #include "ovs-router.h"
 #include "packets.h"
 #include "poll-loop.h"
+#include "random.h"
 #include "route-table.h"
 #include "shash.h"
 #include "socket-util.h"
+#include "timeval.h"
 #include "tnl-push-pop.h"
 #include "openvswitch/vlog.h"
 #include "unaligned.h"
@@ -631,6 +638,634 @@ netdev_geneve_build_header(const struct netdev *netdev,
     return 0;
 }
 
+
+/* STT */
+
+/* The maximum amount of memory used to store packets waiting to be reassembled
+ * on a given CPU.  Once this threshold is exceeded we will begin freeing the
+ * least recently used fragments.
+ */
+#define REASM_HI_THRESH (4 * 1024 * 1024)
+/* The target for the high memory evictor.  Once we have exceeded
+ * REASM_HI_THRESH, we will continue freeing fragments until we hit
+ * this limit.
+ */
+#define REASM_LO_THRESH (3 * 1024 * 1024)
+/* The length of time a given packet has to be reassembled from the time the
+ * first fragment arrives.  Once this limit is exceeded it becomes available
+ * for cleaning.
+ */
+
+#define FRAG_EXP_TIME  frag_exp_time
+
+#define FRAG_HASH_SHIFT         8
+#define FRAG_HASH_ENTRIES       (1 << FRAG_HASH_SHIFT)
+#define FRAG_HASH_SEGS          ((sizeof(uint32_t) * 8) / FRAG_HASH_SHIFT)
+
+/* The length and offset of a fragment are encoded in the sequence number.
+ * STT_SEQ_LEN_SHIFT is the left shift needed to store the length.
+ * STT_SEQ_OFFSET_MASK is the mask to extract the offset.
+ */
+#define STT_SEQ_LEN_SHIFT      16
+#define STT_SEQ_OFFSET_MASK    ((1 << STT_SEQ_LEN_SHIFT) - 1)
+
+struct pkt_key {
+        struct in6_addr ipv6_src;
+        struct in6_addr ipv6_dst;
+        ovs_be32 pkt_seq;
+};
+
+struct pkt_frag {
+        struct dp_packet *pkts;
+        unsigned long timestamp;
+        struct ovs_list lru_node;
+        struct pkt_key key;
+};
+
+struct first_frag {
+        struct dp_packet *last_pkt;
+        unsigned int mem_used;
+        uint16_t tot_len;
+        uint16_t rcvd_len;
+        bool set_ecn_ce;
+};
+
+struct frag_packet_data {
+        uint16_t offset;
+	uint16_t pkt_size;
+        /* Only valid for the first packet in the chain. */
+        struct first_frag first;
+        struct dp_packet *next;
+};
+
+BUILD_ASSERT_DECL(DP_PACKET_CONTEXT_SIZE >= sizeof(struct frag_packet_data));
+
+#define FRAG_DATA(packet) ((struct frag_packet_data *)(packet)->data)
+#define STT_PACKET_DATA(pkt)	((unsigned char *)dp_packet_l4(pkt) + STT_HEADER_LEN)
+
+struct stt_reassemble {
+        struct pkt_frag frag_hash[FRAG_HASH_ENTRIES];
+        struct ovs_list frag_lru;
+        unsigned int frag_mem_used;
+        uint32_t id;
+        uint32_t counter;
+};
+
+static struct ovs_mutex thread_is_lock;
+static struct id_pool *id_ppol;
+static uint32_t frag_hash_seed;
+static ovsthread_key_t per_thread_reasm_data;
+static void evict_frags(struct stt_reassemble *reasm, int mem_limit);
+static uint64_t frag_exp_time;
+
+static struct stt_reassemble *
+get_reasm()
+{
+    struct stt_reassemble *reasm;
+    uint32_t i;
+    bool res;
+
+    reasm = ovsthread_getspecific(per_thread_reasm_data);
+    if (OVS_UNLIKELY(reasm)) {
+        return reasm;
+    }
+
+    reasm = xmalloc_cacheline(sizeof(*reasm));
+    list_init(&reasm->frag_lru);
+    reasm->counter = 0;
+
+    ovs_mutex_lock(&thread_is_lock);
+
+    for (i = 0; i < USHRT_MAX; i++) {
+        res = id_pool_alloc_id(id_ppol, &i);
+
+        if (res) {
+            break;
+        }
+    }
+    if (res) {
+        reasm->id = i;
+    } else {
+        OVS_NOT_REACHED();
+    }
+    ovs_mutex_unlock(&thread_is_lock);
+    ovsthread_setspecific(per_thread_reasm_data, reasm);
+    return reasm;
+}
+
+static void
+reasm_destructor(void *_reasm)
+{
+    struct stt_reassemble *reasm = _reasm;
+
+    evict_frags(reasm, 0);
+
+    ovs_mutex_lock(&thread_is_lock);
+    id_pool_free_id(id_ppol, reasm->id);
+    ovs_mutex_unlock(&thread_is_lock);
+}
+
+int
+netdev_stt_class_init(void)
+{
+    ovsthread_key_create(&per_thread_reasm_data, reasm_destructor);
+    frag_hash_seed = random();
+    ovs_mutex_init(&thread_is_lock);
+    id_ppol = id_pool_create(0, USHRT_MAX);
+    frag_exp_time = 30 * OVS_HZ;
+    return 0;
+}
+
+static bool pkt_key_match(const struct pkt_key *a, const struct pkt_key *b)
+{
+    return !memcmp(a, b, sizeof (*a));
+}
+
+static uint32_t pkt_key_hash(const struct pkt_key *key)
+{
+        return hash_3words(hash_bytes(&key, offsetof(struct pkt_key, pkt_seq), 0),
+                            (uint32_t)(key->pkt_seq),  frag_hash_seed);
+}
+
+static inline void list_packet_delete(struct dp_packet *pkt)
+{
+    do {
+        struct dp_packet *next = FRAG_DATA(pkt)->next;
+
+        dp_packet_delete(pkt);
+        pkt = next;
+    } while (pkt);
+}
+
+static void free_frag(struct stt_reassemble *reasm, struct pkt_frag *frag)
+{
+    reasm->frag_mem_used -= FRAG_DATA(frag->pkts)->first.mem_used;
+    list_packet_delete(frag->pkts);
+    frag->pkts = NULL;
+}
+
+static struct pkt_frag *
+pkt_frag_from_node(const struct ovs_list *node)
+{
+    return CONTAINER_OF(node, struct pkt_frag, lru_node);
+}
+
+static void evict_frags(struct stt_reassemble *reasm, int mem_limit)
+{
+    while (!list_is_empty(&reasm->frag_lru) &&
+           reasm->frag_mem_used > mem_limit) {
+        struct pkt_frag *frag;
+
+        frag = pkt_frag_from_node(list_pop_back(&reasm->frag_lru));
+        free_frag(reasm, frag);
+    }
+
+    /* Update Fragment cache expiration time. */
+    frag_exp_time = 30 * OVS_HZ;
+}
+
+static struct pkt_frag *
+lookup_frag(struct stt_reassemble *reasm,
+            const struct pkt_key *key, uint32_t hash)
+{
+    struct pkt_frag *frag, *victim_frag = NULL;
+    int i;
+
+    for (i = 0; i < FRAG_HASH_SEGS; i++) {
+        frag = &reasm->frag_hash[hash & (FRAG_HASH_ENTRIES - 1)];
+
+        if (frag->pkts &&
+            pkt_key_match(&frag->key, key) &&
+            !time_before(cycles_counter(), frag->timestamp + FRAG_EXP_TIME)) {
+                return frag;
+
+        }
+        if (!victim_frag ||
+            (victim_frag->pkts &&
+             (!frag->pkts ||
+              time_before(frag->timestamp, victim_frag->timestamp))))
+            victim_frag = frag;
+
+        hash >>= FRAG_HASH_SHIFT;
+    }
+
+    if (victim_frag->pkts)
+        free_frag(reasm, victim_frag);
+
+    return victim_frag;
+}
+
+static struct dp_packet *
+packet_merge(struct dp_packet *pkt)
+{
+    struct dp_packet *next;
+    struct dp_packet *m;
+
+    m = dp_packet_clone(pkt);
+    next = FRAG_DATA(pkt)->next;
+    dp_packet_delete(pkt);
+
+    pkt = next;
+    while (pkt) {
+        void *data;
+
+        data = dp_packet_put_uninit(m, FRAG_DATA(pkt)->pkt_size);
+        memcpy(data, STT_PACKET_DATA(pkt), FRAG_DATA(pkt)->pkt_size);
+        next = FRAG_DATA(pkt)->next;
+        dp_packet_delete(pkt);
+        pkt = next;
+    }
+
+    return m;
+}
+
+static struct dp_packet *
+reassemble(struct dp_packet *packet)
+{
+    struct tcp_header *tcph = dp_packet_l4(packet);
+    int pkt_size = dp_packet_l4_size(packet) - sizeof(*tcph);
+    uint32_t seq = ntohl(get_16aligned_be32(&tcph->tcp_seq));
+    struct stt_reassemble *reasm = get_reasm();
+    struct dp_packet *last_pkt;
+    struct pkt_frag *frag;
+    struct pkt_key key;
+    uint32_t hash;
+    bool is_ipv6;
+    uint8_t tos;
+    int tot_len;
+
+    tot_len = seq >> STT_SEQ_LEN_SHIFT;
+    FRAG_DATA(packet)->offset = seq & STT_SEQ_OFFSET_MASK;
+    FRAG_DATA(packet)->next = NULL;
+    FRAG_DATA(packet)->pkt_size = pkt_size;
+
+    if (STT_BASE_HLEN > pkt_size) {
+        goto out_free;
+    }
+
+    if (FRAG_DATA(packet)->offset + pkt_size > tot_len) {
+        goto out_free;
+    }
+
+    if (tot_len == pkt_size) {
+        goto out;
+    }
+
+    is_ipv6 = is_header_ipv6(dp_packet_data(packet));
+
+    if (is_ipv6) {
+        struct ovs_16aligned_ip6_hdr *ip6 = dp_packet_l3(packet);
+
+        memcpy(&key.ipv6_src, &ip6->ip6_src.be16, sizeof ip6->ip6_src);
+        memcpy(&key.ipv6_dst, &ip6->ip6_dst.be16, sizeof ip6->ip6_dst);
+    } else {
+        struct ip_header *iph = dp_packet_l3(packet);
+
+        in6_addr_set_mapped_ipv4(&key.ipv6_src, get_16aligned_be32(&iph->ip_src));
+        in6_addr_set_mapped_ipv4(&key.ipv6_dst, get_16aligned_be32(&iph->ip_dst));
+    }
+    key.pkt_seq = get_16aligned_be32(&tcph->tcp_ack);
+    hash = pkt_key_hash(&key);
+
+    if (reasm->frag_mem_used + dp_packet_get_allocated(packet) > REASM_HI_THRESH) {
+        evict_frags(reasm, REASM_LO_THRESH);
+    }
+
+    frag = lookup_frag(reasm, &key, hash);
+    if (!frag->pkts) {
+        frag->pkts = packet;
+        frag->key = key;
+        frag->timestamp = cycles_counter();
+        FRAG_DATA(packet)->first.last_pkt = packet;
+        FRAG_DATA(packet)->first.mem_used = dp_packet_get_allocated(packet);
+        FRAG_DATA(packet)->first.tot_len = tot_len;
+        FRAG_DATA(packet)->first.rcvd_len = pkt_size;
+        FRAG_DATA(packet)->first.set_ecn_ce = false;
+        list_push_back(&reasm->frag_lru, &frag->lru_node);
+        reasm->frag_mem_used += dp_packet_get_allocated(packet);
+
+        packet = NULL;
+        goto out;
+    }
+
+    /* Optimize for the common case where fragments are received in-order
+     * and not overlapping.
+     */
+    last_pkt = FRAG_DATA(frag->pkts)->first.last_pkt;
+    if (FRAG_DATA(last_pkt)->offset + FRAG_DATA(last_pkt)->pkt_size ==
+           FRAG_DATA(packet)->offset) {
+        FRAG_DATA(last_pkt)->next = packet;
+        FRAG_DATA(frag->pkts)->first.last_pkt = packet;
+    } else {
+        struct dp_packet *prev = NULL, *next;
+
+        for (next = frag->pkts; next; next = FRAG_DATA(next)->next) {
+            if (FRAG_DATA(next)->offset >= FRAG_DATA(packet)->offset) {
+                break;
+            }
+            prev = next;
+        }
+
+        /* Overlapping fragments aren't allowed.  We shouldn't start
+         * before the end of the previous fragment.
+         */
+        if (prev &&
+            FRAG_DATA(prev)->offset + FRAG_DATA(prev)->pkt_size > FRAG_DATA(packet)->offset) {
+            goto unlock_free;
+        }
+
+        /* We also shouldn't end after the beginning of the next
+         * fragment.
+         */
+        if (next &&
+            FRAG_DATA(packet)->offset + pkt_size > FRAG_DATA(next)->offset) {
+            goto unlock_free;
+        }
+
+        if (prev) {
+            FRAG_DATA(prev)->next = packet;
+        } else {
+            FRAG_DATA(packet)->first = FRAG_DATA(frag->pkts)->first;
+            frag->pkts = packet;
+        }
+
+        if (next) {
+            FRAG_DATA(packet)->next = next;
+        } else {
+            FRAG_DATA(frag->pkts)->first.last_pkt = packet;
+        }
+    }
+
+    if (is_ipv6) {
+        struct ovs_16aligned_ip6_hdr *ip6 = dp_packet_l3(packet);
+
+        tos = ntohl(get_16aligned_be32(&ip6->ip6_flow)) >> 20;
+    } else {
+        struct ip_header *iph = dp_packet_l3(packet);
+        tos = iph->ip_tos;
+    }
+
+    FRAG_DATA(frag->pkts)->first.set_ecn_ce |= IP_ECN_is_ce(tos);
+    FRAG_DATA(frag->pkts)->first.rcvd_len += pkt_size;
+    FRAG_DATA(frag->pkts)->first.mem_used += dp_packet_get_allocated(packet);
+    reasm->frag_mem_used += dp_packet_get_allocated(packet);
+
+    if (FRAG_DATA(frag->pkts)->first.tot_len ==
+        FRAG_DATA(frag->pkts)->first.rcvd_len) {
+        struct dp_packet *frag_head = frag->pkts;
+
+        if (FRAG_DATA(frag_head)->first.set_ecn_ce) {
+            IP_ECN_set_ce(frag_head, is_ipv6);
+        }
+
+        list_remove(&frag->lru_node);
+        reasm->frag_mem_used -= FRAG_DATA(frag_head)->first.mem_used;
+        frag->pkts = NULL;
+        packet = packet_merge(frag_head);
+    } else {
+        list_remove(&frag->lru_node);
+        list_push_back(&reasm->frag_lru, &frag->lru_node);
+        packet = NULL;
+    }
+    goto out;
+
+unlock_free:
+    dp_packet_delete(packet);
+    packet = NULL;
+out:
+    return packet;
+out_free:
+    dp_packet_delete(packet);
+    return NULL;
+}
+
+static bool
+valid_tcp_checksum(struct dp_packet *packet)
+{
+    uint32_t csum;
+
+    if (is_header_ipv6(dp_packet_data(packet))) {
+        csum = packet_csum_pseudoheader6(dp_packet_l3(packet));
+    } else {
+        csum = packet_csum_pseudoheader(dp_packet_l3(packet));
+    }
+
+    csum = csum_continue(csum, dp_packet_l4(packet), dp_packet_l4_size(packet));
+    if (csum_finish(csum)) {
+        return false;
+    }
+    return true;
+}
+
+static void *
+tcp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
+                   unsigned int *hlen)
+{
+    struct tcp_header *tcp;
+
+    tcp = ip_extract_tnl_md(packet, tnl, hlen);
+    if (!tcp) {
+        return NULL;
+    }
+
+    tnl->flags |= FLOW_TNL_F_CSUM;
+    tnl->tp_src = tcp->tcp_src;
+    tnl->tp_dst = tcp->tcp_dst;
+    return tcp + 1;
+}
+
+static int
+stt_extract_tnl_md(struct dp_packet *packet)
+{
+    struct pkt_metadata *md = &packet->md;
+    struct flow_tnl *tnl = &md->tunnel;
+    uint8_t flags, l4_offset;
+    struct stthdr *stth;
+    uint32_t hlen;
+
+    pkt_metadata_init_tnl(md);
+    stth = tcp_extract_tnl_md(packet, tnl, &hlen);
+    if (!stth) {
+        return EINVAL;
+    }
+
+    if (stth->version != 0) {
+        VLOG_WARN_RL(&err_rl, "invalid STT version = %d\n", stth->version);
+        return EINVAL;
+    }
+    flags = stth->flags;
+    l4_offset = stth->l4_offset;
+
+    tnl->tun_id = get_32aligned_be64(&stth->key);
+    tnl->flags |= FLOW_TNL_F_KEY;
+
+    dp_packet_reset_packet(packet, hlen + STT_HEADER_LEN);
+
+    if (flags & STT_CSUM_PARTIAL) {
+        uint8_t proto_type;
+        uint16_t csum_offset;
+        int l3_header_size;
+        int l4_header_size;
+        uint32_t l4_csum;
+        ovs_be16 *csum_ptr;
+
+        proto_type = stth->flags & STT_PROTO_TYPES;
+        if (proto_type == (STT_PROTO_IPV4 | STT_PROTO_TCP)) {
+           /* TCP/IPv4 */
+           csum_offset = offsetof(struct tcp_header, tcp_csum);
+           l3_header_size = sizeof(struct ip_header);
+           l4_header_size = sizeof(struct tcp_header);
+        } else if (proto_type == STT_PROTO_TCP) {
+           /* TCP/IPv6 */
+           csum_offset = offsetof(struct tcp_header, tcp_csum);
+           l3_header_size = sizeof(struct ovs_16aligned_ip6_hdr);
+           l4_header_size = sizeof(struct tcp_header);
+        } else if (proto_type == STT_PROTO_IPV4) {
+           /* UDP/IPv4 */
+           csum_offset = offsetof(struct udp_header, udp_csum);
+           l3_header_size = sizeof(struct ip_header);
+           l4_header_size = sizeof(struct udp_header);
+        } else {
+           /* UDP/IPv6 */
+           csum_offset = offsetof(struct udp_header, udp_csum);
+           l3_header_size = sizeof(struct ovs_16aligned_ip6_hdr);
+           l4_header_size = sizeof(struct udp_header);
+        }
+
+        if (l4_offset < ETH_HEADER_LEN + l3_header_size) {
+            return EINVAL;
+        }
+        if (dp_packet_size(packet) < l4_offset + l4_header_size) {
+            return EINVAL;
+        }
+        csum_ptr = (ovs_be16 *) ((uint16_t *) dp_packet_data(packet) + (l4_offset >> 1));
+        l4_csum = csum_continue(0, csum_ptr, dp_packet_size(packet) - l4_offset);
+        *(csum_ptr + (csum_offset >> 1)) = csum_finish(l4_csum);
+    }
+
+    return 0;
+}
+
+int
+netdev_stt_pop_header(struct dp_packet **p_packet)
+{
+    struct dp_packet *packet = *p_packet;
+    struct dp_packet *reasm_pkt;
+
+    if (!valid_tcp_checksum(packet)) {
+        return EINVAL;
+    }
+
+    reasm_pkt = reassemble(packet);
+    *p_packet = reasm_pkt;
+    if (reasm_pkt) {
+        return stt_extract_tnl_md(reasm_pkt);
+    }
+    return 0;
+}
+
+static void *
+tcp_build_header(struct netdev_tunnel_config *tnl_cfg,
+                 struct ovs_action_push_tnl *data,
+                 unsigned int *hlen)
+{
+    struct ovs_16aligned_ip6_hdr *ip6;
+    struct tcp_header *tcp;
+    struct ip_header *ip;
+    bool is_ipv6;
+
+    *hlen = sizeof(struct eth_header);
+
+    is_ipv6 = is_header_ipv6(data->header);
+
+    if (is_ipv6) {
+        ip6 = ipv6_hdr(data->header);
+        ip6->ip6_nxt = IPPROTO_TCP;
+        tcp = (struct tcp_header *) (ip6 + 1);
+        *hlen += IPV6_HEADER_LEN;
+    } else {
+        ip = ip_hdr(data->header);
+        ip->ip_proto = IPPROTO_TCP;
+        tcp = (struct tcp_header *) (ip + 1);
+        *hlen += IP_HEADER_LEN;
+    }
+
+    tcp->tcp_dst = tnl_cfg->dst_port;
+    tcp->tcp_ctl   = TCP_CTL(TCP_ACK | TCP_PSH, sizeof(struct tcp_header) >> 2);
+    tcp->tcp_winsz    = htons(USHRT_MAX);
+    return tcp + 1;
+}
+
+int
+netdev_stt_build_header(const struct netdev *netdev,
+                        struct ovs_action_push_tnl *data,
+                        const struct flow *tnl_flow)
+{
+    struct netdev_vport *dev = netdev_vport_cast(netdev);
+    struct netdev_tunnel_config *tnl_cfg;
+    struct stthdr *stth;
+    unsigned int hlen;
+
+    /* XXX: RCUfy tnl_cfg. */
+    ovs_mutex_lock(&dev->mutex);
+    tnl_cfg = &dev->tnl_cfg;
+
+    stth = tcp_build_header(tnl_cfg, data, &hlen);
+
+    stth->flags = STT_CSUM_VERIFIED;
+    stth->vlan_tci = 0;
+    put_32aligned_be64(&stth->key, tnl_flow->tunnel.tun_id);
+
+    ovs_mutex_unlock(&dev->mutex);
+    data->header_len = hlen + STT_HEADER_LEN;
+    data->tnl_type = OVS_VPORT_TYPE_STT;
+    return 0;
+}
+
+static uint32_t ack_seq(void)
+{
+    struct stt_reassemble *reasm = get_reasm();
+    uint32_t ack;
+
+    ack = reasm->counter << 16 | reasm->id;
+    reasm->counter++;
+    return ack;
+}
+
+static void
+push_tcp_header(struct dp_packet *packet,
+                const struct ovs_action_push_tnl *data)
+{
+    struct tcp_header *tcp;
+    uint32_t csum, stt_len;
+    int ip_tot_size;
+
+    tcp = push_ip_header(packet, data->header, data->header_len, &ip_tot_size);
+
+    /* set tcp src port */
+    tcp->tcp_src = get_src_port(packet);
+
+    stt_len = (ip_tot_size - sizeof(struct tcp_header));
+    put_16aligned_be32(&tcp->tcp_seq, htonl(stt_len << STT_SEQ_LEN_SHIFT));
+    put_16aligned_be32(&tcp->tcp_ack, htonl(ack_seq()));
+
+    if (is_header_ipv6(dp_packet_data(packet))) {
+        csum = packet_csum_pseudoheader6(ipv6_hdr(dp_packet_data(packet)));
+    } else {
+        csum = packet_csum_pseudoheader(ip_hdr(dp_packet_data(packet)));
+    }
+
+    csum = csum_continue(csum, tcp, ip_tot_size);
+    tcp->tcp_csum = csum_finish(csum);
+}
+
+void
+netdev_stt_push_header(struct dp_packet *packet,
+                       const struct ovs_action_push_tnl *data)
+{
+    push_tcp_header(packet, data);
+}
+
 void
 netdev_vport_range(struct unixctl_conn *conn, int argc,
                    const char *argv[], void *aux OVS_UNUSED)
diff --git a/lib/tnl-push-pop.h b/lib/tnl-push-pop.h
index be84ecd..c1f2a68 100644
--- a/lib/tnl-push-pop.h
+++ b/lib/tnl-push-pop.h
@@ -53,4 +53,15 @@ void
 netdev_vport_range(struct unixctl_conn *conn, int argc,
                    const char *argv[], void *aux OVS_UNUSED);
 
+int
+netdev_stt_pop_header(struct dp_packet **packet);
+int
+netdev_stt_build_header(const struct netdev *netdev,
+                        struct ovs_action_push_tnl *data,
+                        const struct flow *tnl_flow);
+void
+netdev_stt_push_header(struct dp_packet *packet,
+                       const struct ovs_action_push_tnl *data);
+int
+netdev_stt_class_init(void);
 #endif
diff --git a/tests/tunnel-push-pop.at b/tests/tunnel-push-pop.at
index b04f4a6..b0363f2 100644
--- a/tests/tunnel-push-pop.at
+++ b/tests/tunnel-push-pop.at
@@ -12,6 +12,8 @@ AT_CHECK([ovs-vsctl add-port int-br t2 -- set Interface t2 type=vxlan \
                        options:remote_ip=1.1.2.93 options:out_key=flow options:csum=true ofport_request=4\
                     -- add-port int-br t4 -- set Interface t4 type=geneve \
                        options:remote_ip=flow options:key=123 ofport_request=5\
+                    -- add-port int-br t5 -- set Interface t5 type=stt \
+                       options:remote_ip=1.1.2.92 options:key=789 ofport_request=6 options:csum=true\
                        ], [0])
 
 AT_CHECK([ovs-appctl dpif/show], [0], [dnl
@@ -25,6 +27,7 @@ dummy at ovs-dummy: hit:0 missed:0
 		t2 2/4789: (vxlan: key=123, remote_ip=1.1.2.92)
 		t3 4/4789: (vxlan: csum=true, out_key=flow, remote_ip=1.1.2.93)
 		t4 5/6081: (geneve: key=123, remote_ip=flow)
+		t5 6/7471: (stt: csum=true, key=789, remote_ip=1.1.2.92)
 ])
 
 dnl First setup dummy interface IP address, then add the route
@@ -51,6 +54,7 @@ AT_CHECK([ovs-appctl tnl/ports/show |sort], [0], [dnl
 Listening ports:
 genev_sys_6081 (6081)
 gre_sys (3)
+stt_sys_7471 (7471)
 vxlan_sys_4789 (4789)
 ])
 
@@ -71,6 +75,13 @@ AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=f8:bc:12:44:34:
 AT_CHECK([tail -1 stdout], [0],
   [Datapath actions: tnl_pop(6081)
 ])
+i
+
+dnl Check STT tunnel pop
+AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x0800),ipv4(src=1.1.2.92,dst=1.1.2.88,proto=6,tos=0,ttl=64,frag=no),tcp(src=51283,dst=7471)'], [0], [stdout])
+AT_CHECK([tail -1 stdout], [0],
+  [Datapath actions: tnl_pop(7471)
+])
 
 dnl Check VXLAN tunnel push
 AT_CHECK([ovs-ofctl add-flow int-br action=2])
@@ -108,6 +119,13 @@ AT_CHECK([tail -1 stdout], [0],
   [Datapath actions: tnl_push(tnl_port(6081),header(size=58,type=5,eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),ipv4(src=1.1.2.88,dst=1.1.2.92,proto=17,tos=0,ttl=64,frag=0x40),udp(src=0,dst=6081,csum=0x0),geneve(crit,vni=0x7b,options({class=0xffff,type=0x80,len=4,0xa}))),out_port(100))
 ])
 
+dnl Check STT tunnel push
+AT_CHECK([ovs-ofctl add-flow int-br action=6])
+AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(2),eth_type(0x0800),ipv4(src=1.1.3.88,dst=1.1.3.112,proto=47,tos=0,ttl=64,frag=no)'], [0], [stdout])
+AT_CHECK([tail -1 stdout], [0],
+  [Datapath actions: tnl_push(tnl_port(7471),header(size=72,type=106,eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),ipv4(src=1.1.2.88,dst=1.1.2.92,proto=6,tos=0,ttl=64,frag=0x40),tcp(src=0,dst=7471,seq=0x0,ack=0x0,flags=psh|ack|0x5000,csum=0x0,urg=0x0),stt(tun_id=0x315,ver=0x0,flags=0x1,l4_offset=0x0,res=0x0,mss=0x0,vid=0,pcp=0,cfi=0)),out_port(100))
+])
+
 dnl Check decapsulation of GRE packet
 AT_CHECK([ovs-appctl netdev-dummy/receive p0 'aa55aa550000001b213cab6408004500007e79464000402fba550101025c0101025820006558000001c8fe71d883724fbeb6f4e1494a080045000054ba200000400184861e0000011e00000200004227e75400030af3195500000000f265010000000000101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f3031323334353637'])
 ovs-appctl time/warp 1000
@@ -124,6 +142,15 @@ AT_CHECK([ovs-ofctl dump-ports int-br | grep 'port  3'], [0], [dnl
   port  3: rx pkts=1, bytes=98, drop=0, errs=0, frame=0, over=0, crc=0
 ])
 
+dnl Check STT only accepts encapsulated Ethernet frames
+AT_CHECK([ovs-ofctl del-flows int-br])
+AT_CHECK([ovs-appctl netdev-dummy/receive p0 'aa55aa550000001b213cab6408004500009c00004000400633a70101025c01010258204e1d2f007400000000001c5018ffff7cb8000000010000000000000000000000000315000066b2591a7347427c73ecf94a080045000054b15e400040017e950101045c010104580800a59658cb00018a7b8f560000000016f80a0000000000101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f3031323334353637'])
+ovs-appctl time/warp 1000
+
+AT_CHECK([ovs-ofctl dump-ports int-br | grep 'port  6'], [0], [dnl
+  port  6: rx pkts=1, bytes=98, drop=0, errs=0, frame=0, over=0, crc=0
+])
+
 dnl Check decapsulation of Geneve packet with options
 AT_CAPTURE_FILE([ofctl_monitor.log])
 AT_CHECK([ovs-ofctl monitor int-br 65534 --detach --no-chdir --pidfile 2> ofctl_monitor.log])
-- 
1.8.3.1




More information about the dev mailing list