[ovs-dev] [PATCH 8/8] native tunnel: Add support for STT
Pravin B Shelar
pshelar at nicira.com
Mon Jan 11 07:18:14 UTC 2016
This patch used userpsace tunneling mechanism for implementing
STT tunneling protocol.
Signed-off-by: Pravin B Shelar <pshelar at nicira.com>
---
lib/netdev-vport.c | 6 +-
lib/odp-util.c | 41 ++-
lib/packets.h | 26 ++
lib/tnl-push-pop.c | 639 ++++++++++++++++++++++++++++++++++++++++++++++-
lib/tnl-push-pop.h | 11 +
tests/tunnel-push-pop.at | 27 ++
6 files changed, 745 insertions(+), 5 deletions(-)
diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
index 8b41201..c70a596 100644
--- a/lib/netdev-vport.c
+++ b/lib/netdev-vport.c
@@ -907,8 +907,12 @@ netdev_vport_tunnel_register(void)
TUNNEL_CLASS("vxlan", "vxlan_sys", NULL, netdev_vxlan_build_header,
push_udp_header,
netdev_vxlan_pop_header),
+ TUNNEL_CLASS("stt", "stt_sys", netdev_stt_class_init,
+ netdev_stt_build_header,
+ netdev_stt_push_header,
+ netdev_stt_pop_header),
+
TUNNEL_CLASS("lisp", "lisp_sys", NULL, NULL, NULL, NULL),
- TUNNEL_CLASS("stt", "stt_sys", NULL, NULL, NULL, NULL),
};
static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
diff --git a/lib/odp-util.c b/lib/odp-util.c
index f16e113..bdc7391 100644
--- a/lib/odp-util.c
+++ b/lib/odp-util.c
@@ -85,6 +85,8 @@ static void format_geneve_opts(const struct geneve_opt *opt,
static struct nlattr *generate_all_wildcard_mask(const struct attr_len_tbl tbl[],
int max, struct ofpbuf *,
const struct nlattr *key);
+static void format_be64(struct ds *ds, const char *name, ovs_be64 key,
+ const ovs_be64 *mask, bool verbose);
static void format_u128(struct ds *ds, const ovs_u128 *value,
const ovs_u128 *mask, bool verbose);
static int scan_u128(const char *s, ovs_u128 *value, ovs_u128 *mask);
@@ -445,13 +447,29 @@ format_udp_tnl_push_header(struct ds *ds, const struct udp_header *udp)
return udp + 1;
}
+static const void *
+format_tcp_tnl_push_header(struct ds *ds, const struct tcp_header *tcp)
+{
+ ds_put_format(ds, "tcp(src=%"PRIu16",dst=%"PRIu16",seq=0x%"PRIx16","
+ "ack=0x%"PRIx16",", ntohs(tcp->tcp_src), ntohs(tcp->tcp_dst),
+ ntohl(get_16aligned_be32(&tcp->tcp_seq)),
+ ntohl(get_16aligned_be32(&tcp->tcp_ack)));
+
+ format_flags_masked(ds, "flags", packet_tcp_flag_to_string,
+ ntohs(tcp->tcp_ctl), TCP_FLAGS(OVS_BE16_MAX),
+ TCP_FLAGS(OVS_BE16_MAX));
+
+ ds_put_format(ds, ",csum=0x%"PRIx16",urg=0x%"PRIx16")",
+ ntohs(tcp->tcp_csum), ntohs(tcp->tcp_urg));
+ return tcp + 1;
+}
+
static void
format_odp_tnl_push_header(struct ds *ds, struct ovs_action_push_tnl *data)
{
const struct eth_header *eth;
const void *l3;
const void *l4;
- const struct udp_header *udp;
eth = (const struct eth_header *)data->header;
@@ -491,11 +509,12 @@ format_odp_tnl_push_header(struct ds *ds, struct ovs_action_push_tnl *data)
l4 = (ip6 + 1);
}
- udp = (const struct udp_header *) l4;
if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) {
const struct vxlanhdr *vxh;
+ const struct udp_header *udp;
+ udp = (const struct udp_header *) l4;
vxh = format_udp_tnl_push_header(ds, udp);
ds_put_format(ds, "vxlan(flags=0x%"PRIx32",vni=0x%"PRIx32")",
@@ -503,7 +522,9 @@ format_odp_tnl_push_header(struct ds *ds, struct ovs_action_push_tnl *data)
ntohl(get_16aligned_be32(&vxh->vx_vni)) >> 8);
} else if (data->tnl_type == OVS_VPORT_TYPE_GENEVE) {
const struct genevehdr *gnh;
+ const struct udp_header *udp;
+ udp = (const struct udp_header *) l4;
gnh = format_udp_tnl_push_header(ds, udp);
ds_put_format(ds, "geneve(%s%svni=0x%"PRIx32,
@@ -541,6 +562,22 @@ format_odp_tnl_push_header(struct ds *ds, struct ovs_action_push_tnl *data)
options++;
}
ds_put_format(ds, ")");
+ } else if (data->tnl_type == OVS_VPORT_TYPE_STT) {
+ const struct tcp_header *tcp;
+ const struct stthdr *stth;
+
+ tcp = (const struct tcp_header *) l4;
+ stth = format_tcp_tnl_push_header(ds, tcp);
+ ds_put_format(ds, ",stt(");
+ format_be64(ds, "tun_id", get_32aligned_be64(&stth->key), NULL, false);
+ ds_put_format(ds, "ver=0x%"PRIx8",flags=0x%"PRIx8","
+ "l4_offset=0x%"PRIx8",res=0x%"PRIx8","
+ "mss=0x%"PRIx16",",
+ stth->version, stth->flags,
+ stth->l4_offset, stth->reserved, ntohs(stth->mss));
+
+ format_vlan_tci(ds, stth->vlan_tci, OVS_BE16_MAX, false);
+ ds_put_format(ds, ")");
}
ds_put_format(ds, ")");
}
diff --git a/lib/packets.h b/lib/packets.h
index 2157657..5459b0f 100644
--- a/lib/packets.h
+++ b/lib/packets.h
@@ -1021,6 +1021,32 @@ struct vxlanhdr {
#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */
+/* STT header */
+
+struct stthdr {
+ __u8 version;
+ __u8 flags;
+ __u8 l4_offset;
+ __u8 reserved;
+ ovs_be16 mss;
+ ovs_be16 vlan_tci;
+ ovs_32aligned_be64 key;
+};
+
+/* Padding after the end of the tunnel headers to provide alignment
+ * for inner packet IP header after 14 byte Ethernet header.
+ */
+#define STT_ETH_PAD 2
+
+#define STT_BASE_HLEN (sizeof(struct stthdr) + STT_ETH_PAD)
+#define STT_HEADER_LEN (sizeof(struct tcp_header) + STT_BASE_HLEN)
+
+#define STT_CSUM_VERIFIED (1 << 0)
+#define STT_CSUM_PARTIAL (1 << 1)
+#define STT_PROTO_IPV4 (1 << 2)
+#define STT_PROTO_TCP (1 << 3)
+#define STT_PROTO_TYPES (STT_PROTO_IPV4 | STT_PROTO_TCP)
+
void ipv6_format_addr(const struct in6_addr *addr, struct ds *);
void ipv6_format_addr_bracket(const struct in6_addr *addr, struct ds *,
bool bracket);
diff --git a/lib/tnl-push-pop.c b/lib/tnl-push-pop.c
index 86023c2..9440033 100644
--- a/lib/tnl-push-pop.c
+++ b/lib/tnl-push-pop.c
@@ -16,8 +16,6 @@
#include <config.h>
-#include "netdev-vport.h"
-
#include <errno.h>
#include <fcntl.h>
#include <sys/socket.h>
@@ -25,6 +23,10 @@
#include <netinet/ip6.h>
#include <sys/ioctl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
#include "byte-order.h"
#include "csum.h"
#include "daemon.h"
@@ -32,20 +34,25 @@
#include "dpif.h"
#include "dp-packet.h"
#include "dynamic-string.h"
+#include "entropy.h"
#include "flow.h"
#include "hash.h"
#include "hmap.h"
+#include "id-pool.h"
#include "list.h"
#include "netdev-provider.h"
+#include "netdev-vport.h"
#include "netdev-vport-private.h"
#include "odp-netlink.h"
#include "dp-packet.h"
#include "ovs-router.h"
#include "packets.h"
#include "poll-loop.h"
+#include "random.h"
#include "route-table.h"
#include "shash.h"
#include "socket-util.h"
+#include "timeval.h"
#include "tnl-push-pop.h"
#include "openvswitch/vlog.h"
#include "unaligned.h"
@@ -631,6 +638,634 @@ netdev_geneve_build_header(const struct netdev *netdev,
return 0;
}
+
+/* STT */
+
+/* The maximum amount of memory used to store packets waiting to be reassembled
+ * on a given CPU. Once this threshold is exceeded we will begin freeing the
+ * least recently used fragments.
+ */
+#define REASM_HI_THRESH (4 * 1024 * 1024)
+/* The target for the high memory evictor. Once we have exceeded
+ * REASM_HI_THRESH, we will continue freeing fragments until we hit
+ * this limit.
+ */
+#define REASM_LO_THRESH (3 * 1024 * 1024)
+/* The length of time a given packet has to be reassembled from the time the
+ * first fragment arrives. Once this limit is exceeded it becomes available
+ * for cleaning.
+ */
+
+#define FRAG_EXP_TIME frag_exp_time
+
+#define FRAG_HASH_SHIFT 8
+#define FRAG_HASH_ENTRIES (1 << FRAG_HASH_SHIFT)
+#define FRAG_HASH_SEGS ((sizeof(uint32_t) * 8) / FRAG_HASH_SHIFT)
+
+/* The length and offset of a fragment are encoded in the sequence number.
+ * STT_SEQ_LEN_SHIFT is the left shift needed to store the length.
+ * STT_SEQ_OFFSET_MASK is the mask to extract the offset.
+ */
+#define STT_SEQ_LEN_SHIFT 16
+#define STT_SEQ_OFFSET_MASK ((1 << STT_SEQ_LEN_SHIFT) - 1)
+
+struct pkt_key {
+ struct in6_addr ipv6_src;
+ struct in6_addr ipv6_dst;
+ ovs_be32 pkt_seq;
+};
+
+struct pkt_frag {
+ struct dp_packet *pkts;
+ unsigned long timestamp;
+ struct ovs_list lru_node;
+ struct pkt_key key;
+};
+
+struct first_frag {
+ struct dp_packet *last_pkt;
+ unsigned int mem_used;
+ uint16_t tot_len;
+ uint16_t rcvd_len;
+ bool set_ecn_ce;
+};
+
+struct frag_packet_data {
+ uint16_t offset;
+ uint16_t pkt_size;
+ /* Only valid for the first packet in the chain. */
+ struct first_frag first;
+ struct dp_packet *next;
+};
+
+BUILD_ASSERT_DECL(DP_PACKET_CONTEXT_SIZE >= sizeof(struct frag_packet_data));
+
+#define FRAG_DATA(packet) ((struct frag_packet_data *)(packet)->data)
+#define STT_PACKET_DATA(pkt) ((unsigned char *)dp_packet_l4(pkt) + STT_HEADER_LEN)
+
+struct stt_reassemble {
+ struct pkt_frag frag_hash[FRAG_HASH_ENTRIES];
+ struct ovs_list frag_lru;
+ unsigned int frag_mem_used;
+ uint32_t id;
+ uint32_t counter;
+};
+
+static struct ovs_mutex thread_is_lock;
+static struct id_pool *id_ppol;
+static uint32_t frag_hash_seed;
+static ovsthread_key_t per_thread_reasm_data;
+static void evict_frags(struct stt_reassemble *reasm, int mem_limit);
+static uint64_t frag_exp_time;
+
+static struct stt_reassemble *
+get_reasm()
+{
+ struct stt_reassemble *reasm;
+ uint32_t i;
+ bool res;
+
+ reasm = ovsthread_getspecific(per_thread_reasm_data);
+ if (OVS_UNLIKELY(reasm)) {
+ return reasm;
+ }
+
+ reasm = xmalloc_cacheline(sizeof(*reasm));
+ list_init(&reasm->frag_lru);
+ reasm->counter = 0;
+
+ ovs_mutex_lock(&thread_is_lock);
+
+ for (i = 0; i < USHRT_MAX; i++) {
+ res = id_pool_alloc_id(id_ppol, &i);
+
+ if (res) {
+ break;
+ }
+ }
+ if (res) {
+ reasm->id = i;
+ } else {
+ OVS_NOT_REACHED();
+ }
+ ovs_mutex_unlock(&thread_is_lock);
+ ovsthread_setspecific(per_thread_reasm_data, reasm);
+ return reasm;
+}
+
+static void
+reasm_destructor(void *_reasm)
+{
+ struct stt_reassemble *reasm = _reasm;
+
+ evict_frags(reasm, 0);
+
+ ovs_mutex_lock(&thread_is_lock);
+ id_pool_free_id(id_ppol, reasm->id);
+ ovs_mutex_unlock(&thread_is_lock);
+}
+
+int
+netdev_stt_class_init(void)
+{
+ ovsthread_key_create(&per_thread_reasm_data, reasm_destructor);
+ frag_hash_seed = random();
+ ovs_mutex_init(&thread_is_lock);
+ id_ppol = id_pool_create(0, USHRT_MAX);
+ frag_exp_time = 30 * OVS_HZ;
+ return 0;
+}
+
+static bool pkt_key_match(const struct pkt_key *a, const struct pkt_key *b)
+{
+ return !memcmp(a, b, sizeof (*a));
+}
+
+static uint32_t pkt_key_hash(const struct pkt_key *key)
+{
+ return hash_3words(hash_bytes(&key, offsetof(struct pkt_key, pkt_seq), 0),
+ (uint32_t)(key->pkt_seq), frag_hash_seed);
+}
+
+static inline void list_packet_delete(struct dp_packet *pkt)
+{
+ do {
+ struct dp_packet *next = FRAG_DATA(pkt)->next;
+
+ dp_packet_delete(pkt);
+ pkt = next;
+ } while (pkt);
+}
+
+static void free_frag(struct stt_reassemble *reasm, struct pkt_frag *frag)
+{
+ reasm->frag_mem_used -= FRAG_DATA(frag->pkts)->first.mem_used;
+ list_packet_delete(frag->pkts);
+ frag->pkts = NULL;
+}
+
+static struct pkt_frag *
+pkt_frag_from_node(const struct ovs_list *node)
+{
+ return CONTAINER_OF(node, struct pkt_frag, lru_node);
+}
+
+static void evict_frags(struct stt_reassemble *reasm, int mem_limit)
+{
+ while (!list_is_empty(&reasm->frag_lru) &&
+ reasm->frag_mem_used > mem_limit) {
+ struct pkt_frag *frag;
+
+ frag = pkt_frag_from_node(list_pop_back(&reasm->frag_lru));
+ free_frag(reasm, frag);
+ }
+
+ /* Update Fragment cache expiration time. */
+ frag_exp_time = 30 * OVS_HZ;
+}
+
+static struct pkt_frag *
+lookup_frag(struct stt_reassemble *reasm,
+ const struct pkt_key *key, uint32_t hash)
+{
+ struct pkt_frag *frag, *victim_frag = NULL;
+ int i;
+
+ for (i = 0; i < FRAG_HASH_SEGS; i++) {
+ frag = &reasm->frag_hash[hash & (FRAG_HASH_ENTRIES - 1)];
+
+ if (frag->pkts &&
+ pkt_key_match(&frag->key, key) &&
+ !time_before(cycles_counter(), frag->timestamp + FRAG_EXP_TIME)) {
+ return frag;
+
+ }
+ if (!victim_frag ||
+ (victim_frag->pkts &&
+ (!frag->pkts ||
+ time_before(frag->timestamp, victim_frag->timestamp))))
+ victim_frag = frag;
+
+ hash >>= FRAG_HASH_SHIFT;
+ }
+
+ if (victim_frag->pkts)
+ free_frag(reasm, victim_frag);
+
+ return victim_frag;
+}
+
+static struct dp_packet *
+packet_merge(struct dp_packet *pkt)
+{
+ struct dp_packet *next;
+ struct dp_packet *m;
+
+ m = dp_packet_clone(pkt);
+ next = FRAG_DATA(pkt)->next;
+ dp_packet_delete(pkt);
+
+ pkt = next;
+ while (pkt) {
+ void *data;
+
+ data = dp_packet_put_uninit(m, FRAG_DATA(pkt)->pkt_size);
+ memcpy(data, STT_PACKET_DATA(pkt), FRAG_DATA(pkt)->pkt_size);
+ next = FRAG_DATA(pkt)->next;
+ dp_packet_delete(pkt);
+ pkt = next;
+ }
+
+ return m;
+}
+
+static struct dp_packet *
+reassemble(struct dp_packet *packet)
+{
+ struct tcp_header *tcph = dp_packet_l4(packet);
+ int pkt_size = dp_packet_l4_size(packet) - sizeof(*tcph);
+ uint32_t seq = ntohl(get_16aligned_be32(&tcph->tcp_seq));
+ struct stt_reassemble *reasm = get_reasm();
+ struct dp_packet *last_pkt;
+ struct pkt_frag *frag;
+ struct pkt_key key;
+ uint32_t hash;
+ bool is_ipv6;
+ uint8_t tos;
+ int tot_len;
+
+ tot_len = seq >> STT_SEQ_LEN_SHIFT;
+ FRAG_DATA(packet)->offset = seq & STT_SEQ_OFFSET_MASK;
+ FRAG_DATA(packet)->next = NULL;
+ FRAG_DATA(packet)->pkt_size = pkt_size;
+
+ if (STT_BASE_HLEN > pkt_size) {
+ goto out_free;
+ }
+
+ if (FRAG_DATA(packet)->offset + pkt_size > tot_len) {
+ goto out_free;
+ }
+
+ if (tot_len == pkt_size) {
+ goto out;
+ }
+
+ is_ipv6 = is_header_ipv6(dp_packet_data(packet));
+
+ if (is_ipv6) {
+ struct ovs_16aligned_ip6_hdr *ip6 = dp_packet_l3(packet);
+
+ memcpy(&key.ipv6_src, &ip6->ip6_src.be16, sizeof ip6->ip6_src);
+ memcpy(&key.ipv6_dst, &ip6->ip6_dst.be16, sizeof ip6->ip6_dst);
+ } else {
+ struct ip_header *iph = dp_packet_l3(packet);
+
+ in6_addr_set_mapped_ipv4(&key.ipv6_src, get_16aligned_be32(&iph->ip_src));
+ in6_addr_set_mapped_ipv4(&key.ipv6_dst, get_16aligned_be32(&iph->ip_dst));
+ }
+ key.pkt_seq = get_16aligned_be32(&tcph->tcp_ack);
+ hash = pkt_key_hash(&key);
+
+ if (reasm->frag_mem_used + dp_packet_get_allocated(packet) > REASM_HI_THRESH) {
+ evict_frags(reasm, REASM_LO_THRESH);
+ }
+
+ frag = lookup_frag(reasm, &key, hash);
+ if (!frag->pkts) {
+ frag->pkts = packet;
+ frag->key = key;
+ frag->timestamp = cycles_counter();
+ FRAG_DATA(packet)->first.last_pkt = packet;
+ FRAG_DATA(packet)->first.mem_used = dp_packet_get_allocated(packet);
+ FRAG_DATA(packet)->first.tot_len = tot_len;
+ FRAG_DATA(packet)->first.rcvd_len = pkt_size;
+ FRAG_DATA(packet)->first.set_ecn_ce = false;
+ list_push_back(&reasm->frag_lru, &frag->lru_node);
+ reasm->frag_mem_used += dp_packet_get_allocated(packet);
+
+ packet = NULL;
+ goto out;
+ }
+
+ /* Optimize for the common case where fragments are received in-order
+ * and not overlapping.
+ */
+ last_pkt = FRAG_DATA(frag->pkts)->first.last_pkt;
+ if (FRAG_DATA(last_pkt)->offset + FRAG_DATA(last_pkt)->pkt_size ==
+ FRAG_DATA(packet)->offset) {
+ FRAG_DATA(last_pkt)->next = packet;
+ FRAG_DATA(frag->pkts)->first.last_pkt = packet;
+ } else {
+ struct dp_packet *prev = NULL, *next;
+
+ for (next = frag->pkts; next; next = FRAG_DATA(next)->next) {
+ if (FRAG_DATA(next)->offset >= FRAG_DATA(packet)->offset) {
+ break;
+ }
+ prev = next;
+ }
+
+ /* Overlapping fragments aren't allowed. We shouldn't start
+ * before the end of the previous fragment.
+ */
+ if (prev &&
+ FRAG_DATA(prev)->offset + FRAG_DATA(prev)->pkt_size > FRAG_DATA(packet)->offset) {
+ goto unlock_free;
+ }
+
+ /* We also shouldn't end after the beginning of the next
+ * fragment.
+ */
+ if (next &&
+ FRAG_DATA(packet)->offset + pkt_size > FRAG_DATA(next)->offset) {
+ goto unlock_free;
+ }
+
+ if (prev) {
+ FRAG_DATA(prev)->next = packet;
+ } else {
+ FRAG_DATA(packet)->first = FRAG_DATA(frag->pkts)->first;
+ frag->pkts = packet;
+ }
+
+ if (next) {
+ FRAG_DATA(packet)->next = next;
+ } else {
+ FRAG_DATA(frag->pkts)->first.last_pkt = packet;
+ }
+ }
+
+ if (is_ipv6) {
+ struct ovs_16aligned_ip6_hdr *ip6 = dp_packet_l3(packet);
+
+ tos = ntohl(get_16aligned_be32(&ip6->ip6_flow)) >> 20;
+ } else {
+ struct ip_header *iph = dp_packet_l3(packet);
+ tos = iph->ip_tos;
+ }
+
+ FRAG_DATA(frag->pkts)->first.set_ecn_ce |= IP_ECN_is_ce(tos);
+ FRAG_DATA(frag->pkts)->first.rcvd_len += pkt_size;
+ FRAG_DATA(frag->pkts)->first.mem_used += dp_packet_get_allocated(packet);
+ reasm->frag_mem_used += dp_packet_get_allocated(packet);
+
+ if (FRAG_DATA(frag->pkts)->first.tot_len ==
+ FRAG_DATA(frag->pkts)->first.rcvd_len) {
+ struct dp_packet *frag_head = frag->pkts;
+
+ if (FRAG_DATA(frag_head)->first.set_ecn_ce) {
+ IP_ECN_set_ce(frag_head, is_ipv6);
+ }
+
+ list_remove(&frag->lru_node);
+ reasm->frag_mem_used -= FRAG_DATA(frag_head)->first.mem_used;
+ frag->pkts = NULL;
+ packet = packet_merge(frag_head);
+ } else {
+ list_remove(&frag->lru_node);
+ list_push_back(&reasm->frag_lru, &frag->lru_node);
+ packet = NULL;
+ }
+ goto out;
+
+unlock_free:
+ dp_packet_delete(packet);
+ packet = NULL;
+out:
+ return packet;
+out_free:
+ dp_packet_delete(packet);
+ return NULL;
+}
+
+static bool
+valid_tcp_checksum(struct dp_packet *packet)
+{
+ uint32_t csum;
+
+ if (is_header_ipv6(dp_packet_data(packet))) {
+ csum = packet_csum_pseudoheader6(dp_packet_l3(packet));
+ } else {
+ csum = packet_csum_pseudoheader(dp_packet_l3(packet));
+ }
+
+ csum = csum_continue(csum, dp_packet_l4(packet), dp_packet_l4_size(packet));
+ if (csum_finish(csum)) {
+ return false;
+ }
+ return true;
+}
+
+static void *
+tcp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
+ unsigned int *hlen)
+{
+ struct tcp_header *tcp;
+
+ tcp = ip_extract_tnl_md(packet, tnl, hlen);
+ if (!tcp) {
+ return NULL;
+ }
+
+ tnl->flags |= FLOW_TNL_F_CSUM;
+ tnl->tp_src = tcp->tcp_src;
+ tnl->tp_dst = tcp->tcp_dst;
+ return tcp + 1;
+}
+
+static int
+stt_extract_tnl_md(struct dp_packet *packet)
+{
+ struct pkt_metadata *md = &packet->md;
+ struct flow_tnl *tnl = &md->tunnel;
+ uint8_t flags, l4_offset;
+ struct stthdr *stth;
+ uint32_t hlen;
+
+ pkt_metadata_init_tnl(md);
+ stth = tcp_extract_tnl_md(packet, tnl, &hlen);
+ if (!stth) {
+ return EINVAL;
+ }
+
+ if (stth->version != 0) {
+ VLOG_WARN_RL(&err_rl, "invalid STT version = %d\n", stth->version);
+ return EINVAL;
+ }
+ flags = stth->flags;
+ l4_offset = stth->l4_offset;
+
+ tnl->tun_id = get_32aligned_be64(&stth->key);
+ tnl->flags |= FLOW_TNL_F_KEY;
+
+ dp_packet_reset_packet(packet, hlen + STT_HEADER_LEN);
+
+ if (flags & STT_CSUM_PARTIAL) {
+ uint8_t proto_type;
+ uint16_t csum_offset;
+ int l3_header_size;
+ int l4_header_size;
+ uint32_t l4_csum;
+ ovs_be16 *csum_ptr;
+
+ proto_type = stth->flags & STT_PROTO_TYPES;
+ if (proto_type == (STT_PROTO_IPV4 | STT_PROTO_TCP)) {
+ /* TCP/IPv4 */
+ csum_offset = offsetof(struct tcp_header, tcp_csum);
+ l3_header_size = sizeof(struct ip_header);
+ l4_header_size = sizeof(struct tcp_header);
+ } else if (proto_type == STT_PROTO_TCP) {
+ /* TCP/IPv6 */
+ csum_offset = offsetof(struct tcp_header, tcp_csum);
+ l3_header_size = sizeof(struct ovs_16aligned_ip6_hdr);
+ l4_header_size = sizeof(struct tcp_header);
+ } else if (proto_type == STT_PROTO_IPV4) {
+ /* UDP/IPv4 */
+ csum_offset = offsetof(struct udp_header, udp_csum);
+ l3_header_size = sizeof(struct ip_header);
+ l4_header_size = sizeof(struct udp_header);
+ } else {
+ /* UDP/IPv6 */
+ csum_offset = offsetof(struct udp_header, udp_csum);
+ l3_header_size = sizeof(struct ovs_16aligned_ip6_hdr);
+ l4_header_size = sizeof(struct udp_header);
+ }
+
+ if (l4_offset < ETH_HEADER_LEN + l3_header_size) {
+ return EINVAL;
+ }
+ if (dp_packet_size(packet) < l4_offset + l4_header_size) {
+ return EINVAL;
+ }
+ csum_ptr = (ovs_be16 *) ((uint16_t *) dp_packet_data(packet) + (l4_offset >> 1));
+ l4_csum = csum_continue(0, csum_ptr, dp_packet_size(packet) - l4_offset);
+ *(csum_ptr + (csum_offset >> 1)) = csum_finish(l4_csum);
+ }
+
+ return 0;
+}
+
+int
+netdev_stt_pop_header(struct dp_packet **p_packet)
+{
+ struct dp_packet *packet = *p_packet;
+ struct dp_packet *reasm_pkt;
+
+ if (!valid_tcp_checksum(packet)) {
+ return EINVAL;
+ }
+
+ reasm_pkt = reassemble(packet);
+ *p_packet = reasm_pkt;
+ if (reasm_pkt) {
+ return stt_extract_tnl_md(reasm_pkt);
+ }
+ return 0;
+}
+
+static void *
+tcp_build_header(struct netdev_tunnel_config *tnl_cfg,
+ struct ovs_action_push_tnl *data,
+ unsigned int *hlen)
+{
+ struct ovs_16aligned_ip6_hdr *ip6;
+ struct tcp_header *tcp;
+ struct ip_header *ip;
+ bool is_ipv6;
+
+ *hlen = sizeof(struct eth_header);
+
+ is_ipv6 = is_header_ipv6(data->header);
+
+ if (is_ipv6) {
+ ip6 = ipv6_hdr(data->header);
+ ip6->ip6_nxt = IPPROTO_TCP;
+ tcp = (struct tcp_header *) (ip6 + 1);
+ *hlen += IPV6_HEADER_LEN;
+ } else {
+ ip = ip_hdr(data->header);
+ ip->ip_proto = IPPROTO_TCP;
+ tcp = (struct tcp_header *) (ip + 1);
+ *hlen += IP_HEADER_LEN;
+ }
+
+ tcp->tcp_dst = tnl_cfg->dst_port;
+ tcp->tcp_ctl = TCP_CTL(TCP_ACK | TCP_PSH, sizeof(struct tcp_header) >> 2);
+ tcp->tcp_winsz = htons(USHRT_MAX);
+ return tcp + 1;
+}
+
+int
+netdev_stt_build_header(const struct netdev *netdev,
+ struct ovs_action_push_tnl *data,
+ const struct flow *tnl_flow)
+{
+ struct netdev_vport *dev = netdev_vport_cast(netdev);
+ struct netdev_tunnel_config *tnl_cfg;
+ struct stthdr *stth;
+ unsigned int hlen;
+
+ /* XXX: RCUfy tnl_cfg. */
+ ovs_mutex_lock(&dev->mutex);
+ tnl_cfg = &dev->tnl_cfg;
+
+ stth = tcp_build_header(tnl_cfg, data, &hlen);
+
+ stth->flags = STT_CSUM_VERIFIED;
+ stth->vlan_tci = 0;
+ put_32aligned_be64(&stth->key, tnl_flow->tunnel.tun_id);
+
+ ovs_mutex_unlock(&dev->mutex);
+ data->header_len = hlen + STT_HEADER_LEN;
+ data->tnl_type = OVS_VPORT_TYPE_STT;
+ return 0;
+}
+
+static uint32_t ack_seq(void)
+{
+ struct stt_reassemble *reasm = get_reasm();
+ uint32_t ack;
+
+ ack = reasm->counter << 16 | reasm->id;
+ reasm->counter++;
+ return ack;
+}
+
+static void
+push_tcp_header(struct dp_packet *packet,
+ const struct ovs_action_push_tnl *data)
+{
+ struct tcp_header *tcp;
+ uint32_t csum, stt_len;
+ int ip_tot_size;
+
+ tcp = push_ip_header(packet, data->header, data->header_len, &ip_tot_size);
+
+ /* set tcp src port */
+ tcp->tcp_src = get_src_port(packet);
+
+ stt_len = (ip_tot_size - sizeof(struct tcp_header));
+ put_16aligned_be32(&tcp->tcp_seq, htonl(stt_len << STT_SEQ_LEN_SHIFT));
+ put_16aligned_be32(&tcp->tcp_ack, htonl(ack_seq()));
+
+ if (is_header_ipv6(dp_packet_data(packet))) {
+ csum = packet_csum_pseudoheader6(ipv6_hdr(dp_packet_data(packet)));
+ } else {
+ csum = packet_csum_pseudoheader(ip_hdr(dp_packet_data(packet)));
+ }
+
+ csum = csum_continue(csum, tcp, ip_tot_size);
+ tcp->tcp_csum = csum_finish(csum);
+}
+
+void
+netdev_stt_push_header(struct dp_packet *packet,
+ const struct ovs_action_push_tnl *data)
+{
+ push_tcp_header(packet, data);
+}
+
void
netdev_vport_range(struct unixctl_conn *conn, int argc,
const char *argv[], void *aux OVS_UNUSED)
diff --git a/lib/tnl-push-pop.h b/lib/tnl-push-pop.h
index be84ecd..c1f2a68 100644
--- a/lib/tnl-push-pop.h
+++ b/lib/tnl-push-pop.h
@@ -53,4 +53,15 @@ void
netdev_vport_range(struct unixctl_conn *conn, int argc,
const char *argv[], void *aux OVS_UNUSED);
+int
+netdev_stt_pop_header(struct dp_packet **packet);
+int
+netdev_stt_build_header(const struct netdev *netdev,
+ struct ovs_action_push_tnl *data,
+ const struct flow *tnl_flow);
+void
+netdev_stt_push_header(struct dp_packet *packet,
+ const struct ovs_action_push_tnl *data);
+int
+netdev_stt_class_init(void);
#endif
diff --git a/tests/tunnel-push-pop.at b/tests/tunnel-push-pop.at
index b04f4a6..b0363f2 100644
--- a/tests/tunnel-push-pop.at
+++ b/tests/tunnel-push-pop.at
@@ -12,6 +12,8 @@ AT_CHECK([ovs-vsctl add-port int-br t2 -- set Interface t2 type=vxlan \
options:remote_ip=1.1.2.93 options:out_key=flow options:csum=true ofport_request=4\
-- add-port int-br t4 -- set Interface t4 type=geneve \
options:remote_ip=flow options:key=123 ofport_request=5\
+ -- add-port int-br t5 -- set Interface t5 type=stt \
+ options:remote_ip=1.1.2.92 options:key=789 ofport_request=6 options:csum=true\
], [0])
AT_CHECK([ovs-appctl dpif/show], [0], [dnl
@@ -25,6 +27,7 @@ dummy at ovs-dummy: hit:0 missed:0
t2 2/4789: (vxlan: key=123, remote_ip=1.1.2.92)
t3 4/4789: (vxlan: csum=true, out_key=flow, remote_ip=1.1.2.93)
t4 5/6081: (geneve: key=123, remote_ip=flow)
+ t5 6/7471: (stt: csum=true, key=789, remote_ip=1.1.2.92)
])
dnl First setup dummy interface IP address, then add the route
@@ -51,6 +54,7 @@ AT_CHECK([ovs-appctl tnl/ports/show |sort], [0], [dnl
Listening ports:
genev_sys_6081 (6081)
gre_sys (3)
+stt_sys_7471 (7471)
vxlan_sys_4789 (4789)
])
@@ -71,6 +75,13 @@ AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=f8:bc:12:44:34:
AT_CHECK([tail -1 stdout], [0],
[Datapath actions: tnl_pop(6081)
])
+i
+
+dnl Check STT tunnel pop
+AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x0800),ipv4(src=1.1.2.92,dst=1.1.2.88,proto=6,tos=0,ttl=64,frag=no),tcp(src=51283,dst=7471)'], [0], [stdout])
+AT_CHECK([tail -1 stdout], [0],
+ [Datapath actions: tnl_pop(7471)
+])
dnl Check VXLAN tunnel push
AT_CHECK([ovs-ofctl add-flow int-br action=2])
@@ -108,6 +119,13 @@ AT_CHECK([tail -1 stdout], [0],
[Datapath actions: tnl_push(tnl_port(6081),header(size=58,type=5,eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),ipv4(src=1.1.2.88,dst=1.1.2.92,proto=17,tos=0,ttl=64,frag=0x40),udp(src=0,dst=6081,csum=0x0),geneve(crit,vni=0x7b,options({class=0xffff,type=0x80,len=4,0xa}))),out_port(100))
])
+dnl Check STT tunnel push
+AT_CHECK([ovs-ofctl add-flow int-br action=6])
+AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(2),eth_type(0x0800),ipv4(src=1.1.3.88,dst=1.1.3.112,proto=47,tos=0,ttl=64,frag=no)'], [0], [stdout])
+AT_CHECK([tail -1 stdout], [0],
+ [Datapath actions: tnl_push(tnl_port(7471),header(size=72,type=106,eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),ipv4(src=1.1.2.88,dst=1.1.2.92,proto=6,tos=0,ttl=64,frag=0x40),tcp(src=0,dst=7471,seq=0x0,ack=0x0,flags=psh|ack|0x5000,csum=0x0,urg=0x0),stt(tun_id=0x315,ver=0x0,flags=0x1,l4_offset=0x0,res=0x0,mss=0x0,vid=0,pcp=0,cfi=0)),out_port(100))
+])
+
dnl Check decapsulation of GRE packet
AT_CHECK([ovs-appctl netdev-dummy/receive p0 'aa55aa550000001b213cab6408004500007e79464000402fba550101025c0101025820006558000001c8fe71d883724fbeb6f4e1494a080045000054ba200000400184861e0000011e00000200004227e75400030af3195500000000f265010000000000101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f3031323334353637'])
ovs-appctl time/warp 1000
@@ -124,6 +142,15 @@ AT_CHECK([ovs-ofctl dump-ports int-br | grep 'port 3'], [0], [dnl
port 3: rx pkts=1, bytes=98, drop=0, errs=0, frame=0, over=0, crc=0
])
+dnl Check STT only accepts encapsulated Ethernet frames
+AT_CHECK([ovs-ofctl del-flows int-br])
+AT_CHECK([ovs-appctl netdev-dummy/receive p0 'aa55aa550000001b213cab6408004500009c00004000400633a70101025c01010258204e1d2f007400000000001c5018ffff7cb8000000010000000000000000000000000315000066b2591a7347427c73ecf94a080045000054b15e400040017e950101045c010104580800a59658cb00018a7b8f560000000016f80a0000000000101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f3031323334353637'])
+ovs-appctl time/warp 1000
+
+AT_CHECK([ovs-ofctl dump-ports int-br | grep 'port 6'], [0], [dnl
+ port 6: rx pkts=1, bytes=98, drop=0, errs=0, frame=0, over=0, crc=0
+])
+
dnl Check decapsulation of Geneve packet with options
AT_CAPTURE_FILE([ofctl_monitor.log])
AT_CHECK([ovs-ofctl monitor int-br 65534 --detach --no-chdir --pidfile 2> ofctl_monitor.log])
--
1.8.3.1
More information about the dev
mailing list