[ovs-dev] [PATCH 04/12] conntrack: New userspace connection tracker.

Daniele Di Proietto diproiettod at vmware.com
Mon Nov 16 06:21:15 UTC 2015


This commit adds the conntrack module.

It is a connection tracker that resides entirely in userspace.  Its
primary user will be the dpif-netdev datapath.

The module main goal is to provide conntrack_execute(), which offers a
convenient interface to implement the datapath ct() action.

The conntrack module uses two submodules to deal with the l4 protocol
details (conntrack-other for UDP and ICMP, conntrack-tcp for TCP).

The conntrack-tcp submodule implementation is adapted from FreeBSD's pf
subsystem, therefore it's BSD licensed.  It has been slightly altered to
match the OVS coding style and to allow the pickup of already
established connections.

Signed-off-by: Daniele Di Proietto <diproiettod at vmware.com>
---
 lib/automake.mk         |   5 +
 lib/conntrack-other.c   |  91 ++++++
 lib/conntrack-private.h |  77 +++++
 lib/conntrack-tcp.c     | 475 +++++++++++++++++++++++++++
 lib/conntrack.c         | 854 ++++++++++++++++++++++++++++++++++++++++++++++++
 lib/conntrack.h         | 144 ++++++++
 6 files changed, 1646 insertions(+)
 create mode 100644 lib/conntrack-other.c
 create mode 100644 lib/conntrack-private.h
 create mode 100644 lib/conntrack-tcp.c
 create mode 100644 lib/conntrack.c
 create mode 100644 lib/conntrack.h

diff --git a/lib/automake.mk b/lib/automake.mk
index 6a20e55..ef8559e 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -45,6 +45,11 @@ lib_libopenvswitch_la_SOURCES = \
 	lib/compiler.h \
 	lib/connectivity.c \
 	lib/connectivity.h \
+	lib/conntrack-private.h \
+	lib/conntrack-tcp.c \
+	lib/conntrack-other.c \
+	lib/conntrack.c \
+	lib/conntrack.h \
 	lib/coverage.c \
 	lib/coverage.h \
 	lib/crc32c.c \
diff --git a/lib/conntrack-other.c b/lib/conntrack-other.c
new file mode 100644
index 0000000..c0d57a2
--- /dev/null
+++ b/lib/conntrack-other.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2015 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#include "conntrack-private.h"
+#include "dp-packet.h"
+
+enum other_state {
+    OTHERS_FIRST,
+    OTHERS_MULTIPLE,
+    OTHERS_BIDIR,
+};
+
+struct conn_other {
+    struct conn up;
+    enum other_state state;
+};
+
+static const long long other_timeouts[] = {
+    [OTHERS_FIRST] = 60 * 1000,
+    [OTHERS_MULTIPLE] = 60 * 1000,
+    [OTHERS_BIDIR] = 30 * 1000,
+};
+
+static struct conn_other *
+conn_other_cast(const struct conn *conn)
+{
+    return CONTAINER_OF(conn, struct conn_other, up);
+}
+
+static void
+update_expiration(struct conn_other *conn, long long now)
+{
+    conn->up.expiration = now + other_timeouts[conn->state];
+}
+
+static enum ct_update_res
+other_conn_update(struct conn *conn_, struct dp_packet *pkt OVS_UNUSED,
+                bool reply, long long now)
+{
+    struct conn_other *conn = conn_other_cast(conn_);
+
+    if (reply && conn->state != OTHERS_BIDIR) {
+        conn->state = OTHERS_BIDIR;
+    } else if (conn->state == OTHERS_FIRST) {
+        conn->state = OTHERS_MULTIPLE;
+    }
+
+    update_expiration(conn, now);
+
+    return CT_UPDATE_VALID;
+}
+
+static bool
+other_valid_new(struct dp_packet *pkt OVS_UNUSED)
+{
+    return true;
+}
+
+static struct conn *
+other_new_conn(struct dp_packet *pkt OVS_UNUSED, long long now)
+{
+    struct conn_other *conn;
+
+    conn = xzalloc(sizeof(struct conn_other));
+    conn->state = OTHERS_FIRST;
+
+    update_expiration(conn, now);
+
+    return &conn->up;
+}
+
+struct ct_l4_proto ct_proto_other = {
+    .new_conn = other_new_conn,
+    .valid_new = other_valid_new,
+    .conn_update = other_conn_update,
+};
diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
new file mode 100644
index 0000000..62e3003
--- /dev/null
+++ b/lib/conntrack-private.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2015 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CONNTRACK_PRIVATE_H
+#define CONNTRACK_PRIVATE_H 1
+
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <netinet/ip6.h>
+
+#include "hmap.h"
+#include "openvswitch/types.h"
+#include "packets.h"
+#include "unaligned.h"
+
+struct ct_addr {
+    union {
+        ovs_16aligned_be32 ipv4;
+        union ovs_16aligned_in6_addr ipv6;
+        ovs_be32 ipv4_aligned;
+        struct in6_addr ipv6_aligned;
+    };
+};
+
+struct ct_endpoint {
+    struct ct_addr addr;
+    ovs_be16 port;
+};
+
+struct conn_key {
+    struct ct_endpoint src;
+    struct ct_endpoint dst;
+
+    ovs_be16 dl_type;
+    uint8_t nw_proto;
+    uint16_t zone;
+};
+
+struct conn {
+    struct conn_key key;
+    struct conn_key rev_key;
+    long long expiration;
+    struct hmap_node node;
+    uint32_t mark;
+    ovs_u128 label;
+};
+
+enum ct_update_res {
+    CT_UPDATE_INVALID,
+    CT_UPDATE_VALID,
+    CT_UPDATE_NEW,
+};
+
+struct ct_l4_proto {
+    struct conn *(*new_conn)(struct dp_packet *pkt, long long now);
+    bool (*valid_new)(struct dp_packet *pkt);
+    enum ct_update_res (*conn_update)(struct conn *conn, struct dp_packet *pkt,
+                                      bool reply, long long now);
+};
+
+extern struct ct_l4_proto ct_proto_tcp;
+extern struct ct_l4_proto ct_proto_other;
+
+#endif /* conntrack-private.h */
diff --git a/lib/conntrack-tcp.c b/lib/conntrack-tcp.c
new file mode 100644
index 0000000..977c691
--- /dev/null
+++ b/lib/conntrack-tcp.c
@@ -0,0 +1,475 @@
+/*-
+ * Copyright (c) 2001 Daniel Hartmeier
+ * Copyright (c) 2002 - 2008 Henning Brauer
+ * Copyright (c) 2012 Gleb Smirnoff <glebius at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *    - Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    - Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Effort sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F30602-01-2-0537.
+ *
+ *      $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
+ */
+
+#include <config.h>
+
+#include "conntrack-private.h"
+#include "ct-dpif.h"
+#include "dp-packet.h"
+#include "util.h"
+
+struct tcp_peer {
+    enum ct_dpif_tcp_state state;
+    uint32_t               seqlo;          /* Max sequence number sent     */
+    uint32_t               seqhi;          /* Max the other end ACKd + win */
+    uint16_t               max_win;        /* largest window (pre scaling) */
+    uint8_t                wscale;         /* window scaling factor        */
+};
+
+struct conn_tcp {
+    struct conn up;
+    struct tcp_peer peer[2];
+};
+
+enum {
+    TCPOPT_EOL,
+    TCPOPT_NOP,
+    TCPOPT_WINDOW = 3,
+};
+
+/* TCP sequence numbers are 32 bit integers operated
+ * on with modular arithmetic.  These macros can be
+ * used to compare such integers. */
+#define SEQ_LT(a,b)     ((int)((a)-(b)) < 0)
+#define SEQ_LEQ(a,b)    ((int)((a)-(b)) <= 0)
+#define SEQ_GT(a,b)     ((int)((a)-(b)) > 0)
+#define SEQ_GEQ(a,b)    ((int)((a)-(b)) >= 0)
+
+#define SEQ_MIN(a, b)   ((SEQ_LT(a, b)) ? (a) : (b))
+#define SEQ_MAX(a, b)   ((SEQ_GT(a, b)) ? (a) : (b))
+
+static struct conn_tcp*
+conn_tcp_cast(const struct conn* conn)
+{
+    return CONTAINER_OF(conn, struct conn_tcp, up);
+}
+
+/* pf does this in in pf_normalize_tcp(), and it is called only if scrub
+ * is enabled.  We're not scrubbing, but this check seems reasonable.  */
+static bool
+tcp_invalid_flags(uint16_t flags)
+{
+
+    if (flags & TCP_SYN) {
+        if (flags & TCP_RST) {
+            return true;
+        }
+        if (flags & TCP_FIN) {
+            /* Here pf removes the fin flag.  We simply mark the packet as
+             * invalid */
+            return true;
+        }
+    } else {
+        /* Illegal packet */
+        if (!(flags & (TCP_ACK|TCP_RST))) {
+            return true;
+        }
+    }
+
+    if (!(flags & TCP_ACK)) {
+        /* These flags are only valid if ACK is set */
+        if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & TCP_URG)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+#define TCP_MAX_WSCALE 14
+#define CT_WSCALE_FLAG 0x80
+#define CT_WSCALE_UNKNOWN 0x40
+#define CT_WSCALE_MASK 0xf
+
+static uint8_t
+tcp_get_wscale(const struct tcp_header *tcp)
+{
+    unsigned len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp;
+    const uint8_t *opt = (const uint8_t *)(tcp + 1);
+    uint8_t wscale = 0;
+    uint8_t optlen;
+
+    while (len >= 3) {
+        if (*opt == TCPOPT_EOL) {
+            break;
+        }
+        switch (*opt) {
+        case TCPOPT_NOP:
+            opt++;
+            len--;
+            break;
+        case TCPOPT_WINDOW:
+            wscale = MIN(opt[2], TCP_MAX_WSCALE);
+            wscale |= CT_WSCALE_FLAG;
+            /* fall through */
+        default:
+            optlen = opt[2];
+            if (optlen < 2) {
+                optlen = 2;
+            }
+            len -= optlen;
+            opt += optlen;
+        }
+    }
+
+    return wscale;
+}
+
+static uint32_t
+tcp_payload_length(struct dp_packet *pkt)
+{
+    return (char *) dp_packet_tail(pkt) - dp_packet_l2_pad_size(pkt)
+           - (char *) dp_packet_get_tcp_payload(pkt);
+}
+
+static void
+update_expiration(struct conn_tcp *conn, long long now, long long interval)
+{
+    conn->up.expiration = now + interval;
+}
+
+
+static enum ct_update_res
+tcp_conn_update(struct conn* conn_, struct dp_packet *pkt, bool reply,
+                long long now)
+{
+    struct conn_tcp *conn = conn_tcp_cast(conn_);
+    struct tcp_header *tcp = dp_packet_l4(pkt);
+    /* The peer that sent 'pkt' */
+    struct tcp_peer *src = &conn->peer[reply ? 1 : 0];
+    /* The peer that should receive 'pkt' */
+    struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];
+    uint8_t sws = 0, dws = 0;
+    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
+
+    uint16_t win = ntohs(tcp->tcp_winsz);
+    uint32_t ack, end, seq, orig_seq;
+    uint32_t p_len = tcp_payload_length(pkt);
+    int ackskew;
+
+    if (tcp_invalid_flags(tcp_flags)) {
+        return CT_UPDATE_INVALID;
+    }
+
+    if (((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN) &&
+            dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 &&
+            src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
+        src->state = dst->state = CT_DPIF_TCPS_CLOSED;
+        return CT_UPDATE_NEW;
+    }
+
+    if (src->wscale & CT_WSCALE_FLAG
+        && dst->wscale & CT_WSCALE_FLAG
+        && !(tcp_flags & TCP_SYN)) {
+
+        sws = src->wscale & CT_WSCALE_MASK;
+        dws = dst->wscale & CT_WSCALE_MASK;
+
+    } else if (src->wscale & CT_WSCALE_UNKNOWN
+        && dst->wscale & CT_WSCALE_UNKNOWN
+        && !(tcp_flags & TCP_SYN)) {
+
+        sws = TCP_MAX_WSCALE;
+        dws = TCP_MAX_WSCALE;
+    }
+
+    /*
+     * Sequence tracking algorithm from Guido van Rooij's paper:
+     *   http://www.madison-gurkha.com/publications/tcp_filtering/
+     *      tcp_filtering.ps
+     */
+
+    orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq));
+    if (src->state < CT_DPIF_TCPS_SYN_SENT) {
+        /* First packet from this end. Set its state */
+
+        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
+
+        end = seq + p_len;
+        if (tcp_flags & TCP_SYN) {
+            end++;
+            if (dst->wscale & CT_WSCALE_FLAG) {
+                src->wscale = tcp_get_wscale(tcp);
+                if (src->wscale & CT_WSCALE_FLAG) {
+                    /* Remove scale factor from initial window */
+                    sws = src->wscale & CT_WSCALE_MASK;
+                    win = DIV_ROUND_UP((uint32_t) win, 1 << sws);
+                    dws = dst->wscale & CT_WSCALE_MASK;
+                } else {
+                    /* fixup other window */
+                    dst->max_win <<= dst->wscale &
+                        CT_WSCALE_MASK;
+                    /* in case of a retrans SYN|ACK */
+                    dst->wscale = 0;
+                }
+            }
+        }
+        if (tcp_flags & TCP_FIN) {
+            end++;
+        }
+
+        src->seqlo = seq;
+        src->state = CT_DPIF_TCPS_SYN_SENT;
+        /*
+         * May need to slide the window (seqhi may have been set by
+         * the crappy stack check or if we picked up the connection
+         * after establishment)
+         */
+        if (src->seqhi == 1 ||
+                SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) {
+            src->seqhi = end + MAX(1, dst->max_win << dws);
+        }
+        if (win > src->max_win) {
+            src->max_win = win;
+        }
+
+    } else {
+        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
+        end = seq + p_len;
+        if (tcp_flags & TCP_SYN) {
+            end++;
+        }
+        if (tcp_flags & TCP_FIN) {
+            end++;
+        }
+    }
+
+    if ((tcp_flags & TCP_ACK) == 0) {
+        /* Let it pass through the ack skew check */
+        ack = dst->seqlo;
+    } else if ((ack == 0
+                && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST))
+               /* broken tcp stacks do not set ack */) {
+        /* Many stacks (ours included) will set the ACK number in an
+         * FIN|ACK if the SYN times out -- no sequence to ACK. */
+        ack = dst->seqlo;
+    }
+
+    if (seq == end) {
+        /* Ease sequencing restrictions on no data packets */
+        seq = src->seqlo;
+        end = seq;
+    }
+
+    ackskew = dst->seqlo - ack;
+#define MAXACKWINDOW (0xffff + 1500)    /* 1500 is an arbitrary fudge factor */
+    if (SEQ_GEQ(src->seqhi, end)
+        /* Last octet inside other's window space */
+        && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))
+        /* Retrans: not more than one window back */
+        && (ackskew >= -MAXACKWINDOW)
+        /* Acking not more than one reassembled fragment backwards */
+        && (ackskew <= (MAXACKWINDOW << sws))
+        /* Acking not more than one window forward */
+        && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo
+            || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo))) {
+        /* Require an exact/+1 sequence match on resets when possible */
+
+        /* update max window */
+        if (src->max_win < win) {
+            src->max_win = win;
+        }
+        /* synchronize sequencing */
+        if (SEQ_GT(end, src->seqlo)) {
+            src->seqlo = end;
+        }
+        /* slide the window of what the other end can send */
+        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
+            dst->seqhi = ack + MAX((win << sws), 1);
+        }
+
+        /* update states */
+        if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) {
+                src->state = CT_DPIF_TCPS_SYN_SENT;
+        }
+        if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
+                src->state = CT_DPIF_TCPS_CLOSING;
+        }
+        if (tcp_flags & TCP_ACK) {
+            if (dst->state == CT_DPIF_TCPS_SYN_SENT) {
+                dst->state = CT_DPIF_TCPS_ESTABLISHED;
+            } else if (dst->state == CT_DPIF_TCPS_CLOSING) {
+                dst->state = CT_DPIF_TCPS_FIN_WAIT_2;
+            }
+        }
+        if (tcp_flags & TCP_RST) {
+            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
+        }
+
+        if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2
+            && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
+            update_expiration(conn, now, 30 * 1000);
+        } else if (src->state >= CT_DPIF_TCPS_CLOSING
+                   && dst->state >= CT_DPIF_TCPS_CLOSING) {
+            update_expiration(conn, now, 45 * 1000);
+        } else if (src->state < CT_DPIF_TCPS_ESTABLISHED
+                   || dst->state < CT_DPIF_TCPS_ESTABLISHED) {
+            update_expiration(conn, now, 30 * 1000);
+        } else if (src->state >= CT_DPIF_TCPS_CLOSING
+                   || dst->state >= CT_DPIF_TCPS_CLOSING) {
+            update_expiration(conn, now, 15 * 60 * 1000);
+        } else {
+            update_expiration(conn, now, 24 * 60 * 60 * 1000);
+        }
+    } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT
+                || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
+                || src->state >= CT_DPIF_TCPS_FIN_WAIT_2)
+               && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)
+               /* Within a window forward of the originating packet */
+               && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
+               /* Within a window backward of the originating packet */
+
+        /*
+         * This currently handles three situations:
+         *  1) Stupid stacks will shotgun SYNs before their peer
+         *     replies.
+         *  2) When PF catches an already established stream (the
+         *     firewall rebooted, the state table was flushed, routes
+         *     changed...)
+         *  3) Packets get funky immediately after the connection
+         *     closes (this should catch Solaris spurious ACK|FINs
+         *     that web servers like to spew after a close)
+         *
+         * This must be a little more careful than the above code
+         * since packet floods will also be caught here. We don't
+         * update the TTL here to mitigate the damage of a packet
+         * flood and so the same code can handle awkward establishment
+         * and a loosened connection close.
+         * In the establishment case, a correct peer response will
+         * validate the connection, go through the normal state code
+         * and keep updating the state TTL.
+         */
+
+        /* update max window */
+        if (src->max_win < win) {
+            src->max_win = win;
+        }
+        /* synchronize sequencing */
+        if (SEQ_GT(end, src->seqlo)) {
+            src->seqlo = end;
+        }
+        /* slide the window of what the other end can send */
+        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
+            dst->seqhi = ack + MAX((win << sws), 1);
+        }
+
+        /*
+         * Cannot set dst->seqhi here since this could be a shotgunned
+         * SYN and not an already established connection.
+         */
+
+        if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
+            src->state = CT_DPIF_TCPS_CLOSING;
+        }
+
+        if (tcp_flags & TCP_RST) {
+            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
+        }
+    } else {
+        return CT_UPDATE_INVALID;
+    }
+
+    return CT_UPDATE_VALID;
+}
+
+static bool
+tcp_valid_new(struct dp_packet *pkt)
+{
+    struct tcp_header *tcp = dp_packet_l4(pkt);
+    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
+
+    if (tcp_invalid_flags(tcp_flags)) {
+        return false;
+    }
+
+    /* A syn+ack is not allowed to create a connection.  We want to allow
+     * totally new connections (syn) or already established, not partially
+     * open (syn+ack). */
+    if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) {
+        return false;
+    }
+
+    return true;
+}
+
+static struct conn *
+tcp_new_conn(struct dp_packet *pkt, long long now)
+{
+    struct conn_tcp* newconn = NULL;
+    struct tcp_header *tcp = dp_packet_l4(pkt);
+    struct tcp_peer *src, *dst;
+    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
+
+    newconn = xzalloc(sizeof(struct conn_tcp));
+
+    src = &newconn->peer[0];
+    dst = &newconn->peer[1];
+
+    src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq));
+    src->seqhi = src->seqlo + tcp_payload_length(pkt) + 1;
+
+    if (tcp_flags & TCP_SYN) {
+        src->seqhi++;
+        src->wscale = tcp_get_wscale(tcp);
+    } else {
+        src->wscale = CT_WSCALE_UNKNOWN;
+        dst->wscale = CT_WSCALE_UNKNOWN;
+    }
+    src->max_win = MAX(ntohs(tcp->tcp_winsz), 1);
+    if (src->wscale & CT_WSCALE_MASK) {
+        /* Remove scale factor from initial window */
+        uint8_t sws = src->wscale & CT_WSCALE_MASK;
+        src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << sws);
+    }
+    if (tcp_flags & TCP_FIN) {
+        src->seqhi++;
+    }
+    dst->seqhi = 1;
+    dst->max_win = 1;
+    src->state = CT_DPIF_TCPS_SYN_SENT;
+    dst->state = CT_DPIF_TCPS_CLOSED;
+
+    update_expiration(newconn, now, 30 * 1000);
+
+    return &newconn->up;
+}
+
+struct ct_l4_proto ct_proto_tcp = {
+    .new_conn = tcp_new_conn,
+    .valid_new = tcp_valid_new,
+    .conn_update = tcp_conn_update,
+};
diff --git a/lib/conntrack.c b/lib/conntrack.c
new file mode 100644
index 0000000..1ac8636
--- /dev/null
+++ b/lib/conntrack.c
@@ -0,0 +1,854 @@
+/*
+ * Copyright (c) 2015 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include "conntrack.h"
+
+#include <errno.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <netinet/icmp6.h>
+
+#include "bitmap.h"
+#include "conntrack-private.h"
+#include "dp-packet.h"
+#include "flow.h"
+#include "hmap.h"
+#include "netdev.h"
+#include "odp-netlink.h"
+#include "openvswitch/vlog.h"
+#include "ovs-rcu.h"
+#include "random.h"
+#include "timeval.h"
+
+VLOG_DEFINE_THIS_MODULE(conntrack);
+
+struct conn_lookup_ctx {
+    struct conn_key key;
+    struct conn *conn;
+    uint32_t hash;
+    bool reply;
+    bool related;
+};
+
+static bool conn_key_extract(struct conntrack *, struct dp_packet *,
+                             struct conn_lookup_ctx *, uint16_t zone);
+static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
+static void conn_key_reverse(struct conn_key *);
+static void conn_keys_lookup(struct conntrack *, struct conn_lookup_ctx *,
+                             unsigned long maps, unsigned bucket,
+                             long long now);
+static bool valid_new(struct dp_packet *pkt, struct conn_key *);
+static struct conn *new_conn(struct dp_packet *pkt, struct conn_key *,
+                             long long now);
+static void delete_conn(struct conn *);
+static enum ct_update_res conn_update(struct conn *, struct dp_packet*,
+                                      bool reply, long long now);
+static bool conn_expired(struct conn *, long long now);
+static void set_mark(struct dp_packet *, struct conn *,
+                     uint32_t val, uint32_t mask);
+static void set_label(struct dp_packet *, struct conn *,
+                      const struct ovs_key_ct_labels *val,
+                      const struct ovs_key_ct_labels *mask);
+
+static struct ct_l4_proto *l4_protos[] = {
+    [IPPROTO_TCP] = &ct_proto_tcp,
+    [IPPROTO_UDP] = &ct_proto_other,
+    [IPPROTO_ICMP] = &ct_proto_other,
+};
+
+/* Initializes the connection tracker 'ct'.  The caller is responbile for
+ * calling 'conntrack_destroy()', when the instance is not needed anymore */
+void
+conntrack_init(struct conntrack *ct)
+{
+    unsigned i;
+
+    for (i = 0; i < CONNTRACK_BUCKETS; i++) {
+        ct_lock_init(&ct->locks[i]);
+        ct_lock_lock(&ct->locks[i]);
+        hmap_init(&ct->connections[i]);
+        ct_lock_unlock(&ct->locks[i]);
+    }
+    ct->hash_basis = random_uint32();
+    ct->purge_bucket = 0;
+    ct->purge_inner_bucket = 0;
+    ct->purge_inner_offset = 0;
+}
+
+/* Destroys the connection tracker 'ct' and frees all the allocated memory. */
+void
+conntrack_destroy(struct conntrack *ct)
+{
+    unsigned i;
+
+    for (i = 0; i < CONNTRACK_BUCKETS; i++) {
+        struct conn *conn, *next;
+
+        ct_lock_lock(&ct->locks[i]);
+        HMAP_FOR_EACH_SAFE(conn, next, node, &ct->connections[i]) {
+            hmap_remove(&ct->connections[i], &conn->node);
+            delete_conn(conn);
+        }
+        hmap_destroy(&ct->connections[i]);
+        ct_lock_unlock(&ct->locks[i]);
+        ct_lock_destroy(&ct->locks[i]);
+    }
+}
+
+static unsigned hash_to_bucket(uint32_t hash)
+{
+    /* Extract the most significant bits in hash. The least significant bits
+     * are already used internally by the hmap implementation. */
+    BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);
+
+    return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
+}
+
+static void
+write_ct_md(struct dp_packet *pkt, uint8_t state, uint16_t zone,
+            uint32_t mark, ovs_u128 label)
+{
+    pkt->md.ct_state = state | CS_TRACKED;
+    pkt->md.ct_zone = zone;
+    pkt->md.ct_mark = mark;
+    pkt->md.ct_label = label;
+}
+
+static struct conn *
+conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
+               struct conn_lookup_ctx *ctx, uint8_t *state, bool commit,
+               long long now)
+{
+    unsigned bucket = hash_to_bucket(ctx->hash);
+    struct conn *nc = NULL;
+
+    if (!valid_new(pkt, &ctx->key)) {
+        *state |= CS_INVALID;
+        return nc;
+    }
+
+    *state |= CS_NEW;
+
+    if (commit) {
+        nc = new_conn(pkt, &ctx->key, now);
+
+        memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key);
+
+        conn_key_reverse(&nc->rev_key);
+        hmap_insert(&ct->connections[bucket], &nc->node, ctx->hash);
+    }
+
+    return nc;
+}
+
+static struct conn *
+process_one(struct conntrack *ct, struct dp_packet *pkt,
+            struct conn_lookup_ctx *ctx, uint16_t zone,
+            bool commit, long long now)
+{
+    unsigned bucket = hash_to_bucket(ctx->hash);
+    struct conn *conn = ctx->conn;
+    uint8_t state = 0;
+
+    if (conn) {
+        if (ctx->related) {
+            state |= CS_RELATED;
+            if (ctx->reply) {
+                state |= CS_REPLY_DIR;
+            }
+        } else {
+            enum ct_update_res res;
+
+            res = conn_update(conn, pkt, ctx->reply, now);
+
+            switch (res) {
+            case CT_UPDATE_VALID:
+                state |= CS_ESTABLISHED;
+                if (ctx->reply) {
+                    state |= CS_REPLY_DIR;
+                }
+                break;
+            case CT_UPDATE_INVALID:
+                state |= CS_INVALID;
+                break;
+            case CT_UPDATE_NEW:
+                hmap_remove(&ct->connections[bucket], &conn->node);
+                delete_conn(conn);
+                conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
+                break;
+            }
+        }
+
+        pkt->md.ct_label = conn->label;
+        pkt->md.ct_mark = conn->mark;
+        write_ct_md(pkt, state, zone, conn->mark, conn->label);
+    } else {
+        conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
+        write_ct_md(pkt, state, zone, 0, (ovs_u128) {{0}});
+    }
+
+    return conn;
+}
+
+/* Sends a group of 'cnt' packets ('pkts') through the connection tracker
+ * 'ct'.  If 'commit' is true, the packets are allowed to create new entries
+ * in the connection tables.  'setmark', if not NULL, should point to a two
+ * elements array containing a value and a mask to set the connection mark.
+ * 'setlabel' behaves similarly for the connection label.*/
+int
+conntrack_execute(struct conntrack *ct, struct dp_packet **pkts, size_t cnt,
+                  bool commit, uint16_t zone, const uint32_t *setmark,
+                  const struct ovs_key_ct_labels *setlabel,
+                  const char *helper)
+{
+#if !defined(__CHECKER__) && !defined(_WIN32)
+    const size_t KEY_ARRAY_SIZE = cnt;
+#else
+    enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST };
+#endif
+    struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE];
+    int8_t bucket_list[CONNTRACK_BUCKETS];
+    struct {
+        unsigned bucket;
+        unsigned long maps;
+    } arr[KEY_ARRAY_SIZE];
+    long long now = time_msec();
+    size_t i = 0;
+    uint8_t arrcnt = 0;
+
+    BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >= NETDEV_MAX_BURST);
+
+    if (helper) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+
+        VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper);
+        /* Continue without the helper */
+    }
+
+    memset(bucket_list, INT8_C(-1), sizeof bucket_list);
+    for (i = 0; i < cnt; i++) {
+        unsigned bucket;
+
+        if (!conn_key_extract(ct, pkts[i], &ctxs[i], zone)) {
+            write_ct_md(pkts[i], CS_INVALID, zone, 0, (ovs_u128){{0}});
+            continue;
+        }
+
+        bucket = hash_to_bucket(ctxs[i].hash);
+        if (bucket_list[bucket] == INT8_C(-1)) {
+            bucket_list[bucket] = arrcnt;
+
+            arr[arrcnt].maps = 0;
+            ULLONG_SET1(arr[arrcnt].maps, i);
+            arr[arrcnt++].bucket = bucket;
+        } else {
+            ULLONG_SET1(arr[bucket_list[bucket]].maps, i);
+            arr[bucket_list[bucket]].maps |= 1UL << i;
+        }
+    }
+
+    for (i = 0; i < arrcnt; i++) {
+        size_t j;
+
+        ct_lock_lock(&ct->locks[arr[i].bucket]);
+        conn_keys_lookup(ct, ctxs, arr[i].maps, arr[i].bucket, now);
+
+        ULLONG_FOR_EACH_1(j, arr[i].maps) {
+            struct conn *conn;
+
+            conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, now);
+
+            if (conn && setmark) {
+                set_mark(pkts[j], conn, setmark[0], setmark[1]);
+            }
+
+            if (conn && setlabel) {
+                set_label(pkts[j], conn, &setlabel[0], &setlabel[1]);
+            }
+        }
+        ct_lock_unlock(&ct->locks[arr[i].bucket]);
+    }
+
+    return 0;
+}
+
+static void
+set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
+{
+    pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
+    conn->mark = pkt->md.ct_mark;
+}
+
+static void
+set_label(struct dp_packet *pkt, struct conn *conn,
+          const struct ovs_key_ct_labels *val,
+          const struct ovs_key_ct_labels *mask)
+{
+    ovs_u128 v, m;
+
+    memcpy(&v, val, sizeof v);
+    memcpy(&m, mask, sizeof m);
+
+    pkt->md.ct_label.u64.lo = v.u64.lo
+                              | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
+    pkt->md.ct_label.u64.hi = v.u64.hi
+                              | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
+    conn->label = pkt->md.ct_label;
+}
+
+#define CONNTRACK_PURGE_NUM 256
+
+static void
+sweep_bucket(struct hmap *bucket, uint32_t *inner_bucket,
+             uint32_t *inner_offset, unsigned *left, long long now)
+{
+    while (*left != 0) {
+        struct hmap_node *node;
+        struct conn *conn;
+
+        node = hmap_at_position(bucket, inner_bucket, inner_offset);
+
+        if (!node) {
+            hmap_shrink(bucket);
+            break;
+        }
+
+        INIT_CONTAINER(conn, node, node);
+        if (conn_expired(conn, now)) {
+            hmap_remove(bucket, &conn->node);
+            delete_conn(conn);
+            (*left)--;
+        }
+    }
+}
+
+/* Cleans up old connection entries.  Should be called periodically. */
+void
+conntrack_run(struct conntrack *ct)
+{
+    unsigned bucket = hash_to_bucket(ct->purge_bucket);
+    uint32_t inner_bucket = ct->purge_inner_bucket,
+             inner_offset = ct->purge_inner_offset;
+    unsigned left = CONNTRACK_PURGE_NUM;
+    long long now = time_msec();
+
+    while (bucket < CONNTRACK_BUCKETS) {
+        ct_lock_lock(&ct->locks[bucket]);
+        sweep_bucket(&ct->connections[bucket],
+                     &inner_bucket, &inner_offset,
+                     &left, now);
+        ct_lock_unlock(&ct->locks[bucket]);
+
+        if (left == 0) {
+            break;
+        } else {
+            bucket++;
+        }
+    }
+
+    ct->purge_bucket = bucket;
+    ct->purge_inner_bucket = inner_bucket;
+    ct->purge_inner_offset = inner_offset;
+}
+
+/* Key extraction */
+
+/* The function stores a pointer to the first byte after the header in
+ * '*new_data', if 'new_data' is not NULL.  If it is NULL, the caller is
+ * not interested in the header's tail,  meaning that the header has
+ * already been parsed (e.g. by flow_extract): we take this as a hint to
+ * save a few checks. */
+static inline bool
+extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
+                const char **new_data)
+{
+    const struct ip_header *ip = data;
+
+    if (new_data) {
+        size_t ip_len;
+
+        if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
+            return false;
+        }
+        ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
+
+        if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
+            return false;
+        }
+        if (OVS_UNLIKELY(size < ip_len)) {
+            return false;
+        }
+
+        *new_data = (char *) data + ip_len;
+    }
+
+    if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
+        return false;
+    }
+
+    key->src.addr.ipv4 = ip->ip_src;
+    key->dst.addr.ipv4 = ip->ip_dst;
+    key->nw_proto = ip->ip_proto;
+
+    return true;
+}
+
+/* The function stores a pointer to the first byte after the header in
+ * '*new_data', if 'new_data' is not NULL.  If it is NULL, the caller is
+ * not interested in the header's tail,  meaning that the header has
+ * already been parsed (e.g. by flow_extract): we take this as a hint to
+ * save a few checks. */
+static inline bool
+extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
+                const char **new_data)
+{
+    const struct ovs_16aligned_ip6_hdr *ip6 = data;
+    uint8_t nw_proto = ip6->ip6_nxt;
+    uint8_t nw_frag = 0;
+
+    if (new_data) {
+        if (OVS_UNLIKELY(size < sizeof *ip6)) {
+            return false;
+        }
+    }
+
+    data = ip6 + 1;
+    size -=  sizeof *ip6;
+
+    if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {
+        return false;
+    }
+
+    if (new_data) {
+        *new_data = data;
+    }
+
+    if (nw_frag) {
+        return false;
+    }
+
+    key->src.addr.ipv6 = ip6->ip6_src;
+    key->dst.addr.ipv6 = ip6->ip6_dst;
+    key->nw_proto = nw_proto;
+
+    return true;
+}
+
+static inline bool
+check_l4_tcp(const void *data, size_t size)
+{
+    const struct tcp_header *tcp = data;
+    size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
+
+    if (OVS_LIKELY(tcp_len >= TCP_HEADER_LEN && tcp_len <= size)) {
+        return true;
+    }
+
+    return false;
+}
+
+static inline bool
+extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
+{
+    const struct tcp_header *tcp = data;
+
+    if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
+        return false;
+    }
+
+    key->src.port = tcp->tcp_src;
+    key->dst.port = tcp->tcp_dst;
+
+    return true;
+}
+
+static inline bool
+extract_l4_udp(struct conn_key *key, const void *data, size_t size)
+{
+    const struct udp_header *udp = data;
+
+    if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
+        return false;
+    }
+
+    key->src.port = udp->udp_src;
+    key->dst.port = udp->udp_dst;
+
+    return true;
+}
+
+static inline bool extract_l4(struct conn_key *key, const void *data,
+                              size_t size, bool *related);
+
+/* If 'related' is not NULL and the function is processing an ICMP
+ * error packet, extract the l3 and l4 fields from the nested header
+ * instead and set *related to true.  If 'related' is NULL we're
+ * already processing a nested header and no such recursion is
+ * possible */
+static inline int
+extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
+                bool *related)
+{
+    const struct icmp_header *icmp = data;
+
+    if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
+        return false;
+    }
+
+    switch (icmp->icmp_type) {
+    case ICMP_ECHO_REQUEST:
+    case ICMP_ECHO_REPLY:
+    case ICMP_TIMESTAMP:
+    case ICMP_TIMESTAMPREPLY:
+    case ICMP_INFOREQUEST:
+    case ICMP_INFOREPLY:
+        /* Separate ICMP connection: identified using id */
+        key->src.port = key->dst.port = icmp->icmp_fields.echo.id;
+        break;
+    case ICMP_DST_UNREACH:
+    case ICMP_TIME_EXCEEDED:
+    case ICMP_PARAM_PROB:
+    case ICMP_SOURCEQUENCH:
+    case ICMP_REDIRECT: {
+        /* ICMP packet part of another connection. We should
+         * extract the key from embedded packet header */
+        struct conn_key inner_key;
+        const char *l3 = (const char *) (icmp + 1);
+        const char *tail = (const char *) data + size;
+        const char *l4;
+        bool ok;
+
+        if (!related) {
+            return false;
+        }
+        *related = true;
+
+        memset(&inner_key, 0, sizeof inner_key);
+        inner_key.dl_type = htons(ETH_TYPE_IP);
+        ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4);
+        if (!ok) {
+            return false;
+        }
+
+        /* pf doesn't do this, but it seems a good idea */
+        if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
+            || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
+            return false;
+        }
+
+        key->src = inner_key.src;
+        key->dst = inner_key.dst;
+        key->nw_proto = inner_key.nw_proto;
+
+        ok = extract_l4(key, l4, tail - l4, NULL);
+        if (ok) {
+            conn_key_reverse(key);
+        }
+        return ok;
+    }
+    default:
+        return false;
+    }
+
+    return true;
+}
+
+/* If 'related' is not NULL and the function is processing an ICMP
+ * error packet, extract the l3 and l4 fields from the nested header
+ * instead and set *related to true.  If 'related' is NULL we're
+ * already processing a nested header and no such recursion is
+ * possible */
+static inline bool
+extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
+                 bool *related)
+{
+    const struct icmp6_header *icmp6 = data;
+
+    /* All the messages that we support need at least 4 bytes after
+     * the header */
+    if (size < sizeof *icmp6 + 4) {
+        return false;
+    }
+
+    switch (icmp6->icmp6_type) {
+    case ICMP6_ECHO_REQUEST:
+    case ICMP6_ECHO_REPLY:
+        /* Separate ICMP connection: identified using id */
+        key->src.port = key->dst.port = *(ovs_be16 *) (icmp6 + 1);
+        break;
+    case ICMP6_DST_UNREACH:
+    case ICMP6_PACKET_TOO_BIG:
+    case ICMP6_TIME_EXCEEDED:
+    case ICMP6_PARAM_PROB:{
+        /* ICMP packet part of another connection. We should
+         * extract the key from embedded packet header */
+        struct conn_key inner_key;
+        const char *l3 = (const char *) icmp6 + 8;
+        const char *tail = (const char *) data + size;
+        const char *l4 = NULL;
+        bool ok;
+
+        if (!related) {
+            return false;
+        }
+        *related = true;
+
+        memset(&inner_key, 0, sizeof inner_key);
+        inner_key.dl_type = htons(ETH_TYPE_IPV6);
+        ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
+        if (!ok) {
+            return false;
+        }
+
+        /* pf doesn't do this, but it seems a good idea */
+        if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
+                              &key->dst.addr.ipv6_aligned)
+            || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
+                                 &key->src.addr.ipv6_aligned)) {
+            return false;
+        }
+
+        key->src = inner_key.src;
+        key->dst = inner_key.dst;
+        key->nw_proto = inner_key.nw_proto;
+
+        ok = extract_l4(key, l4, tail - l4, NULL);
+        if (ok) {
+            conn_key_reverse(key);
+        }
+        return ok;
+    }
+    default:
+        return false;
+    }
+
+    return true;
+}
+
+/* Extract l4 fields into 'key', which must already contain valid l3
+ * members.  If 'related' is not NULL and an ICMP error packet is being
+ * processed, the function will extract the key from the packet nested
+ * in the ICMP paylod and set '*related' to true.  If 'related' is NULL,
+ * nested parsing isn't allowed.  This is necessary to limit the
+ * recursion level. */
+static inline bool
+extract_l4(struct conn_key *key, const void *data, size_t size, bool *related)
+{
+    if (key->nw_proto == IPPROTO_TCP) {
+        return extract_l4_tcp(key, data, size)
+               && (!related || check_l4_tcp(data, size));
+    } else if (key->nw_proto == IPPROTO_UDP) {
+        return extract_l4_udp(key, data, size);
+    } else if (key->dl_type == htons(ETH_TYPE_IP)
+               && key->nw_proto == IPPROTO_ICMP) {
+        return extract_l4_icmp(key, data, size, related);
+    } else if (key->dl_type == htons(ETH_TYPE_IPV6)
+               && key->nw_proto == IPPROTO_ICMPV6) {
+        return extract_l4_icmp6(key, data, size, related);
+    } else {
+        return false;
+    }
+}
+
+static bool
+conn_key_extract(struct conntrack *ct, struct dp_packet *pkt,
+                 struct conn_lookup_ctx *ctx, uint16_t zone)
+{
+    const struct eth_header *l2 = dp_packet_l2(pkt);
+    const struct ip_header *l3 = dp_packet_l3(pkt);
+    const char *l4 = dp_packet_l4(pkt);
+    const char *tail = dp_packet_tail(pkt);
+    bool ok;
+
+    memset(ctx, 0, sizeof *ctx);
+
+    if (!l2 || !l3 || !l4) {
+        return false;
+    }
+
+    ctx->key.zone = zone;
+
+    /* XXX In this function we parse the packet (again, it has already
+     * gone through miniflow_extract()) for two reasons:
+     *
+     * 1) To extract the l3 addresses and l4 ports.
+     *    We already have the l3 and l4 headers' pointers.  Extracting
+     *    the l3 addresses and the l4 ports is really cheap, since they
+     *    can be found at fixed locations.
+     * 2) To extract the l3 and l4 types.
+     *    Extracting the l3 and l4 types (especially the l3[1]) on the
+     *    other hand is quite expensive, because they're not at a
+     *    fixed location.
+     *
+     * Here's a way to avoid (2) with the help of the datapath.
+     * The datapath doesn't keep the packet's extracted flow[2], so
+     * using that is not an option.  We could use the packet's matching
+     * megaflow, but we have to make sure that the l3 and l4 types
+     * are unwildcarded.  This means either:
+     *
+     * a) dpif-netdev unwildcards the l3 (and l4) types when a new flow
+     *    is installed if the actions contains ct().  This is what the
+     *    kernel datapath does.  It is not so straightforward, though.
+     *
+     * b) ofproto-dpif-xlate unwildcards the l3 (and l4) types when
+     *    translating a ct() action.  This is already done in different
+     *    actions and since both the userspace and the kernel datapath
+     *    would benefit from it, it seems an appropriate place to do
+     *    it.
+     *
+     * ---
+     * [1] A simple benchmark (running only the connection tracker
+     *     over and over on the same packets) shows that if the
+     *     l3 type is already provided we are 15% faster (running the
+     *     connection tracker over a couple of DPDK devices with a
+     *     stream of UDP 64-bytes packets shows that we are 4% faster).
+     *
+     * [2] The reasons for this are that keeping the flow increases
+     *     (slightly) the cache footprint and increases computation
+     *     time as we move the packet around. Most importantly, the flow
+     *     should be updated by the actions and this can be slow, as
+     *     we use a sparse representation (miniflow).
+     *
+     */
+    ctx->key.dl_type = parse_dl_type(l2, (char *) l3 - (char *) l2);
+    if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
+        ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL);
+    } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
+        ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);
+    } else {
+        ok = false;
+    }
+
+    if (ok) {
+        if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related)) {
+            ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/* Symmetric */
+static uint32_t
+conn_key_hash(const struct conn_key *key, uint32_t basis)
+{
+    uint32_t hsrc, hdst, hash;
+    int i;
+
+    hsrc = hdst = basis;
+
+    for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {
+        hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);
+        hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);
+    }
+
+    hash = hsrc ^ hdst;
+
+    hash = hash_words((uint32_t *) &key->dst + 1,
+                      (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
+                      hash);
+
+    return hash;
+}
+
+static void
+conn_key_reverse(struct conn_key *key)
+{
+    struct ct_endpoint tmp;
+    tmp = key->src;
+    key->src = key->dst;
+    key->dst = tmp;
+}
+
+static void
+conn_keys_lookup(struct conntrack *ct,
+                 struct conn_lookup_ctx *keys,
+                 unsigned long maps,
+                 unsigned bucket,
+                 long long now)
+{
+    size_t i;
+
+    ULLONG_FOR_EACH_1(i, maps) {
+        struct conn *conn, *found = NULL;
+        uint32_t hash = keys[i].hash;
+        bool reply;
+
+        HMAP_FOR_EACH_WITH_HASH(conn, node, hash, &ct->connections[bucket]) {
+            if (!memcmp(&conn->key, &keys[i].key, sizeof(conn->key))) {
+                found = conn;
+                reply = false;
+                break;
+            }
+            if (!memcmp(&conn->rev_key, &keys[i].key, sizeof(conn->rev_key))) {
+                found = conn;
+                reply = true;
+                break;
+            }
+        }
+
+        if (found) {
+            if (conn_expired(found, now)) {
+                found = NULL;
+            } else {
+                keys[i].reply = reply;
+            }
+        }
+
+        keys[i].conn = found;
+    }
+}
+
+static enum ct_update_res
+conn_update(struct conn *conn, struct dp_packet *pkt, bool reply,
+            long long now)
+{
+    return l4_protos[conn->key.nw_proto]->conn_update(conn, pkt, reply, now);
+}
+
+static bool
+conn_expired(struct conn *conn, long long now)
+{
+    return now >= conn->expiration;
+}
+
+static bool
+valid_new(struct dp_packet *pkt, struct conn_key *key)
+{
+    return l4_protos[key->nw_proto]->valid_new(pkt);
+}
+
+static struct conn *
+new_conn(struct dp_packet *pkt, struct conn_key *key, long long now)
+{
+    struct conn *newconn;
+
+    newconn = l4_protos[key->nw_proto]->new_conn(pkt, now);
+
+    if (newconn) {
+        newconn->key = *key;
+    }
+
+    return newconn;
+}
+
+static void
+delete_conn(struct conn *conn)
+{
+    free(conn);
+}
diff --git a/lib/conntrack.h b/lib/conntrack.h
new file mode 100644
index 0000000..f97a7ea
--- /dev/null
+++ b/lib/conntrack.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2014 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CONNTRACK_H
+#define CONNTRACK_H 1
+
+#include <stdbool.h>
+
+#include "hmap.h"
+#include "netdev-dpdk.h"
+#include "odp-netlink.h"
+#include "openvswitch/thread.h"
+#include "openvswitch/types.h"
+
+
+struct dp_packet;
+
+/* Userspace connection tracker
+ * ============================
+ *
+ * This is a connection tracking module that keeps all the state in userspace.
+ *
+ * Usage
+ * =====
+ *
+ *     struct conntract ct;
+ *
+ * Initialization:
+ *
+ *     conntrack_init(&ct);
+ *
+ * It is necessary to periodically issue a call to
+ *
+ *     conntrack_run(&ct);
+ *
+ * to allow the module to clean up expired connections.
+ *
+ * To send a group of packets through the connection tracker:
+ *
+ *     conntrack_execute(&ct, pkts, n_pkts, ...);
+ *
+ * Thread-safety
+ * =============
+ *
+ * conntrack_execute() can be called by multiple threads simultaneoulsy.
+ */
+
+struct conntrack;
+
+void conntrack_init(struct conntrack *);
+void conntrack_run(struct conntrack *);
+void conntrack_destroy(struct conntrack *);
+
+int conntrack_execute(struct conntrack *, struct dp_packet **, size_t,
+                      bool commit, uint16_t zone, const uint32_t *setmark,
+                      const struct ovs_key_ct_labels *setlabel,
+                      const char *helper);
+
+/* struct ct_lock is a standard mutex or a spinlock when using DPDK */
+
+#ifdef DPDK_NETDEV
+struct OVS_LOCKABLE ct_lock {
+    rte_spinlock_t lock;
+};
+
+static inline void ct_lock_init(struct ct_lock *lock)
+{
+    rte_spinlock_init(&lock->lock);
+}
+
+static inline void ct_lock_lock(struct ct_lock *lock) 
+    OVS_ACQUIRES(lock)
+    OVS_NO_THREAD_SAFETY_ANALYSIS
+{
+    rte_spinlock_lock(&lock->lock);
+}
+
+static inline void ct_lock_unlock(struct ct_lock *lock)
+    OVS_RELEASES(lock)
+    OVS_NO_THREAD_SAFETY_ANALYSIS
+{
+    rte_spinlock_unlock(&lock->lock);
+}
+
+static inline void ct_lock_destroy(struct ct_lock *lock OVS_UNUSED)
+{
+}
+#else
+struct OVS_LOCKABLE ct_lock {
+    struct ovs_mutex lock;
+};
+
+static inline void ct_lock_init(struct ct_lock *lock)
+{
+    ovs_mutex_init(&lock->lock);
+}
+
+static inline void ct_lock_lock(struct ct_lock *lock)
+    OVS_ACQUIRES(lock)
+    OVS_NO_THREAD_SAFETY_ANALYSIS
+{
+    ovs_mutex_lock(&lock->lock);
+}
+
+static inline void ct_lock_unlock(struct ct_lock *lock)
+    OVS_RELEASES(lock)
+    OVS_NO_THREAD_SAFETY_ANALYSIS
+{
+    ovs_mutex_unlock(&lock->lock);
+}
+
+static inline void ct_lock_destroy(struct ct_lock *lock)
+{
+    ovs_mutex_destroy(&lock->lock);
+}
+#endif
+
+#define CONNTRACK_BUCKETS_SHIFT 8
+#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT)
+
+struct conntrack {
+    /* Each lock guards a 'connections' bucket */
+    struct ct_lock locks[CONNTRACK_BUCKETS];
+    struct hmap connections[CONNTRACK_BUCKETS] OVS_GUARDED;
+    uint32_t hash_basis;
+    unsigned purge_bucket;
+    uint32_t purge_inner_bucket;
+    uint32_t purge_inner_offset;
+};
+
+#endif /* conntrack.h */
-- 
2.1.4




More information about the dev mailing list