[ovs-dev] [PATCH v4 04/17] conntrack: New userspace connection tracker.

Daniele Di Proietto diproiettod at vmware.com
Fri Jun 10 22:47:30 UTC 2016


This commit adds the conntrack module.

It is a connection tracker that resides entirely in userspace.  Its
primary user will be the dpif-netdev datapath.

The module main goal is to provide conntrack_execute(), which offers a
convenient interface to implement the datapath ct() action.

The conntrack module uses two submodules to deal with the l4 protocol
details (conntrack-other for UDP and ICMP, conntrack-tcp for TCP).

The conntrack-tcp submodule implementation is adapted from FreeBSD's pf
subsystem, therefore it's BSD licensed.  It has been slightly altered to
match the OVS coding style and to allow the pickup of already
established connections.

Signed-off-by: Daniele Di Proietto <diproiettod at vmware.com>
Acked-by: Antonio Fischetti <antonio.fischetti at intel.com>
---
 COPYING                     |   1 +
 debian/copyright.in         |   4 +
 include/openvswitch/types.h |   4 +
 lib/automake.mk             |   5 +
 lib/conntrack-other.c       |  85 +++++
 lib/conntrack-private.h     |  88 +++++
 lib/conntrack-tcp.c         | 462 +++++++++++++++++++++++
 lib/conntrack.c             | 890 ++++++++++++++++++++++++++++++++++++++++++++
 lib/conntrack.h             | 150 ++++++++
 lib/util.h                  |   9 +
 10 files changed, 1698 insertions(+)
 create mode 100644 lib/conntrack-other.c
 create mode 100644 lib/conntrack-private.h
 create mode 100644 lib/conntrack-tcp.c
 create mode 100644 lib/conntrack.c
 create mode 100644 lib/conntrack.h

diff --git a/COPYING b/COPYING
index 308e3ea..afb98b9 100644
--- a/COPYING
+++ b/COPYING
@@ -25,6 +25,7 @@ License, version 2.
 The following files are licensed under the 2-clause BSD license.
     include/windows/getopt.h
     lib/getopt_long.c
+    lib/conntrack-tcp.c
 
 The following files are licensed under the 3-clause BSD-license
     include/windows/netinet/icmp6.h
diff --git a/debian/copyright.in b/debian/copyright.in
index 57d007a..a15f4dd 100644
--- a/debian/copyright.in
+++ b/debian/copyright.in
@@ -21,6 +21,9 @@ Upstream Copyright Holders:
 	Copyright (c) 2014 Michael Chapman
 	Copyright (c) 2014 WindRiver, Inc.
 	Copyright (c) 2014 Avaya, Inc.
+	Copyright (c) 2001 Daniel Hartmeier
+	Copyright (c) 2002 - 2008 Henning Brauer
+	Copyright (c) 2012 Gleb Smirnoff <glebius at FreeBSD.org>
 
 License:
 
@@ -90,6 +93,7 @@ License:
 	lib/getopt_long.c
 	include/windows/getopt.h
 	datapath-windows/ovsext/Conntrack-tcp.c
+	lib/conntrack-tcp.c
 
 * The following files are licensed under the 3-clause BSD-license
 
diff --git a/include/openvswitch/types.h b/include/openvswitch/types.h
index bc94145..35bde0a 100644
--- a/include/openvswitch/types.h
+++ b/include/openvswitch/types.h
@@ -107,6 +107,10 @@ static const ovs_u128 OVS_U128_MAX = { { UINT32_MAX, UINT32_MAX,
                                          UINT32_MAX, UINT32_MAX } };
 static const ovs_be128 OVS_BE128_MAX OVS_UNUSED = { { OVS_BE32_MAX, OVS_BE32_MAX,
                                            OVS_BE32_MAX, OVS_BE32_MAX } };
+static const ovs_u128 OVS_U128_MIN OVS_UNUSED = { {0, 0, 0, 0} };
+static const ovs_u128 OVS_BE128_MIN OVS_UNUSED = { {0, 0, 0, 0} };
+
+#define OVS_U128_ZERO OVS_U128_MIN
 
 /* A 64-bit value, in network byte order, that is only aligned on a 32-bit
  * boundary. */
diff --git a/lib/automake.mk b/lib/automake.mk
index eabc0e7..4ba1a2c 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -49,6 +49,11 @@ lib_libopenvswitch_la_SOURCES = \
 	lib/compiler.h \
 	lib/connectivity.c \
 	lib/connectivity.h \
+	lib/conntrack-private.h \
+	lib/conntrack-tcp.c \
+	lib/conntrack-other.c \
+	lib/conntrack.c \
+	lib/conntrack.h \
 	lib/coverage.c \
 	lib/coverage.h \
 	lib/crc32c.c \
diff --git a/lib/conntrack-other.c b/lib/conntrack-other.c
new file mode 100644
index 0000000..295cb2c
--- /dev/null
+++ b/lib/conntrack-other.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2015, 2016 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#include "conntrack-private.h"
+#include "dp-packet.h"
+
+enum other_state {
+    OTHERS_FIRST,
+    OTHERS_MULTIPLE,
+    OTHERS_BIDIR,
+};
+
+struct conn_other {
+    struct conn up;
+    enum other_state state;
+};
+
+static const enum ct_timeout other_timeouts[] = {
+    [OTHERS_FIRST] = CT_TM_OTHER_FIRST,
+    [OTHERS_MULTIPLE] = CT_TM_OTHER_MULTIPLE,
+    [OTHERS_BIDIR] = CT_TM_OTHER_BIDIR,
+};
+
+static struct conn_other *
+conn_other_cast(const struct conn *conn)
+{
+    return CONTAINER_OF(conn, struct conn_other, up);
+}
+
+static enum ct_update_res
+other_conn_update(struct conn *conn_, struct dp_packet *pkt OVS_UNUSED,
+                  bool reply, long long now)
+{
+    struct conn_other *conn = conn_other_cast(conn_);
+
+    if (reply && conn->state != OTHERS_BIDIR) {
+        conn->state = OTHERS_BIDIR;
+    } else if (conn->state == OTHERS_FIRST) {
+        conn->state = OTHERS_MULTIPLE;
+    }
+
+    update_expiration(conn_, other_timeouts[conn->state], now);
+
+    return CT_UPDATE_VALID;
+}
+
+static bool
+other_valid_new(struct dp_packet *pkt OVS_UNUSED)
+{
+    return true;
+}
+
+static struct conn *
+other_new_conn(struct dp_packet *pkt OVS_UNUSED, long long now)
+{
+    struct conn_other *conn;
+
+    conn = xzalloc(sizeof *conn);
+    conn->state = OTHERS_FIRST;
+
+    update_expiration(&conn->up, other_timeouts[conn->state], now);
+
+    return &conn->up;
+}
+
+struct ct_l4_proto ct_proto_other = {
+    .new_conn = other_new_conn,
+    .valid_new = other_valid_new,
+    .conn_update = other_conn_update,
+};
diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
new file mode 100644
index 0000000..d3e0099
--- /dev/null
+++ b/lib/conntrack-private.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2015, 2016 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CONNTRACK_PRIVATE_H
+#define CONNTRACK_PRIVATE_H 1
+
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <netinet/ip6.h>
+
+#include "conntrack.h"
+#include "hmap.h"
+#include "openvswitch/list.h"
+#include "openvswitch/types.h"
+#include "packets.h"
+#include "unaligned.h"
+
+struct ct_addr {
+    union {
+        ovs_16aligned_be32 ipv4;
+        union ovs_16aligned_in6_addr ipv6;
+        ovs_be32 ipv4_aligned;
+        struct in6_addr ipv6_aligned;
+    };
+};
+
+struct ct_endpoint {
+    struct ct_addr addr;
+    ovs_be16 port;
+};
+
+struct conn_key {
+    struct ct_endpoint src;
+    struct ct_endpoint dst;
+
+    ovs_be16 dl_type;
+    uint8_t nw_proto;
+    uint16_t zone;
+};
+
+struct conn {
+    struct conn_key key;
+    struct conn_key rev_key;
+    long long expiration;
+    struct ovs_list exp_node;
+    struct hmap_node node;
+    uint32_t mark;
+    ovs_u128 label;
+};
+
+enum ct_update_res {
+    CT_UPDATE_INVALID,
+    CT_UPDATE_VALID,
+    CT_UPDATE_NEW,
+};
+
+struct ct_l4_proto {
+    struct conn *(*new_conn)(struct dp_packet *pkt, long long now);
+    bool (*valid_new)(struct dp_packet *pkt);
+    enum ct_update_res (*conn_update)(struct conn *conn, struct dp_packet *pkt,
+                                      bool reply, long long now);
+};
+
+extern struct ct_l4_proto ct_proto_tcp;
+extern struct ct_l4_proto ct_proto_other;
+
+extern long long ct_timeout_val[];
+
+static inline void
+update_expiration(struct conn *conn, enum ct_timeout tm, long long now)
+{
+    conn->expiration = now + ct_timeout_val[tm];
+}
+
+#endif /* conntrack-private.h */
diff --git a/lib/conntrack-tcp.c b/lib/conntrack-tcp.c
new file mode 100644
index 0000000..b574eeb
--- /dev/null
+++ b/lib/conntrack-tcp.c
@@ -0,0 +1,462 @@
+/*-
+ * Copyright (c) 2001 Daniel Hartmeier
+ * Copyright (c) 2002 - 2008 Henning Brauer
+ * Copyright (c) 2012 Gleb Smirnoff <glebius at FreeBSD.org>
+ * Copyright (c) 2015, 2016 Nicira, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *    - Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    - Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Effort sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F30602-01-2-0537.
+ *
+ *      $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
+ */
+
+#include <config.h>
+
+#include "conntrack-private.h"
+#include "ct-dpif.h"
+#include "dp-packet.h"
+#include "util.h"
+
+struct tcp_peer {
+    enum ct_dpif_tcp_state state;
+    uint32_t               seqlo;          /* Max sequence number sent     */
+    uint32_t               seqhi;          /* Max the other end ACKd + win */
+    uint16_t               max_win;        /* largest window (pre scaling) */
+    uint8_t                wscale;         /* window scaling factor        */
+};
+
+struct conn_tcp {
+    struct conn up;
+    struct tcp_peer peer[2];
+};
+
+enum {
+    TCPOPT_EOL,
+    TCPOPT_NOP,
+    TCPOPT_WINDOW = 3,
+};
+
+/* TCP sequence numbers are 32 bit integers operated
+ * on with modular arithmetic.  These macros can be
+ * used to compare such integers. */
+#define SEQ_LT(a,b)     INT_MOD_LT(a, b)
+#define SEQ_LEQ(a,b)    INT_MOD_LEQ(a, b)
+#define SEQ_GT(a,b)     INT_MOD_GT(a, b)
+#define SEQ_GEQ(a,b)    INT_MOD_GEQ(a, b)
+
+#define SEQ_MIN(a, b)   INT_MOD_MIN(a, b)
+#define SEQ_MAX(a, b)   INT_MOD_MAX(a, b)
+
+static struct conn_tcp*
+conn_tcp_cast(const struct conn* conn)
+{
+    return CONTAINER_OF(conn, struct conn_tcp, up);
+}
+
+/* pf does this in in pf_normalize_tcp(), and it is called only if scrub
+ * is enabled.  We're not scrubbing, but this check seems reasonable.  */
+static bool
+tcp_invalid_flags(uint16_t flags)
+{
+
+    if (flags & TCP_SYN) {
+        if (flags & TCP_RST || flags & TCP_FIN) {
+            return true;
+        }
+    } else {
+        /* Illegal packet */
+        if (!(flags & (TCP_ACK|TCP_RST))) {
+            return true;
+        }
+    }
+
+    if (!(flags & TCP_ACK)) {
+        /* These flags are only valid if ACK is set */
+        if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & TCP_URG)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+#define TCP_MAX_WSCALE 14
+#define CT_WSCALE_FLAG 0x80
+#define CT_WSCALE_UNKNOWN 0x40
+#define CT_WSCALE_MASK 0xf
+
+static uint8_t
+tcp_get_wscale(const struct tcp_header *tcp)
+{
+    int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp;
+    const uint8_t *opt = (const uint8_t *)(tcp + 1);
+    uint8_t wscale = 0;
+    uint8_t optlen;
+
+    while (len >= 3) {
+        switch (*opt) {
+        case TCPOPT_EOL:
+            return wscale;
+        case TCPOPT_NOP:
+            opt++;
+            len--;
+            break;
+        case TCPOPT_WINDOW:
+            wscale = MIN(opt[2], TCP_MAX_WSCALE);
+            wscale |= CT_WSCALE_FLAG;
+            /* fall through */
+        default:
+            optlen = opt[1];
+            if (optlen < 2) {
+                optlen = 2;
+            }
+            len -= optlen;
+            opt += optlen;
+        }
+    }
+
+    return wscale;
+}
+
+static uint32_t
+tcp_payload_length(struct dp_packet *pkt)
+{
+    return (char *) dp_packet_tail(pkt) - dp_packet_l2_pad_size(pkt)
+           - (char *) dp_packet_get_tcp_payload(pkt);
+}
+
+static enum ct_update_res
+tcp_conn_update(struct conn* conn_, struct dp_packet *pkt, bool reply,
+                long long now)
+{
+    struct conn_tcp *conn = conn_tcp_cast(conn_);
+    struct tcp_header *tcp = dp_packet_l4(pkt);
+    /* The peer that sent 'pkt' */
+    struct tcp_peer *src = &conn->peer[reply ? 1 : 0];
+    /* The peer that should receive 'pkt' */
+    struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];
+    uint8_t sws = 0, dws = 0;
+    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
+
+    uint16_t win = ntohs(tcp->tcp_winsz);
+    uint32_t ack, end, seq, orig_seq;
+    uint32_t p_len = tcp_payload_length(pkt);
+    int ackskew;
+
+    if (tcp_invalid_flags(tcp_flags)) {
+        return CT_UPDATE_INVALID;
+    }
+
+    if (((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN)
+            && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
+            && src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
+        src->state = dst->state = CT_DPIF_TCPS_CLOSED;
+        return CT_UPDATE_NEW;
+    }
+
+    if (src->wscale & CT_WSCALE_FLAG
+        && dst->wscale & CT_WSCALE_FLAG
+        && !(tcp_flags & TCP_SYN)) {
+
+        sws = src->wscale & CT_WSCALE_MASK;
+        dws = dst->wscale & CT_WSCALE_MASK;
+
+    } else if (src->wscale & CT_WSCALE_UNKNOWN
+        && dst->wscale & CT_WSCALE_UNKNOWN
+        && !(tcp_flags & TCP_SYN)) {
+
+        sws = TCP_MAX_WSCALE;
+        dws = TCP_MAX_WSCALE;
+    }
+
+    /*
+     * Sequence tracking algorithm from Guido van Rooij's paper:
+     *   http://www.madison-gurkha.com/publications/tcp_filtering/
+     *      tcp_filtering.ps
+     */
+
+    orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq));
+    if (src->state < CT_DPIF_TCPS_SYN_SENT) {
+        /* First packet from this end. Set its state */
+
+        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
+
+        end = seq + p_len;
+        if (tcp_flags & TCP_SYN) {
+            end++;
+            if (dst->wscale & CT_WSCALE_FLAG) {
+                src->wscale = tcp_get_wscale(tcp);
+                if (src->wscale & CT_WSCALE_FLAG) {
+                    /* Remove scale factor from initial window */
+                    sws = src->wscale & CT_WSCALE_MASK;
+                    win = DIV_ROUND_UP((uint32_t) win, 1 << sws);
+                    dws = dst->wscale & CT_WSCALE_MASK;
+                } else {
+                    /* fixup other window */
+                    dst->max_win <<= dst->wscale & CT_WSCALE_MASK;
+                    /* in case of a retrans SYN|ACK */
+                    dst->wscale = 0;
+                }
+            }
+        }
+        if (tcp_flags & TCP_FIN) {
+            end++;
+        }
+
+        src->seqlo = seq;
+        src->state = CT_DPIF_TCPS_SYN_SENT;
+        /*
+         * May need to slide the window (seqhi may have been set by
+         * the crappy stack check or if we picked up the connection
+         * after establishment)
+         */
+        if (src->seqhi == 1
+                || SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) {
+            src->seqhi = end + MAX(1, dst->max_win << dws);
+        }
+        if (win > src->max_win) {
+            src->max_win = win;
+        }
+
+    } else {
+        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
+        end = seq + p_len;
+        if (tcp_flags & TCP_SYN) {
+            end++;
+        }
+        if (tcp_flags & TCP_FIN) {
+            end++;
+        }
+    }
+
+    if ((tcp_flags & TCP_ACK) == 0) {
+        /* Let it pass through the ack skew check */
+        ack = dst->seqlo;
+    } else if ((ack == 0
+                && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST))
+               /* broken tcp stacks do not set ack */) {
+        /* Many stacks (ours included) will set the ACK number in an
+         * FIN|ACK if the SYN times out -- no sequence to ACK. */
+        ack = dst->seqlo;
+    }
+
+    if (seq == end) {
+        /* Ease sequencing restrictions on no data packets */
+        seq = src->seqlo;
+        end = seq;
+    }
+
+    ackskew = dst->seqlo - ack;
+#define MAXACKWINDOW (0xffff + 1500)    /* 1500 is an arbitrary fudge factor */
+    if (SEQ_GEQ(src->seqhi, end)
+        /* Last octet inside other's window space */
+        && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))
+        /* Retrans: not more than one window back */
+        && (ackskew >= -MAXACKWINDOW)
+        /* Acking not more than one reassembled fragment backwards */
+        && (ackskew <= (MAXACKWINDOW << sws))
+        /* Acking not more than one window forward */
+        && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo
+            || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo))) {
+        /* Require an exact/+1 sequence match on resets when possible */
+
+        /* update max window */
+        if (src->max_win < win) {
+            src->max_win = win;
+        }
+        /* synchronize sequencing */
+        if (SEQ_GT(end, src->seqlo)) {
+            src->seqlo = end;
+        }
+        /* slide the window of what the other end can send */
+        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
+            dst->seqhi = ack + MAX((win << sws), 1);
+        }
+
+        /* update states */
+        if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) {
+                src->state = CT_DPIF_TCPS_SYN_SENT;
+        }
+        if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
+                src->state = CT_DPIF_TCPS_CLOSING;
+        }
+        if (tcp_flags & TCP_ACK) {
+            if (dst->state == CT_DPIF_TCPS_SYN_SENT) {
+                dst->state = CT_DPIF_TCPS_ESTABLISHED;
+            } else if (dst->state == CT_DPIF_TCPS_CLOSING) {
+                dst->state = CT_DPIF_TCPS_FIN_WAIT_2;
+            }
+        }
+        if (tcp_flags & TCP_RST) {
+            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
+        }
+
+        if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2
+            && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
+            update_expiration(conn_, CT_TM_TCP_CLOSED, now);
+        } else if (src->state >= CT_DPIF_TCPS_CLOSING
+                   && dst->state >= CT_DPIF_TCPS_CLOSING) {
+            update_expiration(conn_, CT_TM_TCP_FIN_WAIT, now);
+        } else if (src->state < CT_DPIF_TCPS_ESTABLISHED
+                   || dst->state < CT_DPIF_TCPS_ESTABLISHED) {
+            update_expiration(conn_, now, CT_TM_TCP_OPENING);
+        } else if (src->state >= CT_DPIF_TCPS_CLOSING
+                   || dst->state >= CT_DPIF_TCPS_CLOSING) {
+            update_expiration(conn_, now, CT_TM_TCP_CLOSING);
+        } else {
+            update_expiration(conn_, now, CT_TM_TCP_ESTABLISHED);
+        }
+    } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT
+                || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
+                || src->state >= CT_DPIF_TCPS_FIN_WAIT_2)
+               && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)
+               /* Within a window forward of the originating packet */
+               && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
+               /* Within a window backward of the originating packet */
+
+        /*
+         * This currently handles three situations:
+         *  1) Stupid stacks will shotgun SYNs before their peer
+         *     replies.
+         *  2) When PF catches an already established stream (the
+         *     firewall rebooted, the state table was flushed, routes
+         *     changed...)
+         *  3) Packets get funky immediately after the connection
+         *     closes (this should catch Solaris spurious ACK|FINs
+         *     that web servers like to spew after a close)
+         *
+         * This must be a little more careful than the above code
+         * since packet floods will also be caught here. We don't
+         * update the TTL here to mitigate the damage of a packet
+         * flood and so the same code can handle awkward establishment
+         * and a loosened connection close.
+         * In the establishment case, a correct peer response will
+         * validate the connection, go through the normal state code
+         * and keep updating the state TTL.
+         */
+
+        /* update max window */
+        if (src->max_win < win) {
+            src->max_win = win;
+        }
+        /* synchronize sequencing */
+        if (SEQ_GT(end, src->seqlo)) {
+            src->seqlo = end;
+        }
+        /* slide the window of what the other end can send */
+        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
+            dst->seqhi = ack + MAX((win << sws), 1);
+        }
+
+        /*
+         * Cannot set dst->seqhi here since this could be a shotgunned
+         * SYN and not an already established connection.
+         */
+
+        if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
+            src->state = CT_DPIF_TCPS_CLOSING;
+        }
+
+        if (tcp_flags & TCP_RST) {
+            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
+        }
+    } else {
+        return CT_UPDATE_INVALID;
+    }
+
+    return CT_UPDATE_VALID;
+}
+
+static bool
+tcp_valid_new(struct dp_packet *pkt)
+{
+    struct tcp_header *tcp = dp_packet_l4(pkt);
+    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
+
+    if (tcp_invalid_flags(tcp_flags)) {
+        return false;
+    }
+
+    /* A syn+ack is not allowed to create a connection.  We want to allow
+     * totally new connections (syn) or already established, not partially
+     * open (syn+ack). */
+    if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) {
+        return false;
+    }
+
+    return true;
+}
+
+static struct conn *
+tcp_new_conn(struct dp_packet *pkt, long long now)
+{
+    struct conn_tcp* newconn = NULL;
+    struct tcp_header *tcp = dp_packet_l4(pkt);
+    struct tcp_peer *src, *dst;
+    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
+
+    newconn = xzalloc(sizeof *newconn);
+
+    src = &newconn->peer[0];
+    dst = &newconn->peer[1];
+
+    src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq));
+    src->seqhi = src->seqlo + tcp_payload_length(pkt) + 1;
+
+    if (tcp_flags & TCP_SYN) {
+        src->seqhi++;
+        src->wscale = tcp_get_wscale(tcp);
+    } else {
+        src->wscale = CT_WSCALE_UNKNOWN;
+        dst->wscale = CT_WSCALE_UNKNOWN;
+    }
+    src->max_win = MAX(ntohs(tcp->tcp_winsz), 1);
+    if (src->wscale & CT_WSCALE_MASK) {
+        /* Remove scale factor from initial window */
+        uint8_t sws = src->wscale & CT_WSCALE_MASK;
+        src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << sws);
+    }
+    if (tcp_flags & TCP_FIN) {
+        src->seqhi++;
+    }
+    dst->seqhi = 1;
+    dst->max_win = 1;
+    src->state = CT_DPIF_TCPS_SYN_SENT;
+    dst->state = CT_DPIF_TCPS_CLOSED;
+
+    update_expiration(&newconn->up, now, CT_TM_TCP_FIRST_PACKET);
+
+    return &newconn->up;
+}
+
+struct ct_l4_proto ct_proto_tcp = {
+    .new_conn = tcp_new_conn,
+    .valid_new = tcp_valid_new,
+    .conn_update = tcp_conn_update,
+};
diff --git a/lib/conntrack.c b/lib/conntrack.c
new file mode 100644
index 0000000..96935bc
--- /dev/null
+++ b/lib/conntrack.c
@@ -0,0 +1,890 @@
+/*
+ * Copyright (c) 2015, 2016 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include "conntrack.h"
+
+#include <errno.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <netinet/icmp6.h>
+
+#include "bitmap.h"
+#include "conntrack-private.h"
+#include "coverage.h"
+#include "csum.h"
+#include "dp-packet.h"
+#include "flow.h"
+#include "hmap.h"
+#include "netdev.h"
+#include "odp-netlink.h"
+#include "openvswitch/vlog.h"
+#include "ovs-rcu.h"
+#include "random.h"
+#include "timeval.h"
+
+VLOG_DEFINE_THIS_MODULE(conntrack);
+
+COVERAGE_DEFINE(conntrack_new_full);
+
+struct conn_lookup_ctx {
+    struct conn_key key;
+    struct conn *conn;
+    uint32_t hash;
+    bool reply;
+    bool related;
+};
+
+static bool conn_key_extract(struct conntrack *, struct dp_packet *,
+                             struct conn_lookup_ctx *, uint16_t zone);
+static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
+static void conn_key_reverse(struct conn_key *);
+static void conn_key_lookup(struct conntrack_bucket *ctb,
+                            struct conn_lookup_ctx *ctx,
+                            long long now);
+static bool valid_new(struct dp_packet *pkt, struct conn_key *);
+static struct conn *new_conn(struct dp_packet *pkt, struct conn_key *,
+                             long long now);
+static void delete_conn(struct conn *);
+static enum ct_update_res conn_update(struct conn *, struct dp_packet*,
+                                      bool reply, long long now);
+static bool conn_expired(struct conn *, long long now);
+static void set_mark(struct dp_packet *, struct conn *,
+                     uint32_t val, uint32_t mask);
+static void set_label(struct dp_packet *, struct conn *,
+                      const struct ovs_key_ct_labels *val,
+                      const struct ovs_key_ct_labels *mask);
+
+static struct ct_l4_proto *l4_protos[] = {
+    [IPPROTO_TCP] = &ct_proto_tcp,
+    [IPPROTO_UDP] = &ct_proto_other,
+    [IPPROTO_ICMP] = &ct_proto_other,
+    [IPPROTO_ICMPV6] = &ct_proto_other,
+};
+
+long long ct_timeout_val[] = {
+#define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
+    CT_TIMEOUTS
+#undef CT_TIMEOUT
+};
+
+/* If the total number of connections goes above this value, no new connections
+ * are accepted */
+#define DEFAULT_N_CONN_LIMIT 3000000
+
+/* Initializes the connection tracker 'ct'.  The caller is responsibile for
+ * calling 'conntrack_destroy()', when the instance is not needed anymore */
+void
+conntrack_init(struct conntrack *ct)
+{
+    unsigned i;
+
+    for (i = 0; i < CONNTRACK_BUCKETS; i++) {
+        struct conntrack_bucket *ctb = &ct->buckets[i];
+
+        ct_lock_init(&ctb->lock);
+        ct_lock_lock(&ctb->lock);
+        hmap_init(&ctb->connections);
+        ct_lock_unlock(&ctb->lock);
+    }
+    ct->hash_basis = random_uint32();
+    atomic_count_init(&ct->n_conn, 0);
+    atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
+}
+
+/* Destroys the connection tracker 'ct' and frees all the allocated memory. */
+void
+conntrack_destroy(struct conntrack *ct)
+{
+    unsigned i;
+
+    for (i = 0; i < CONNTRACK_BUCKETS; i++) {
+        struct conntrack_bucket *ctb = &ct->buckets[i];
+        struct conn *conn;
+
+        ct_lock_lock(&ctb->lock);
+        HMAP_FOR_EACH_POP(conn, node, &ctb->connections) {
+            atomic_count_dec(&ct->n_conn);
+            delete_conn(conn);
+        }
+        hmap_destroy(&ctb->connections);
+        ct_lock_unlock(&ctb->lock);
+        ct_lock_destroy(&ctb->lock);
+    }
+}
+
+static unsigned hash_to_bucket(uint32_t hash)
+{
+    /* Extracts the most significant bits in hash. The least significant bits
+     * are already used internally by the hmap implementation. */
+    BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);
+
+    return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
+}
+
+static void
+write_ct_md(struct dp_packet *pkt, uint16_t state, uint16_t zone,
+            uint32_t mark, ovs_u128 label)
+{
+    pkt->md.ct_state = state | CS_TRACKED;
+    pkt->md.ct_zone = zone;
+    pkt->md.ct_mark = mark;
+    pkt->md.ct_label = label;
+}
+
+static struct conn *
+conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
+               struct conn_lookup_ctx *ctx, uint16_t *state, bool commit,
+               long long now)
+{
+    unsigned bucket = hash_to_bucket(ctx->hash);
+    struct conn *nc = NULL;
+
+    if (!valid_new(pkt, &ctx->key)) {
+        *state |= CS_INVALID;
+        return nc;
+    }
+
+    *state |= CS_NEW;
+
+    if (commit) {
+        unsigned int n_conn_limit;
+
+        atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
+
+        if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
+            COVERAGE_INC(conntrack_new_full);
+            return nc;
+        }
+
+        nc = new_conn(pkt, &ctx->key, now);
+
+        memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key);
+
+        conn_key_reverse(&nc->rev_key);
+        hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
+        atomic_count_inc(&ct->n_conn);
+    }
+
+    return nc;
+}
+
+static struct conn *
+process_one(struct conntrack *ct, struct dp_packet *pkt,
+            struct conn_lookup_ctx *ctx, uint16_t zone,
+            bool commit, long long now)
+{
+    unsigned bucket = hash_to_bucket(ctx->hash);
+    struct conn *conn = ctx->conn;
+    uint16_t state = 0;
+
+    if (conn) {
+        if (ctx->related) {
+            state |= CS_RELATED;
+            if (ctx->reply) {
+                state |= CS_REPLY_DIR;
+            }
+        } else {
+            enum ct_update_res res;
+
+            res = conn_update(conn, pkt, ctx->reply, now);
+
+            switch (res) {
+            case CT_UPDATE_VALID:
+                state |= CS_ESTABLISHED;
+                if (ctx->reply) {
+                    state |= CS_REPLY_DIR;
+                }
+                break;
+            case CT_UPDATE_INVALID:
+                state |= CS_INVALID;
+                break;
+            case CT_UPDATE_NEW:
+                hmap_remove(&ct->buckets[bucket].connections, &conn->node);
+                atomic_count_dec(&ct->n_conn);
+                delete_conn(conn);
+                conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
+                break;
+            default:
+                OVS_NOT_REACHED();
+            }
+        }
+    } else {
+        conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
+    }
+
+    write_ct_md(pkt, state, zone, conn ? conn->mark : 0,
+                conn ? conn->label : OVS_U128_ZERO);
+
+    return conn;
+}
+
+/* Sends the packets in '*pkt_batch' through the connection tracker 'ct'.  All
+ * the packets should have the same 'dl_type' (IPv4 or IPv6) and should have
+ * the l3 and and l4 offset properly set.
+ * 
+ * If 'commit' is true, the packets are allowed to create new entries in the
+ * connection tables.  'setmark', if not NULL, should point to a two
+ * elements array containing a value and a mask to set the connection mark.
+ * 'setlabel' behaves similarly for the connection label.*/
+int
+conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
+                  bool commit, uint16_t zone, const uint32_t *setmark,
+                  const struct ovs_key_ct_labels *setlabel,
+                  const char *helper)
+{
+    struct dp_packet **pkts = pkt_batch->packets;
+    size_t cnt = pkt_batch->count;
+#if !defined(__CHECKER__) && !defined(_WIN32)
+    const size_t KEY_ARRAY_SIZE = cnt;
+#else
+    enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST };
+#endif
+    struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE];
+    int8_t bucket_list[CONNTRACK_BUCKETS];
+    struct {
+        unsigned bucket;
+        unsigned long maps;
+    } arr[KEY_ARRAY_SIZE];
+    long long now = time_msec();
+    size_t i = 0;
+    uint8_t arrcnt = 0;
+
+    BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >= NETDEV_MAX_BURST);
+
+    if (helper) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+
+        VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper);
+        /* Continue without the helper */
+    }
+
+    memset(bucket_list, INT8_C(-1), sizeof bucket_list);
+    for (i = 0; i < cnt; i++) {
+        unsigned bucket;
+
+        if (!conn_key_extract(ct, pkts[i], &ctxs[i], zone)) {
+            write_ct_md(pkts[i], CS_INVALID, zone, 0, OVS_U128_ZERO);
+            continue;
+        }
+
+        bucket = hash_to_bucket(ctxs[i].hash);
+        if (bucket_list[bucket] == INT8_C(-1)) {
+            bucket_list[bucket] = arrcnt;
+
+            arr[arrcnt].maps = 0;
+            ULLONG_SET1(arr[arrcnt].maps, i);
+            arr[arrcnt++].bucket = bucket;
+        } else {
+            ULLONG_SET1(arr[bucket_list[bucket]].maps, i);
+            arr[bucket_list[bucket]].maps |= 1UL << i;
+        }
+    }
+
+    for (i = 0; i < arrcnt; i++) {
+        struct conntrack_bucket *ctb = &ct->buckets[arr[i].bucket];
+        size_t j;
+
+        ct_lock_lock(&ctb->lock);
+
+        ULLONG_FOR_EACH_1(j, arr[i].maps) {
+            struct conn *conn;
+
+            conn_key_lookup(ctb, &ctxs[j], now);
+
+            conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, now);
+
+            if (conn && setmark) {
+                set_mark(pkts[j], conn, setmark[0], setmark[1]);
+            }
+
+            if (conn && setlabel) {
+                set_label(pkts[j], conn, &setlabel[0], &setlabel[1]);
+            }
+        }
+        ct_lock_unlock(&ctb->lock);
+    }
+
+    return 0;
+}
+
+static void
+set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
+{
+    pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
+    conn->mark = pkt->md.ct_mark;
+}
+
+static void
+set_label(struct dp_packet *pkt, struct conn *conn,
+          const struct ovs_key_ct_labels *val,
+          const struct ovs_key_ct_labels *mask)
+{
+    ovs_u128 v, m;
+
+    memcpy(&v, val, sizeof v);
+    memcpy(&m, mask, sizeof m);
+
+    pkt->md.ct_label.u64.lo = v.u64.lo
+                              | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
+    pkt->md.ct_label.u64.hi = v.u64.hi
+                              | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
+    conn->label = pkt->md.ct_label;
+}
+
+/* Key extraction */
+
+/* The function stores a pointer to the first byte after the header in
+ * '*new_data', if 'new_data' is not NULL.  If it is NULL, the caller is
+ * not interested in the header's tail,  meaning that the header has
+ * already been parsed (e.g. by flow_extract): we take this as a hint to
+ * save a few checks.  If 'validate_checksum' is true, the function returns
+ * false if the IPv4 checksum is invalid. */
+static inline bool
+extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
+                const char **new_data, bool validate_checksum)
+{
+    const struct ip_header *ip = data;
+    size_t ip_len;
+
+    if (new_data) {
+        if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
+            return false;
+        }
+    }
+
+    ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
+
+    if (new_data) {
+        if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
+            return false;
+        }
+        if (OVS_UNLIKELY(size < ip_len)) {
+            return false;
+        }
+
+        *new_data = (char *) data + ip_len;
+    }
+
+    if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
+        return false;
+    }
+
+    if (validate_checksum && csum(data, ip_len) != 0) {
+        return false;
+    }
+
+    key->src.addr.ipv4 = ip->ip_src;
+    key->dst.addr.ipv4 = ip->ip_dst;
+    key->nw_proto = ip->ip_proto;
+
+    return true;
+}
+
+/* The function stores a pointer to the first byte after the header in
+ * '*new_data', if 'new_data' is not NULL.  If it is NULL, the caller is
+ * not interested in the header's tail,  meaning that the header has
+ * already been parsed (e.g. by flow_extract): we take this as a hint to
+ * save a few checks. */
+static inline bool
+extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
+                const char **new_data)
+{
+    const struct ovs_16aligned_ip6_hdr *ip6 = data;
+    uint8_t nw_proto = ip6->ip6_nxt;
+    uint8_t nw_frag = 0;
+
+    if (new_data) {
+        if (OVS_UNLIKELY(size < sizeof *ip6)) {
+            return false;
+        }
+    }
+
+    data = ip6 + 1;
+    size -=  sizeof *ip6;
+
+    if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {
+        return false;
+    }
+
+    if (new_data) {
+        *new_data = data;
+    }
+
+    if (nw_frag) {
+        return false;
+    }
+
+    key->src.addr.ipv6 = ip6->ip6_src;
+    key->dst.addr.ipv6 = ip6->ip6_dst;
+    key->nw_proto = nw_proto;
+
+    return true;
+}
+
+static inline bool
+checksum_valid(const struct conn_key *key, const void *data, size_t size,
+               const void *l3)
+{
+    uint32_t csum = 0;
+
+    if (key->dl_type == htons(ETH_TYPE_IP)) {
+        csum = packet_csum_pseudoheader(l3);
+    } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
+        csum = packet_csum_pseudoheader6(l3);
+    } else {
+        return false;
+    }
+
+    csum = csum_continue(csum, data, size);
+
+    return csum_finish(csum) == 0;
+}
+
+static inline bool
+check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
+             const void *l3)
+{
+    const struct tcp_header *tcp = data;
+    size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
+
+    if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
+        return false;
+    }
+
+    return checksum_valid(key, data, size, l3);
+}
+
+static inline bool
+check_l4_udp(const struct conn_key *key, const void *data, size_t size,
+             const void *l3)
+{
+    const struct udp_header *udp = data;
+    size_t udp_len = ntohs(udp->udp_len);
+
+    if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
+        return false;
+    }
+
+    /* Validation must be skipped if checksum is 0 on IPv4 packets */
+    return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
+           || checksum_valid(key, data, size, l3);
+}
+
+static inline bool
+check_l4_icmp(const void *data, size_t size)
+{
+    return csum(data, size) == 0;
+}
+
+static inline bool
+check_l4_icmp6(const struct conn_key *key, const void *data, size_t size,
+               const void *l3)
+{
+    return checksum_valid(key, data, size, l3);
+}
+
+static inline bool
+extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
+{
+    const struct tcp_header *tcp = data;
+
+    if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
+        return false;
+    }
+
+    key->src.port = tcp->tcp_src;
+    key->dst.port = tcp->tcp_dst;
+
+    /* Port 0 is invalid */
+    return key->src.port && key->dst.port;
+}
+
+static inline bool
+extract_l4_udp(struct conn_key *key, const void *data, size_t size)
+{
+    const struct udp_header *udp = data;
+
+    if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
+        return false;
+    }
+
+    key->src.port = udp->udp_src;
+    key->dst.port = udp->udp_dst;
+
+    /* Port 0 is invalid */
+    return key->src.port && key->dst.port;
+}
+
+static inline bool extract_l4(struct conn_key *key, const void *data,
+                              size_t size, bool *related, const void *l3);
+
+/* If 'related' is not NULL and the function is processing an ICMP
+ * error packet, extract the l3 and l4 fields from the nested header
+ * instead and set *related to true.  If 'related' is NULL we're
+ * already processing a nested header and no such recursion is
+ * possible */
+static inline int
+extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
+                bool *related)
+{
+    const struct icmp_header *icmp = data;
+
+    if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
+        return false;
+    }
+
+    switch (icmp->icmp_type) {
+    case ICMP4_ECHO_REQUEST:
+    case ICMP4_ECHO_REPLY:
+    case ICMP4_TIMESTAMP:
+    case ICMP4_TIMESTAMPREPLY:
+    case ICMP4_INFOREQUEST:
+    case ICMP4_INFOREPLY:
+        /* Separate ICMP connection: identified using id */
+        key->src.port = key->dst.port = icmp->icmp_fields.echo.id;
+        break;
+    case ICMP4_DST_UNREACH:
+    case ICMP4_TIME_EXCEEDED:
+    case ICMP4_PARAM_PROB:
+    case ICMP4_SOURCEQUENCH:
+    case ICMP4_REDIRECT: {
+        /* ICMP packet part of another connection. We should
+         * extract the key from embedded packet header */
+        struct conn_key inner_key;
+        const char *l3 = (const char *) (icmp + 1);
+        const char *tail = (const char *) data + size;
+        const char *l4;
+        bool ok;
+
+        if (!related) {
+            return false;
+        }
+        *related = true;
+
+        memset(&inner_key, 0, sizeof inner_key);
+        inner_key.dl_type = htons(ETH_TYPE_IP);
+        ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
+        if (!ok) {
+            return false;
+        }
+
+        /* pf doesn't do this, but it seems a good idea */
+        if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
+            || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
+            return false;
+        }
+
+        key->src = inner_key.src;
+        key->dst = inner_key.dst;
+        key->nw_proto = inner_key.nw_proto;
+
+        ok = extract_l4(key, l4, tail - l4, NULL, l3);
+        if (ok) {
+            conn_key_reverse(key);
+        }
+        return ok;
+    }
+    default:
+        return false;
+    }
+
+    return true;
+}
+
+/* If 'related' is not NULL and the function is processing an ICMP
+ * error packet, extract the l3 and l4 fields from the nested header
+ * instead and set *related to true.  If 'related' is NULL we're
+ * already processing a nested header and no such recursion is
+ * possible */
+static inline bool
+extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
+                 bool *related)
+{
+    const struct icmp6_header *icmp6 = data;
+
+    /* All the messages that we support need at least 4 bytes after
+     * the header */
+    if (size < sizeof *icmp6 + 4) {
+        return false;
+    }
+
+    switch (icmp6->icmp6_type) {
+    case ICMP6_ECHO_REQUEST:
+    case ICMP6_ECHO_REPLY:
+        /* Separate ICMP connection: identified using id */
+        key->src.port = key->dst.port = *(ovs_be16 *) (icmp6 + 1);
+        break;
+    case ICMP6_DST_UNREACH:
+    case ICMP6_PACKET_TOO_BIG:
+    case ICMP6_TIME_EXCEEDED:
+    case ICMP6_PARAM_PROB: {
+        /* ICMP packet part of another connection. We should
+         * extract the key from embedded packet header */
+        struct conn_key inner_key;
+        const char *l3 = (const char *) icmp6 + 8;
+        const char *tail = (const char *) data + size;
+        const char *l4 = NULL;
+        bool ok;
+
+        if (!related) {
+            return false;
+        }
+        *related = true;
+
+        memset(&inner_key, 0, sizeof inner_key);
+        inner_key.dl_type = htons(ETH_TYPE_IPV6);
+        ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
+        if (!ok) {
+            return false;
+        }
+
+        /* pf doesn't do this, but it seems a good idea */
+        if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
+                              &key->dst.addr.ipv6_aligned)
+            || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
+                                 &key->src.addr.ipv6_aligned)) {
+            return false;
+        }
+
+        key->src = inner_key.src;
+        key->dst = inner_key.dst;
+        key->nw_proto = inner_key.nw_proto;
+
+        ok = extract_l4(key, l4, tail - l4, NULL, l3);
+        if (ok) {
+            conn_key_reverse(key);
+        }
+        return ok;
+    }
+    default:
+        return false;
+    }
+
+    return true;
+}
+
+/* Extract l4 fields into 'key', which must already contain valid l3
+ * members.
+ *
+ * If 'related' is not NULL and an ICMP error packet is being
+ * processed, the function will extract the key from the packet nested
+ * in the ICMP paylod and set '*related' to true.
+ *
+ * If 'related' is NULL, it means that we're already parsing a header nested
+ * in an ICMP error.  In this case, we skip checksum and length validation. */
+static inline bool
+extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
+           const void *l3)
+{
+    if (key->nw_proto == IPPROTO_TCP) {
+        return (!related || check_l4_tcp(key, data, size, l3))
+               && extract_l4_tcp(key, data, size);
+    } else if (key->nw_proto == IPPROTO_UDP) {
+        return (!related || check_l4_udp(key, data, size, l3))
+               && extract_l4_udp(key, data, size);
+    } else if (key->dl_type == htons(ETH_TYPE_IP)
+               && key->nw_proto == IPPROTO_ICMP) {
+        return (!related || check_l4_icmp(data, size))
+               && extract_l4_icmp(key, data, size, related);
+    } else if (key->dl_type == htons(ETH_TYPE_IPV6)
+               && key->nw_proto == IPPROTO_ICMPV6) {
+        return (!related || check_l4_icmp6(key, data, size, l3))
+               && extract_l4_icmp6(key, data, size, related);
+    } else {
+        return false;
+    }
+}
+
+static bool
+conn_key_extract(struct conntrack *ct, struct dp_packet *pkt,
+                 struct conn_lookup_ctx *ctx, uint16_t zone)
+{
+    const struct eth_header *l2 = dp_packet_l2(pkt);
+    const struct ip_header *l3 = dp_packet_l3(pkt);
+    const char *l4 = dp_packet_l4(pkt);
+    const char *tail = dp_packet_tail(pkt);
+    bool ok;
+
+    memset(ctx, 0, sizeof *ctx);
+
+    if (!l2 || !l3 || !l4) {
+        return false;
+    }
+
+    ctx->key.zone = zone;
+
+    /* XXX In this function we parse the packet (again, it has already
+     * gone through miniflow_extract()) for two reasons:
+     *
+     * 1) To extract the l3 addresses and l4 ports.
+     *    We already have the l3 and l4 headers' pointers.  Extracting
+     *    the l3 addresses and the l4 ports is really cheap, since they
+     *    can be found at fixed locations.
+     * 2) To extract the l3 and l4 types.
+     *    Extracting the l3 and l4 types (especially the l3[1]) on the
+     *    other hand is quite expensive, because they're not at a
+     *    fixed location.
+     *
+     * Here's a way to avoid (2) with the help of the datapath.
+     * The datapath doesn't keep the packet's extracted flow[2], so
+     * using that is not an option.  We could use the packet's matching
+     * megaflow for l3 type (it's always unwildcarded), and for l4 type
+     * (we have to unwildcard it first).  This means either:
+     *
+     * a) dpif-netdev passes the matching megaflow to dp_execute_cb(), which
+     *    is used to extract the l3 type.  Unfortunately, dp_execute_cb() is
+     *    used also in dpif_netdev_execute(), which doesn't have a matching
+     *    megaflow.
+     *
+     * b) We define an alternative OVS_ACTION_ATTR_CT, used only by the
+     *    userspace datapath, which includes l3 (and l4) type.  The
+     *    alternative action could be generated by ofproto-dpif specifically
+     *    for the userspace datapath. Having a different interface for
+     *    userspace and kernel doesn't seem very clean, though.
+     *
+     * ---
+     * [1] A simple benchmark (running only the connection tracker
+     *     over and over on the same packets) shows that if the
+     *     l3 type is already provided we are 15% faster (running the
+     *     connection tracker over a couple of DPDK devices with a
+     *     stream of UDP 64-bytes packets shows that we are 4% faster).
+     *
+     * [2] The reasons for this are that keeping the flow increases
+     *     (slightly) the cache footprint and increases computation
+     *     time as we move the packet around. Most importantly, the flow
+     *     should be updated by the actions and this can be slow, as
+     *     we use a sparse representation (miniflow).
+     *
+     */
+    ctx->key.dl_type = parse_dl_type(l2, (char *) l3 - (char *) l2);
+    if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
+        ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL, true);
+    } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
+        ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);
+    } else {
+        ok = false;
+    }
+
+    if (ok) {
+        if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3)) {
+            ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/* Symmetric */
+static uint32_t
+conn_key_hash(const struct conn_key *key, uint32_t basis)
+{
+    uint32_t hsrc, hdst, hash;
+    int i;
+
+    hsrc = hdst = basis;
+
+    /* Hash the source and destination tuple */
+    for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {
+        hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);
+        hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);
+    }
+
+    /* Even if source and destination are swapped the hash will be the same. */
+    hash = hsrc ^ hdst;
+
+    /* Hash the rest of the key(L3 and L4 types and zone). */
+    hash = hash_words((uint32_t *) &key->dst + 1,
+                      (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
+                      hash);
+
+    return hash;
+}
+
+static void
+conn_key_reverse(struct conn_key *key)
+{
+    struct ct_endpoint tmp;
+    tmp = key->src;
+    key->src = key->dst;
+    key->dst = tmp;
+}
+
+static void
+conn_key_lookup(struct conntrack_bucket *ctb,
+                struct conn_lookup_ctx *ctx,
+                long long now)
+{
+    uint32_t hash = ctx->hash;
+    struct conn *conn;
+
+    ctx->conn = NULL;
+
+    HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
+        if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))
+                && !conn_expired(conn, now)) {
+            ctx->conn = conn;
+            ctx->reply = false;
+            break;
+        }
+        if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key))
+                && !conn_expired(conn, now)) {
+            ctx->conn = conn;
+            ctx->reply = true;
+            break;
+        }
+    }
+}
+
+static enum ct_update_res
+conn_update(struct conn *conn, struct dp_packet *pkt, bool reply,
+            long long now)
+{
+    return l4_protos[conn->key.nw_proto]->conn_update(conn, pkt, reply, now);
+}
+
+static bool
+conn_expired(struct conn *conn, long long now)
+{
+    return now >= conn->expiration;
+}
+
+static bool
+valid_new(struct dp_packet *pkt, struct conn_key *key)
+{
+    return l4_protos[key->nw_proto]->valid_new(pkt);
+}
+
+static struct conn *
+new_conn(struct dp_packet *pkt, struct conn_key *key, long long now)
+{
+    struct conn *newconn;
+
+    newconn = l4_protos[key->nw_proto]->new_conn(pkt, now);
+
+    if (newconn) {
+        newconn->key = *key;
+    }
+
+    return newconn;
+}
+
+static void
+delete_conn(struct conn *conn)
+{
+    free(conn);
+}
diff --git a/lib/conntrack.h b/lib/conntrack.h
new file mode 100644
index 0000000..54731bd
--- /dev/null
+++ b/lib/conntrack.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2015, 2016 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CONNTRACK_H
+#define CONNTRACK_H 1
+
+#include <stdbool.h>
+
+#include "hmap.h"
+#include "odp-netlink.h"
+#include "openvswitch/thread.h"
+#include "openvswitch/types.h"
+#include "ovs-atomic.h"
+
+/* Userspace connection tracker
+ * ============================
+ *
+ * This is a connection tracking module that keeps all the state in userspace.
+ *
+ * Usage
+ * =====
+ *
+ *     struct conntract ct;
+ *
+ * Initialization:
+ *
+ *     conntrack_init(&ct);
+ *
+ * It is necessary to periodically issue a call to
+ *
+ *     conntrack_run(&ct);
+ *
+ * to allow the module to clean up expired connections.
+ *
+ * To send a group of packets through the connection tracker:
+ *
+ *     conntrack_execute(&ct, pkts, n_pkts, ...);
+ *
+ * Thread-safety
+ * =============
+ *
+ * conntrack_execute() can be called by multiple threads simultaneoulsy.
+ */
+
+struct dp_packet_batch;
+
+struct conntrack;
+
+void conntrack_init(struct conntrack *);
+void conntrack_run(struct conntrack *);
+void conntrack_destroy(struct conntrack *);
+
+int conntrack_execute(struct conntrack *, struct dp_packet_batch *,
+                      bool commit, uint16_t zone, const uint32_t *setmark,
+                      const struct ovs_key_ct_labels *setlabel,
+                      const char *helper);
+
+/* 'struct ct_lock' is a wrapper for an adaptive mutex.  It's useful to try
+ * different types of locks (e.g. spinlocks) */
+
+struct OVS_LOCKABLE ct_lock {
+    struct ovs_mutex lock;
+};
+
+static inline void ct_lock_init(struct ct_lock *lock)
+{
+    ovs_mutex_init_adaptive(&lock->lock);
+}
+
+static inline void ct_lock_lock(struct ct_lock *lock)
+    OVS_ACQUIRES(lock)
+    OVS_NO_THREAD_SAFETY_ANALYSIS
+{
+    ovs_mutex_lock(&lock->lock);
+}
+
+static inline void ct_lock_unlock(struct ct_lock *lock)
+    OVS_RELEASES(lock)
+    OVS_NO_THREAD_SAFETY_ANALYSIS
+{
+    ovs_mutex_unlock(&lock->lock);
+}
+
+static inline void ct_lock_destroy(struct ct_lock *lock)
+{
+    ovs_mutex_destroy(&lock->lock);
+}
+
+/* Timeouts: all the possible timeout states passed to update_expiration()
+ * are listed here. The name will be prefix by CT_TM_ and the value is in
+ * milliseconds */
+#define CT_TIMEOUTS \
+    CT_TIMEOUT(TCP_FIRST_PACKET, 30 * 1000) \
+    CT_TIMEOUT(TCP_OPENING, 30 * 1000) \
+    CT_TIMEOUT(TCP_ESTABLISHED, 24 * 60 * 60 * 1000) \
+    CT_TIMEOUT(TCP_CLOSING, 15 * 60 * 1000) \
+    CT_TIMEOUT(TCP_FIN_WAIT, 45 * 1000) \
+    CT_TIMEOUT(TCP_CLOSED, 30 * 1000) \
+    CT_TIMEOUT(OTHER_FIRST, 60 * 1000) \
+    CT_TIMEOUT(OTHER_MULTIPLE, 60 * 1000) \
+    CT_TIMEOUT(OTHER_BIDIR, 30 * 1000) \
+
+enum ct_timeout {
+#define CT_TIMEOUT(NAME, VALUE) CT_TM_##NAME,
+    CT_TIMEOUTS
+#undef CT_TIMEOUT
+    N_CT_TM
+};
+
+/* Locking:
+ *
+ * The connections are kept in different buckets, which are completely
+ * independent. The connection bucket is determined by the hash of its key.
+ * */
+struct conntrack_bucket {
+    struct ct_lock lock;
+    struct hmap connections OVS_GUARDED;
+};
+
+#define CONNTRACK_BUCKETS_SHIFT 8
+#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT)
+
+struct conntrack {
+    /* Independent buckets containing the connections */
+    struct conntrack_bucket buckets[CONNTRACK_BUCKETS];
+
+    /* Salt for hashing a connection key. */
+    uint32_t hash_basis;
+
+    /* Number of connections currently in the connection tracker. */
+    atomic_count n_conn;
+    /* Connections limit. When this limit is reached, no new connection
+     * will be accepted. */
+    atomic_uint n_conn_limit;
+};
+
+#endif /* conntrack.h */
diff --git a/lib/util.h b/lib/util.h
index 7be4a30..ad31e74 100644
--- a/lib/util.h
+++ b/lib/util.h
@@ -69,6 +69,15 @@ ovs_prefetch_range(const void *start, size_t size)
 #define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
 #endif
 
+/* Comparisons for ints with modular arithmetic */
+#define INT_MOD_LT(a,b)     ((int) ((a)-(b)) < 0)
+#define INT_MOD_LEQ(a,b)    ((int) ((a)-(b)) <= 0)
+#define INT_MOD_GT(a,b)     ((int) ((a)-(b)) > 0)
+#define INT_MOD_GEQ(a,b)    ((int) ((a)-(b)) >= 0)
+
+#define INT_MOD_MIN(a, b)   ((INT_MOD_LT(a, b)) ? (a) : (b))
+#define INT_MOD_MAX(a, b)   ((INT_MOD_GT(a, b)) ? (a) : (b))
+
 #define OVS_NOT_REACHED() abort()
 
 /* Use "%"PRIuSIZE to format size_t with printf(). */
-- 
2.8.1




More information about the dev mailing list