[ovs-dev] [RFC flow tunnels 5/8] tunnel: Userspace implementation of tunnel manipulation.

Ethan Jackson ethan at nicira.com
Wed Jan 9 23:43:45 UTC 2013


From: Jesse Gross <jesse at nicira.com>

The kernel tunneling code currently needs to handle a large number
of operations when tunnel packets are encapsulated and
decapsulated.  Some examples of this are: finding the correct
tunnel port on receive, TTL and ToS inheritance, ECN handling, etc.
All of these can be done on a per-flow basis in userspace now that
we have both the inner and outer header information, which allows
us to both simplify the kernel and take advantage of userspace's
information.  Once tunnel packets are redirected into this code,
the redundant pieces can be removed from other places.

Signed-off-by: Jesse Gross <jesse at nicira.com>
Signed-off-by: Ethan Jackson <ethan at nicira.com>
---
 ofproto/automake.mk |    4 +-
 ofproto/tunnel.c    |  521 +++++++++++++++++++++++++++++++++++++++++++++++++++
 ofproto/tunnel.h    |   46 +++++
 3 files changed, 570 insertions(+), 1 deletion(-)
 create mode 100644 ofproto/tunnel.c
 create mode 100644 ofproto/tunnel.h

diff --git a/ofproto/automake.mk b/ofproto/automake.mk
index 9088292..69f014f 100644
--- a/ofproto/automake.mk
+++ b/ofproto/automake.mk
@@ -29,6 +29,8 @@ ofproto_libofproto_a_SOURCES = \
 	ofproto/pktbuf.c \
 	ofproto/pktbuf.h \
 	ofproto/pinsched.c \
-	ofproto/pinsched.h
+	ofproto/pinsched.h \
+	ofproto/tunnel.c \
+	ofproto/tunnel.h
 
 MAN_FRAGMENTS += ofproto/ofproto-unixctl.man ofproto/ofproto-dpif-unixctl.man
diff --git a/ofproto/tunnel.c b/ofproto/tunnel.c
new file mode 100644
index 0000000..2e52e22
--- /dev/null
+++ b/ofproto/tunnel.c
@@ -0,0 +1,521 @@
+/* Copyright (c) 2013 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include <config.h>
+#include "tunnel.h"
+
+#include <errno.h>
+
+#include "ofproto/ofproto-provider.h"
+#include "byte-order.h"
+#include "daemon.h"
+#include "dirs.h"
+#include "dynamic-string.h"
+#include "hash.h"
+#include "hmap.h"
+#include "netdev-vport.h"
+#include "odp-util.h"
+#include "packets.h"
+#include "smap.h"
+#include "socket-util.h"
+#include "tunnel.h"
+#include "unixctl.h"
+#include "vlog.h"
+
+/* TODO:
+ *
+ * Ability to generate actions on input for ECN
+ * Port stats
+ * Ability to generate metadata for packet-outs
+ * IPsec using skb mark.
+ * VXLAN.
+ * Multicast group management (possibly).
+ * Disallow netdevs with names like "gre64" to prevent collisions. */
+
+VLOG_DEFINE_THIS_MODULE(tunnel);
+
+struct tnl_match {
+    ovs_be64 in_key;
+    ovs_be32 ip_src;
+    ovs_be32 ip_dst;
+    uint32_t odp_port;
+    bool in_key_present;
+    bool in_key_flow;
+};
+BUILD_ASSERT_DECL(sizeof(struct tnl_match) == 24);
+
+struct tnl_port {
+    struct hmap_node match_node;
+    struct hmap_node ofport_node;
+
+    const struct ofport *ofport;
+    unsigned int netdev_seq;
+    struct tnl_match match;
+};
+
+static struct hmap tnl_match_map = HMAP_INITIALIZER(&tnl_match_map);
+static struct hmap ofport_map = HMAP_INITIALIZER(&ofport_map);
+
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+static struct vlog_rate_limit dbg_rl = VLOG_RATE_LIMIT_INIT(60, 60);
+
+static struct tnl_port *tnl_find(struct tnl_match *);
+static struct tnl_port *tnl_find_exact(struct tnl_match *);
+static uint32_t tnl_hash(struct tnl_match *);
+static void tnl_match_fmt(const struct tnl_match *, struct ds *);
+static void tnl_port_fmt(const struct tnl_port *, struct ds *);
+static void tnl_port_mod_log(const struct tnl_port *, const char *action);
+static struct tnl_port *find_ofport(const struct ofport *);
+static const char *tnl_port_get_name(const struct tnl_port *);
+static void tnl_port_del__(struct tnl_port *);
+
+/* Performs periodic maintenance work. */
+bool
+tnl_run(void)
+{
+    struct tnl_port *tnl_port, *next;
+    bool reconfigured;
+
+    reconfigured = false;
+    HMAP_FOR_EACH_SAFE (tnl_port, next, ofport_node, &ofport_map) {
+        if (tnl_port->netdev_seq
+            != netdev_change_seq(tnl_port->ofport->netdev)) {
+            const struct ofport *ofport = tnl_port->ofport;
+            uint32_t odp_port = tnl_port->match.odp_port;
+
+            VLOG_DBG("reconfiguring %s", tnl_port_get_name(tnl_port));
+            tnl_port_del__(tnl_port);
+            tnl_port_add(ofport, odp_port);
+            reconfigured = true;
+
+        }
+    }
+    return reconfigured;
+}
+
+/* Adds 'ofport' to the module with datapath port number 'odp_port'. 'ofport's
+ * must be added before they can be used by the module. If 'ofport' is not a
+ * tunnel, does nothing. Returns 0 on success, otherwise a positive errno
+ * value. */
+int
+tnl_port_add(const struct ofport *ofport, uint32_t odp_port)
+{
+    const struct netdev_tunnel_config *cfg;
+    struct tnl_port *existing_port;
+    struct tnl_port *tnl_port;
+
+    cfg = netdev_get_tunnel_config(ofport->netdev);
+    if (!cfg) {
+        return 0;
+    }
+
+    tnl_port = xzalloc(sizeof *tnl_port);
+    tnl_port->ofport = ofport;
+    tnl_port->netdev_seq = netdev_change_seq(tnl_port->ofport->netdev);
+
+    tnl_port->match.in_key = cfg->in_key;
+    tnl_port->match.ip_src = cfg->ip_src;
+    tnl_port->match.ip_dst = cfg->ip_dst;
+    tnl_port->match.in_key_present = cfg->in_key_present;
+    tnl_port->match.in_key_flow = cfg->in_key_flow;
+    tnl_port->match.odp_port = odp_port;
+
+    existing_port = tnl_find_exact(&tnl_port->match);
+    if (existing_port) {
+        struct ds ds = DS_EMPTY_INITIALIZER;
+
+        tnl_match_fmt(&tnl_port->match, &ds);
+        VLOG_WARN("%s: attempting to add tunnel port with same config as "
+                  "port '%s' (%s)", tnl_port_get_name(tnl_port),
+                  tnl_port_get_name(existing_port), ds_cstr(&ds));
+        ds_destroy(&ds);
+        free(tnl_port);
+        return EEXIST;
+    }
+
+
+    hmap_insert(&tnl_match_map, &tnl_port->match_node,
+                tnl_hash(&tnl_port->match));
+    hmap_insert(&ofport_map, &tnl_port->ofport_node,
+                hash_pointer(tnl_port->ofport, 0));
+    tnl_port_mod_log(tnl_port, "adding");
+    return 0;
+}
+
+static void
+tnl_port_del__(struct tnl_port *tnl_port)
+{
+    tnl_port_mod_log(tnl_port, "removing");
+    hmap_remove(&tnl_match_map, &tnl_port->match_node);
+    hmap_remove(&ofport_map, &tnl_port->ofport_node);
+    free(tnl_port);
+}
+
+/* Removes 'ofport' from the module. */
+void
+tnl_port_del(const struct ofport *ofport)
+{
+    struct tnl_port *tnl_port = find_ofport(ofport);
+    if (tnl_port) {
+        tnl_port_del__(tnl_port);
+    }
+}
+
+static bool
+is_ip(const struct flow *flow)
+{
+    return flow->dl_type == htons(ETH_TYPE_IP) ||
+           flow->dl_type == htons(ETH_TYPE_IPV6);
+}
+
+/* Returns true if 'flow' should be submitted to tnl_port_receive(). */
+bool
+tnl_port_should_receive(const struct flow *flow)
+{
+    return flow->tunnel.ip_dst != 0;
+}
+
+/* Transforms 'flow' so that it appears to have been received by a tunnel
+ * OpenFlow port controlled by this module instead of the datapath port it
+ * actually came in on.  Sets 'flow''s in_port to the appropriate OpenFlow port
+ * number.  Returns the 'ofport' corresponding to the new in_port.
+ *
+ * Callers should verify that 'flow' needs to be received by calling
+ * tnl_port_should_receive() before this function.
+ *
+ * Leaves 'flow' untouched and returns null if unsuccessful. */
+const struct ofport *
+tnl_port_receive(struct flow *flow)
+{
+    char *pre_flow_str = NULL;
+    struct tnl_port *tnl_port;
+    struct tnl_match match;
+
+    memset(&match, 0, sizeof match);
+    match.odp_port = flow->in_port;
+    match.ip_src = flow->tunnel.ip_dst;
+    match.ip_dst = flow->tunnel.ip_src;
+    match.in_key = flow->tunnel.tun_id;
+    match.in_key_present = flow->tunnel.flags & FLOW_TNL_F_KEY;
+
+    tnl_port = tnl_find(&match);
+    if (!tnl_port) {
+        struct ds ds = DS_EMPTY_INITIALIZER;
+
+        tnl_match_fmt(&match, &ds);
+        VLOG_WARN_RL(&rl, "receive tunnel port not found (%s)", ds_cstr(&ds));
+        ds_destroy(&ds);
+
+        return NULL;
+    }
+
+    if (is_ip(flow)
+        && ((flow->tunnel.ip_tos & IP_ECN_MASK) == IP_ECN_CE)
+        && (flow->nw_tos & IP_ECN_MASK) == IP_ECN_NOT_ECT) {
+        VLOG_WARN_RL(&rl, "dropping tunnel packet marked ECN CE but is not ECN"
+                     " capable");
+        return NULL;
+    }
+
+    if (!VLOG_DROP_DBG(&dbg_rl)) {
+        pre_flow_str = flow_to_string(flow);
+    }
+
+    flow->in_port = tnl_port->ofport->ofp_port;
+
+    memset(&flow->tunnel, 0, sizeof flow->tunnel);
+    flow->tunnel.tun_id = match.in_key;
+
+    if (pre_flow_str) {
+        char *post_flow_str = flow_to_string(flow);
+        struct ds ds = DS_EMPTY_INITIALIZER;
+
+        tnl_port_fmt(tnl_port, &ds);
+        VLOG_DBG("flow received\n"
+                 "%s"
+                 " pre: %s\n"
+                 "post: %s", ds_cstr(&ds),
+                 pre_flow_str, post_flow_str);
+        ds_destroy(&ds);
+        free(pre_flow_str);
+        free(post_flow_str);
+    }
+    return tnl_port->ofport;
+}
+
+/* Given that 'flow' should be output to 'ofport', updates 'flow''s tunnel
+ * headers and sets 'odp_port' to the actual datapath port that the output
+ * should happen on.  Returns true if successful, false if 'ofport' is not a
+ * tunnel, or there was a problem. */
+bool
+tnl_port_send(const struct ofport *ofport, struct flow *flow,
+              uint32_t *odp_port)
+{
+    const struct netdev_tunnel_config *cfg;
+    struct tnl_port *tnl_port;
+    char *pre_flow_str = NULL;
+
+    tnl_port = find_ofport(ofport);
+    if (!tnl_port) {
+        return false;
+    }
+
+    cfg = netdev_get_tunnel_config(tnl_port->ofport->netdev);
+    if (!cfg) {
+        VLOG_ERR("tnl_port missing its configuration");
+        return false;
+    }
+
+    if (!VLOG_DROP_DBG(&dbg_rl)) {
+        pre_flow_str = flow_to_string(flow);
+    }
+
+    flow->tunnel.ip_src = tnl_port->match.ip_src;
+    flow->tunnel.ip_dst = tnl_port->match.ip_dst;
+
+    if (cfg->out_key_flow) {
+        flow->tunnel.tun_id = flow->tunnel.tun_id;
+    } else {
+        flow->tunnel.tun_id = cfg->out_key;
+    }
+
+    if (cfg->ttl_inherit && is_ip(flow)) {
+        flow->tunnel.ip_ttl = flow->nw_ttl;
+    } else {
+        flow->tunnel.ip_ttl = cfg->ttl;
+    }
+
+    if (cfg->tos_inherit && is_ip(flow)) {
+        flow->tunnel.ip_tos = flow->nw_tos & IP_DSCP_MASK;
+    } else {
+        flow->tunnel.ip_tos = cfg->tos;
+    }
+    if ((flow->nw_tos & IP_ECN_MASK) == IP_ECN_CE) {
+        flow->tunnel.ip_tos |= IP_ECN_ECT_0;
+    } else {
+        flow->tunnel.ip_tos |= flow->nw_tos & IP_ECN_MASK;
+    }
+
+    flow->tunnel.flags = (cfg->dont_fragment ? FLOW_TNL_F_DONT_FRAGMENT : 0)
+        | (cfg->csum ? FLOW_TNL_F_CSUM : 0)
+        | (cfg->out_key_present ? FLOW_TNL_F_KEY : 0);
+
+    if (pre_flow_str) {
+        struct ds ds = DS_EMPTY_INITIALIZER;
+        char *post_flow_str;
+
+        post_flow_str = flow_to_string(flow);
+        tnl_port_fmt(tnl_port, &ds);
+
+        VLOG_DBG("flow sent\n"
+                 "%s"
+                 " pre: %s\n"
+                 "post: %s", ds_cstr(&ds),
+                 pre_flow_str, post_flow_str);
+
+        ds_destroy(&ds);
+        free(pre_flow_str);
+        free(post_flow_str);
+    }
+
+    *odp_port = tnl_port->match.odp_port;
+    return true;
+}
+
+static struct tnl_port *
+find_ofport(const struct ofport *ofport)
+{
+    struct tnl_port *tnl_port;
+
+    HMAP_FOR_EACH_IN_BUCKET (tnl_port, ofport_node, hash_pointer(ofport, 0),
+                             &ofport_map) {
+        if (tnl_port->ofport == ofport) {
+            return tnl_port;
+        }
+    }
+
+    return NULL;
+}
+
+static uint32_t
+tnl_hash(struct tnl_match *match)
+{
+    return hash_bytes(match, sizeof *match, 0);
+}
+
+static struct tnl_port *
+tnl_find_exact(struct tnl_match *match)
+{
+    struct tnl_port *tnl_port;
+
+    HMAP_FOR_EACH_WITH_HASH (tnl_port, match_node, tnl_hash(match),
+                             &tnl_match_map) {
+        if (!memcmp(match, &tnl_port->match, sizeof *match)) {
+            return tnl_port;
+        }
+    }
+
+    return NULL;
+}
+
+static struct tnl_port *
+tnl_find(struct tnl_match *match_)
+{
+    struct tnl_match match = *match_;
+    bool is_multicast = ip_is_multicast(match.ip_src);
+    struct tnl_port *tnl_port;
+
+    /* remote_ip, local_ip, in_key */
+    if (!is_multicast) {
+        tnl_port = tnl_find_exact(&match);
+        if (tnl_port) {
+            return tnl_port;
+        }
+    }
+
+    /* remote_ip, in_key */
+    match.ip_src = 0;
+    tnl_port = tnl_find_exact(&match);
+    if (tnl_port) {
+        return tnl_port;
+    }
+    match.ip_src = match_->ip_src;
+
+    /* remote_ip, local_ip */
+    if (!is_multicast) {
+        match.in_key = 0;
+        match.in_key_flow = true;
+        tnl_port = tnl_find_exact(&match);
+        if (tnl_port) {
+            return tnl_port;
+        }
+        match.in_key = match_->in_key;
+        match.in_key_flow = false;
+    }
+
+    /* remote_ip */
+    match.ip_src = 0;
+    match.in_key = 0;
+    match.in_key_flow = true;
+    tnl_port = tnl_find_exact(&match);
+    if (tnl_port) {
+        return tnl_port;
+    }
+    match.ip_src = match_->ip_src;
+    match.in_key = match_->in_key;
+    match.in_key_flow = false;
+
+    if (is_multicast) {
+        match.ip_src = 0;
+        match.ip_dst = match_->ip_src;
+
+        /* multicast remote_ip, in_key */
+        tnl_port = tnl_find_exact(&match);
+        if (tnl_port) {
+            return tnl_port;
+        }
+
+        /* multicast remote_ip */
+        match.in_key = 0;
+        match.in_key_flow = true;
+        tnl_port = tnl_find_exact(&match);
+        if (tnl_port) {
+            return tnl_port;
+        }
+    }
+
+    return NULL;
+}
+
+static void
+tnl_match_fmt(const struct tnl_match *match, struct ds *ds)
+{
+    ds_put_format(ds, IP_FMT"->"IP_FMT, IP_ARGS(match->ip_src),
+                  IP_ARGS(match->ip_dst));
+
+    if (match->in_key_present) {
+        if (match->in_key_flow) {
+            ds_put_cstr(ds, ", key=flow");
+        } else {
+            ds_put_format(ds, ", key=%#"PRIx64, ntohll(match->in_key));
+        }
+    }
+
+    ds_put_format(ds, ", dp port=%"PRIu32, match->odp_port);
+}
+
+static void
+tnl_port_mod_log(const struct tnl_port *tnl_port, const char *action)
+{
+    if (VLOG_IS_DBG_ENABLED()) {
+        struct ds ds = DS_EMPTY_INITIALIZER;
+
+        tnl_match_fmt(&tnl_port->match, &ds);
+        VLOG_DBG("%s tunnel port %s (%s)", action, tnl_port_get_name(tnl_port),
+                 ds_cstr(&ds));
+        ds_destroy(&ds);
+    }
+}
+
+static void
+tnl_port_fmt(const struct tnl_port *tnl_port, struct ds *ds)
+{
+    const struct netdev_tunnel_config *cfg =
+        netdev_get_tunnel_config(tnl_port->ofport->netdev);
+    ds_put_format(ds, "port %"PRIu32": %s (%s: ", tnl_port->match.odp_port,
+                  tnl_port_get_name(tnl_port),
+                  netdev_get_type(tnl_port->ofport->netdev));
+    tnl_match_fmt(&tnl_port->match, ds);
+
+    if (cfg->out_key != cfg->in_key ||
+        cfg->out_key_present != cfg->in_key_present ||
+        cfg->out_key_flow != cfg->in_key_flow) {
+        ds_put_cstr(ds, ", out_key=");
+        if (!cfg->out_key_present) {
+            ds_put_cstr(ds, "none");
+        } else if (cfg->out_key_flow) {
+            ds_put_cstr(ds, "flow");
+        } else {
+            ds_put_format(ds, "%#"PRIx64, ntohll(cfg->out_key));
+        }
+    }
+
+    if (cfg->ttl_inherit) {
+        ds_put_cstr(ds, ", ttl=inherit");
+    } else {
+        ds_put_format(ds, ", ttl=%"PRIu8, cfg->ttl);
+    }
+
+    if (cfg->tos_inherit) {
+        ds_put_cstr(ds, ", tos=inherit");
+    } else if (cfg->tos) {
+        ds_put_format(ds, ", tos=%#"PRIx8, cfg->tos);
+    }
+
+    if (!cfg->dont_fragment) {
+        ds_put_cstr(ds, ", df=false");
+    }
+
+    if (cfg->csum) {
+        ds_put_cstr(ds, ", csum=true");
+    }
+
+    ds_put_cstr(ds, ")\n");
+}
+
+static const char *
+tnl_port_get_name(const struct tnl_port *tnl_port)
+{
+    return netdev_get_name(tnl_port->ofport->netdev);
+}
diff --git a/ofproto/tunnel.h b/ofproto/tunnel.h
new file mode 100644
index 0000000..02f1882
--- /dev/null
+++ b/ofproto/tunnel.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2013 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TUNNEL_H
+#define TUNNEL_H 1
+
+#include <config.h>
+#include <stdbool.h>
+
+#include "flow.h"
+#include "netdev.h"
+
+/* Tunnel port emulation layer.
+ *
+ * These functions emulate tunnel virtual ports based on the outer
+ * header information from the kernel.
+ *
+ * It is legal to pass all port types and packets to this code.  If
+ * it is not associated with a tunnel the function will be a no-op. */
+
+struct ofport;
+
+bool tnl_run(void);
+
+int tnl_port_add(const struct ofport *, uint32_t odp_port);
+void tnl_port_del(const struct ofport *);
+
+int tnl_stats_get(const struct ofport *, struct netdev_stats *);
+
+bool tnl_port_should_receive(const struct flow *);
+const struct ofport *tnl_port_receive(struct flow *);
+bool tnl_port_send(const struct ofport *, struct flow *, uint32_t *odp_port);
+
+#endif /* tunnel.h */
-- 
1.7.9.5




More information about the dev mailing list