[ovs-dev] [RFC PATCH 05/11] dpif: add 'dpif-bpf' provider.

William Tu u9012063 at gmail.com
Sat Jun 23 12:16:37 UTC 2018


From: Joe Stringer <joe at ovn.org>

Implement a new datapath interface for use with BPF datapaths.

Like dpif-netlink, dpif-bpf is backed by an implementation which resides
within the kernel. It uses the BPF functionality available in recent
versions of Linux to create the datapath. Unlike dpif-netlink there is no
datapath notion of a bridge with ports attached; dpif-bpf is implemented
by attaching BPF programs directly to individual devices using TC.

Upcalls are implemented using a perf event ringbuffer, which is polled
by handler threads. Flow execution is implemented by sending the packet
plus metadata on a dedicated tap device, where there is an BPF program
that understands the format of the packet coming from userspace. When
this device receives a message, it strips the metadata, uses it to
determine how to execute the packet, then forwards the packet onwards.

This initial implementation has a number of limitations which are
expected to go away over time:
	* The set of matches and actions supported by the datapath is not
	  as wide as the full set known by OVS, so if a flow cannot be
	  expressed in the current eBPF API, OVS will log errors and return
	  errors during flow put.
	* Only the input port and packet length is passed as metadata from
	  the datapath to userspace during upcall. Key extraction is done
	  purely from the packet provided from the datapath.
	* Conversely, only the output port is sent down during execution.
	  No other actions are supported currently; and only one output is
	  supported.
	* Ingress policing cannot be configured on BPF
	  datapath devices.
	* On startup, if the OVS BPF datapath is already loaded into the
	  kernel and pinned to the filesystem, it will reuse this datapath,
	  even if the datapath is out-of-date.

Documentation/intro/install/bpf.rst contains further information on how
to build and use the bpf datapath.

For more details on the design and implementation, see our OSR paper:
[1] https://dl.acm.org/citation.cfm?id=3139657
[2] http://openvswitch.org/support/ovscon2016/7/1120-tu.pdf

Signed-off-by: Joe Stringer <joe at ovn.org>
Signed-off-by: William Tu <u9012063 at gmail.com>
Signed-off-by: Yifeng Sun <pkusunyifeng at gmail.com>
Co-authored-by: William Tu <u9012063 at gmail.com>
Co-authored-by: Yifeng Sun <pkusunyifeng at gmail.com>
---
 lib/dpif-bpf.c      | 1995 +++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/dpif-provider.h |    1 +
 lib/dpif.c          |    3 +
 3 files changed, 1999 insertions(+)
 create mode 100644 lib/dpif-bpf.c

diff --git a/lib/dpif-bpf.c b/lib/dpif-bpf.c
new file mode 100644
index 000000000000..d0931af78278
--- /dev/null
+++ b/lib/dpif-bpf.c
@@ -0,0 +1,1995 @@
+/*
+ * Copyright (c) 2016, 2017, 2018 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#include <errno.h>
+#include <openvswitch/hmap.h>
+#include <openvswitch/types.h>
+#include <openvswitch/vlog.h>
+#include <unistd.h>
+#include <bpf/bpf.h>
+
+#include "bpf.h"
+#include "bpf/odp-bpf.h"
+#include "dirs.h"
+#include "dpif.h"
+#include "dpif-provider.h"
+#include "dpif-bpf-odp.h"
+#include "dpif-netlink-rtnl.h"
+#include "fat-rwlock.h"
+#include "netdev.h"
+#include "netdev-provider.h"
+#include "netdev-vport.h"
+#include "odp-util.h"
+#include "ovs-numa.h"
+#include "perf-event.h"
+#include "sset.h"
+#include "openvswitch/poll-loop.h"
+
+VLOG_DEFINE_THIS_MODULE(dpif_bpf);
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60);
+
+/* Protects against changes to 'bpf_datapaths'. */
+static struct ovs_mutex bpf_datapath_mutex = OVS_MUTEX_INITIALIZER;
+
+/* Contains all 'struct dpif_bpf_dp's. */
+static struct shash bpf_datapaths OVS_GUARDED_BY(bpf_datapath_mutex)
+    = SHASH_INITIALIZER(&bpf_datapaths);
+
+struct bpf_handler {
+    /* Into owning dpif_bpf_dp->channels */
+    int offset;
+    int count;
+    int index;         /* next channel to use */
+};
+
+struct dpif_bpf_dp {
+    struct dpif *dpif;
+    const char *const name;
+    struct ovs_refcount ref_cnt;
+    atomic_flag destroyed;
+
+    /* Ports.
+     *
+     * Any lookup into 'ports' requires taking 'port_mutex'. */
+    struct ovs_mutex port_mutex;
+    struct hmap ports_by_odp OVS_GUARDED;
+    struct hmap ports_by_ifindex OVS_GUARDED;
+    struct seq *port_seq;       /* Incremented whenever a port changes. */
+    uint64_t last_seq;
+
+    /* Handlers */
+    struct fat_rwlock upcall_lock;
+    uint32_t n_handlers;
+    struct bpf_handler *handlers;
+
+    /* Upcall channels. */
+    size_t page_size;
+    int n_pages;
+    int n_channels;
+    struct perf_channel channels[];
+};
+
+struct dpif_bpf {
+    struct dpif dpif;
+    struct dpif_bpf_dp *dp;
+};
+
+struct dpif_bpf_port {
+    struct hmap_node odp_node;  /* Node in dpif_bpf_dp 'ports_by_odp'. */
+    struct hmap_node if_node;   /* Node in dpif_bpf_dp 'ports_by_ifindex'. */
+    struct netdev *netdev;
+    odp_port_t port_no;
+    int ifindex;
+    char *type;                 /* Port type as requested by user. */
+    struct netdev_saved_flags *sf;
+
+    unsigned n_rxq;
+    struct netdev_rxq **rxqs;
+};
+
+static void vlog_hex_dump(const u8 *buf, size_t count)
+{
+    struct ds ds = DS_EMPTY_INITIALIZER;
+    ds_put_hex_dump(&ds, buf, count, 0, false);
+    VLOG_DBG("\n%s", ds_cstr(&ds));
+    ds_destroy(&ds);
+}
+
+int create_dp_bpf(const char *name, struct dpif_bpf_dp **dp);
+static void dpif_bpf_close(struct dpif *dpif);
+static int do_add_port(struct dpif_bpf_dp *dp, const char *devname,
+                       const char *type, odp_port_t port_no)
+    OVS_REQUIRES(dp->port_mutex);
+static void do_del_port(struct dpif_bpf_dp *dp, struct dpif_bpf_port *port)
+    OVS_REQUIRES(dp->port_mutex);
+static int dpif_bpf_delete_all_flow(void);
+
+static struct dpif_bpf *
+dpif_bpf_cast(const struct dpif *dpif)
+{
+    ovs_assert(dpif->dpif_class == &dpif_bpf_class);
+    return CONTAINER_OF(dpif, struct dpif_bpf, dpif);
+}
+
+static struct dpif_bpf_dp *
+get_dpif_bpf_dp(const struct dpif *dpif)
+{
+    return dpif_bpf_cast(dpif)->dp;
+}
+
+static struct dp_bpf {
+    struct bpf_state bpf;
+    struct netdev *outport; /* Used for downcall. */
+} datapath;
+
+static int
+configure_outport(struct netdev *outport)
+{
+    int error;
+
+    error = netdev_set_filter(outport, &datapath.bpf.downcall);
+    if (error) {
+        return error;
+    }
+
+    error = netdev_set_flags(outport, NETDEV_UP, NULL);
+    if (error) {
+        return error;
+    }
+
+    return 0;
+}
+
+static int
+dpif_bpf_init(void)
+{
+    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
+    static int error = 0;
+
+    if (ovsthread_once_start(&once)) {
+        struct netdev *outport;
+
+        error = bpf_get(&datapath.bpf, true);
+        if (!error) {
+            /* FIXME: should we named ovs-system? */
+            error = netdev_open("ovs-system", "tap", &outport);
+            if (!error) {
+                VLOG_INFO("%s: created BPF tap downcall device %s",
+                          __func__, outport->name);
+
+                error = configure_outport(outport);
+                if (error) {
+                    VLOG_ERR("%s: configure downcall device failed", __func__);
+                    netdev_close(outport);
+                } else {
+                    datapath.outport = outport;
+                }
+            }
+        }
+
+        if (!error) {
+            dpif_bpf_delete_all_flow();
+        }
+        ovsthread_once_done(&once);
+    }
+    return error;
+}
+
+static int
+dpif_bpf_enumerate(struct sset *all_dps,
+                   const struct dpif_class *dpif_class OVS_UNUSED)
+{
+    struct shash_node *node;
+
+    ovs_mutex_lock(&bpf_datapath_mutex);
+    SHASH_FOR_EACH(node, &bpf_datapaths) {
+        sset_add(all_dps, node->name);
+    }
+    ovs_mutex_unlock(&bpf_datapath_mutex);
+
+    return 0;
+}
+
+static const char
+*dpif_bpf_port_open_type(const struct dpif_class *dpif_class OVS_UNUSED,
+                         const char *type)
+{
+    return strcmp(type, "internal") ? type : "tap";
+}
+
+static struct dpif *
+create_dpif_bpf(struct dpif_bpf_dp *dp)
+    OVS_REQUIRES(bpf_datapath_mutex)
+{
+    uint16_t netflow_id = hash_string(dp->name, 0);
+    struct dpif_bpf *dpif;
+
+    ovs_refcount_ref(&dp->ref_cnt);
+
+    dpif = xmalloc(sizeof *dpif);
+    dpif_init(&dpif->dpif, &dpif_bpf_class, dp->name, netflow_id >> 8, netflow_id);
+    dpif->dp = dp;
+
+    return &dpif->dpif;
+}
+
+static int
+dpif_bpf_open(const struct dpif_class *dpif_class OVS_UNUSED,
+              const char *name, bool create OVS_UNUSED, struct dpif **dpifp)
+{
+    struct dpif_bpf_dp *dp;
+    int error;
+
+    error = dpif_bpf_init();
+    if (error) {
+        VLOG_ERR("dpif_bpf_init failed");
+        return error;
+    }
+
+    ovs_mutex_lock(&bpf_datapath_mutex);
+    dp = shash_find_data(&bpf_datapaths, name);
+    if (!dp) {
+        error = create ? create_dp_bpf(name, &dp) : ENODEV;
+    } else {
+        ovs_assert(dpif_class == &dpif_bpf_class);
+        error = create ? EEXIST : 0;
+    }
+    if (!error) {
+        *dpifp = create_dpif_bpf(dp);
+        if (create) { /* XXX */
+            dp->dpif = *dpifp;
+        }
+    }
+    ovs_mutex_unlock(&bpf_datapath_mutex);
+
+    return error;
+}
+
+static int
+perf_event_channels_init(struct dpif_bpf_dp *dp)
+{
+    size_t length = dp->page_size * (dp->n_pages + 1);
+    int error = 0;
+    int i, cpu;
+
+    for (cpu = 0; cpu < dp->n_channels; cpu++) {
+        struct perf_channel *channel = &dp->channels[cpu];
+
+        error = perf_channel_open(channel, cpu, length);
+        if (error) {
+            goto error;
+        }
+    }
+
+error:
+    if (error) {
+        for (i = 0; i < cpu; i++) {
+            perf_channel_close(&dp->channels[cpu]);
+        }
+    }
+
+    return error;
+}
+
+static void
+dpif_bpf_free(struct dpif_bpf_dp *dp)
+    OVS_REQUIRES(bpf_datapath_mutex)
+{
+    shash_find_and_delete(&bpf_datapaths, dp->name);
+
+    if (ovs_refcount_read(&dp->ref_cnt) == 0) {
+        ovs_mutex_destroy(&dp->port_mutex);
+        seq_destroy(dp->port_seq);
+        fat_rwlock_destroy(&dp->upcall_lock);
+        hmap_destroy(&dp->ports_by_ifindex);
+        hmap_destroy(&dp->ports_by_odp);
+        if (dp->n_handlers) {
+            free(dp->handlers);
+        }
+        free(dp);
+    }
+}
+
+int
+create_dp_bpf(const char *name, struct dpif_bpf_dp **dp_)
+    OVS_REQUIRES(bpf_datapath_mutex)
+{
+    int max_cpu;
+    struct dpif_bpf_dp *dp;
+    int i, error;
+
+    max_cpu = ovs_numa_get_n_cores();
+
+    dp = xzalloc(sizeof *dp + max_cpu * sizeof(struct perf_channel));
+    ovs_refcount_init(&dp->ref_cnt);
+    atomic_flag_clear(&dp->destroyed);
+    hmap_init(&dp->ports_by_odp);
+    hmap_init(&dp->ports_by_ifindex);
+    fat_rwlock_init(&dp->upcall_lock);
+    dp->port_seq = seq_create();
+    ovs_mutex_init(&dp->port_mutex);
+    dp->n_pages = 8;
+    dp->page_size = sysconf(_SC_PAGESIZE);
+    dp->n_channels = max_cpu;
+    dp->last_seq = seq_read(dp->port_seq);
+
+    *CONST_CAST(const char **, &dp->name) = xstrdup(name);
+    shash_add(&bpf_datapaths, name, dp); /* XXX */
+
+    error = perf_event_channels_init(dp);
+    if (error) {
+        dpif_bpf_free(dp);
+        return error;
+    }
+
+    ovs_assert(datapath.bpf.upcalls.fd != -1);
+
+    for (i = 0; i < dp->n_channels; i++) {
+        error = bpf_map_update_elem(datapath.bpf.upcalls.fd, &i,
+                                    &dp->channels[i].fd, 0);
+        if (error) {
+            VLOG_WARN("failed to insert channel fd on cpu=%d: %s",
+                      i, ovs_strerror(error));
+            goto out;
+        }
+    }
+
+out:
+    if (error) {
+        dpif_bpf_free(dp);
+    }
+    if (!error) {
+        *dp_ = dp;
+    }
+    return 0;
+}
+
+static void
+dpif_bpf_close(struct dpif *dpif_)
+{
+    struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif_);
+
+    ovs_mutex_lock(&bpf_datapath_mutex);
+    if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
+        struct dpif_bpf_port *port, *next;
+        int i;
+
+        fat_rwlock_wrlock(&dp->upcall_lock);
+        for (i = 0; i < dp->n_channels; i++) {
+            struct perf_channel *channel = &dp->channels[i];
+
+            perf_channel_close(channel);
+        }
+        fat_rwlock_unlock(&dp->upcall_lock);
+
+        ovs_mutex_lock(&dp->port_mutex);
+        HMAP_FOR_EACH_SAFE (port, next, odp_node, &dp->ports_by_odp) {
+            do_del_port(dp, port);
+        }
+        ovs_mutex_unlock(&dp->port_mutex);
+        dpif_bpf_free(dp);
+    }
+    ovs_mutex_unlock(&bpf_datapath_mutex);
+
+    free(dpif_bpf_cast(dpif_));
+}
+
+static int
+dpif_bpf_destroy(struct dpif *dpif_)
+{
+    struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif_);
+
+    if (!atomic_flag_test_and_set(&dp->destroyed)) {
+        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
+            /* Can't happen: 'dpif' still owns a reference to 'dp'.
+             * The workflow is first call dpif_class->destroy() then
+             * dpif->close(). */
+            OVS_NOT_REACHED();
+        }
+    }
+#if 0
+    if (datapath.outport) {
+        netdev_close(datapath.outport);
+    }
+#endif
+
+    return 0;
+}
+
+static int
+dpif_bpf_get_stats(const struct dpif *dpif OVS_UNUSED,
+                   struct dpif_dp_stats *stats)
+{
+    uint32_t key, n_flows = 0;
+    struct bpf_flow_key flow_key;
+    int err = 0;
+
+    memset(stats, 0, sizeof(*stats));
+    key = OVS_DP_STATS_HIT;
+    if (bpf_map_lookup_elem(datapath.bpf.datapath_stats.fd, &key,
+                            &stats->n_hit)) {
+        VLOG_INFO("datapath_stats lookup failed (%d): %s", key,
+                  ovs_strerror(errno));
+    }
+    key = OVS_DP_STATS_MISSED;
+    if (bpf_map_lookup_elem(datapath.bpf.datapath_stats.fd, &key,
+                            &stats->n_missed)) {
+        VLOG_INFO("datapath_stats lookup failed (%d): %s", key,
+                  ovs_strerror(errno));
+    }
+
+    /* Count the number of datapath flow entries */
+    memset(&flow_key, 0, sizeof flow_key);
+    do {
+        err = bpf_map_get_next_key(datapath.bpf.flow_table.fd,
+                                   &flow_key, &flow_key);
+        if (!err) {
+            n_flows++;
+        }
+    } while (!err);
+
+    stats->n_flows = n_flows;
+
+    /* XXX: Other missing stats */
+    return 0;
+}
+
+static struct dpif_bpf_port *
+bpf_lookup_port(const struct dpif_bpf_dp *dp, odp_port_t port_no)
+    OVS_REQUIRES(dp->port_mutex)
+{
+    struct dpif_bpf_port *port;
+
+    HMAP_FOR_EACH_WITH_HASH (port, odp_node, netdev_hash_port_no(port_no),
+                             &dp->ports_by_odp) {
+        if (port->port_no == port_no) {
+            return port;
+        }
+    }
+    return NULL;
+}
+
+static odp_port_t
+choose_port(struct dpif_bpf_dp *dp)
+    OVS_REQUIRES(dp->port_mutex)
+{
+    uint32_t port_no;
+
+    for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
+        if (!bpf_lookup_port(dp, u32_to_odp(port_no))) {
+            return u32_to_odp(port_no);
+        }
+    }
+
+    return ODPP_NONE;
+}
+
+static int
+get_port_by_name(struct dpif_bpf_dp *dp, const char *devname,
+                 struct dpif_bpf_port **portp)
+    OVS_REQUIRES(dp->port_mutex)
+{
+    struct dpif_bpf_port *port;
+
+    HMAP_FOR_EACH (port, odp_node, &dp->ports_by_odp) {
+        if (!strcmp(netdev_get_name(port->netdev), devname)) {
+            *portp = port;
+            return 0;
+        }
+    }
+
+    *portp = NULL;
+    return ENOENT;
+}
+
+static uint32_t
+hash_ifindex(int ifindex)
+{
+    return hash_int(ifindex, 0);
+}
+
+static int
+get_port_by_ifindex(struct dpif_bpf_dp *dp, int ifindex,
+                    struct dpif_bpf_port **portp)
+    OVS_REQUIRES(dp->port_mutex)
+{
+    struct dpif_bpf_port *port;
+
+    HMAP_FOR_EACH_WITH_HASH (port, if_node, hash_ifindex(ifindex),
+                             &dp->ports_by_ifindex) {
+        if (port->ifindex == ifindex) {
+            *portp = port;
+            return 0;
+        }
+    }
+
+    *portp = NULL;
+    return ENOENT;
+}
+
+static odp_port_t
+ifindex_to_odp(struct dpif_bpf_dp *dp, int ifindex)
+    OVS_REQUIRES(dp->port_mutex)
+{
+    struct dpif_bpf_port *port;
+
+    if (get_port_by_ifindex(dp, ifindex, &port)) {
+        return ODPP_NONE;
+    }
+
+    return port->port_no;
+}
+
+static bool output_to_local_stack(struct netdev *netdev)
+{
+    return !strcmp(netdev_get_type(netdev), "tap");
+}
+
+static bool netdev_support_xdp(struct netdev *netdev OVS_UNUSED)
+{
+    return true;
+}
+
+static uint32_t
+get_port_flags(struct netdev *netdev)
+{
+    return output_to_local_stack(netdev) ? OVS_BPF_FLAGS_TX_STACK : 0;
+}
+
+static uint16_t
+odp_port_to_ifindex(struct dpif_bpf_dp *dp, odp_port_t port_no, uint32_t *flags)
+    OVS_REQUIRES(dp->port_mutex)
+{
+    struct dpif_bpf_port *port = bpf_lookup_port(dp, port_no);
+
+    if (port) {
+        if (flags) {
+            *flags = get_port_flags(port->netdev);
+        }
+        return port->ifindex;
+    }
+    return 0;
+}
+
+/* Modelled after dpif-netdev 'port_create', minus pmd and txq logic, plus bpf
+ * filter set. */
+static int
+port_create(const char *devname, const char *type,
+            odp_port_t port_no, struct dpif_bpf_port **portp)
+{
+    struct netdev_saved_flags *sf;
+    struct dpif_bpf_port *port;
+    enum netdev_flags flags;
+    struct netdev *netdev;
+    int n_open_rxqs = 0;
+    int i, error;
+    int ifindex;
+
+    *portp = NULL;
+
+    /* Open and validate network device. */
+    error = netdev_open(devname, type, &netdev);
+
+    VLOG_DBG("%s %s type %s error %d", __func__, devname, type, error);
+    if (error) {
+        return error;
+    }
+    /* XXX reject non-Ethernet devices */
+
+    netdev_get_flags(netdev, &flags);
+    if (flags & NETDEV_LOOPBACK) {
+        VLOG_ERR_RL(&rl, "%s: cannot add a loopback device", devname);
+        error = EINVAL;
+        goto out;
+    }
+
+    if (netdev_is_reconf_required(netdev)) {
+        error = netdev_reconfigure(netdev);
+        if (error) {
+            goto out;
+        }
+    }
+
+    ifindex = netdev_get_ifindex(netdev);
+    if (ifindex < 0) {
+        VLOG_WARN_RL(&rl, "%s: Failed to get ifindex", devname);
+        error = -ifindex;
+        goto out;
+    }
+
+    VLOG_DBG("%s ifindex = %d", devname, ifindex);
+
+    /* For all internal port, ex: br0, br-underlay, br-int,
+       we set bpf program only to its egress queue. (due to the
+       natural of tap device). For other types, ex: eth0, vxlan_sys,
+       we set bpf program to its ingress queue.
+
+       A tap device's egress queue is tied to a socket for userspace
+       to receive the packet by open(/dev/tun0).  On the other hand,
+       a send to the socket will show up in the tap device's ingress queue.
+    */
+    if (output_to_local_stack(netdev)) {
+        error = netdev_set_filter(netdev, &datapath.bpf.egress);
+    } else {
+        error = netdev_set_filter(netdev, &datapath.bpf.ingress);
+    }
+    if (error) {
+        goto out;
+    }
+
+    if (netdev_support_xdp(netdev)) {
+        error = netdev_set_xdp(netdev, &datapath.bpf.xdp);
+        if (error) {
+            VLOG_WARN("%s XDP set failed", __func__);
+            goto out;
+        }
+        VLOG_DBG("%s %s XDP set done", __func__, netdev->name);
+    }
+
+    port = xzalloc(sizeof *port);
+    port->port_no = port_no;
+    port->ifindex = ifindex;
+    port->netdev = netdev;
+    port->n_rxq = netdev_n_rxq(netdev);
+    port->rxqs = xcalloc(port->n_rxq, sizeof *port->rxqs);
+    port->type = xstrdup(type);
+
+    for (i = 0; i < port->n_rxq; i++) {
+        error = netdev_rxq_open(netdev, &port->rxqs[i], i);
+        if (error) {
+            VLOG_ERR("%s: cannot receive packets on this network device (queue %d) (%s)",
+                     devname, i, ovs_strerror(errno));
+            goto out_rxq_close;
+        }
+        n_open_rxqs++;
+    }
+
+    error = netdev_turn_flags_on(netdev, NETDEV_PROMISC, &sf);
+    if (error) {
+        goto out_rxq_close;
+    }
+    port->sf = sf;
+
+    *portp = port;
+    return 0;
+
+out_rxq_close:
+    for (i = 0; i < n_open_rxqs; i++) {
+        netdev_rxq_close(port->rxqs[i]);
+    }
+    free(port->type);
+    free(port->rxqs);
+    free(port);
+
+out:
+    netdev_close(netdev);
+    return error;
+}
+
+static int
+do_add_port(struct dpif_bpf_dp *dp, const char *devname,
+            const char *type, odp_port_t port_no)
+    OVS_REQUIRES(dp->port_mutex)
+{
+    struct dpif_bpf_port *port;
+    int error;
+
+    if (!get_port_by_name(dp, devname, &port)) {
+        return EEXIST;
+    }
+
+    error = port_create(devname, type, port_no, &port);
+    if (error) {
+        VLOG_ERR("port_create return %d", error);
+        return error;
+    }
+
+    hmap_insert(&dp->ports_by_odp, &port->odp_node,
+                netdev_hash_port_no(port->port_no));
+    hmap_insert(&dp->ports_by_ifindex, &port->if_node,
+                hash_ifindex(port->ifindex));
+    seq_change(dp->port_seq);
+
+    return 0;
+}
+
+static int
+dpif_bpf_port_add(struct dpif *dpif, struct netdev *netdev,
+                  odp_port_t *port_nop)
+{
+    struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif);
+    char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
+    const char *dpif_port;
+    odp_port_t port_no;
+    int error;
+
+    if (!strcmp(netdev_get_type(netdev), "vxlan") ||
+        !strcmp(netdev_get_type(netdev), "gre") ||
+        !strcmp(netdev_get_type(netdev), "geneve")) {
+
+        VLOG_INFO("Creating %s device", netdev_get_type(netdev));
+        error = dpif_netlink_rtnl_port_create(netdev);
+        if (error) {
+            if (error != EOPNOTSUPP) {
+                VLOG_WARN_RL(&rl, "Failed to create %s with rtnetlink: %s",
+                             netdev_get_name(netdev), ovs_strerror(error));
+            }
+            return error;
+        }
+    }
+
+    ovs_mutex_lock(&dp->port_mutex);
+    dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
+    if (*port_nop != ODPP_NONE) {
+        port_no = *port_nop;
+        error = bpf_lookup_port(dp, *port_nop) ? EBUSY : 0;
+    } else {
+        port_no = choose_port(dp);
+        error = port_no == ODPP_NONE ? EFBIG : 0;
+    }
+    if (error) {
+        goto unlock;
+    }
+
+    *port_nop = port_no;
+    error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
+    if (error) {
+        goto unlock;
+    }
+
+unlock:
+    ovs_mutex_unlock(&dp->port_mutex);
+    return error;
+}
+
+static void
+do_del_port(struct dpif_bpf_dp *dp, struct dpif_bpf_port *port)
+    OVS_REQUIRES(dp->port_mutex)
+{
+    int i, error;
+
+    seq_change(dp->port_seq);
+    hmap_remove(&dp->ports_by_odp, &port->odp_node);
+    hmap_remove(&dp->ports_by_ifindex, &port->if_node);
+
+    error = netdev_set_filter(port->netdev, NULL);
+    if (error) {
+        VLOG_WARN("%s: Failed to clear filter from netdev",
+                  netdev_get_name(port->netdev));
+    }
+
+    if (netdev_support_xdp(port->netdev)) {
+        error = netdev_set_xdp(port->netdev, NULL);
+        if (error) {
+            VLOG_WARN("%s: Failed to clear XDP from netdev",
+                      netdev_get_name(port->netdev));
+        }
+    }
+
+    netdev_close(port->netdev);
+    netdev_restore_flags(port->sf);
+    for (i = 0; i < port->n_rxq; i++) {
+        netdev_rxq_close(port->rxqs[i]);
+    }
+
+    free(port->type);
+    free(port->rxqs);
+    free(port);
+}
+
+static int
+dpif_bpf_port_del(struct dpif *dpif, odp_port_t port_no)
+{
+    struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif);
+    struct dpif_bpf_port *port;
+    int error = 0;
+
+    ovs_mutex_lock(&dp->port_mutex);
+    port = bpf_lookup_port(dp, port_no);
+    if (!port) {
+        VLOG_WARN("deleting port %d, but it doesn't exist", port_no);
+        error = EINVAL;
+    }
+    ovs_mutex_unlock(&dp->port_mutex);
+
+    return error;
+}
+
+static void
+answer_port_query(const struct dpif_bpf_port *port,
+                  struct dpif_port *dpif_port)
+{
+    dpif_port->name = xstrdup(netdev_get_name(port->netdev));
+    dpif_port->type = xstrdup(port->type);
+    dpif_port->port_no = port->port_no;
+}
+
+static int
+dpif_bpf_port_query_by_number(const struct dpif *dpif_, odp_port_t port_no,
+                              struct dpif_port *port_)
+{
+    struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif_);
+    struct dpif_bpf_port *port;
+    int error = 0;
+
+    ovs_mutex_lock(&dp->port_mutex);
+    port = bpf_lookup_port(dp, port_no);
+    if (!port) {
+        errno = ENOENT;
+        goto out;
+    }
+    answer_port_query(port, port_);
+
+out:
+    ovs_mutex_unlock(&dp->port_mutex);
+    return error;
+}
+
+static int
+dpif_bpf_port_query_by_name(const struct dpif *dpif_, const char *devname,
+                            struct dpif_port *dpif_port)
+{
+    struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif_);
+    struct dpif_bpf_port *port;
+    int error;
+
+    ovs_mutex_lock(&dp->port_mutex);
+    error = get_port_by_name(dp, devname, &port);
+    if (!error && dpif_port) {
+        answer_port_query(port, dpif_port);
+    }
+    ovs_mutex_unlock(&dp->port_mutex);
+
+    return error;
+}
+
+struct dpif_bpf_port_state {
+    struct hmap_position position;
+    char *name;
+};
+
+static int
+dpif_bpf_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
+{
+    *statep = xzalloc(sizeof(struct dpif_bpf_port_state));
+    return 0;
+}
+
+static int
+dpif_bpf_port_dump_next(const struct dpif *dpif_, void *state_,
+                        struct dpif_port *dpif_port)
+{
+    struct dpif_bpf_port_state *state = state_;
+    struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif_);
+    struct hmap_node *node;
+    int retval;
+
+    ovs_mutex_lock(&dp->port_mutex);
+    node = hmap_at_position(&dp->ports_by_odp, &state->position);
+    if (node) {
+        struct dpif_bpf_port *port;
+
+        port = CONTAINER_OF(node, struct dpif_bpf_port, odp_node);
+
+        free(state->name);
+        state->name = xstrdup(netdev_get_name(port->netdev));
+        dpif_port->name = state->name;
+        dpif_port->type = port->type;
+        dpif_port->port_no = port->port_no;
+
+        retval = 0;
+    } else {
+        retval = EOF;
+    }
+    ovs_mutex_unlock(&dp->port_mutex);
+
+    return retval;
+}
+
+static int
+dpif_bpf_port_dump_done(const struct dpif *dpif OVS_UNUSED,
+                        void *state_)
+{
+    struct dpif_bpf_port_state *state = state_;
+
+    free(state->name);
+    free(state);
+    return 0;
+}
+
+static int
+dpif_bpf_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
+{
+    struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif_);
+    uint64_t new_port_seq;
+
+    new_port_seq = seq_read(dp->port_seq);
+    if (dp->last_seq != new_port_seq) {
+        dp->last_seq = new_port_seq;
+        return ENOBUFS;
+    }
+
+    return EAGAIN;
+}
+
+static void
+dpif_bpf_port_poll_wait(const struct dpif *dpif_)
+{
+    struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif_);
+
+    seq_wait(dp->port_seq, dp->last_seq);
+}
+
+static int
+dpif_bpf_flow_flush(struct dpif *dpif OVS_UNUSED)
+{
+    struct bpf_flow_key key;
+    int err = 0;
+
+    /* Flow Entry Table */
+    memset(&key, 0, sizeof key);
+    do {
+        err = bpf_map_get_next_key(datapath.bpf.flow_table.fd, &key, &key);
+        if (!err) {
+            bpf_map_delete_elem(datapath.bpf.flow_table.fd, &key);
+        }
+    } while (!err);
+
+    /* Flow Stats Table */
+    memset(&key, 0, sizeof key);
+    do {
+        err = bpf_map_get_next_key(datapath.bpf.dp_flow_stats.fd, &key, &key);
+        if (!err) {
+            bpf_map_delete_elem(datapath.bpf.dp_flow_stats.fd, &key);
+        }
+    } while (!err);
+
+
+    return errno == ENOENT ? 0 : errno;
+}
+
+struct dpif_bpf_flow_dump {
+    struct dpif_flow_dump up;
+    int status;
+    struct bpf_flow_key pos;
+    struct ovs_mutex mutex;
+};
+
+static struct dpif_bpf_flow_dump *
+dpif_bpf_flow_dump_cast(struct dpif_flow_dump *dump)
+{
+    return CONTAINER_OF(dump, struct dpif_bpf_flow_dump, up);
+}
+
+static struct dpif_flow_dump *
+dpif_bpf_flow_dump_create(const struct dpif *dpif_, bool terse,
+                          char *type OVS_UNUSED)
+{
+    struct dpif_bpf_flow_dump *dump;
+
+    dump = xzalloc(sizeof *dump);
+    dpif_flow_dump_init(&dump->up, dpif_);
+    dump->up.terse = terse;
+    ovs_mutex_init(&dump->mutex);
+
+    return &dump->up;
+}
+
+static int
+dpif_bpf_flow_dump_destroy(struct dpif_flow_dump *dump_)
+{
+    struct dpif_bpf_flow_dump *dump = dpif_bpf_flow_dump_cast(dump_);
+    int status = dump->status;
+
+    ovs_mutex_destroy(&dump->mutex);
+    free(dump);
+
+    return status == ENOENT ? 0 : status;
+}
+
+struct dpif_bpf_flow_dump_thread {
+    struct dpif_flow_dump_thread up;
+    struct dpif_bpf_flow_dump *dump;
+    struct ofpbuf buf;  /* Stores key,mask,acts for a particular dump. */
+};
+
+static struct dpif_bpf_flow_dump_thread *
+dpif_bpf_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
+{
+    return CONTAINER_OF(thread, struct dpif_bpf_flow_dump_thread, up);
+}
+
+static struct dpif_flow_dump_thread *
+dpif_bpf_flow_dump_thread_create(struct dpif_flow_dump *dump_)
+{
+    struct dpif_bpf_flow_dump *dump = dpif_bpf_flow_dump_cast(dump_);
+    struct dpif_bpf_flow_dump_thread *thread;
+
+    thread = xmalloc(sizeof *thread);
+    dpif_flow_dump_thread_init(&thread->up, &dump->up);
+    thread->dump = dump;
+    ofpbuf_init(&thread->buf, 1024);
+    return &thread->up;
+}
+
+static void
+dpif_bpf_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
+{
+    struct dpif_bpf_flow_dump_thread *thread =
+        dpif_bpf_flow_dump_thread_cast(thread_);
+    ofpbuf_uninit(&thread->buf);
+    free(thread);
+}
+
+static int
+fetch_flow(struct dpif_bpf_dp *dp, struct dpif_flow *flow,
+           struct ofpbuf *out, const struct bpf_flow_key *key)
+{
+    struct flow f;
+    struct odp_flow_key_parms parms = {
+        .flow = &f,
+    };
+    struct bpf_action_batch action;
+    struct bpf_flow_stats stats;
+    int err;
+
+    memset(flow, 0, sizeof *flow);
+
+    err = bpf_map_lookup_elem(datapath.bpf.flow_table.fd, key, &action);
+    if (err) {
+        return errno;
+    }
+
+    /* XXX: Extract 'dp_flow' into 'flow'. */
+    if (bpf_flow_key_to_flow(key, &f) == ODP_FIT_ERROR) {
+        VLOG_WARN("%s: bpf flow key parsing error", __func__);
+        return EINVAL;
+    }
+    f.in_port.odp_port = ifindex_to_odp(dp,
+                                        odp_to_u32(f.in_port.odp_port));
+
+    /* Translate BPF flow into netlink format. */
+    ofpbuf_clear(out);
+
+    /* Use 'out->header' to point to the flow key, 'out->msg' for actions */
+    out->header = out->data;
+    odp_flow_key_from_flow(&parms, out);
+    out->msg = ofpbuf_tail(out);
+    err = bpf_actions_to_odp_actions(&action, out);
+    if (err) {
+        VLOG_ERR("%s: bpf_actions to odp actions fails", __func__);
+        return err;
+    }
+
+    flow->key = out->header;
+    flow->key_len = ofpbuf_headersize(out);
+    flow->actions = out->msg;
+    flow->actions_len = ofpbuf_msgsize(out);
+
+    dpif_flow_hash(dp->dpif, flow->key, flow->key_len, &flow->ufid);
+    flow->ufid_present = false; /* XXX */
+
+    /* Fetch datapath flow stats */
+    err = bpf_map_lookup_elem(datapath.bpf.dp_flow_stats.fd, key, &stats);
+    if (err) {
+        VLOG_DBG("flow stats lookup fails, fd %d err = %d %s",
+                 datapath.bpf.dp_flow_stats.fd, err, ovs_strerror(errno));
+        return errno;
+    } else {
+        VLOG_DBG("flow stats lookup OK");
+        memcpy(&flow->stats, &stats, 3 * sizeof(uint64_t));
+    }
+
+    return 0;
+}
+
+static int
+dpif_bpf_insert_flow(struct bpf_flow_key *flow_key,
+                     struct bpf_action_batch *actions)
+{
+    int err;
+    struct bpf_flow_stats flow_stats;
+
+    VLOG_DBG("Insert bof_flow_key:");
+    vlog_hex_dump((unsigned char *)flow_key, sizeof *flow_key);
+
+    VLOG_DBG("Insert action:");
+    vlog_hex_dump((unsigned char *)actions, sizeof actions[0]);
+
+    ovs_assert(datapath.bpf.flow_table.fd != -1);
+    err = bpf_map_update_elem(datapath.bpf.flow_table.fd,
+                              flow_key,
+                              actions, BPF_ANY);
+    if (err) {
+        VLOG_ERR("Failed to add flow into flow table, map fd %d, error %s",
+                 datapath.bpf.flow_table.fd, ovs_strerror(errno));
+        return errno;
+    }
+
+    flow_stats.packet_count = 1;
+    flow_stats.byte_count = flow_key->mds.md.packet_length;
+    flow_stats.used = 0;
+
+    err = bpf_map_update_elem(datapath.bpf.dp_flow_stats.fd,
+                              flow_key,
+                              &flow_stats, BPF_ANY);
+    if (err) {
+        VLOG_ERR("Failed to add flow into flow stats table, map fd %d, error %s",
+                 datapath.bpf.dp_flow_stats.fd, ovs_strerror(errno));
+        return errno;
+    }
+
+    return 0;
+}
+
+static int
+dpif_bpf_delete_flow(struct bpf_flow_key *flow_key,
+                     struct dpif_flow_stats *stats)
+{
+    int err;
+    struct bpf_action_batch actions;
+
+    ovs_assert(datapath.bpf.flow_table.fd != -1);
+
+    err = bpf_map_lookup_elem(datapath.bpf.flow_table.fd, flow_key, &actions);
+    if (err != 0) {
+        VLOG_ERR("Failed to find flow into flow table, map fd %d: %s",
+                 datapath.bpf.flow_table.fd, ovs_strerror(errno));
+        VLOG_WARN("bpf_flow_key not found\n");
+        vlog_hex_dump((unsigned char *)flow_key, sizeof *flow_key);
+
+        goto delete_stats;
+    }
+
+    err = bpf_map_delete_elem(datapath.bpf.flow_table.fd, flow_key);
+    if (err) {
+        VLOG_ERR("Failed to del flow into flow table, map fd %d: %s",
+                 datapath.bpf.flow_table.fd, ovs_strerror(errno));
+        return errno;
+    }
+
+    if (stats) {
+        /* XXX: Stats */
+        memset(stats, 0, sizeof *stats);
+
+delete_stats:
+        err = bpf_map_delete_elem(datapath.bpf.dp_flow_stats.fd, flow_key);
+        if (err) {
+            VLOG_ERR("Failed to del flow into flow stat table, map fd %d: %s",
+                      datapath.bpf.flow_table.fd, ovs_strerror(errno));
+            /* Skip when element is not found */
+            return 0;
+        }
+    }
+    return 0;
+}
+
+static int
+dpif_bpf_delete_all_flow(void)
+{
+    int err;
+    struct bpf_flow_key key;
+
+    do {
+        err = bpf_map_get_next_key(datapath.bpf.flow_table.fd, NULL, &key);
+        if (err) {
+            return err;
+        }
+
+        err = bpf_map_delete_elem(datapath.bpf.flow_table.fd, &key);
+    } while (!err);
+
+    return err;
+}
+
+static int
+dpif_bpf_flow_dump_next(struct dpif_flow_dump_thread *thread_,
+                        struct dpif_flow *flows, int max_flows)
+{
+    struct dpif_bpf_flow_dump_thread *thread =
+        dpif_bpf_flow_dump_thread_cast(thread_);
+    struct dpif_bpf_flow_dump *dump = thread->dump;
+    int n = 0;
+    int err;
+
+    ovs_mutex_lock(&dump->mutex);
+    err = dump->status;
+    if (err) {
+        goto unlock;
+    }
+
+    while (n <= max_flows) {
+        struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dump->up.dpif);
+
+        err = bpf_map_get_next_key(datapath.bpf.flow_table.fd,
+                                   &dump->pos, &dump->pos);
+        if (err) {
+            err = errno;
+            break;
+        }
+        err = fetch_flow(dp, &flows[n], &thread->buf, &dump->pos);
+        if (err == ENOENT) {
+            /* Flow disappeared. Oh well, we tried. */
+            continue;
+        } else if (err) {
+            break;
+        }
+        n++;
+    }
+    dump->status = err;
+unlock:
+    ovs_mutex_unlock(&dump->mutex);
+    return n;
+}
+
+struct dpif_bpf_downcall_parms {
+    uint32_t type;
+    odp_port_t port_no;
+    struct bpf_action_batch *action_batch;
+};
+
+static int
+dpif_bpf_downcall(struct dpif *dpif_, struct dp_packet *packet,
+                  const struct flow *flow,
+                  struct dpif_bpf_downcall_parms *parms)
+{
+    struct dp_packet_batch batch;
+    struct bpf_downcall md = {
+        .type = parms->type,
+        .debug = 0xC0FFEEEE,
+    };
+    uint32_t ifindex;
+    uint32_t flags;
+    int error;
+    int queue = 0;
+    struct dp_packet *clone_pkt;
+
+    ovs_assert(datapath.bpf.execute_actions.fd != -1);
+
+    bpf_metadata_from_flow(flow, &md.md);
+
+    ifindex = odp_port_to_ifindex(get_dpif_bpf_dp(dpif_),
+                                  flow->in_port.odp_port, &flags);
+#if 0
+    /* this is ok at check_support time */
+    if (!ifindex) {
+        VLOG_WARN("%s: in_port.odp_port %d found",
+                 __func__, flow->in_port.odp_port);
+        return ENODEV;
+    }
+#endif
+
+    md.md.md.in_port = ifindex;
+    md.ifindex = ifindex;
+
+    if (parms->action_batch) {
+        int zero_index = 0;
+        error = bpf_map_update_elem(datapath.bpf.execute_actions.fd,
+                                    &zero_index, parms->action_batch, 0);
+        if (error) {
+            VLOG_ERR("%s: map update failed", __func__);
+            return error;
+        }
+    }
+
+    /* XXX: Check that ovs-system device MTU is large enough to include md. */
+    dp_packet_put(packet, &md, sizeof md);
+    clone_pkt = dp_packet_clone(packet);
+    dp_packet_batch_init_packet(&batch, clone_pkt);
+
+    VLOG_INFO("send downcall (%d)", parms->type);
+    error = netdev_send(datapath.outport, queue, &batch, false);
+    dp_packet_set_size(packet, dp_packet_size(packet) - sizeof md);
+
+    return error;
+}
+
+static int OVS_UNUSED
+dpif_bpf_output(struct dpif *dpif_, struct dp_packet *packet,
+                const struct flow *flow, odp_port_t port_no,
+                uint32_t flags OVS_UNUSED)
+{
+    struct dpif_bpf_downcall_parms parms = {
+        .port_no = port_no,
+        .type = OVS_BPF_DOWNCALL_OUTPUT,
+        .action_batch = NULL
+    };
+    return dpif_bpf_downcall(dpif_, packet, flow, &parms);
+}
+
+static int
+dpif_bpf_execute_(struct dpif *dpif_, struct dp_packet *packet,
+                  const struct flow *flow,
+                  struct bpf_action_batch *action_batch)
+{
+    struct dpif_bpf_downcall_parms parms = {
+        .type = OVS_BPF_DOWNCALL_EXECUTE,
+        .action_batch = action_batch,
+    };
+    return dpif_bpf_downcall(dpif_, packet, flow, &parms);
+}
+
+static int
+dpif_bpf_serialize_actions(struct dpif_bpf_dp *dp,
+                           struct bpf_action_batch *action_batch,
+                           const struct nlattr *nlactions,
+                           size_t actions_len)
+{
+
+    const struct nlattr *a;
+    unsigned int left, count = 0, skipped = 0;
+    struct bpf_action *actions;
+
+    memset(action_batch, 0, sizeof(*action_batch));
+    actions = action_batch->actions;
+
+    NL_ATTR_FOR_EACH_UNSAFE (a, left, nlactions, actions_len) {
+        enum ovs_action_attr type = nl_attr_type(a);
+        actions[count].type = type;
+
+        if (type == OVS_ACTION_ATTR_OUTPUT) {
+            struct dpif_bpf_port *port;
+            odp_port_t port_no = nl_attr_get_odp_port(a);
+
+            ovs_mutex_lock(&dp->port_mutex);
+            port = bpf_lookup_port(dp, port_no);
+            if (port) {
+                VLOG_INFO("output action to port %d ifindex %d", port_no,
+                          port->ifindex);
+                actions[count].u.out.port = port->ifindex;
+                actions[count].u.out.flags = get_port_flags(port->netdev);
+            }
+            ovs_mutex_unlock(&dp->port_mutex);
+        } else {
+            if (odp_action_to_bpf_action(a, &actions[count])) {
+                skipped++;
+            }
+        }
+        count++;
+    }
+
+    VLOG_INFO("Processing flow actions (%d/%d skipped)", skipped, count);
+    if (skipped) {
+        /* XXX: VLOG actions that couldn't be processed */
+    }
+    return 0;
+}
+
+static int
+dpif_bpf_execute(struct dpif *dpif_, struct dpif_execute *execute)
+{
+    struct bpf_action_batch batch;
+    int error = 0;
+
+    error = dpif_bpf_serialize_actions(get_dpif_bpf_dp(dpif_), &batch, execute->actions,
+                                       execute->actions_len);
+    if (error) {
+        return error;
+    }
+
+    error = dpif_bpf_execute_(dpif_, execute->packet,
+                              execute->flow, &batch);
+    return error;
+}
+
+/* Translates 'port' into an ifindex and sets it inside 'key'.
+ *
+ * Returns 0 on success, or a positive errno otherwise. */
+static int
+set_in_port(struct dpif_bpf_dp *dp, struct bpf_flow_key *key, odp_port_t port)
+{
+    uint16_t ifindex;
+
+    ifindex = odp_port_to_ifindex(dp, port, NULL);
+    if (!ifindex && port) {
+        VLOG_WARN("Could not find ifindex corresponding to port %"PRIu32,
+                  port);
+        return ENODEV;
+    }
+
+    key->mds.md.in_port = ifindex;
+    return 0;
+}
+
+/* Converts 'key' (of size 'key_len') into a bpf flow key in 'key_out', and
+ * optionally 'actions' (of size 'actions_len') into 'batch'. 'mask' (of size
+ * 'mask_len') may optionally be used for logging, of which the verbosity is
+ * controlled by 'verbose'.
+ *
+ * Returns 0 on success, or a positive errno otherwise.
+ */
+static int
+prepare_bpf_flow__(struct dpif_bpf_dp *dp,
+                   const struct nlattr *key, size_t key_len,
+                   const struct nlattr *mask, size_t mask_len,
+                   const struct nlattr *actions, size_t actions_len,
+                   struct bpf_flow_key *key_out, struct bpf_action_batch *batch,
+                   bool verbose)
+{
+    odp_port_t in_port;
+    int err = EINVAL;
+
+    if (1) {
+        struct ds ds = DS_EMPTY_INITIALIZER;
+
+        /* XXX: Use dpif_format_flow()? */
+        odp_flow_format(key, key_len, mask, mask_len, NULL, &ds, true);
+        ds_put_cstr(&ds, ", actions=");
+        format_odp_actions(&ds, actions, actions_len, NULL);
+        VLOG_WARN("Translating odp key to bpf key:\n%s", ds_cstr(&ds));
+        ds_destroy(&ds);
+    }
+
+    memset(key_out, 0, sizeof *key_out);
+    if (odp_key_to_bpf_flow_key(key, key_len, key_out,
+                                &in_port, false, verbose)) {
+        if (verbose) {
+            struct ds ds = DS_EMPTY_INITIALIZER;
+
+            /* XXX: Use dpif_format_flow()? */
+            odp_flow_format(key, key_len, mask, mask_len, NULL, &ds,
+                            true);
+            VLOG_WARN("Failed to translate odp key to bpf key:\n%s",
+                      ds_cstr(&ds));
+            ds_destroy(&ds);
+        }
+        return err;
+    }
+
+    err = set_in_port(dp, key_out, in_port);
+    if (err) {
+        return err;
+    }
+    if (batch) {
+        err = dpif_bpf_serialize_actions(dp, batch, actions, actions_len);
+        if (err) {
+            return err;
+        }
+    }
+
+    /* Transfer back to flow to check if everything is good */
+    if (1) {
+        struct flow flow;
+        enum odp_key_fitness res;
+
+        res = bpf_flow_key_to_flow(key_out, &flow);
+        if (res != ODP_FIT_PERFECT) {
+            VLOG_ERR("transfer bpf key back to flow failed");
+        } else {
+            struct ds ds = DS_EMPTY_INITIALIZER;
+
+            flow_format(&ds, &flow, NULL);
+            ds_put_cstr(&ds, ", actions=");
+            format_odp_actions(&ds, actions, actions_len, NULL);
+            VLOG_WARN("Translating back:\n%s", ds_cstr(&ds));
+            ds_destroy(&ds);
+        }
+    }
+
+    return 0;
+}
+
+static int
+prepare_bpf_flow(struct dpif_bpf_dp *dp, const struct nlattr *key,
+                 size_t key_len, struct bpf_flow_key *key_out, bool verbose)
+{
+    return prepare_bpf_flow__(dp, key, key_len, NULL, 0, NULL, 0, key_out,
+                              NULL, verbose);
+}
+
+static void
+dpif_bpf_operate(struct dpif *dpif_, struct dpif_op **ops, size_t n_ops)
+{
+    struct dpif_bpf_dp *dp = get_dpif_bpf_dp(dpif_);
+
+    for (int i = 0; i < n_ops; i++) {
+        struct dpif_op *op = ops[i];
+        struct dpif_flow_del *del OVS_UNUSED;
+        struct dpif_flow_get *get OVS_UNUSED;
+
+        switch (op->type) {
+        case DPIF_OP_EXECUTE:
+            op->error = dpif_bpf_execute(dpif_, &op->u.execute);
+            break;
+        case DPIF_OP_FLOW_PUT: {
+            struct dpif_flow_put *put = &op->u.flow_put;
+            bool verbose = !(put->flags & DPIF_FP_PROBE);
+            struct bpf_action_batch action_batch;
+            struct bpf_flow_key key;
+            int err;
+
+            err = prepare_bpf_flow__(dp, put->key, put->key_len,
+                                     put->mask, put->mask_len,
+                                     put->actions, put->actions_len,
+                                     &key, &action_batch, verbose);
+            if (!err) {
+                err = dpif_bpf_insert_flow(&key, &action_batch);
+            }
+            op->error = err;
+            break;
+        }
+        case DPIF_OP_FLOW_GET: {
+            struct dpif_flow_get *get = &op->u.flow_get;
+            struct bpf_flow_key key;
+            int err;
+
+            err = prepare_bpf_flow(dp, get->key, get->key_len, &key, true);
+            if (!err) {
+                err = fetch_flow(dp, get->flow, get->buffer, &key);
+            }
+            op->error = err;
+            break;
+        }
+        case DPIF_OP_FLOW_DEL: {
+            struct dpif_flow_del *del = &op->u.flow_del;
+            struct bpf_flow_key key;
+            int err;
+
+            err = prepare_bpf_flow(dp, del->key, del->key_len, &key, true);
+            if (!err) {
+                err = dpif_bpf_delete_flow(&key, del->stats);
+            }
+            op->error = err;
+            break;
+        }
+        default:
+            OVS_NOT_REACHED();
+        }
+    }
+}
+
+static int
+dpif_bpf_recv_set(struct dpif *dpif_, bool enable)
+{
+    struct dpif_bpf_dp *dpif = get_dpif_bpf_dp(dpif_);
+    int stored_error = 0;
+
+    for (int i = 0; i < dpif->n_channels; i++) {
+        int error = perf_channel_set(&dpif->channels[i], enable);
+        if (error) {
+            VLOG_ERR("failed to set recv_set %s (%s)",
+                     enable ? "true": "false", ovs_strerror(error));
+            stored_error = error;
+        }
+    }
+
+    return stored_error;
+}
+
+static int
+dpif_bpf_handlers_set__(struct dpif_bpf_dp *dp, uint32_t n_handlers)
+    OVS_REQUIRES(&dp->upcall_lock)
+{
+    struct bpf_handler prev;
+    int i, extra;
+
+    memset(&prev, 0, sizeof prev);
+    if (dp->n_handlers) {
+        free(dp->handlers);
+        dp->handlers = NULL;
+        dp->n_handlers = 0;
+    }
+
+    if (!n_handlers) {
+        return 0;
+    }
+
+    dp->handlers = xzalloc(sizeof *dp->handlers * n_handlers);
+    for (i = 0; i < n_handlers; i++) {
+        struct bpf_handler *curr = dp->handlers + i;
+
+        if (i > dp->n_channels) {
+            VLOG_INFO("Ignoring extraneous handlers (%d for %d channels)",
+                      n_handlers, dp->n_channels);
+            break;
+        }
+
+        curr->offset = prev.offset + prev.count;
+        curr->count = dp->n_channels / n_handlers;
+        prev = *curr;
+    }
+    extra = dp->n_channels % n_handlers;
+    if (extra) {
+        VLOG_INFO("Extra %d channels; distributing across handlers", extra);
+        for (i = 0; i < extra; i++) {
+            struct bpf_handler *curr = dp->handlers + n_handlers - i - 1;
+
+            curr->offset = curr->offset + extra - i - 1;
+            curr->count++;
+        }
+    }
+
+    dp->n_handlers = n_handlers;
+    return 0;
+}
+
+static int
+dpif_bpf_handlers_set(struct dpif *dpif_, uint32_t n_handlers)
+{
+    struct dpif_bpf_dp *dpif = get_dpif_bpf_dp(dpif_);
+    int error;
+
+    fat_rwlock_wrlock(&dpif->upcall_lock);
+    error = dpif_bpf_handlers_set__(dpif, n_handlers);
+    fat_rwlock_unlock(&dpif->upcall_lock);
+
+    return error;
+}
+
+static int
+extract_key(struct dpif_bpf_dp *dpif, const struct bpf_flow_key *key,
+            struct dp_packet *packet, struct ofpbuf *buf)
+{
+    struct flow flow;
+    struct odp_flow_key_parms parms = {
+        .flow = &flow,
+    };
+    parms.support.recirc = true;
+
+    {
+        struct ds ds = DS_EMPTY_INITIALIZER;
+
+        bpf_flow_key_format(&ds, key);
+        VLOG_INFO("bpf_flow_key_format\n%s", ds_cstr(&ds));
+        ds_destroy(&ds);
+    }
+
+    /* This function goes first because it zeros out flow. */
+    flow_extract(packet, &flow);
+
+    bpf_flow_key_extract_metadata(key, &flow);
+
+    VLOG_INFO("packet.md.port = %d", packet->md.in_port.odp_port);
+
+    if (flow.in_port.odp_port != 0) {
+        flow.in_port.odp_port = ifindex_to_odp(dpif,
+                                    odp_to_u32(flow.in_port.odp_port));
+    } else {
+        flow.in_port.odp_port = packet->md.in_port.odp_port;
+    }
+    VLOG_INFO("flow.in_port.odp_port %d", flow.in_port.odp_port);
+
+    if (1) {
+        struct ds ds = DS_EMPTY_INITIALIZER;
+
+        flow_format(&ds, &flow, NULL);
+        VLOG_WARN("Upcall flow:\n%s",
+                  ds_cstr(&ds));
+        ds_destroy(&ds);
+
+    }
+
+    odp_flow_key_from_flow(&parms, buf);
+
+    return 0;
+}
+
+struct ovs_ebpf_event {
+    struct perf_event_raw sample;
+    struct bpf_upcall header;
+    uint8_t data[];
+};
+
+static void OVS_UNUSED
+dpif_bpf_flow_dump_all(struct dpif_bpf_dp *dp OVS_UNUSED)
+{
+    struct dpif_bpf_flow_dump dump;
+    int err;
+
+    memset(&dump, 0, sizeof dump);
+    while (1) {
+        err = bpf_map_get_next_key(datapath.bpf.flow_table.fd,
+                                   &dump.pos, &dump.pos);
+        if (err) {
+            VLOG_INFO("err is %d", err);
+            break;
+        }
+        vlog_hex_dump((unsigned char *)&dump.pos, sizeof dump.pos);
+    }
+}
+
+/* perf_channel_read() fills the first part of 'buffer' with the full event.
+ * Here, the key will be extracted immediately following it, and 'upcall'
+ * will be initialized to point within 'buffer'.
+ */
+static int
+perf_sample_to_upcall__(struct dpif_bpf_dp *dp, struct ovs_ebpf_event *e,
+                        struct dpif_upcall *upcall, struct ofpbuf *buffer)
+{
+    size_t sample_len = e->sample.size - sizeof e->header;
+    size_t pkt_len = e->header.skb_len;
+    size_t pre_key_len;
+    odp_port_t port_no;
+    int err;
+
+    if (pkt_len < ETH_HEADER_LEN) {
+        VLOG_WARN_RL(&rl, "Unexpectedly short packet (%"PRIuSIZE")", pkt_len);
+        return EINVAL;
+    }
+    if (e->sample.size - sizeof e->header < pkt_len) {
+        VLOG_WARN_RL(&rl,
+            "Packet longer than sample (pkt=%"PRIuSIZE", sample=%"PRIuSIZE")",
+            pkt_len, sample_len);
+        return EINVAL;
+    }
+
+    port_no = ifindex_to_odp(dp, e->header.ifindex);
+    VLOG_INFO("ifindex %d odp %d", e->header.ifindex, port_no);
+    if (port_no == ODPP_NONE) {
+        VLOG_WARN_RL(&rl, "failed to map upcall ifindex=%d to odp",
+                     e->header.ifindex);
+        return EINVAL;
+    }
+
+    memset(upcall, 0, sizeof *upcall);
+
+    /* Use buffer->header to point to the packet, and buffer->msg to point to
+     * the extracted flow key. Therefore, when extract_key() reallocates
+     * 'buffer', we can easily get pointers back to the packet and start of
+     * extracted key. */
+    buffer->header = e->data;
+    buffer->msg = ofpbuf_tail(buffer);
+    pre_key_len = buffer->size;
+
+    VLOG_INFO("upcall key hex\n");
+    vlog_hex_dump((unsigned char *)&e->header.key, sizeof e->header.key);
+    //VLOG_INFO("list of bpf keys\n");
+    //dpif_bpf_flow_dump_all(dp);
+    VLOG_INFO("raw packet data in e->data");
+    vlog_hex_dump(e->data, MIN(pkt_len, 100));
+
+    dp_packet_use_stub(&upcall->packet, e->data, pkt_len);
+    dp_packet_set_size(&upcall->packet, pkt_len);
+    pkt_metadata_init(&upcall->packet.md, port_no);
+
+    err = extract_key(dp, &e->header.key, &upcall->packet, buffer);
+    if (err) {
+        return err;
+    }
+
+    upcall->key = buffer->msg;
+    upcall->key_len = buffer->size - pre_key_len;
+    dpif_flow_hash(dp->dpif, upcall->key, upcall->key_len, &upcall->ufid);
+
+    return 0;
+}
+
+/* perf_channel_read() fills the first part of 'buffer' with the full event.
+ * Here, the key will be extracted immediately following it, and 'upcall'
+ * will be initialized to point within 'buffer'.
+ */
+static int
+perf_sample_to_upcall_miss(struct dpif_bpf_dp *dp, struct ovs_ebpf_event *e,
+                           struct dpif_upcall *upcall, struct ofpbuf *buffer)
+{
+    int err;
+
+    err = perf_sample_to_upcall__(dp, e, upcall, buffer);
+    if (err) {
+        return err;
+    }
+
+    ofpbuf_prealloc_tailroom(buffer, sizeof(struct bpf_downcall));
+    upcall->type = DPIF_UC_MISS;
+
+    return 0;
+}
+
+/* Modified from perf_sample_to_upcall.
+ */
+static int
+perf_sample_to_upcall_userspace(struct dpif_bpf_dp *dp, struct ovs_ebpf_event *e,
+                                struct dpif_upcall *upcall,
+                                struct ofpbuf *buffer)
+{
+    const struct nlattr *actions = (struct nlattr *)e->header.uactions;
+    const struct nlattr *a;
+    unsigned int left;
+    int err;
+
+    err = perf_sample_to_upcall__(dp, e, upcall, buffer);
+    if (err) {
+        return err;
+    }
+
+    NL_ATTR_FOR_EACH_UNSAFE (a, left, actions, e->header.uactions_len) {
+        switch (nl_attr_type(a)) {
+        case OVS_USERSPACE_ATTR_PID:
+            //nl_attr_get_u32(a);
+            break;
+        case OVS_USERSPACE_ATTR_USERDATA:
+            upcall->userdata = CONST_CAST(struct nlattr *, a);
+            break;
+        default:
+            VLOG_INFO("%s unsupported userspace action. %d",
+                      __func__, nl_attr_type(a));
+            return EOPNOTSUPP;
+        }
+    }
+
+    upcall->type = DPIF_UC_ACTION;
+    return 0;
+}
+
+static void
+bpf_debug_print(int subtype, int error)
+{
+    int level = error ? VLL_WARN : VLL_DBG;
+    struct ds ds = DS_EMPTY_INITIALIZER;
+
+    if (subtype >= 0 && subtype < ARRAY_SIZE(bpf_upcall_subtypes)) {
+        ds_put_cstr(&ds, bpf_upcall_subtypes[subtype]);
+    } else {
+        ds_put_format(&ds, "Unknown subtype %d", subtype);
+    }
+    ds_put_format(&ds, " reports: %s", ovs_strerror(error));
+
+    VLOG_RL(&rl, level, "%s", ds_cstr(&ds));
+    ds_destroy(&ds);
+}
+
+static int
+recv_perf_sample(struct dpif_bpf_dp *dpif, struct ovs_ebpf_event *e,
+                 struct dpif_upcall *upcall, struct ofpbuf *buffer)
+{
+    if (e->sample.header.size < sizeof *e
+        || e->sample.size < sizeof e->header) {
+        VLOG_WARN_RL(&rl, "Unexpectedly short sample (%"PRIu32")",
+                     e->sample.size);
+        return EINVAL;
+    }
+
+    VLOG_INFO("\nreceived upcall %d", e->header.type);
+
+    switch (e->header.type) {
+    case OVS_UPCALL_MISS:
+        return perf_sample_to_upcall_miss(dpif, e, upcall, buffer);
+        break;
+    case OVS_UPCALL_DEBUG:
+        bpf_debug_print(e->header.subtype, e->header.error);
+        return EAGAIN;
+    case OVS_UPCALL_ACTION:
+        return perf_sample_to_upcall_userspace(dpif, e, upcall, buffer);
+        break;
+    default:
+        break;
+    }
+
+    VLOG_WARN_RL(&rl, "Unfamiliar upcall type %d", e->header.type);
+    return EINVAL;
+}
+
+static int
+dpif_bpf_recv(struct dpif *dpif_, uint32_t handler_id,
+              struct dpif_upcall *upcall, struct ofpbuf *buffer)
+{
+    struct dpif_bpf_dp *dpif = get_dpif_bpf_dp(dpif_);
+    struct bpf_handler *handler;
+    int error = EAGAIN;
+    int i;
+
+    fat_rwlock_rdlock(&dpif->upcall_lock);
+    handler = dpif->handlers + handler_id;
+    for (i = 0; i < handler->count; i++) {
+        int channel_idx = (handler->index + i) % handler->count;
+        struct perf_channel *channel;
+
+        channel = &dpif->channels[handler->offset + channel_idx];
+        error = perf_channel_read(channel, buffer);
+        if (!error) {
+            error = recv_perf_sample(dpif, buffer->header, upcall, buffer);
+        }
+        if (error != EAGAIN) {
+            break;
+        }
+    }
+    handler->index = (handler->index + 1) % handler->count;
+    fat_rwlock_unlock(&dpif->upcall_lock);
+
+    return error;
+}
+
+static char *
+dpif_bpf_get_datapath_version(void)
+{
+    return xstrdup("<built-in>");
+}
+
+static void
+dpif_bpf_recv_wait(struct dpif *dpif_, uint32_t handler_id)
+{
+    struct dpif_bpf_dp *dpif = get_dpif_bpf_dp(dpif_);
+    struct bpf_handler *handler;
+    int i;
+
+    fat_rwlock_rdlock(&dpif->upcall_lock);
+    handler = dpif->handlers + handler_id;
+    for (i = 0; i < handler->count; i++) {
+        poll_fd_wait(dpif->channels[handler->offset + i].fd, POLLIN);
+    }
+    fat_rwlock_unlock(&dpif->upcall_lock);
+}
+
+static void
+dpif_bpf_recv_purge(struct dpif *dpif_)
+{
+    struct dpif_bpf_dp *dpif = get_dpif_bpf_dp(dpif_);
+    int i;
+
+    fat_rwlock_rdlock(&dpif->upcall_lock);
+    for (i = 0; i < dpif->n_channels; i++) {
+        struct perf_channel *channel = &dpif->channels[i];
+
+        perf_channel_flush(channel);
+    }
+    fat_rwlock_unlock(&dpif->upcall_lock);
+}
+
+const struct dpif_class dpif_bpf_class = {
+    "bpf",
+    dpif_bpf_init,
+    dpif_bpf_enumerate,
+    dpif_bpf_port_open_type,
+    dpif_bpf_open,
+    dpif_bpf_close,
+    dpif_bpf_destroy,
+    NULL,                       /* run */
+    NULL,                       /* wait */
+    dpif_bpf_get_stats,
+    dpif_bpf_port_add,
+    dpif_bpf_port_del,
+    NULL,                       /* port_set_config */
+    dpif_bpf_port_query_by_number,
+    dpif_bpf_port_query_by_name,
+    NULL,                       /* port_get_pid */
+    dpif_bpf_port_dump_start,
+    dpif_bpf_port_dump_next,
+    dpif_bpf_port_dump_done,
+    dpif_bpf_port_poll,
+    dpif_bpf_port_poll_wait,
+    dpif_bpf_flow_flush,
+    dpif_bpf_flow_dump_create,
+    dpif_bpf_flow_dump_destroy,
+    dpif_bpf_flow_dump_thread_create,
+    dpif_bpf_flow_dump_thread_destroy,
+    dpif_bpf_flow_dump_next,
+    dpif_bpf_operate,
+    dpif_bpf_recv_set,
+    dpif_bpf_handlers_set,
+    NULL,                       /* set_config */
+    NULL,                       /* queue_to_priority */
+    dpif_bpf_recv,
+    dpif_bpf_recv_wait,
+    dpif_bpf_recv_purge,
+    NULL,                       /* register_dp_purge_cb */
+    NULL,                       /* register_upcall_cb */
+    NULL,                       /* enable_upcall */
+    NULL,                       /* disable_upcall */
+    dpif_bpf_get_datapath_version,
+    NULL,                       /* ct_dump_start */
+    NULL,                       /* ct_dump_next */
+    NULL,                       /* ct_dump_done */
+    NULL,                       /* ct_flush */
+    NULL,                       /* ct_set_maxconns */
+    NULL,                       /* ct_get_maxconns */
+    NULL,                       /* ct_get_nconns */
+    NULL,                       /* meter_get_features */
+    NULL,                       /* meter_set */
+    NULL,                       /* meter_get */
+    NULL,                       /* meter_del */
+};
diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h
index 62b3598acfc5..ae21593ab1b2 100644
--- a/lib/dpif-provider.h
+++ b/lib/dpif-provider.h
@@ -476,6 +476,7 @@ struct dpif_class {
 
 extern const struct dpif_class dpif_netlink_class;
 extern const struct dpif_class dpif_netdev_class;
+extern const struct dpif_class dpif_bpf_class;
 
 #ifdef  __cplusplus
 }
diff --git a/lib/dpif.c b/lib/dpif.c
index f03763ec55b4..43d97ec1582a 100644
--- a/lib/dpif.c
+++ b/lib/dpif.c
@@ -71,6 +71,9 @@ static const struct dpif_class *base_dpif_classes[] = {
 #if defined(__linux__) || defined(_WIN32)
     &dpif_netlink_class,
 #endif
+#if HAVE_BPF  /* XXX: Linux 4.9+ */
+    &dpif_bpf_class,
+#endif
     &dpif_netdev_class,
 };
 
-- 
2.7.4



More information about the dev mailing list