[ovs-dev] [RFC PATCH 02/11] netdev: add ebpf support for netdev provider.

William Tu u9012063 at gmail.com
Sat Jun 23 12:16:34 UTC 2018


From: Joe Stringer <joe at ovn.org>

To receive packets, an eBPF program has to be attached to a netdev
through tc ingress/egress, an XDP program has to be attached to
a netdev's xdp hook point.  The patch introduces two new netdev_class
function: set_filter and set_xdp for the purpose.  Now two netdev
types, netdev-linux and netdev-vport, have the actual implementation.

Signed-off-by: William Tu <u9012063 at gmail.com>
Co-authored-by: William Tu <u9012063 at gmail.com>
Co-authored-by: Yifeng Sun <pkusunyifeng at gmail.com>
---
 include/linux/pkt_cls.h |  21 +++
 lib/dpif-netdev.c       |  29 ++--
 lib/netdev-bsd.c        |   2 +
 lib/netdev-dpdk.c       |   2 +
 lib/netdev-dummy.c      |   2 +
 lib/netdev-linux.c      | 436 +++++++++++++++++++++++++++++++++++++++++++++++-
 lib/netdev-linux.h      |   2 +
 lib/netdev-provider.h   |  11 ++
 lib/netdev-vport.c      | 145 +++++++++++++++-
 lib/netdev.c            |  25 +++
 lib/netdev.h            |   4 +
 11 files changed, 655 insertions(+), 24 deletions(-)

diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
index f7bc7ea708d7..770af90a5c64 100644
--- a/include/linux/pkt_cls.h
+++ b/include/linux/pkt_cls.h
@@ -104,6 +104,27 @@ enum {
 	__TCA_BASIC_MAX
 };
 
+/* BPF classifier */
+
+#define TCA_BPF_FLAG_ACT_DIRECT		(1 << 0)
+
+enum {
+	TCA_BPF_UNSPEC,
+	TCA_BPF_ACT,
+	TCA_BPF_POLICE,
+	TCA_BPF_CLASSID,
+	TCA_BPF_OPS_LEN,
+	TCA_BPF_OPS,
+	TCA_BPF_FD,
+	TCA_BPF_NAME,
+	TCA_BPF_FLAGS,
+	TCA_BPF_FLAGS_GEN,
+	TCA_BPF_TAG,
+	__TCA_BPF_MAX,
+};
+
+#define TCA_BPF_MAX (__TCA_BPF_MAX - 1)
+
 /* Flower classifier */
 
 enum {
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index ba62128c758c..baff020fe3d0 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -1505,12 +1505,6 @@ dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
     ovs_mutex_unlock(&pmd->cond_mutex);
 }
 
-static uint32_t
-hash_port_no(odp_port_t port_no)
-{
-    return hash_int(odp_to_u32(port_no), 0);
-}
-
 static int
 port_create(const char *devname, const char *type,
             odp_port_t port_no, struct dp_netdev_port **portp)
@@ -1525,6 +1519,7 @@ port_create(const char *devname, const char *type,
 
     /* Open and validate network device. */
     error = netdev_open(devname, type, &netdev);
+    VLOG_INFO("%s %s error %d", __func__, devname, error);
     if (error) {
         return error;
     }
@@ -1578,7 +1573,7 @@ do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
         return error;
     }
 
-    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
+    hmap_insert(&dp->ports, &port->node, netdev_hash_port_no(port_no));
     seq_change(dp->port_seq);
 
     reconfigure_datapath(dp);
@@ -1596,6 +1591,8 @@ dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
     odp_port_t port_no;
     int error;
 
+    VLOG_INFO("%s", __func__);
+
     ovs_mutex_lock(&dp->port_mutex);
     dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
     if (*port_nop != ODPP_NONE) {
@@ -1648,7 +1645,8 @@ dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
 {
     struct dp_netdev_port *port;
 
-    HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
+    HMAP_FOR_EACH_WITH_HASH (port, node, netdev_hash_port_no(port_no),
+                             &dp->ports) {
         if (port->port_no == port_no) {
             return port;
         }
@@ -1808,7 +1806,7 @@ dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
                            odp_port_t in_port)
 {
     struct dpcls *cls;
-    uint32_t hash = hash_port_no(in_port);
+    uint32_t hash = netdev_hash_port_no(in_port);
     CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
         if (cls->in_port == in_port) {
             /* Port classifier exists already */
@@ -1824,7 +1822,7 @@ dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
     OVS_REQUIRES(pmd->flow_mutex)
 {
     struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
-    uint32_t hash = hash_port_no(in_port);
+    uint32_t hash = netdev_hash_port_no(in_port);
 
     if (!cls) {
         /* Create new classifier for in_port */
@@ -3311,7 +3309,7 @@ tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
 {
     struct tx_port *tx;
 
-    HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
+    HMAP_FOR_EACH_IN_BUCKET (tx, node, netdev_hash_port_no(port_no), hmap) {
         if (tx->port->port_no == port_no) {
             return tx;
         }
@@ -4034,13 +4032,13 @@ pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
         if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
             tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
             hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
-                        hash_port_no(tx_port_cached->port->port_no));
+                        netdev_hash_port_no(tx_port_cached->port->port_no));
         }
 
         if (netdev_n_txq(tx_port->port->netdev)) {
             tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
             hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
-                        hash_port_no(tx_port_cached->port->port_no));
+                        netdev_hash_port_no(tx_port_cached->port->port_no));
         }
     }
 }
@@ -4793,7 +4791,8 @@ dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
     tx->flush_time = 0LL;
     dp_packet_batch_init(&tx->output_pkts);
 
-    hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
+    hmap_insert(&pmd->tx_ports, &tx->node,
+                netdev_hash_port_no(tx->port->port_no));
     pmd->need_reload = true;
 }
 
@@ -5965,7 +5964,7 @@ dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
 
     /* Reinsert with new port number. */
     port->port_no = port_no;
-    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
+    hmap_insert(&dp->ports, &port->node, netdev_hash_port_no(port_no));
     reconfigure_datapath(dp);
 
     seq_change(dp->port_seq);
diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c
index 05974c100895..1460ae2504c5 100644
--- a/lib/netdev-bsd.c
+++ b/lib/netdev-bsd.c
@@ -1516,6 +1516,8 @@ netdev_bsd_update_flags(struct netdev *netdev_, enum netdev_flags off,
     NULL, /* set_advertisement */                    \
     NULL, /* get_pt_mode */                          \
     NULL, /* set_policing */                         \
+    NULL, /* set_filter */                           \
+    NULL, /* set_xdp */                              \
     NULL, /* get_qos_type */                         \
     NULL, /* get_qos_capabilities */                 \
     NULL, /* get_qos */                              \
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 52d8fe6b7ac2..20116c22137e 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -3854,6 +3854,8 @@ unlock:
     NULL,                       /* get_pt_mode */             \
                                                               \
     netdev_dpdk_set_policing,                                 \
+    NULL,                       /* set_filter */              \
+    NULL,                       /* set_xdp */                 \
     netdev_dpdk_get_qos_types,                                \
     NULL,                       /* get_qos_capabilities */    \
     netdev_dpdk_get_qos,                                      \
diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
index 4246af3b9c86..44c9458a9a22 100644
--- a/lib/netdev-dummy.c
+++ b/lib/netdev-dummy.c
@@ -1427,6 +1427,8 @@ netdev_dummy_update_flags(struct netdev *netdev_,
     NULL,                       /* get_pt_mode */               \
                                                                 \
     NULL,                       /* set_policing */              \
+    NULL,                       /* set_filter */                \
+    NULL,                       /* set_xdp */                   \
     NULL,                       /* get_qos_types */             \
     NULL,                       /* get_qos_capabilities */      \
     NULL,                       /* get_qos */                   \
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 4e0473cf331f..121dd3bc738e 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -46,6 +46,9 @@
 #include <string.h>
 #include <unistd.h>
 
+#include <bpf/libbpf.h> /* linux/tools/bpf/libbpf.h */
+
+#include "bpf.h"
 #include "coverage.h"
 #include "dp-packet.h"
 #include "dpif-netlink.h"
@@ -227,6 +230,9 @@ enum {
     VALID_VPORT_STAT_ERROR  = 1 << 5,
     VALID_DRVINFO           = 1 << 6,
     VALID_FEATURES          = 1 << 7,
+    VALID_INGRESS_FILTER    = 1 << 8,
+    VALID_EGRESS_FILTER     = 1 << 9,
+    VALID_XDP_FILTER        = 1 << 10,
 };
 
 /* Traffic control. */
@@ -421,6 +427,7 @@ static const struct tc_ops tc_ops_sfq;
 static const struct tc_ops tc_ops_default;
 static const struct tc_ops tc_ops_noop;
 static const struct tc_ops tc_ops_other;
+static const struct tc_ops tc_ops_clsact;
 
 static const struct tc_ops *const tcs[] = {
     &tc_ops_htb,                /* Hierarchy token bucket (see tc-htb(8)). */
@@ -431,6 +438,7 @@ static const struct tc_ops *const tcs[] = {
     &tc_ops_noop,               /* Non operating qos type. */
     &tc_ops_default,            /* Default qdisc (see tc-pfifo_fast(8)). */
     &tc_ops_other,              /* Some other qdisc. */
+    &tc_ops_clsact,             /* Classifier with nested action. */
     NULL
 };
 
@@ -442,8 +450,12 @@ static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
                                                   int type,
                                                   unsigned int flags,
                                                   struct ofpbuf *);
+static int clsact_install__(struct netdev *netdev_);
 static int tc_add_policer(struct netdev *,
                           uint32_t kbits_rate, uint32_t kbits_burst);
+static int tc_add_filter(struct netdev *, int fd, uint32_t parent,
+                         const char *name);
+static bool tc_is_clsact(const struct tc *tc);
 
 static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
                           struct nlattr **options);
@@ -485,13 +497,19 @@ struct netdev_linux {
     long long int carrier_resets;
     uint32_t kbits_rate;        /* Policing data. */
     uint32_t kbits_burst;
+    uint32_t ingress_filter;    /* BPF ingress filter fd. */
+    uint32_t egress_filter;     /* BPF egress filter fd. */
+    uint32_t ingress_xdp_filter;/* XDP ingress filter fd. */
     int vport_stats_error;      /* Cached error code from vport_get_stats().
                                    0 or an errno value. */
     int netdev_mtu_error;       /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
     int ether_addr_error;       /* Cached error code from set/get etheraddr. */
     int netdev_policing_error;  /* Cached error code from set policing. */
+    int ingress_filter_error;   /* Cached error code from set filter. */
+    int egress_filter_error;    /* Cached error code from set filter. */
     int get_features_error;     /* Cached error code from ETHTOOL_GSET. */
     int get_ifindex_error;      /* Cached error code from SIOCGIFINDEX. */
+    int ingress_xdp_error;
 
     enum netdev_features current;    /* Cached from ETHTOOL_GSET. */
     enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
@@ -2159,8 +2177,14 @@ netdev_linux_set_policing(struct netdev *netdev_,
     if (kbits_rate) {
         error = tc_add_del_ingress_qdisc(ifindex, true);
         if (error) {
-            VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
-                         netdev_name, ovs_strerror(error));
+            const char *bpf_conflict = "";
+
+            if (error == EEXIST && (netdev->ingress_filter
+                                    || netdev->egress_filter)) {
+                bpf_conflict = " (conflicts with BPF)";
+            }
+            VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s%s",
+                         netdev_name, ovs_strerror(error), bpf_conflict);
             goto out;
         }
 
@@ -2184,6 +2208,268 @@ out:
     return error;
 }
 
+/* Attempts to set a BPF filter on the device. Returns 0 if successful,
+ * otherwise a positive errno value. */
+static int
+netdev_linux_set_filter__(struct netdev *netdev_, const struct bpf_prog *prog,
+                          unsigned int valid_bit, int *filter_error,
+                          uint32_t *netdev_filter)
+{
+    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+    const char *netdev_name = netdev_get_name(netdev_);
+    int error;
+
+    VLOG_DBG("Setting %s filter %d on %s (handle %08"PRIx32")", prog->name,
+             prog->fd, netdev_name, prog->handle);
+
+    if (netdev->cache_valid & valid_bit) {
+        error = *filter_error;
+        if (error || (prog && prog->fd == *netdev_filter)) {
+            /* Assume that settings haven't changed since we last set them. */
+            goto out;
+        }
+        netdev->cache_valid &= ~valid_bit;
+    }
+
+    /* Remove non-clsact qdiscs. */
+    if (netdev->tc && !tc_is_clsact(netdev->tc)) {
+        error = tc_del_qdisc(netdev_);
+        if (error) {
+            VLOG_WARN_RL(&rl, "%s: removing qdisc failed: %s",
+                         netdev_name, ovs_strerror(error));
+            goto out;
+        }
+    }
+
+    if (prog) {
+        if (!netdev->tc || !tc_is_clsact(netdev->tc)) {
+            error = clsact_install__(netdev_);
+            if (error && error != EEXIST) {
+                VLOG_WARN_RL(&rl, "%s: clsact qdisc setup failed: %s",
+                             netdev_name, ovs_strerror(error));
+                goto out;
+            }
+        }
+
+        error = tc_add_filter(netdev_, prog->fd, prog->handle, prog->name);
+        if (error){
+            VLOG_WARN_RL(&rl, "%s: adding filter %s failed: %s",
+                         netdev_name, prog->name, ovs_strerror(error));
+            goto out;
+        }
+    }
+
+    *netdev_filter = prog ? prog->fd : 0;
+
+out:
+    if (!error || error == ENODEV) {
+        *filter_error = error;
+        netdev->cache_valid |= valid_bit;
+    }
+    return error;
+}
+
+static int
+netdev_linux_set_filter(struct netdev *netdev_, const struct bpf_prog *prog)
+{
+    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+    int error;
+
+    ovs_mutex_lock(&netdev->mutex);
+    if (!prog || prog->handle == INGRESS_HANDLE) {
+        error = netdev_linux_set_filter__(netdev_, prog, VALID_INGRESS_FILTER,
+                                          &netdev->ingress_filter_error,
+                                          &netdev->ingress_filter);
+    } else {
+        error = netdev_linux_set_filter__(netdev_, prog, VALID_EGRESS_FILTER,
+                                          &netdev->egress_filter_error,
+                                          &netdev->egress_filter);
+    }
+    ovs_mutex_unlock(&netdev->mutex);
+
+    return error;
+}
+
+#ifndef SOL_NETLINK
+#define SOL_NETLINK 270
+#endif
+
+/* Extract from libbpf */
+int
+bpf_set_link_xdp_fd(int ifindex, int fd, uint32_t flags)
+{
+
+    struct sockaddr_nl sa;
+    int sock, seq = 0, len, ret = -1;
+    char buf[4096];
+    struct nlattr *nla, *nla_xdp;
+    struct {
+        struct nlmsghdr nh;
+        struct ifinfomsg ifinfo;
+        char attrbuf[64];
+    } req;
+    struct nlmsghdr *nh;
+    struct nlmsgerr *err;
+    socklen_t addrlen;
+    int one = 1;
+
+    memset(&sa, 0, sizeof(sa));
+    sa.nl_family = AF_NETLINK;
+
+    sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+    if (sock < 0) {
+        return -errno;
+    }
+
+    if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK,
+                   &one, sizeof(one)) < 0) {
+        VLOG_WARN_RL(&rl, "Netlink error reporting not supported");
+    }
+
+    if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+        ret = -errno;
+        goto cleanup;
+    }
+
+    addrlen = sizeof(sa);
+    if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) {
+        ret = -errno;
+        goto cleanup;
+    }
+
+    if (addrlen != sizeof(sa)) {
+        goto cleanup;
+    }
+
+    memset(&req, 0, sizeof(req));
+    req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+    req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+    req.nh.nlmsg_type = RTM_SETLINK;
+    req.nh.nlmsg_pid = 0;
+    req.nh.nlmsg_seq = ++seq;
+    req.ifinfo.ifi_family = AF_UNSPEC;
+    req.ifinfo.ifi_index = ifindex;
+
+    /* started nested attribute for XDP */
+    nla = (struct nlattr *)(((char *)&req)
+                           + NLMSG_ALIGN(req.nh.nlmsg_len));
+    nla->nla_type = NLA_F_NESTED | IFLA_XDP;
+    nla->nla_len = NLA_HDRLEN;
+
+    /* add XDP fd */
+    nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
+    nla_xdp->nla_type = IFLA_XDP_FD;
+    nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
+    memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
+            nla->nla_len += nla_xdp->nla_len;
+
+    /* if user passed in any flags, add those too */
+    if (flags) {
+        nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
+        nla_xdp->nla_type = IFLA_XDP_FLAGS;
+        nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
+        memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
+        nla->nla_len += nla_xdp->nla_len;
+    }
+
+    req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
+
+    /* send */
+    if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+        ret = -errno;
+        goto cleanup;
+    }
+
+    /* recv */
+    len = recv(sock, buf, sizeof(buf), 0);
+    if (len < 0) {
+        ret = -errno;
+        goto cleanup;
+    }
+
+    for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
+         nh = NLMSG_NEXT(nh, len)) {
+        if (nh->nlmsg_pid != sa.nl_pid) {
+            ret = -1;
+            goto cleanup;
+        }
+        if (nh->nlmsg_seq != seq) {
+            ret = -1;
+            goto cleanup;
+        }
+        switch (nh->nlmsg_type) {
+        case NLMSG_ERROR:
+            err = (struct nlmsgerr *)NLMSG_DATA(nh);
+            if (!err->error)
+                continue;
+            ret = err->error;
+            /* nla_dump_errormsg(nh); */
+            goto cleanup;
+        case NLMSG_DONE:
+            break;
+        default:
+            break;
+        }
+    }
+
+    ret = 0;
+
+cleanup:
+    close(sock);
+    return ret;
+}
+
+static int
+netdev_linux_set_xdp__(struct netdev *netdev_, const struct bpf_prog *prog,
+                       unsigned int valid_bit, int *filter_error,
+                       uint32_t *netdev_filter)
+{
+    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+    const char *netdev_name = netdev_get_name(netdev_);
+    int ifindex = netdev->ifindex;
+    int error;
+
+    VLOG_DBG("Setting %s XDP filter %d on %s (ifindex %d)", prog->name,
+             prog->fd, netdev_name, ifindex);
+
+    if (netdev->cache_valid & valid_bit) {
+        error = *filter_error;
+        if (error || (prog && prog->fd == *netdev_filter)) {
+            /* Assume that settings haven't changed since we last set them. */
+            goto out;
+        }
+        netdev->cache_valid &= ~valid_bit;
+    }
+    error = bpf_set_link_xdp_fd(ifindex, prog->fd, XDP_FLAGS_SKB_MODE);
+    if (error < 0) {
+        VLOG_WARN_RL(&rl, "%s: adding XDP filter %s failed: %s",
+                     netdev_name, prog->name, ovs_strerror(error));
+        goto out;
+    }
+
+out:
+    if (!error || error == ENODEV) {
+        *filter_error = error;
+        netdev->cache_valid |= valid_bit;
+    }
+    return error;
+}
+
+static int
+netdev_linux_set_xdp(struct netdev *netdev_, const struct bpf_prog *prog)
+{
+    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+    int error;
+
+    ovs_mutex_lock(&netdev->mutex);
+    error = netdev_linux_set_xdp__(netdev_, prog, VALID_XDP_FILTER,
+                                   &netdev->ingress_xdp_error,
+                                   &netdev->ingress_xdp_filter);
+    ovs_mutex_unlock(&netdev->mutex);
+
+    return error;
+}
+
 static int
 netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
                            struct sset *types)
@@ -2879,6 +3165,8 @@ netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
     NULL,                       /* get_pt_mode */               \
                                                                 \
     netdev_linux_set_policing,                                  \
+    netdev_linux_set_filter,                                    \
+    netdev_linux_set_xdp,                                       \
     netdev_linux_get_qos_types,                                 \
     netdev_linux_get_qos_capabilities,                          \
     netdev_linux_get_qos,                                       \
@@ -4671,6 +4959,74 @@ static const struct tc_ops tc_ops_other = {
     NULL                        /* class_dump_stats */
 };
 
+/* "linux-clsact" traffic control class. */
+static int
+clsact_setup_qdisc(struct netdev *netdev)
+{
+    struct ofpbuf request;
+    struct tcmsg *tcmsg;
+
+    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
+                                         NLM_F_EXCL | NLM_F_CREATE, &request);
+    if (!tcmsg) {
+        return ENODEV;
+    }
+    tcmsg->tcm_handle = tc_make_handle(0xFFFF, 0);
+    tcmsg->tcm_parent = TC_H_INGRESS;
+    nl_msg_put_string(&request, TCA_KIND, "clsact");
+    nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
+
+    return tc_transact(&request, NULL);
+}
+
+static int
+clsact_install__(struct netdev *netdev_)
+{
+    static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_clsact);
+    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+    int error;
+
+    error = clsact_setup_qdisc(netdev_);
+    if (error) {
+        return error;
+    }
+
+    /* Nothing but a tc class implementation is allowed to write to a tc.  This
+     * class never does that, so we can legitimately use a const tc object. */
+    netdev->tc = CONST_CAST(struct tc *, &tc);
+
+    return 0;
+}
+
+static int
+clsact_tc_install(struct netdev *netdev,
+                   const struct smap *details OVS_UNUSED)
+{
+    return clsact_install__(netdev);
+}
+
+static int
+clsact_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
+{
+    return clsact_install__(netdev);
+}
+
+static const struct tc_ops tc_ops_clsact = {
+    "clsact",                   /* linux_name */
+    "linux-clsact",             /* ovs_name */
+    0,                          /* n_queues */
+    clsact_tc_install,
+    clsact_tc_load,
+    NULL,                       /* tc_destroy */
+    NULL,                       /* qdisc_get */
+    NULL,                       /* qdisc_set */
+    NULL,                       /* class_get */
+    NULL,                       /* class_set */
+    NULL,                       /* class_delete */
+    NULL,                       /* class_get_stats */
+    NULL                        /* class_dump_stats */
+};
+
 /* Traffic control. */
 
 /* Number of kernel "tc" ticks per second. */
@@ -4775,6 +5131,49 @@ tc_add_policer(struct netdev *netdev,
     return 0;
 }
 
+/* Adds a filter to 'netdev' corresponding to BPF program associated with 'fd'.
+ *
+ * This function is equivalent to running:
+ *     /sbin/tc filter add dev <devname> <parent> bpf da object-pinned <path>
+ *
+ * The configuration and stats may be seen with the following command:
+ *     /sbin/tc -s filter show dev <devname> <parent>
+ *
+ * Returns 0 if successful, otherwise a positive errno value.
+ */
+static int
+tc_add_filter(struct netdev *netdev, int fd, uint32_t parent, const char *name)
+{
+    struct ofpbuf request;
+    struct tcmsg *tcmsg;
+    size_t opts_offset;
+    int error;
+
+    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
+                                         NLM_F_EXCL | NLM_F_CREATE, &request);
+    if (!tcmsg) {
+        return ENODEV;
+    }
+    tcmsg->tcm_handle = tc_make_handle(0, 0x1);
+    tcmsg->tcm_parent = parent;
+    tcmsg->tcm_info = tc_make_handle(0, /* preference */
+                                     (OVS_FORCE uint16_t) htons(ETH_P_ALL));
+
+    nl_msg_put_string(&request, TCA_KIND, "bpf");
+    opts_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
+    nl_msg_put_u32(&request, TCA_BPF_FLAGS, TCA_BPF_FLAG_ACT_DIRECT);
+    nl_msg_put_u32(&request, TCA_BPF_FD, fd);
+    nl_msg_put_string(&request, TCA_BPF_NAME, name);
+    nl_msg_end_nested(&request, opts_offset);
+
+    error = tc_transact(&request, NULL);
+    if (error) {
+        return error;
+    }
+
+    return 0;
+}
+
 static void
 read_psched(void)
 {
@@ -5060,21 +5459,21 @@ tc_delete_class(const struct netdev *netdev, unsigned int handle)
     return error;
 }
 
-/* Equivalent to "tc qdisc del dev <name> root". */
+/* Equivalent to "tc qdisc del dev <name> handle <handle> <parent>". */
 static int
-tc_del_qdisc(struct netdev *netdev_)
+tc_del_qdisc__(struct netdev_linux *netdev, uint32_t parent, uint32_t handle)
 {
-    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
     struct ofpbuf request;
     struct tcmsg *tcmsg;
     int error;
 
-    tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
+    tcmsg = netdev_linux_tc_make_request(&netdev->up, RTM_DELQDISC, 0,
+                                         &request);
     if (!tcmsg) {
         return ENODEV;
     }
-    tcmsg->tcm_handle = tc_make_handle(1, 0);
-    tcmsg->tcm_parent = TC_H_ROOT;
+    tcmsg->tcm_handle = handle;
+    tcmsg->tcm_parent = parent;
 
     error = tc_transact(&request, NULL);
     if (error == EINVAL) {
@@ -5092,6 +5491,27 @@ tc_del_qdisc(struct netdev *netdev_)
 }
 
 static bool
+tc_is_clsact(const struct tc *tc)
+{
+    if (!tc || !tc->ops->linux_name) {
+        return false;
+    }
+    return !strcmp(tc->ops->linux_name, "clsact");
+}
+
+static int
+tc_del_qdisc(struct netdev *netdev_)
+{
+    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
+
+    if (netdev->tc && tc_is_clsact(netdev->tc)) {
+        return tc_del_qdisc__(netdev, TC_H_INGRESS,
+                              tc_make_handle(TC_H_INGRESS, 0));
+    }
+    return tc_del_qdisc__(netdev, TC_H_ROOT, tc_make_handle(1, 0));
+}
+
+static bool
 getqdisc_is_safe(void)
 {
     static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
diff --git a/lib/netdev-linux.h b/lib/netdev-linux.h
index 880f86402a1e..8257d4c695f9 100644
--- a/lib/netdev-linux.h
+++ b/lib/netdev-linux.h
@@ -29,6 +29,8 @@ int netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
                                   const char *flag_name, bool enable);
 int linux_get_ifindex(const char *netdev_name);
 
+int bpf_set_link_xdp_fd(int ifindex, int fd, uint32_t flags);
+
 #define LINUX_FLOW_OFFLOAD_API                                  \
             netdev_tc_flow_flush,                               \
             netdev_tc_flow_dump_create,                         \
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index 25bd671c1382..3e53a5b76272 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -32,6 +32,7 @@
 extern "C" {
 #endif
 
+struct bpf_prog;
 struct netdev_tnl_build_header_params;
 #define NETDEV_NUMA_UNSPEC OVS_NUMA_UNSPEC
 
@@ -505,6 +506,16 @@ struct netdev_class {
     int (*set_policing)(struct netdev *netdev, unsigned int kbits_rate,
                         unsigned int kbits_burst);
 
+    /* Attempts to attach a traffic filter in the form of an (e)BPF program.
+     *
+     * This function may be set to null if filters are not supported. */
+    int (*set_filter)(struct netdev *netdev, const struct bpf_prog *);
+
+    /* Attempts to attach a XDP eBPF program.
+     *
+     * This function may be set to null if filters are not supported. */
+    int (*set_xdp)(struct netdev *netdev, const struct bpf_prog *);
+
     /* Adds to 'types' all of the forms of QoS supported by 'netdev', or leaves
      * it empty if 'netdev' does not support QoS.  Any names added to 'types'
      * should be documented as valid for the "type" column in the "QoS" table
diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
index 52aa12d79933..4341c89894a3 100644
--- a/lib/netdev-vport.c
+++ b/lib/netdev-vport.c
@@ -22,12 +22,14 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <sys/socket.h>
+#include <linux/rtnetlink.h>
 #include <net/if.h>
 #include <sys/types.h>
 #include <netinet/in.h>
 #include <netinet/ip6.h>
 #include <sys/ioctl.h>
 
+#include "bpf.h"
 #include "byte-order.h"
 #include "daemon.h"
 #include "dirs.h"
@@ -43,6 +45,7 @@
 #include "route-table.h"
 #include "smap.h"
 #include "socket-util.h"
+#include "tc.h"
 #include "unaligned.h"
 #include "unixctl.h"
 #include "openvswitch/vlog.h"
@@ -72,6 +75,10 @@ struct vport_class {
     struct netdev_class netdev_class;
 };
 
+/* This is set pretty low because we probably won't learn anything from the
+ * additional log messages. */
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
+
 bool
 netdev_vport_is_vport_class(const struct netdev_class *class)
 {
@@ -866,6 +873,140 @@ netdev_vport_get_ifindex(const struct netdev *netdev_)
     return linux_get_ifindex(name);
 }
 
+/* "linux-clsact" traffic control class. */
+static int
+clsact_setup_qdisc(struct netdev *netdev)
+{
+    struct ofpbuf request;
+    struct tcmsg *tcmsg;
+    int ifindex;
+
+    ifindex = netdev_vport_get_ifindex(netdev);
+
+    tcmsg = tc_make_request(ifindex, RTM_NEWQDISC, NLM_F_EXCL | NLM_F_CREATE,
+                            &request);
+    if (!tcmsg) {
+        return ENODEV;
+    }
+    tcmsg->tcm_handle = tc_make_handle(0xFFFF, 0);
+    tcmsg->tcm_parent = TC_H_INGRESS;
+    nl_msg_put_string(&request, TCA_KIND, "clsact");
+    nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
+
+    return tc_transact(&request, NULL);
+}
+
+static int
+tc_add_filter(struct netdev *netdev, int fd, uint32_t parent, const char *name)
+{
+    struct ofpbuf request;
+    struct tcmsg *tcmsg;
+    size_t opts_offset;
+    int ifindex;
+    int error;
+
+    ifindex = netdev_vport_get_ifindex(netdev);
+
+    tcmsg = tc_make_request(ifindex, RTM_NEWTFILTER, NLM_F_EXCL | NLM_F_CREATE,
+                            &request);
+    if (!tcmsg) {
+        return ENODEV;
+    }
+    tcmsg->tcm_handle = tc_make_handle(0, 0x1);
+    tcmsg->tcm_parent = parent;
+#define ETH_P_ALL   0x0003
+    tcmsg->tcm_info = tc_make_handle(0, /* preference */
+                                     (OVS_FORCE uint16_t) htons(ETH_P_ALL));
+
+    nl_msg_put_string(&request, TCA_KIND, "bpf");
+    opts_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
+    nl_msg_put_u32(&request, TCA_BPF_FLAGS, TCA_BPF_FLAG_ACT_DIRECT);
+    nl_msg_put_u32(&request, TCA_BPF_FD, fd);
+    nl_msg_put_string(&request, TCA_BPF_NAME, name);
+    nl_msg_end_nested(&request, opts_offset);
+
+    error = tc_transact(&request, NULL);
+    if (error) {
+        return error;
+    }
+
+    return 0;
+}
+
+/* Attempts to set a BPF filter on the device. Returns 0 if successful,
+ * otherwise a positive errno value. */
+static int
+netdev_vport_set_filter__(struct netdev *netdev_, const struct bpf_prog *prog,
+                          unsigned int OVS_UNUSED valid_bit, int OVS_UNUSED *filter_error,
+                          uint32_t OVS_UNUSED *netdev_filter)
+{
+    struct netdev_vport OVS_UNUSED *netdev = netdev_vport_cast(netdev_);
+    const char *netdev_name = netdev_get_name(netdev_);
+    int error;
+
+    if (!prog) {
+        return 0;
+    }
+
+    VLOG_DBG("Setting %s filter %d on %s (handle %08"PRIx32")", prog->name,
+             prog->fd, netdev_name, prog->handle);
+
+    error = clsact_setup_qdisc(netdev_);
+    if (error && error != EEXIST) {
+        VLOG_WARN("%s: clsact qdisc setup failed: %s",
+                  netdev_name, ovs_strerror(error));
+        goto out;
+    }
+
+    error = tc_add_filter(netdev_, prog->fd, prog->handle, prog->name);
+    if (error){
+        VLOG_WARN_RL(&rl, "%s: adding filter %s failed: %s",
+                     netdev_name, prog->name, ovs_strerror(error));
+        goto out;
+    }
+
+out:
+    VLOG_INFO("%s %d", __func__, error);
+    return error;
+}
+
+static int
+netdev_vport_set_filter(struct netdev *netdev_, const struct bpf_prog *prog)
+{
+    struct netdev_vport *netdev = netdev_vport_cast(netdev_);
+    int error = 0;
+
+    ovs_mutex_lock(&netdev->mutex);
+    if (!prog || prog->handle == INGRESS_HANDLE) {
+        error = netdev_vport_set_filter__(netdev_, prog, 0, NULL, NULL);
+    }
+    ovs_mutex_unlock(&netdev->mutex);
+
+    VLOG_INFO("%s %d", __func__, error);
+
+    return error;
+}
+
+int bpf_set_link_xdp_fd(int ifindex, int fd, uint32_t flags);
+
+static int
+netdev_vport_set_xdp(struct netdev *netdev_, const struct bpf_prog *prog)
+{
+    struct netdev_vport *netdev = netdev_vport_cast(netdev_);
+    int error = 0;
+    int ifindex;
+
+    ovs_mutex_lock(&netdev->mutex);
+    ifindex = netdev_vport_get_ifindex(netdev_);
+    error = bpf_set_link_xdp_fd(ifindex, prog->fd,
+                                   XDP_FLAGS_SKB_MODE);
+    ovs_mutex_unlock(&netdev->mutex);
+
+    VLOG_INFO("%s %d", __func__, error);
+
+    return error;
+}
+
 #define NETDEV_VPORT_GET_IFINDEX netdev_vport_get_ifindex
 #define NETDEV_FLOW_OFFLOAD_API LINUX_FLOW_OFFLOAD_API
 #else /* !__linux__ */
@@ -914,6 +1055,8 @@ netdev_vport_get_ifindex(const struct netdev *netdev_)
     get_pt_mode,                                            \
                                                             \
     NULL,                       /* set_policing */          \
+    netdev_vport_set_filter,    /* set_filter */            \
+    netdev_vport_set_xdp,       /* set_xdp */               \
     NULL,                       /* get_qos_types */         \
     NULL,                       /* get_qos_capabilities */  \
     NULL,                       /* get_qos */               \
@@ -972,7 +1115,7 @@ netdev_vport_tunnel_register(void)
         TUNNEL_CLASS("gre", "gre_sys", netdev_gre_build_header,
                                        netdev_gre_push_header,
                                        netdev_gre_pop_header,
-                                       NULL),
+                                       NETDEV_VPORT_GET_IFINDEX),
         TUNNEL_CLASS("vxlan", "vxlan_sys", netdev_vxlan_build_header,
                                            netdev_tnl_push_udp_header,
                                            netdev_vxlan_pop_header,
diff --git a/lib/netdev.c b/lib/netdev.c
index be05dc64024a..c44a1a683b92 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -759,6 +759,13 @@ netdev_get_pt_mode(const struct netdev *netdev)
             : NETDEV_PT_LEGACY_L2);
 }
 
+/* Returns a 32-bit hash of the given port number. */
+uint32_t
+netdev_hash_port_no(odp_port_t port_no)
+{
+    return hash_int(odp_to_u32(port_no), 0);
+}
+
 /* Sends 'batch' on 'netdev'.  Returns 0 if successful (for every packet),
  * otherwise a positive errno value.  Returns EAGAIN without blocking if
  * at least one the packets cannot be queued immediately.  Returns EMSGSIZE
@@ -1449,6 +1456,24 @@ netdev_set_policing(struct netdev *netdev, uint32_t kbits_rate,
             : EOPNOTSUPP);
 }
 
+/* Attempts to apply (e)BPF filter 'prog' to the netdev. */
+int
+netdev_set_filter(struct netdev *netdev, struct bpf_prog *prog)
+{
+    return (netdev->netdev_class->set_filter
+            ? netdev->netdev_class->set_filter(netdev, prog)
+            : EOPNOTSUPP);
+}
+
+/* Attempts to apply (e)BPF filter 'prog' to the netdev. */
+int
+netdev_set_xdp(struct netdev *netdev, struct bpf_prog *prog)
+{
+    return (netdev->netdev_class->set_xdp
+            ? netdev->netdev_class->set_xdp(netdev, prog)
+            : EOPNOTSUPP);
+}
+
 /* Adds to 'types' all of the forms of QoS supported by 'netdev', or leaves it
  * empty if 'netdev' does not support QoS.  Any names added to 'types' should
  * be documented as valid for the "type" column in the "QoS" table in
diff --git a/lib/netdev.h b/lib/netdev.h
index ff1b604b24e2..3388504d85c9 100644
--- a/lib/netdev.h
+++ b/lib/netdev.h
@@ -59,6 +59,7 @@ extern "C" {
  *      netdev and access each of those from a different thread.)
  */
 
+struct bpf_prog;
 struct dp_packet_batch;
 struct dp_packet;
 struct netdev_class;
@@ -167,6 +168,7 @@ bool netdev_mtu_is_user_config(struct netdev *);
 int netdev_get_ifindex(const struct netdev *);
 int netdev_set_tx_multiq(struct netdev *, unsigned int n_txq);
 enum netdev_pt_mode netdev_get_pt_mode(const struct netdev *);
+uint32_t netdev_hash_port_no(odp_port_t port_no);
 
 /* Packet reception. */
 int netdev_rxq_open(struct netdev *, struct netdev_rxq **, int id);
@@ -316,6 +318,8 @@ struct netdev_queue_stats {
 
 int netdev_set_policing(struct netdev *, uint32_t kbits_rate,
                         uint32_t kbits_burst);
+int netdev_set_filter(struct netdev *netdev, struct bpf_prog *prog);
+int netdev_set_xdp(struct netdev *netdev, struct bpf_prog *prog);
 
 int netdev_get_qos_types(const struct netdev *, struct sset *types);
 int netdev_get_qos_capabilities(const struct netdev *,
-- 
2.7.4



More information about the dev mailing list