[ovs-dev] [PATCH ovs v1 3/4] tnl-neigh-cache: Allow openvswitch learning neigh entries.

xiangxia.m.yue at gmail.com xiangxia.m.yue at gmail.com
Mon Dec 14 02:20:00 UTC 2020


From: Tonghao Zhang <xiangxia.m.yue at gmail.com>

In flow bifurcation case, in the system there is only one
IP address, for example, IPv4 address. We assign it to PF
netdevice, but not openvswitch bridge. We hope steering
the tunnel packets to openvswitch from PF and building tunnel
packets. When buiding the tunnel packets, openvswitch can
use the neigh entries learned from system.

Signed-off-by: Tonghao Zhang <xiangxia.m.yue at gmail.com>
---
 lib/dpif-netdev.c     |   1 +
 lib/tnl-neigh-cache.c | 312 +++++++++++++++++++++++++++++++++++++++++++++++---
 lib/tnl-neigh-cache.h |   2 +
 vswitchd/bridge.c     |   2 +
 vswitchd/vswitch.xml  |  17 +++
 5 files changed, 321 insertions(+), 13 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 300861ca5..edc4122af 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -5804,6 +5804,7 @@ dpif_netdev_wait(struct dpif *dpif)
     ovs_mutex_unlock(&dp->port_mutex);
     ovs_mutex_unlock(&dp_netdev_mutex);
     seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
+    tnl_neigh_cache_wait();
 }
 
 static void
diff --git a/lib/tnl-neigh-cache.c b/lib/tnl-neigh-cache.c
index 5bda4af7e..8f346ba78 100644
--- a/lib/tnl-neigh-cache.c
+++ b/lib/tnl-neigh-cache.c
@@ -22,6 +22,8 @@
 #include <sys/types.h>
 #include <netinet/in.h>
 #include <netinet/icmp6.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_ether.h>
 #include <stdlib.h>
 
 #include "bitmap.h"
@@ -35,6 +37,7 @@
 #include "ovs-thread.h"
 #include "packets.h"
 #include "openvswitch/poll-loop.h"
+#include "openvswitch/ofpbuf.h"
 #include "seq.h"
 #include "socket-util.h"
 #include "timeval.h"
@@ -42,10 +45,16 @@
 #include "unixctl.h"
 #include "util.h"
 #include "openvswitch/vlog.h"
+#include "netlink-notifier.h"
+#include "netlink-socket.h"
+#include "netlink.h"
+#include "smap.h"
 
+VLOG_DEFINE_THIS_MODULE(tnl_neigh_cache);
 
 /* In seconds */
 #define NEIGH_ENTRY_DEFAULT_IDLE_TIME  (15 * 60)
+#define NUD_VALID (NUD_PERMANENT|NUD_REACHABLE|NUD_PROBE|NUD_STALE|NUD_DELAY)
 
 struct tnl_neigh_entry {
     struct cmap_node cmap_node;
@@ -53,10 +62,30 @@ struct tnl_neigh_entry {
     struct eth_addr mac;
     time_t expires;             /* Expiration time. */
     char br_name[IFNAMSIZ];
+    bool event;
 };
 
+enum tnl_neigh_nlmsg_op {
+    TNL_NEIGH_NLMSG_ADD = 1,
+    TNL_NEIGH_NLMSG_DEL,
+};
+
+struct tnl_neigh_nlmsg {
+    struct in6_addr ip;
+    struct eth_addr mac;
+    char br_name[IFNAMSIZ];
+    enum tnl_neigh_nlmsg_op op;
+};
+
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
 static struct cmap table = CMAP_INITIALIZER;
 static struct ovs_mutex mutex = OVS_MUTEX_INITIALIZER;
+static struct nln_notifier *neigh_notifier = NULL;
+static struct nln *neigh_nln = NULL;
+static struct tnl_neigh_nlmsg tnmsg;
+
+static int tnl_neigh_event_parse(struct ofpbuf *, struct tnl_neigh_nlmsg *);
+static void tnl_neigh_event_change(const struct tnl_neigh_nlmsg *, void *);
 
 static uint32_t
 tnl_neigh_hash(const struct in6_addr *ip)
@@ -72,7 +101,8 @@ tnl_neigh_lookup__(const char br_name[IFNAMSIZ], const struct in6_addr *dst)
 
     hash = tnl_neigh_hash(dst);
     CMAP_FOR_EACH_WITH_HASH (neigh, cmap_node, hash, &table) {
-        if (ipv6_addr_equals(&neigh->ip, dst) && !strcmp(neigh->br_name, br_name)) {
+        if (ipv6_addr_equals(&neigh->ip, dst) &&
+            !strcmp(neigh->br_name, br_name) && !neigh->event) {
             if (neigh->expires <= time_now()) {
                 return NULL;
             }
@@ -81,6 +111,15 @@ tnl_neigh_lookup__(const char br_name[IFNAMSIZ], const struct in6_addr *dst)
             return neigh;
         }
     }
+
+    /* To check whether neigh entry available which learned from system. */
+    CMAP_FOR_EACH_WITH_HASH (neigh, cmap_node, hash, &table) {
+        if (ipv6_addr_equals(&neigh->ip, dst) &&
+            neigh->event) {
+            return neigh;
+        }
+    }
+
     return NULL;
 }
 
@@ -114,15 +153,13 @@ tnl_neigh_delete(struct tnl_neigh_entry *neigh)
 }
 
 static void
-tnl_neigh_set__(const char name[IFNAMSIZ], const struct in6_addr *dst,
-                const struct eth_addr mac)
+tnl_neigh_set_nolock(const char name[IFNAMSIZ], const struct in6_addr *dst,
+                     const struct eth_addr mac, bool event)
 {
-    ovs_mutex_lock(&mutex);
     struct tnl_neigh_entry *neigh = tnl_neigh_lookup__(name, dst);
     if (neigh) {
         if (eth_addr_equals(neigh->mac, mac)) {
             neigh->expires = time_now() + NEIGH_ENTRY_DEFAULT_IDLE_TIME;
-            ovs_mutex_unlock(&mutex);
             return;
         }
         tnl_neigh_delete(neigh);
@@ -130,12 +167,39 @@ tnl_neigh_set__(const char name[IFNAMSIZ], const struct in6_addr *dst,
     seq_change(tnl_conf_seq);
 
     neigh = xmalloc(sizeof *neigh);
-
     neigh->ip = *dst;
     neigh->mac = mac;
+    neigh->event = event;
     neigh->expires = time_now() + NEIGH_ENTRY_DEFAULT_IDLE_TIME;
     ovs_strlcpy(neigh->br_name, name, sizeof neigh->br_name);
     cmap_insert(&table, &neigh->cmap_node, tnl_neigh_hash(&neigh->ip));
+}
+
+static void
+tnl_neigh_unset_nolock(const char name[IFNAMSIZ], const struct in6_addr *dst)
+{
+    struct tnl_neigh_entry *neigh;
+    bool changed = false;
+
+    CMAP_FOR_EACH (neigh, cmap_node, &table) {
+        if (!strcmp(neigh->br_name, name) &&
+            ipv6_addr_equals(&neigh->ip, dst) && neigh->event) {
+            tnl_neigh_delete(neigh);
+            changed = true;
+        }
+    }
+
+    if (changed) {
+        seq_change(tnl_conf_seq);
+    }
+}
+
+static void
+tnl_neigh_set__(const char name[IFNAMSIZ], const struct in6_addr *dst,
+                const struct eth_addr mac)
+{
+    ovs_mutex_lock(&mutex);
+    tnl_neigh_set_nolock(name, dst, mac, false);
     ovs_mutex_unlock(&mutex);
 }
 
@@ -208,11 +272,16 @@ tnl_neigh_cache_run(void)
 
     ovs_mutex_lock(&mutex);
     CMAP_FOR_EACH(neigh, cmap_node, &table) {
-        if (neigh->expires <= time_now()) {
+        if (!neigh->event && neigh->expires <= time_now()) {
             tnl_neigh_delete(neigh);
             changed = true;
         }
     }
+
+    if (neigh_nln) {
+        nln_run(neigh_nln);
+    }
+
     ovs_mutex_unlock(&mutex);
 
     if (changed) {
@@ -220,6 +289,16 @@ tnl_neigh_cache_run(void)
     }
 }
 
+void
+tnl_neigh_cache_wait(void)
+{
+    ovs_mutex_lock(&mutex);
+    if (neigh_nln) {
+        nln_wait(neigh_nln);
+    }
+    ovs_mutex_unlock(&mutex);
+}
+
 void
 tnl_neigh_flush(const char br_name[IFNAMSIZ])
 {
@@ -241,21 +320,29 @@ tnl_neigh_flush(const char br_name[IFNAMSIZ])
 }
 
 static void
-tnl_neigh_cache_flush(struct unixctl_conn *conn, int argc OVS_UNUSED,
-                    const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED)
+tnl_neigh_flush__(bool event)
 {
     struct tnl_neigh_entry *neigh;
     bool changed = false;
 
     ovs_mutex_lock(&mutex);
-    CMAP_FOR_EACH(neigh, cmap_node, &table) {
-        tnl_neigh_delete(neigh);
-        changed = true;
+    CMAP_FOR_EACH (neigh, cmap_node, &table) {
+        if (!event || neigh->event) {
+            tnl_neigh_delete(neigh);
+            changed = true;
+        }
     }
     ovs_mutex_unlock(&mutex);
     if (changed) {
         seq_change(tnl_conf_seq);
     }
+}
+
+static void
+tnl_neigh_cache_flush(struct unixctl_conn *conn, int argc OVS_UNUSED,
+                    const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED)
+{
+    tnl_neigh_flush__(false);
     unixctl_command_reply(conn, "OK");
 }
 
@@ -319,7 +406,7 @@ tnl_neigh_cache_show(struct unixctl_conn *conn, int argc OVS_UNUSED,
 
         ds_put_format(&ds, ETH_ADDR_FMT"   %s",
                       ETH_ADDR_ARGS(neigh->mac), neigh->br_name);
-        if (neigh->expires <= time_now()) {
+        if (!neigh->event && neigh->expires <= time_now()) {
             ds_put_format(&ds, " STALE");
         }
         ds_put_char(&ds, '\n');
@@ -330,6 +417,205 @@ tnl_neigh_cache_show(struct unixctl_conn *conn, int argc OVS_UNUSED,
     ds_destroy(&ds);
 }
 
+static int
+tnl_neigh_event_parse(struct ofpbuf *buf, struct tnl_neigh_nlmsg *change)
+{
+    static const struct nl_policy policy[] = {
+        [NDA_DST] = { .type = NL_A_UNSPEC,
+                      .min_len = sizeof(struct in_addr),
+                      .optional = false, },
+        [NDA_LLADDR] = { .type = NL_A_UNSPEC,
+                         .min_len = ETH_ALEN,
+                         .optional = true, },
+    };
+
+    struct nlattr *attrs[ARRAY_SIZE(policy)];
+    const struct nlmsghdr *nlmsg = buf->data;
+    const struct ndmsg *ndm;
+    char namebuf[IFNAMSIZ];
+    bool parsed;
+    struct in6_addr addr;
+
+    /* Process RTM_NEWNEIGH or RTM_DELNEIGH events only. */
+    if (nlmsg->nlmsg_type != RTM_NEWNEIGH &&
+        nlmsg->nlmsg_type != RTM_DELNEIGH) {
+        return 0;
+    }
+
+    ndm = ofpbuf_at(buf, NLMSG_HDRLEN, sizeof *ndm);
+    if (ndm->ndm_family != AF_INET &&
+        ndm->ndm_family != AF_INET6) {
+        return 0;
+    }
+
+    parsed = nl_policy_parse(buf, NLMSG_HDRLEN + sizeof(struct rtmsg),
+                             policy, attrs, ARRAY_SIZE(policy));
+    if (!parsed) {
+        VLOG_DBG_RL(&rl, "The tnl neigh event parse failed");
+        return 0;
+    }
+
+    if (!if_indextoname(ndm->ndm_ifindex, namebuf)) {
+        return 0;
+    }
+
+    memset(change, 0, sizeof *change);
+    ovs_strlcpy(change->br_name, namebuf, sizeof change->br_name);
+
+    if (ndm->ndm_family == AF_INET) {
+        const ovs_be32 *ip4;
+        ip4 = nl_attr_get_unspec(attrs[NDA_DST], sizeof *ip4);
+        addr = in6_addr_mapped_ipv4(*ip4);
+    } else {
+        const struct in6_addr *ip6;
+        ip6 = nl_attr_get_unspec(attrs[NDA_DST], sizeof *ip6);
+        addr = *ip6;
+    }
+
+    change->ip = addr;
+    change->op = TNL_NEIGH_NLMSG_DEL;
+    if (nlmsg->nlmsg_type == RTM_NEWNEIGH) {
+        /* If neigh entry was not ready,  will not cache it. */
+        if (!(ndm->ndm_state & NUD_VALID) || !attrs[NDA_LLADDR]) {
+            return 0;
+        }
+
+        const struct eth_addr *mac;
+        mac = nl_attr_get_unspec(attrs[NDA_LLADDR], ETH_ALEN);
+        change->mac = *mac;
+        change->op = TNL_NEIGH_NLMSG_ADD;
+    }
+
+    return RTNLGRP_NEIGH;
+}
+
+static void
+tnl_neigh_event_change(const struct tnl_neigh_nlmsg *change,
+                       void *aux OVS_UNUSED)
+{
+    if (!change) {
+        return;
+    }
+
+    switch (change->op) {
+        case TNL_NEIGH_NLMSG_ADD:
+            VLOG_DBG("Add neigh entry: %s "ETH_ADDR_FMT,
+                     change->br_name, ETH_ADDR_ARGS(change->mac));
+            tnl_neigh_set_nolock(change->br_name, &change->ip,
+                                 change->mac, true);
+            break;
+        case TNL_NEIGH_NLMSG_DEL:
+        {
+            char ip[INET6_ADDRSTRLEN];
+
+            ipv6_string_mapped(ip, &change->ip);
+            VLOG_DBG("Del neigh entry: %s %s", change->br_name, ip);
+            tnl_neigh_unset_nolock(change->br_name, &change->ip);
+            break;
+        }
+        default:
+            VLOG_ERR_RL(&rl, "The message ops of neigh netlink is unknown");
+            break;
+    }
+}
+
+static void
+tnl_neigh_event_uninit(void)
+{
+    if (neigh_notifier) {
+        nln_notifier_destroy(neigh_notifier);
+        neigh_notifier = NULL;
+    }
+
+    if (neigh_nln) {
+        nln_destroy(neigh_nln);
+        neigh_nln = NULL;
+    }
+}
+
+static int
+tnl_neigh_event_init(void)
+{
+    neigh_nln = nln_create(NETLINK_ROUTE,
+                           (nln_parse_func *) tnl_neigh_event_parse,
+                           &tnmsg);
+    if (!neigh_nln) {
+        return -1;
+    }
+
+    neigh_notifier =
+        nln_notifier_create(neigh_nln, RTNLGRP_NEIGH,
+                            (nln_notify_func *) tnl_neigh_event_change,
+                            NULL);
+    if (!neigh_notifier) {
+        tnl_neigh_event_uninit();
+        return -1;
+    }
+
+    return 0;
+}
+
+static int
+tnl_neigh_event_dump(void)
+{
+    uint64_t reply_stub[NL_DUMP_BUFSIZE / 8];
+    struct ofpbuf request, reply, buf;
+    struct nl_dump dump;
+    struct ndmsg *ndmsg;
+
+    ofpbuf_init(&request, 0);
+    nl_msg_put_nlmsghdr(&request, sizeof *ndmsg, RTM_GETNEIGH,
+                        NLM_F_REQUEST | NLM_F_DUMP);
+
+    ndmsg = ofpbuf_put_zeros(&request, sizeof *ndmsg);
+    ndmsg->ndm_family = AF_UNSPEC;
+
+    nl_dump_start(&dump, NETLINK_ROUTE, &request);
+    ofpbuf_uninit(&request);
+
+    ofpbuf_use_stub(&buf, reply_stub, sizeof reply_stub);
+    while (nl_dump_next(&dump, &reply, &buf)) {
+        struct tnl_neigh_nlmsg msg;
+
+        if (tnl_neigh_event_parse(&reply, &msg)) {
+            tnl_neigh_event_change(&msg, NULL);
+        }
+    }
+    ofpbuf_uninit(&buf);
+    return nl_dump_done(&dump);
+}
+
+void
+tnl_neigh_event_enabled(const struct smap *ovs_other_config)
+{
+    int err;
+
+    if (smap_get_bool(ovs_other_config, "tnl-neigh-event-enabled", false)) {
+        if (neigh_nln || neigh_notifier) {
+            return;
+        }
+
+        err = tnl_neigh_event_init();
+        if (err) {
+            VLOG_ERR("Can't create nln handle or notifier for neighboring subsystem");
+            return;
+        }
+
+        err = tnl_neigh_event_dump();
+        if (err) {
+            tnl_neigh_event_uninit();
+            VLOG_ERR("Can't dump neigh entries");
+            return;
+        }
+    } else {
+        if (!neigh_nln && !neigh_notifier) {
+            return;
+        }
+        tnl_neigh_flush__(true);
+        tnl_neigh_event_uninit();
+    }
+}
+
 void
 tnl_neigh_cache_init(void)
 {
diff --git a/lib/tnl-neigh-cache.h b/lib/tnl-neigh-cache.h
index ded9c2f86..f98743d06 100644
--- a/lib/tnl-neigh-cache.h
+++ b/lib/tnl-neigh-cache.h
@@ -37,6 +37,8 @@ int tnl_neigh_lookup(const char dev_name[], const struct in6_addr *dst,
                      struct eth_addr *mac);
 void tnl_neigh_cache_init(void);
 void tnl_neigh_cache_run(void);
+void tnl_neigh_cache_wait(void);
 void tnl_neigh_flush(const char dev_name[]);
+void tnl_neigh_event_enabled(const struct smap *ovs_other_config);
 
 #endif
diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c
index 5ed7e8234..161bb5f8b 100644
--- a/vswitchd/bridge.c
+++ b/vswitchd/bridge.c
@@ -69,6 +69,7 @@
 #include "util.h"
 #include "unixctl.h"
 #include "lib/vswitch-idl.h"
+#include "tnl-neigh-cache.h"
 #include "xenserver.h"
 #include "vlan-bitmap.h"
 
@@ -3292,6 +3293,7 @@ bridge_run(void)
         netdev_set_flow_api_enabled(&cfg->other_config);
         dpdk_init(&cfg->other_config);
         userspace_tso_init(&cfg->other_config);
+        tnl_neigh_event_enabled(&cfg->other_config);
     }
 
     /* Initialize the ofproto library.  This only needs to run once, but
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 89a876796..b0f22b534 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -222,6 +222,23 @@
         </p>
       </column>
 
+      <column name="other_config" key="tnl-neigh-event-enabled"
+              type='{"type": "boolean"}'>
+        <p>
+          Set this value to <code>true</code> to enable learning neigh from system.
+          The default value is <code>false</code>.
+        </p>
+        <p>
+          If enabled, Open vSwitch can learn the neigh entries from system. Then you
+          may not configure tunnel IP address on Open vSwitch bridge,
+          when encapsulating tunnel packets(e.g. native_tunnel_output), we try to use
+          the neigh entry which learned from system. That is useful for the flow bifurcation
+          that is a mechanism which uses hardware capable Ethernet devices
+          to split traffic between Linux user space and kernel space. More details:
+          http://git.dpdk.org/next/dpdk-next-net/tree/doc/guides/howto/flow_bifurcation.rst
+        </p>
+      </column>
+
       <column name="other_config" key="hw-offload"
               type='{"type": "boolean"}'>
         <p>
-- 
2.14.1



More information about the dev mailing list