[ovs-dev] [PATCH] route-table: Use classifier versioning to replace all routes atomically.

Ben Pfaff blp at ovn.org
Sat Mar 10 01:00:27 UTC 2018


The route table is implemented in terms of a classifier structure, which
supports versioning with atomic transactional updates, but OVS didn't
actually use it for the route table.  This commit starts using that feature
for the route table and makes OVS atomically replace one set of routes with
another.  This should fix a reported bug in which the routing table was
periodically empty due to refreshes.

Reported-by: Yinpeijun <yinpeijun at huawei.com>
Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-February/046144.html
Signed-off-by: Ben Pfaff <blp at ovn.org>
---
 lib/ovs-router.c  | 177 ++++++++++++++++++++++++++++++++++++++----------------
 lib/ovs-router.h  |  18 +++++-
 lib/route-table.c | 106 ++++++++++++++++----------------
 3 files changed, 191 insertions(+), 110 deletions(-)

diff --git a/lib/ovs-router.c b/lib/ovs-router.c
index 0f1103b0ed18..70cf2068d30b 100644
--- a/lib/ovs-router.c
+++ b/lib/ovs-router.c
@@ -53,6 +53,7 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
 
 static struct ovs_mutex mutex = OVS_MUTEX_INITIALIZER;
 static struct classifier cls;
+static ATOMIC(ovs_version_t) cls_version = ATOMIC_VAR_INIT(OVS_VERSION_MIN);
 
 struct ovs_router_entry {
     struct cls_rule cr;
@@ -71,6 +72,28 @@ ovs_router_entry_cast(const struct cls_rule *cr)
     return cr ? CONTAINER_OF(cr, struct ovs_router_entry, cr) : NULL;
 }
 
+static ovs_version_t
+ovs_router_get_version(void)
+{
+    /* Use memory_order_acquire to signify that any following memory accesses
+     * can not be reordered to happen before this atomic read.  This makes sure
+     * all following reads relate to this or a newer version, but never to an
+     * older version. */
+    ovs_version_t version;
+    atomic_read_explicit(&cls_version, &version, memory_order_acquire);
+    return version;
+}
+
+static void
+ovs_router_set_version(ovs_version_t version)
+{
+    /* Use memory_order_release to signify that any prior memory accesses can
+     * not be reordered to happen after this atomic store.  This makes sure the
+     * new version is properly set up when the readers can read this 'version'
+     * value. */
+    atomic_store_explicit(&cls_version, version, memory_order_release);
+}
+
 static bool
 ovs_router_lookup_fallback(const struct in6_addr *ip6_dst, char output_bridge[],
                            struct in6_addr *src6, struct in6_addr *gw6)
@@ -97,7 +120,7 @@ ovs_router_lookup(uint32_t mark, const struct in6_addr *ip6_dst,
     const struct cls_rule *cr;
     struct flow flow = {.ipv6_dst = *ip6_dst, .pkt_mark = mark};
 
-    cr = classifier_lookup(&cls, OVS_VERSION_MAX, &flow, NULL);
+    cr = classifier_lookup(&cls, ovs_router_get_version(), &flow, NULL);
     if (cr) {
         struct ovs_router_entry *p = ovs_router_entry_cast(cr);
 
@@ -114,6 +137,10 @@ ovs_router_lookup(uint32_t mark, const struct in6_addr *ip6_dst,
 static void
 rt_entry_free(struct ovs_router_entry *p)
 {
+    ovs_mutex_lock(&mutex);
+    classifier_remove(&cls, &p->cr);
+    ovs_mutex_unlock(&mutex);
+
     cls_rule_destroy(&p->cr);
     free(p);
 }
@@ -184,71 +211,78 @@ out:
 }
 
 static int
-ovs_router_insert__(uint32_t mark, uint8_t priority,
-                    const struct in6_addr *ip6_dst,
-                    uint8_t plen, const char output_bridge[],
-                    const struct in6_addr *gw)
+ovs_router_entry_create(const struct ovs_route *route, int priority,
+                        struct ovs_router_entry **entryp)
 {
-    const struct cls_rule *cr;
-    struct ovs_router_entry *p;
+    struct ovs_router_entry *entry;
     struct match match;
     int err;
 
-    rt_init_match(&match, mark, ip6_dst, plen);
+    rt_init_match(&match, route->mark, &route->ip_dst, route->plen);
 
-    p = xzalloc(sizeof *p);
-    ovs_strlcpy(p->output_bridge, output_bridge, sizeof p->output_bridge);
-    if (ipv6_addr_is_set(gw)) {
-        p->gw = *gw;
+    entry = xzalloc(sizeof *entry);
+    ovs_strlcpy(entry->output_bridge, route->output_bridge,
+                sizeof entry->output_bridge);
+    if (ipv6_addr_is_set(&route->gw)) {
+        entry->gw = route->gw;
     }
-    p->mark = mark;
-    p->nw_addr = match.flow.ipv6_dst;
-    p->plen = plen;
-    p->priority = priority;
-    err = get_src_addr(ip6_dst, output_bridge, &p->src_addr);
-    if (err && ipv6_addr_is_set(gw)) {
-        err = get_src_addr(gw, output_bridge, &p->src_addr);
+    entry->mark = route->mark;
+    entry->nw_addr = match.flow.ipv6_dst;
+    entry->plen = route->plen;
+    entry->priority = priority;
+    err = get_src_addr(&route->ip_dst, route->output_bridge, &entry->src_addr);
+    if (err && ipv6_addr_is_set(&route->gw)) {
+        err = get_src_addr(&route->gw, route->output_bridge, &entry->src_addr);
     }
     if (err) {
         struct ds ds = DS_EMPTY_INITIALIZER;
 
-        ipv6_format_mapped(ip6_dst, &ds);
+        ipv6_format_mapped(&route->ip_dst, &ds);
         VLOG_DBG_RL(&rl, "src addr not available for route %s", ds_cstr(&ds));
-        free(p);
+        free(entry);
         ds_destroy(&ds);
         return err;
     }
     /* Longest prefix matches first. */
-    cls_rule_init(&p->cr, &match, priority);
+    cls_rule_init(&entry->cr, &match, priority);
+
+    *entryp = entry;
+    return 0;
+}
+
+static int
+ovs_router_insert__(const struct ovs_route *route, int priority)
+{
+    struct ovs_router_entry *entry;
+    int error = ovs_router_entry_create(route, priority, &entry);
+    if (error) {
+        return error;
+    }
 
     ovs_mutex_lock(&mutex);
-    cr = classifier_replace(&cls, &p->cr, OVS_VERSION_MIN, NULL, 0);
+    ovs_version_t version = ovs_router_get_version();
+    const struct cls_rule *cr = classifier_find_rule_exactly(&cls, &entry->cr,
+                                                             version);
+    if (cr) {
+        cls_rule_make_invisible_in_version(cr, version + 1);
+    }
+    classifier_insert(&cls, &entry->cr, version + 1, NULL, 0);
+    ovs_router_set_version(version + 1);
     ovs_mutex_unlock(&mutex);
 
     if (cr) {
         /* An old rule with the same match was displaced. */
         ovsrcu_postpone(rt_entry_free, ovs_router_entry_cast(cr));
     }
-    tnl_port_map_insert_ipdev(output_bridge);
+    tnl_port_map_insert_ipdev(route->output_bridge);
     seq_change(tnl_conf_seq);
     return 0;
 }
 
 void
-ovs_router_insert(uint32_t mark, const struct in6_addr *ip_dst, uint8_t plen,
-                  const char output_bridge[], const struct in6_addr *gw)
-{
-    ovs_router_insert__(mark, plen, ip_dst, plen, output_bridge, gw);
-}
-
-static void
-rt_entry_delete__(const struct cls_rule *cr)
+ovs_router_insert(const struct ovs_route *route)
 {
-    struct ovs_router_entry *p = ovs_router_entry_cast(cr);
-
-    tnl_port_map_delete_ipdev(p->output_bridge);
-    classifier_remove_assert(&cls, cr);
-    ovsrcu_postpone(rt_entry_free, ovs_router_entry_cast(cr));
+    ovs_router_insert__(route, route->plen);
 }
 
 static bool
@@ -258,24 +292,26 @@ rt_entry_delete(uint32_t mark, uint8_t priority,
     const struct cls_rule *cr;
     struct cls_rule rule;
     struct match match;
-    bool res = false;
 
     rt_init_match(&match, mark, ip6_dst, plen);
 
     cls_rule_init(&rule, &match, priority);
 
     /* Find the exact rule. */
-    cr = classifier_find_rule_exactly(&cls, &rule, OVS_VERSION_MAX);
+    ovs_mutex_lock(&mutex);
+    ovs_version_t version = ovs_router_get_version();
+    cr = classifier_find_rule_exactly(&cls, &rule, version);
     if (cr) {
-        ovs_mutex_lock(&mutex);
-        rt_entry_delete__(cr);
-        ovs_mutex_unlock(&mutex);
+        cls_rule_make_invisible_in_version(cr, version + 1);
+        ovs_router_set_version(version + 1);
+    }
+    ovs_mutex_unlock(&mutex);
 
-        res = true;
+    if (cr) {
+        ovsrcu_postpone(rt_entry_free, ovs_router_entry_cast(cr));
     }
 
-    cls_rule_destroy(&rule);
-    return res;
+    return cr != NULL;
 }
 
 static bool
@@ -345,7 +381,14 @@ ovs_router_add(struct unixctl_conn *conn, int argc,
         }
     }
 
-    err = ovs_router_insert__(mark, plen + 32, &ip6, plen, argv[2], &gw6);
+    struct ovs_route route = {
+        .mark = mark,
+        .ip_dst = ip6,
+        .plen = plen,
+        .gw = gw6,
+    };
+    ovs_strlcpy(route.output_bridge, argv[2], sizeof route.output_bridge);
+    err = ovs_router_insert__(&route, plen + 32);
     if (err) {
         unixctl_command_reply_error(conn, "Error while inserting route.");
     } else {
@@ -392,7 +435,7 @@ ovs_router_show(struct unixctl_conn *conn, int argc OVS_UNUSED,
     struct ds ds = DS_EMPTY_INITIALIZER;
 
     ds_put_format(&ds, "Route Table:\n");
-    CLS_FOR_EACH(rt, cr, &cls) {
+    CLS_FOR_EACH_TARGET (rt, cr, &cls, NULL, ovs_router_get_version()) {
         uint8_t plen;
         if (rt->priority == rt->plen) {
             ds_put_format(&ds, "Cached: ");
@@ -461,22 +504,52 @@ ovs_router_lookup_cmd(struct unixctl_conn *conn, int argc,
 }
 
 void
-ovs_router_flush(void)
+ovs_router_replace(const struct ovs_route routes[], size_t n)
 {
-    struct ovs_router_entry *rt;
-
     ovs_mutex_lock(&mutex);
+    ovs_version_t version = ovs_router_get_version();
     classifier_defer(&cls);
-    CLS_FOR_EACH(rt, cr, &cls) {
+
+    /* Delete existing (non-user) routes. */
+    struct ovs_router_entry *rt;
+    CLS_FOR_EACH_TARGET (rt, cr, &cls, NULL, version) {
         if (rt->priority == rt->plen) {
-            rt_entry_delete__(&rt->cr);
+            tnl_port_map_delete_ipdev(rt->output_bridge);
+            cls_rule_make_invisible_in_version(&rt->cr, version + 1);
+            ovsrcu_postpone(rt_entry_free, rt);
         }
     }
+
+    /* Add new routes. */
+    for (const struct ovs_route *route = routes; route < &routes[n]; route++) {
+        int error = ovs_router_entry_create(route, route->plen, &rt);
+        if (error) {
+            continue;
+        }
+
+        /* Drop duplicates in 'routes'. */
+        const struct cls_rule *cr = classifier_find_rule_exactly(&cls, &rt->cr,
+                                                                 version + 1);
+        if (cr) {
+            free(rt);
+            continue;
+        }
+
+        /* Insert. */
+        classifier_insert(&cls, &rt->cr, version + 1, NULL, 0);
+    }
+    ovs_router_set_version(version + 1);
     classifier_publish(&cls);
     ovs_mutex_unlock(&mutex);
     seq_change(tnl_conf_seq);
 }
 
+void
+ovs_router_flush(void)
+{
+    ovs_router_replace(NULL, 0);
+}
+
 static void
 ovs_router_flush_handler(void *aux OVS_UNUSED)
 {
diff --git a/lib/ovs-router.h b/lib/ovs-router.h
index b55b1a50b146..1b27cceadce3 100644
--- a/lib/ovs-router.h
+++ b/lib/ovs-router.h
@@ -18,6 +18,7 @@
 #define OVS_TNL_ROUTER_H 1
 
 #include <sys/types.h>
+#include <net/if.h>
 #include <netinet/in.h>
 
 #include "util.h"
@@ -30,10 +31,21 @@ bool ovs_router_lookup(uint32_t mark, const struct in6_addr *ip_dst,
                        char out_dev[],
                        struct in6_addr *src, struct in6_addr *gw);
 void ovs_router_init(void);
-void ovs_router_insert(uint32_t mark, const struct in6_addr *ip_dst,
-                       uint8_t plen,
-                       const char output_bridge[], const struct in6_addr *gw);
+
+struct ovs_route {
+    uint32_t mark;
+
+    struct in6_addr ip_dst;
+    uint8_t plen;
+
+    char output_bridge[IFNAMSIZ];
+    struct in6_addr gw;
+};
+void ovs_router_insert(const struct ovs_route *);
+void ovs_router_replace(const struct ovs_route[], size_t n);
+
 void ovs_router_flush(void);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/lib/route-table.c b/lib/route-table.c
index 2ee45e5c9dd5..88046350546c 100644
--- a/lib/route-table.c
+++ b/lib/route-table.c
@@ -44,23 +44,12 @@
 
 VLOG_DEFINE_THIS_MODULE(route_table);
 
-struct route_data {
-    /* Copied from struct rtmsg. */
-    unsigned char rtm_dst_len;
-
-    /* Extracted from Netlink attributes. */
-    struct in6_addr rta_dst; /* 0 if missing. */
-    struct in6_addr rta_gw;
-    char ifname[IFNAMSIZ]; /* Interface name. */
-    uint32_t mark;
-};
-
 /* A digested version of a route message sent down by the kernel to indicate
  * that a route has changed. */
 struct route_table_msg {
     bool relevant;        /* Should this message be processed? */
     int nlmsg_type;       /* e.g. RTM_NEWROUTE, RTM_DELROUTE. */
-    struct route_data rd; /* Data parsed from this message. */
+    struct ovs_route rd;  /* Data parsed from this message. */
 };
 
 static struct ovs_mutex route_table_mutex = OVS_MUTEX_INITIALIZER;
@@ -79,10 +68,8 @@ static struct nln_notifier *name_notifier = NULL;
 static bool route_table_valid = false;
 
 static int route_table_reset(void);
-static void route_table_handle_msg(const struct route_table_msg *);
 static int route_table_parse(struct ofpbuf *, struct route_table_msg *);
 static void route_table_change(const struct route_table_msg *, void *);
-static void route_map_clear(void);
 
 static void name_table_init(void);
 static void name_table_change(const struct rtnetlink_change *, void *);
@@ -152,40 +139,66 @@ route_table_wait(void)
 }
 
 static int
-route_table_reset(void)
+route_table_dump(struct ovs_route **routesp, size_t *n_routesp)
 {
     struct nl_dump dump;
-    struct rtgenmsg *rtgenmsg;
-    uint64_t reply_stub[NL_DUMP_BUFSIZE / 8];
-    struct ofpbuf request, reply, buf;
-
-    route_map_clear();
-    netdev_get_addrs_list_flush();
-    route_table_valid = true;
-    rt_change_seq++;
 
+    /* Compose and send route table dump request. */
+    struct ofpbuf request;
     ofpbuf_init(&request, 0);
-
+    struct rtgenmsg *rtgenmsg;
     nl_msg_put_nlmsghdr(&request, sizeof *rtgenmsg, RTM_GETROUTE,
                         NLM_F_REQUEST);
-
     rtgenmsg = ofpbuf_put_zeros(&request, sizeof *rtgenmsg);
     rtgenmsg->rtgen_family = AF_UNSPEC;
-
     nl_dump_start(&dump, NETLINK_ROUTE, &request);
     ofpbuf_uninit(&request);
 
-    ofpbuf_use_stub(&buf, reply_stub, sizeof reply_stub);
+    /* Retrieve all of the dump results. */
+    struct ovs_route *routes = NULL;
+    size_t allocated_routes = 0;
+    size_t n_routes = 0;
+    uint64_t stub[NL_DUMP_BUFSIZE / 8];
+    struct ofpbuf buf = OFPBUF_STUB_INITIALIZER(stub);
+    struct ofpbuf reply;
     while (nl_dump_next(&dump, &reply, &buf)) {
         struct route_table_msg msg;
-
-        if (route_table_parse(&reply, &msg)) {
-            route_table_handle_msg(&msg);
+        if (route_table_parse(&reply, &msg)
+            && msg.relevant && msg.nlmsg_type == RTM_NEWROUTE) {
+            if (n_routes >= allocated_routes) {
+                routes = x2nrealloc(routes, &allocated_routes, sizeof *routes);
+            }
+            routes[n_routes++] = msg.rd;
         }
     }
     ofpbuf_uninit(&buf);
 
-    return nl_dump_done(&dump);
+    int error = nl_dump_done(&dump);
+    if (error) {
+        free(routes);
+        routes = NULL;
+        n_routes = 0;
+    }
+    *routesp = routes;
+    *n_routesp = n_routes;
+    return error;
+}
+
+static int
+route_table_reset(void)
+{
+    struct ovs_route *routes;
+    size_t n_routes;
+    int error = route_table_dump(&routes, &n_routes);
+    if (!error) {
+        netdev_get_addrs_list_flush();
+        ovs_router_replace(routes, n_routes);
+        route_table_valid = true;
+        rt_change_seq++;
+
+        free(routes);
+    }
+    return error;
 }
 
 /* Return RTNLGRP_IPV4_ROUTE or RTNLGRP_IPV6_ROUTE on success, 0 on parse
@@ -244,11 +257,11 @@ route_table_parse(struct ofpbuf *buf, struct route_table_msg *change)
             change->relevant = false;
         }
         change->nlmsg_type     = nlmsg->nlmsg_type;
-        change->rd.rtm_dst_len = rtm->rtm_dst_len + (ipv4 ? 96 : 0);
+        change->rd.plen = rtm->rtm_dst_len + (ipv4 ? 96 : 0);
         if (attrs[RTA_OIF]) {
             rta_oif = nl_attr_get_u32(attrs[RTA_OIF]);
 
-            if (!if_indextoname(rta_oif, change->rd.ifname)) {
+            if (!if_indextoname(rta_oif, change->rd.output_bridge)) {
                 int error = errno;
 
                 VLOG_DBG_RL(&rl, "Could not find interface name[%u]: %s",
@@ -265,20 +278,20 @@ route_table_parse(struct ofpbuf *buf, struct route_table_msg *change)
             if (ipv4) {
                 ovs_be32 dst;
                 dst = nl_attr_get_be32(attrs[RTA_DST]);
-                in6_addr_set_mapped_ipv4(&change->rd.rta_dst, dst);
+                in6_addr_set_mapped_ipv4(&change->rd.ip_dst, dst);
             } else {
-                change->rd.rta_dst = nl_attr_get_in6_addr(attrs[RTA_DST]);
+                change->rd.ip_dst = nl_attr_get_in6_addr(attrs[RTA_DST]);
             }
         } else if (ipv4) {
-            in6_addr_set_mapped_ipv4(&change->rd.rta_dst, 0);
+            in6_addr_set_mapped_ipv4(&change->rd.ip_dst, 0);
         }
         if (attrs[RTA_GATEWAY]) {
             if (ipv4) {
                 ovs_be32 gw;
                 gw = nl_attr_get_be32(attrs[RTA_GATEWAY]);
-                in6_addr_set_mapped_ipv4(&change->rd.rta_gw, gw);
+                in6_addr_set_mapped_ipv4(&change->rd.gw, gw);
             } else {
-                change->rd.rta_gw = nl_attr_get_in6_addr(attrs[RTA_GATEWAY]);
+                change->rd.gw = nl_attr_get_in6_addr(attrs[RTA_GATEWAY]);
             }
         }
         if (attrs[RTA_MARK]) {
@@ -300,23 +313,6 @@ route_table_change(const struct route_table_msg *change OVS_UNUSED,
     route_table_valid = false;
 }
 
-static void
-route_table_handle_msg(const struct route_table_msg *change)
-{
-    if (change->relevant && change->nlmsg_type == RTM_NEWROUTE) {
-        const struct route_data *rd = &change->rd;
-
-        ovs_router_insert(rd->mark, &rd->rta_dst, rd->rtm_dst_len,
-                          rd->ifname, &rd->rta_gw);
-    }
-}
-
-static void
-route_map_clear(void)
-{
-    ovs_router_flush();
-}
-
 bool
 route_table_fallback_lookup(const struct in6_addr *ip6_dst OVS_UNUSED,
                             char name[] OVS_UNUSED,
-- 
2.16.1



More information about the dev mailing list