[ovs-dev] [PATCH 5/5] dpif-linux: Prevent a single port from monopolizing upcalls.

Jesse Gross jesse at nicira.com
Mon Sep 19 22:00:08 UTC 2011


Currently it is possible for a client on a single port to generate
a huge number of packets that miss in the kernel flow table and
monopolize the userspace/kernel communication path.  This
effectively DoS's the machine because no new flow setups can take
place.  This adds some additional fairness by separating each upcall
type for each object in the datapath onto a separate socket, each
with its own queue.  Userspace then reads round-robin from each
socket so other flow setups can still succeed.

Since the number of objects can potentially be large, we don't always
have a unique socket for each.  Instead, we create 16 sockets and
spread the load around them in a round robin fashion.  It's theoretically
possible to do better than this with some kind of active load balancing
scheme but this seems like a good place to start.

Feature #6485
---
 NEWS             |    3 +
 lib/dpif-linux.c |  180 +++++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 134 insertions(+), 49 deletions(-)

diff --git a/NEWS b/NEWS
index 186b2f6..7f28986 100644
--- a/NEWS
+++ b/NEWS
@@ -15,6 +15,9 @@ Post-v1.2.0
     - CAPWAP tunneling now supports an extension to transport a 64-key.  By
       default it remains compatible with the old version and other
       standards-based implementations.
+    - Flow setups are now processed in a round-robin manner across ports
+      to prevent any single client from monopolizing the CPU and conducting
+      a denial of service attack.
 
 v1.2.0 - 03 Aug 2011
 ------------------------
diff --git a/lib/dpif-linux.c b/lib/dpif-linux.c
index 1088d0e..5fbe694 100644
--- a/lib/dpif-linux.c
+++ b/lib/dpif-linux.c
@@ -61,6 +61,9 @@ enum { LRU_MAX_PORTS = 1024 };
 enum { LRU_MASK = LRU_MAX_PORTS - 1};
 BUILD_ASSERT_DECL(IS_POW2(LRU_MAX_PORTS));
 
+enum {N_UPCALL_SOCKS = 16 };
+BUILD_ASSERT_DECL(IS_POW2(N_UPCALL_SOCKS));
+
 /* This ethtool flag was introduced in Linux 2.6.24, so it might be
  * missing if we have old headers. */
 #define ETH_FLAG_LRO      (1 << 15)    /* LRO is enabled */
@@ -135,7 +138,9 @@ struct dpif_linux {
     struct hmap_node hmap_node;
 
     /* Upcall messages. */
-    struct nl_sock *upcall_sock;
+    struct nl_sock *upcall_socks[N_UPCALL_SOCKS];
+    int last_read_upcall;
+    int last_assigned_upcall;
     unsigned int listen_mask;
 
     /* Change notification. */
@@ -151,6 +156,7 @@ struct dpif_linux {
 };
 
 static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(9999, 5);
+static struct vlog_rate_limit dbg_rl = VLOG_RATE_LIMIT_INIT(60, 60);
 
 /* Generic Netlink family numbers for OVS. */
 static int ovs_datapath_family;
@@ -169,6 +175,7 @@ static int dpif_linux_init(void);
 static void open_dpif(const struct dpif_linux_dp *, struct dpif **);
 static bool dpif_linux_nln_parse(struct ofpbuf *, void *);
 static void dpif_linux_port_changed(const void *vport, void *dpif);
+static uint32_t get_upcall_pid(struct dpif_linux *);
 
 static void dpif_linux_vport_to_ofpbuf(const struct dpif_linux_vport *,
                                        struct ofpbuf *);
@@ -261,22 +268,18 @@ open_dpif(const struct dpif_linux_dp *dp, struct dpif **dpifp)
     struct dpif_linux *dpif;
     int i;
 
-    dpif = xmalloc(sizeof *dpif);
+    dpif = xzalloc(sizeof *dpif);
     dpif->port_notifier = nln_notifier_create(nln, dpif_linux_port_changed,
                                               dpif);
 
     dpif_init(&dpif->dpif, &dpif_linux_class, dp->name,
               dp->dp_ifindex, dp->dp_ifindex);
 
-    dpif->upcall_sock = NULL;
-    dpif->listen_mask = 0;
     dpif->dp_ifindex = dp->dp_ifindex;
     hmap_insert(&dp_map, &dpif->hmap_node, hash_int(dpif->dp_ifindex, 0));
     sset_init(&dpif->changed_ports);
-    dpif->change_error = false;
     *dpifp = &dpif->dpif;
 
-    dpif->lru_head = dpif->lru_tail = 0;
     dpif->lru_bitmap = bitmap_allocate(LRU_MAX_PORTS);
     bitmap_set1(dpif->lru_bitmap, OVSP_LOCAL);
     for (i = 1; i < LRU_MAX_PORTS; i++) {
@@ -285,12 +288,24 @@ open_dpif(const struct dpif_linux_dp *dp, struct dpif **dpifp)
 }
 
 static void
+destroy_upcall_socks(struct dpif_linux *dpif)
+{
+    int i;
+
+    for (i = 0; i < N_UPCALL_SOCKS; i++) {
+        nl_sock_destroy(dpif->upcall_socks[i]);
+    }
+
+    memset(dpif->upcall_socks, 0, sizeof(dpif->upcall_socks));
+}
+
+static void
 dpif_linux_close(struct dpif *dpif_)
 {
     struct dpif_linux *dpif = dpif_linux_cast(dpif_);
 
     nln_notifier_destroy(dpif->port_notifier);
-    nl_sock_destroy(dpif->upcall_sock);
+    destroy_upcall_socks(dpif);
     hmap_remove(&dp_map, &dpif->hmap_node);
     sset_destroy(&dpif->changed_ports);
     free(dpif->lru_bitmap);
@@ -405,11 +420,15 @@ dpif_linux_port_add(struct dpif *dpif_, struct netdev *netdev,
     /* Loop until we find a port that isn't used. */
     do {
         request.port_no = dpif_linux_pop_port(dpif);
-        request.upcall_pid = nl_sock_pid(dpif->upcall_sock);
+        request.upcall_pid = get_upcall_pid(dpif);
         error = dpif_linux_vport_transact(&request, &reply, &buf);
 
         if (!error) {
             *port_nop = reply.port_no;
+            VLOG_DBG_RL(&dbg_rl, "%s: assigning port %"PRIu32" to netlink "
+                        "pid %"PRIu32,
+                        dpif_name(dpif_), request.port_no,
+                        request.upcall_pid);
         }
         ofpbuf_delete(buf);
     } while (request.port_no != UINT32_MAX
@@ -664,7 +683,7 @@ dpif_linux_flow_put(struct dpif *dpif_, enum dpif_flow_put_flags flags,
     /* Ensure that OVS_FLOW_ATTR_ACTIONS will always be included. */
     request.actions = actions ? actions : &dummy_action;
     request.actions_len = actions_len;
-    request.upcall_pid = nl_sock_pid(dpif->upcall_sock);
+    request.upcall_pid = get_upcall_pid(dpif);
     if (flags & DPIF_FP_ZERO_STATS) {
         request.clear = true;
     }
@@ -672,9 +691,21 @@ dpif_linux_flow_put(struct dpif *dpif_, enum dpif_flow_put_flags flags,
     error = dpif_linux_flow_transact(&request,
                                      stats ? &reply : NULL,
                                      stats ? &buf : NULL);
-    if (!error && stats) {
-        dpif_linux_flow_get_stats(&reply, stats);
-        ofpbuf_delete(buf);
+    if (!error) {
+        if (!VLOG_DROP_DBG(&dbg_rl)) {
+            struct ds flow;
+
+            ds_init(&flow);
+            odp_flow_key_format(key, key_len, &flow);
+            VLOG_DBG("%s: assigning flow %s to netlink pid %"PRIu32,
+                     dpif_name(dpif_), ds_cstr(&flow),
+                     request.upcall_pid);
+            ds_destroy(&flow);
+        }
+        if (stats) {
+            dpif_linux_flow_get_stats(&reply, stats);
+            ofpbuf_delete(buf);
+        }
     }
     return error;
 }
@@ -816,7 +847,7 @@ dpif_linux_execute__(struct dpif_linux *dpif,
     nl_msg_put_unspec(buf, OVS_PACKET_ATTR_PACKET, packet->data, packet->size);
     nl_msg_put_unspec(buf, OVS_PACKET_ATTR_KEY, key, key_len);
     nl_msg_put_unspec(buf, OVS_PACKET_ATTR_ACTIONS, actions, actions_len);
-    nl_msg_put_u32(buf, OVS_PACKET_ATTR_UPCALL_PID, nl_sock_pid(dpif->upcall_sock));
+    nl_msg_put_u32(buf, OVS_PACKET_ATTR_UPCALL_PID, get_upcall_pid(dpif));
 
     error = nl_sock_transact(genl_sock, buf, NULL);
     ofpbuf_delete(buf);
@@ -843,6 +874,14 @@ dpif_linux_recv_get_mask(const struct dpif *dpif_, int *listen_mask)
     return 0;
 }
 
+static uint32_t 
+get_upcall_pid(struct dpif_linux *dpif)
+{
+    dpif->last_assigned_upcall = (dpif->last_assigned_upcall + 1) &
+                                 (N_UPCALL_SOCKS - 1);
+    return nl_sock_pid(dpif->upcall_socks[dpif->last_assigned_upcall]);
+}
+
 static int
 dpif_linux_recv_set_mask(struct dpif *dpif_, int listen_mask)
 {
@@ -852,9 +891,9 @@ dpif_linux_recv_set_mask(struct dpif *dpif_, int listen_mask)
     if (listen_mask == dpif->listen_mask) {
         return 0;
     } else if (!listen_mask) {
-        nl_sock_destroy(dpif->upcall_sock);
-        dpif->upcall_sock = NULL;
-    } else if (!dpif->upcall_sock) {
+        destroy_upcall_socks(dpif);
+    } else if (!dpif->listen_mask) {
+        int i;
         struct dpif_port port;
         struct dpif_port_dump port_dump;
         struct dpif_flow_dump flow_dump;
@@ -864,9 +903,11 @@ dpif_linux_recv_set_mask(struct dpif *dpif_, int listen_mask)
         size_t actions_len;
         const struct dpif_flow_stats *flow_stats;
 
-        error = nl_sock_create(NETLINK_GENERIC, &dpif->upcall_sock);
-        if (error) {
-            goto error;
+        for (i = 0; i < N_UPCALL_SOCKS; i++) {
+            error = nl_sock_create(NETLINK_GENERIC, &dpif->upcall_socks[i]);
+            if (error) {
+                goto error;
+            }
         }
 
         DPIF_PORT_FOR_EACH (&port, &port_dump, dpif_) {
@@ -876,12 +917,16 @@ dpif_linux_recv_set_mask(struct dpif *dpif_, int listen_mask)
             vport_request.cmd = OVS_VPORT_CMD_SET;
             vport_request.dp_ifindex = dpif->dp_ifindex;
             vport_request.port_no = port.port_no;
-            vport_request.upcall_pid = nl_sock_pid(dpif->upcall_sock);
+            vport_request.upcall_pid = get_upcall_pid(dpif);
             error = dpif_linux_vport_transact(&vport_request, NULL, NULL);
             if (error) {
                 dpif_port_dump_done(&port_dump);
                 goto error;
             }
+            VLOG_DBG_RL(&dbg_rl, "%s: assigning port %"PRIu32" to netlink "
+                        "pid %"PRIu32,
+                        dpif_name(dpif_), vport_request.port_no,
+                        vport_request.upcall_pid);
         }
 
         dpif_flow_dump_start(&flow_dump, dpif_);
@@ -892,12 +937,22 @@ dpif_linux_recv_set_mask(struct dpif *dpif_, int listen_mask)
             dpif_linux_flow_init(&flow_request);
             flow_request.cmd = OVS_FLOW_CMD_SET;
             flow_request.dp_ifindex = dpif->dp_ifindex;
-            flow_request.upcall_pid = nl_sock_pid(dpif->upcall_sock);
+            flow_request.upcall_pid = get_upcall_pid(dpif);
             error = dpif_linux_flow_transact(&flow_request, NULL, NULL);
             if (error) {
                 dpif_flow_dump_done(&flow_dump);
                 goto error;
             }
+            if (VLOG_DROP_DBG(&dbg_rl)) {
+                struct ds flow;
+
+                ds_init(&flow);
+                odp_flow_key_format(key, key_len, &flow);
+                VLOG_DBG("%s: assigning flow %s to netlink pid %"PRIu32,
+                          dpif_name(dpif_), ds_cstr(&flow),
+                          flow_request.upcall_pid);
+                ds_destroy(&flow);
+            }
         }
         dpif_flow_dump_done(&flow_dump);
     }
@@ -906,8 +961,7 @@ dpif_linux_recv_set_mask(struct dpif *dpif_, int listen_mask)
     return 0;
 
 error:
-    nl_sock_destroy(dpif->upcall_sock);
-    dpif->upcall_sock = NULL;
+    destroy_upcall_socks(dpif);
     return error;
 }
 
@@ -1024,32 +1078,49 @@ static int
 dpif_linux_recv(struct dpif *dpif_, struct dpif_upcall *upcall)
 {
     struct dpif_linux *dpif = dpif_linux_cast(dpif_);
-    struct ofpbuf *buf;
-    int error;
     int i;
-
-    if (!dpif->upcall_sock) {
-        return EAGAIN;
-    }
-
-    for (i = 0; i < 50; i++) {
-        int dp_ifindex;
-
-        error = nl_sock_recv(dpif->upcall_sock, &buf, false);
-        if (error) {
-            return error;
-        }
-
-        error = parse_odp_packet(buf, upcall, &dp_ifindex);
-        if (!error
-            && dp_ifindex == dpif->dp_ifindex
-            && dpif->listen_mask & (1u << upcall->type)) {
-            return 0;
-        }
-
-        ofpbuf_delete(buf);
-        if (error) {
-            return error;
+    int read_tries = 0;
+
+    if (!dpif->listen_mask) {
+       return EAGAIN;
+    }
+
+    for (i = 0; i < N_UPCALL_SOCKS; i++) {
+        struct nl_sock *upcall_sock;
+        dpif->last_read_upcall = (dpif->last_read_upcall + 1) &
+                                 (N_UPCALL_SOCKS - 1);
+        upcall_sock = dpif->upcall_socks[dpif->last_read_upcall];
+
+        if (nl_sock_woke(upcall_sock)) {
+            int dp_ifindex;
+
+            for (;;) {
+                struct ofpbuf *buf;
+                int error;
+
+                if (++read_tries > 50) {
+                    return EAGAIN;
+                }
+
+                error = nl_sock_recv(upcall_sock, &buf, false);
+                if (error == EAGAIN) {
+                    break;
+                } else if (error) {
+                    return error;
+                }
+
+                error = parse_odp_packet(buf, upcall, &dp_ifindex);
+                if (!error
+                    && dp_ifindex == dpif->dp_ifindex
+                    && dpif->listen_mask & (1u << upcall->type)) {
+                    return 0;
+                }
+
+                ofpbuf_delete(buf);
+                if (error) {
+                    return error;
+                }
+            }
         }
     }
 
@@ -1060,8 +1131,14 @@ static void
 dpif_linux_recv_wait(struct dpif *dpif_)
 {
     struct dpif_linux *dpif = dpif_linux_cast(dpif_);
-    if (dpif->upcall_sock) {
-        nl_sock_wait(dpif->upcall_sock, POLLIN);
+    int i;
+
+    if (!dpif->listen_mask) {
+       return;
+    }
+
+    for (i = 0; i < N_UPCALL_SOCKS; i++) {
+        nl_sock_wait(dpif->upcall_socks[i], POLLIN);
     }
 }
 
@@ -1069,9 +1146,14 @@ static void
 dpif_linux_recv_purge(struct dpif *dpif_)
 {
     struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+    int i;
+
+    if (!dpif->listen_mask) {
+       return;
+    }
 
-    if (dpif->upcall_sock) {
-        nl_sock_drain(dpif->upcall_sock);
+    for (i = 0; i < N_UPCALL_SOCKS; i++) {
+        nl_sock_drain(dpif->upcall_socks[i]);
     }
 }
 
-- 
1.7.4.1




More information about the dev mailing list