[ovs-dev] [recirc fix branch-2.3 V2] recirculation: Map recirc_id to ofproto_dpif.

Alex Wang alexw at nicira.com
Tue Dec 23 18:42:51 UTC 2014


After commit 0c7812e5e (recirculation: Do not drop packet when
there is no match from internal table.), if flow keys are modified
before the recirculation action (e.g. set vlan ID), the miss
handling of recirc'ed packets may not reach the intended
'ofproto_dpif' which has rules looking up the 'recirc_id's,
causing drops.

This commit adds an unittest that captures this bug.  Moreover,
to solve this bug, this commit checks mapping between 'recirc_id'
and the corresponding 'ofproto_dpif', and makes sure that the
miss handling of recirc'ed packets are done with the correct
'ofproto_dpif'.

Signed-off-by: Alex Wang <alexw at nicira.com>
Acked-by: Andy Zhou <azhou at nicira.com>
Acked-by: Jarno Rajahalme <jrajahalme at nicira.com>

---
PATCH->V2:
- ovsrcu postpone the free of dpif_backer_recirc_node.
- fix unittest interminttent failure.

Diff between backport & master:
- move + adjust the fix logic in xlate_lookup() (on master) to xlate_receive().
- change the cmap (for mapping between 'recirc_id' and 'ofproto') to hmap,
  and protect the lookup with mutex.
- unittest adjust to work with branch-2.3.
---
 ofproto/ofproto-dpif-xlate.c |   45 ++++++++++++++++++++--
 ofproto/ofproto-dpif.c       |   85 ++++++++++++++++++++++++++++++++++++++++--
 ofproto/ofproto-dpif.h       |   12 ++++++
 tests/ofproto-dpif.at        |   68 +++++++++++++++++++++++++++++++++
 4 files changed, 203 insertions(+), 7 deletions(-)

diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index 0fc5443..ee353e3 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -634,6 +634,8 @@ xlate_receive(const struct dpif_backer *backer, struct ofpbuf *packet,
               struct dpif_sflow **sflow, struct netflow **netflow,
               odp_port_t *odp_in_port)
 {
+    struct ofproto_dpif *recv_ofproto = NULL;
+    struct ofproto_dpif *recirc_ofproto = NULL;
     const struct xport *xport;
     int error = ENODEV;
 
@@ -655,6 +657,7 @@ xlate_receive(const struct dpif_backer *backer, struct ofpbuf *packet,
     if (!xport) {
         goto exit;
     }
+    recv_ofproto = xport->xbridge->ofproto;
 
     if (vsp_adjust_flow(xport->xbridge->ofproto, flow)) {
         if (packet) {
@@ -667,20 +670,54 @@ xlate_receive(const struct dpif_backer *backer, struct ofpbuf *packet,
     }
     error = 0;
 
+    /* When recirc_id is set in 'flow', checks whether the ofproto_dpif that
+     * corresponds to the recirc_id is same as the receiving bridge.  If they
+     * are the same, uses the 'recv_ofproto' and keeps the 'ofp_in_port' as
+     * assigned.  Otherwise, uses the 'recirc_ofproto' that owns recirc_id and
+     * assigns OFPP_NONE to 'ofp_in_port'.  Doing this is in that, the
+     * recirculated flow must be processced by the ofproto which originates
+     * the recirculation, and as bridges can only see their own ports, the
+     * in_port of the 'recv_ofproto' should not be passed to the
+     * 'recirc_ofproto'.
+     *
+     * Admittedly, setting the 'ofp_in_port' to OFPP_NONE limits the
+     * 'recirc_ofproto' from meaningfully matching on in_port of recirculated
+     * flow, and should be fixed in the near future.
+     *
+     * TODO: Restore the original patch port.
+     */
+    if (flow->recirc_id) {
+        recirc_ofproto = ofproto_dpif_recirc_get_ofproto(backer,
+                                                         flow->recirc_id);
+        /* Returns error if could not find recirculation bridge */
+        if (!recirc_ofproto) {
+            error = ENOENT;
+            goto exit;
+        }
+
+        if (recv_ofproto != recirc_ofproto) {
+            xport = NULL;
+            flow->in_port.ofp_port = OFPP_NONE;
+            if (odp_in_port) {
+                *odp_in_port = ODPP_NONE;
+            }
+        }
+    }
+
     if (ofproto) {
-        *ofproto = xport->xbridge->ofproto;
+        *ofproto = xport ? recv_ofproto : recirc_ofproto;
     }
 
     if (ipfix) {
-        *ipfix = dpif_ipfix_ref(xport->xbridge->ipfix);
+        *ipfix = xport ? dpif_ipfix_ref(xport->xbridge->ipfix) : NULL;
     }
 
     if (sflow) {
-        *sflow = dpif_sflow_ref(xport->xbridge->sflow);
+        *sflow = xport ? dpif_sflow_ref(xport->xbridge->sflow) : NULL;
     }
 
     if (netflow) {
-        *netflow = netflow_ref(xport->xbridge->netflow);
+        *netflow = xport ? netflow_ref(xport->xbridge->netflow) : NULL;
     }
 
 exit:
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index 7204ccf..6df6b83 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -57,6 +57,7 @@
 #include "ofproto-dpif-sflow.h"
 #include "ofproto-dpif-upcall.h"
 #include "ofproto-dpif-xlate.h"
+#include "ovs-rcu.h"
 #include "poll-loop.h"
 #include "seq.h"
 #include "simap.h"
@@ -238,6 +239,13 @@ COVERAGE_DEFINE(rev_port_toggled);
 COVERAGE_DEFINE(rev_flow_table);
 COVERAGE_DEFINE(rev_mac_learning);
 
+/* Stores mapping between 'recirc_id' and 'ofproto-dpif'. */
+struct dpif_backer_recirc_node {
+    struct hmap_node hmap_node;
+    struct ofproto_dpif *ofproto;
+    uint32_t recirc_id;
+};
+
 /* All datapaths of a given type share a single dpif backer instance. */
 struct dpif_backer {
     char *type;
@@ -256,6 +264,8 @@ struct dpif_backer {
 
     /* Recirculation. */
     struct recirc_id_pool *rid_pool;       /* Recirculation ID pool. */
+    struct hmap recirc_map;         /* Map of 'recirc_id's to 'ofproto's. */
+    struct ovs_mutex recirc_mutex;  /* Protects 'recirc_map'. */
     bool enable_recirc;   /* True if the datapath supports recirculation */
 
     /* True if the datapath supports variable-length
@@ -794,6 +804,30 @@ dealloc(struct ofproto *ofproto_)
     free(ofproto);
 }
 
+/* Called when 'ofproto' is destructed.  Checks for and clears any
+ * recirc_id leak. */
+static void
+dpif_backer_recirc_clear_ofproto(struct dpif_backer *backer,
+                                 struct ofproto_dpif *ofproto)
+{
+    struct dpif_backer_recirc_node *node;
+
+    ovs_mutex_lock(&backer->recirc_mutex);
+    HMAP_FOR_EACH (node, hmap_node, &backer->recirc_map) {
+        if (node->ofproto == ofproto) {
+            VLOG_ERR("recirc_id %"PRIu32", not freed when ofproto (%s) "
+                     "is destructed", node->recirc_id, ofproto->up.name);
+            hmap_remove(&backer->recirc_map, &node->hmap_node);
+            /* Does not matter whether directly free or use ovsrcu_postpone,
+             * since all datapath flows are already purged before calling this
+             * function, and no 'recirc_id' could be associated to 'ofproto'.
+             */
+            ovsrcu_postpone(free, node);
+        }
+    }
+    ovs_mutex_unlock(&backer->recirc_mutex);
+}
+
 static void
 close_dpif_backer(struct dpif_backer *backer)
 {
@@ -810,6 +844,8 @@ close_dpif_backer(struct dpif_backer *backer)
     hmap_destroy(&backer->odp_to_ofport_map);
     shash_find_and_delete(&all_dpif_backers, backer->type);
     recirc_id_pool_destroy(backer->rid_pool);
+    hmap_destroy(&backer->recirc_map);
+    ovs_mutex_destroy(&backer->recirc_mutex);
     free(backer->type);
     dpif_close(backer->dpif);
     free(backer);
@@ -921,6 +957,8 @@ open_dpif_backer(const char *type, struct dpif_backer **backerp)
     backer->variable_length_userdata = check_variable_length_userdata(backer);
     backer->max_mpls_depth = check_max_mpls_depth(backer);
     backer->rid_pool = recirc_id_pool_create();
+    ovs_mutex_init(&backer->recirc_mutex);
+    hmap_init(&backer->recirc_map);
 
     error = dpif_recv_set(backer->dpif, backer->recv_set_enable);
     if (error) {
@@ -1306,6 +1344,8 @@ destruct(struct ofproto *ofproto_)
     }
     guarded_list_destroy(&ofproto->pins);
 
+    dpif_backer_recirc_clear_ofproto(ofproto->backer, ofproto);
+
     mbridge_unref(ofproto->mbridge);
 
     netflow_unref(ofproto->netflow);
@@ -4758,20 +4798,59 @@ odp_port_to_ofp_port(const struct ofproto_dpif *ofproto, odp_port_t odp_port)
     }
 }
 
+struct ofproto_dpif *
+ofproto_dpif_recirc_get_ofproto(const struct dpif_backer *backer,
+                                uint32_t recirc_id)
+{
+    struct dpif_backer_recirc_node *node;
+
+    ovs_mutex_lock(&backer->recirc_mutex);
+    node = CONTAINER_OF(hmap_first_with_hash(&backer->recirc_map, recirc_id),
+                        struct dpif_backer_recirc_node, hmap_node);
+    ovs_mutex_unlock(&backer->recirc_mutex);
+
+    return node ? node->ofproto : NULL;
+}
+
 uint32_t
 ofproto_dpif_alloc_recirc_id(struct ofproto_dpif *ofproto)
 {
     struct dpif_backer *backer = ofproto->backer;
+    uint32_t recirc_id = recirc_id_alloc(backer->rid_pool);
+
+    if (recirc_id) {
+        struct dpif_backer_recirc_node *node = xmalloc(sizeof *node);
+
+        node->recirc_id = recirc_id;
+        node->ofproto = ofproto;
 
-    return  recirc_id_alloc(backer->rid_pool);
+        ovs_mutex_lock(&backer->recirc_mutex);
+        hmap_insert(&backer->recirc_map, &node->hmap_node, node->recirc_id);
+        ovs_mutex_unlock(&backer->recirc_mutex);
+    }
+
+    return recirc_id;
 }
 
 void
 ofproto_dpif_free_recirc_id(struct ofproto_dpif *ofproto, uint32_t recirc_id)
 {
     struct dpif_backer *backer = ofproto->backer;
-
-    recirc_id_free(backer->rid_pool, recirc_id);
+    struct dpif_backer_recirc_node *node;
+
+    ovs_mutex_lock(&backer->recirc_mutex);
+    node = CONTAINER_OF(hmap_first_with_hash(&backer->recirc_map, recirc_id),
+                        struct dpif_backer_recirc_node, hmap_node);
+    if (node) {
+        hmap_remove(&backer->recirc_map, &node->hmap_node);
+        ovs_mutex_unlock(&backer->recirc_mutex);
+        recirc_id_free(backer->rid_pool, node->recirc_id);
+        /* RCU postpone the free, since other threads may be referring
+         * to 'node' at same time. */
+        ovsrcu_postpone(free, node);
+    } else {
+        ovs_mutex_unlock(&backer->recirc_mutex);
+    }
 }
 
 int
diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h
index 6f77b1a..cee4723 100644
--- a/ofproto/ofproto-dpif.h
+++ b/ofproto/ofproto-dpif.h
@@ -226,8 +226,20 @@ struct ofport_dpif *odp_port_to_ofport(const struct dpif_backer *, odp_port_t);
  * Post recirculation data path flows are managed like other data path flows.
  * They are created on demand. Miss handling, stats collection and revalidation
  * work the same way as regular flows.
+ *
+ * If the bridge which originates the recirculation is different from the bridge
+ * that receives the post recirculation packet (e.g. when patch port is used),
+ * the packet will be processed directly by the recirculation bridge with
+ * in_port set to OFPP_NONE.  Admittedly, doing this limits the recirculation
+ * bridge from matching on in_port of post recirculation packets, and will be
+ * fixed in the near future.
+ *
+ * TODO: Always restore the correct in_port.
+ *
  */
 
+struct ofproto_dpif *ofproto_dpif_recirc_get_ofproto(const struct dpif_backer *ofproto,
+                                                     uint32_t recirc_id);
 uint32_t ofproto_dpif_alloc_recirc_id(struct ofproto_dpif *ofproto);
 void ofproto_dpif_free_recirc_id(struct ofproto_dpif *ofproto, uint32_t recirc_id);
 int ofproto_dpif_add_internal_flow(struct ofproto_dpif *,
diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
index 1fcd937..6fb0dce 100644
--- a/tests/ofproto-dpif.at
+++ b/tests/ofproto-dpif.at
@@ -195,6 +195,74 @@ AT_CHECK([ovs-appctl dpif/dump-flows br1 |grep tcp > br1_flows.txt])
 AT_CHECK([test `grep in_port.4 br1_flows.txt |wc -l` -gt 24])
 AT_CHECK([test `grep in_port.5 br1_flows.txt |wc -l` -gt 24])
 AT_CHECK([test `grep in_port.6 br1_flows.txt |wc -l` -gt 24])
+
+OVS_VSWITCHD_STOP()
+AT_CLEANUP
+
+# Makes sure recirculation does not change the way packet is handled.
+AT_SETUP([ofproto-dpif, balance-tcp bonding, different recirc flow ])
+OVS_VSWITCHD_START(
+  [add-bond br0 bond0 p1 p2 bond_mode=balance-tcp lacp=active \
+       other-config:lacp-time=fast other-config:bond-rebalance-interval=0 -- \
+   set interface p1 type=dummy options:pstream=punix:$OVS_RUNDIR/p1.sock ofport_request=1 -- \
+   set interface p2 type=dummy options:pstream=punix:$OVS_RUNDIR/p2.sock ofport_request=2 -- \
+   add-br br1 -- \
+   set bridge br1 other-config:hwaddr=aa:66:aa:66:00:00 -- \
+   set bridge br1 datapath-type=dummy other-config:datapath-id=1234 \
+       fail-mode=standalone -- \
+   add-bond br1 bond1 p3 p4 bond_mode=balance-tcp lacp=active \
+       other-config:lacp-time=fast other-config:bond-rebalance-interval=0 -- \
+   set interface p3 type=dummy options:stream=unix:$OVS_RUNDIR/p1.sock ofport_request=3 -- \
+   set interface p4 type=dummy options:stream=unix:$OVS_RUNDIR/p2.sock ofport_request=4 -- \
+   add-port br1 br1- -- set interface br1- type=patch options:peer=br1+ ofport_request=100 -- \
+   add-br br-int -- \
+   set bridge br-int other-config:hwaddr=aa:77:aa:77:00:00 -- \
+   set bridge br-int datapath-type=dummy other-config:datapath-id=1235 \
+       fail-mode=secure -- \
+   add-port br-int br1+ -- set interface br1+ type=patch options:peer=br1- ofport_request=101 -- \
+   add-port br-int p5 -- set interface p5 ofport_request=5 type=dummy
+])
+AT_CHECK([ovs-appctl netdev-dummy/set-admin-state up], 0, [OK
+])
+
+# Waits for all ifaces enabled.
+OVS_WAIT_UNTIL([test `ovs-appctl bond/show | grep -- "may_enable: true" |  wc -l` -ge 4])
+
+# The dl_vlan flow should not be ever matched,
+# since recirculation should not change the flow handling.
+AT_DATA([flows.txt], [dnl
+table=0 priority=1 in_port=5 actions=mod_vlan_vid:1,output(101)
+table=0 priority=2 in_port=5 dl_vlan=1 actions=drop
+])
+AT_CHECK([ovs-ofctl add-flows br-int flows.txt])
+
+# Sends a packet to trigger recirculation.
+# Should generate recirc_id(0x12d),dp_hash(0xcf/0xff).
+AT_CHECK([ovs-appctl netdev-dummy/receive p5 "in_port(5),eth(src=50:54:00:00:00:05,dst=50:54:00:00:01:00),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no)"])
+
+ovs-appctl time/warp 5000 100
+
+# Forces revalidators to update all stats.
+AT_CHECK([ovs-appctl upcall/disable-megaflows], [0], [dnl
+megaflows disabled
+])
+AT_CHECK([ovs-appctl upcall/enable-megaflows], [0], [dnl
+megaflows enabled
+])
+
+# Checks the flow stats in br1, should only be one flow with non-zero
+# 'n_packets' from internal table.
+AT_CHECK([ovs-appctl bridge/dump-flows br1 | ofctl_strip | grep -- "n_packets" | grep -- "table_id" | sed -e 's/output:[[0-9]]\+/output/'], [0], [dnl
+table_id=254, n_packets=1, n_bytes=64, priority=20,recirc_id=0x12d,dp_hash=0xcf/0xff,actions=output
+])
+
+# Checks the flow stats in br-int, should be only one match.
+AT_CHECK([ovs-ofctl dump-flows br-int | ofctl_strip | sort], [0], [dnl
+ n_packets=1, n_bytes=60, priority=1,in_port=5 actions=mod_vlan_vid:1,output:101
+ priority=2,in_port=5,dl_vlan=1 actions=drop
+NXST_FLOW reply:
+])
+
 OVS_VSWITCHD_STOP()
 AT_CLEANUP
 
-- 
1.7.9.5




More information about the dev mailing list