[ovs-dev] [PATCH v4 ovn] Fix basic multicast flows for vxlan (non-vtep) tunnels

Ihar Hrachyshka ihrachys at redhat.com
Mon Oct 4 16:37:20 UTC 2021


The 15-bit port key range used for multicast groups can't be covered
by 12-bit key space available for port keys in VXLAN. To make
multicast keys work, we have to transform 16-bit multicast port keys
to 12-bit keys before fanning out packets through VXLAN tunnels.
Otherwise significant bits are not retained, and multicast / broadcast
traffic does not reach ports located on other chassis.

This patch introduces a mapping scheme between core 16-bit multicast
port keys and 12-bit key range available in VXLAN. The scheme is as
follows:

1) Before sending a packet through VXLAN tunnel, the most significant
   bit of a 16-bit port key is copied into the most significant bit of
   12-bit VXLAN key. The 11 least significant bits of a 16-bit port
   key are copied to the least significant bits of 12-bit VXLAN key.

2) When receiving a packet through VXLAN tunnel, the most significant
   bit of a VXLAN 12-bit port key is copied into the most significant
   bit of 16-bit port key used in core. The 11 least significant bits
   of a VXLAN 12-bit port key are copied into the least significant
   bits of a 16-bit port key used in core.

This change also implies that the range available for multicast port
keys is more limited and fits into 11-bit space. The same rule should
be enforced for unicast port keys, like we already do for tunnel keys
when a VXLAN encap is present in a cluster. This enforcement is
implied here but missing in master and will be implemented in a
separate patch. (The missing enforcement is an oversight of the
original patch that added support for VXLAN tunnels.)

Fixes: b07f1bc3d068 ("Add VXLAN support for non-VTEP datapath bindings")
Signed-off-by: Ihar Hrachyshka <ihrachys at redhat.com>

--

v1: initial commit
v2: updated docs
v2: removed newly added but unused macros
v3: rebase
v4: fix memory leak in consider_mc_group
---
 controller-vtep/gateway.c |   2 +
 controller/physical.c     | 102 ++++++++++++++++++++++++++++++--------
 lib/mcast-group-index.h   |   2 +
 ovn-architecture.7.xml    |  13 +++--
 tests/ovn.at              |  23 +++++++--
 5 files changed, 112 insertions(+), 30 deletions(-)

diff --git a/controller-vtep/gateway.c b/controller-vtep/gateway.c
index e9419138b..288772dc4 100644
--- a/controller-vtep/gateway.c
+++ b/controller-vtep/gateway.c
@@ -61,6 +61,8 @@ create_chassis_rec(struct ovsdb_idl_txn *txn, const char *name,
     sbrec_encap_set_options(encap_rec, &options);
     sbrec_encap_set_chassis_name(encap_rec, name);
     sbrec_chassis_set_encaps(chassis_rec, &encap_rec, 1);
+    const struct smap oc = SMAP_CONST1(&oc, "is-vtep", "true");
+    sbrec_chassis_set_other_config(chassis_rec, &oc);
 
     return chassis_rec;
 }
diff --git a/controller/physical.c b/controller/physical.c
index 0cfb158c8..58e4157f1 100644
--- a/controller/physical.c
+++ b/controller/physical.c
@@ -37,6 +37,7 @@
 #include "openvswitch/ofp-parse.h"
 #include "ovn-controller.h"
 #include "lib/chassis-index.h"
+#include "lib/mcast-group-index.h"
 #include "lib/ovn-sb-idl.h"
 #include "lib/ovn-util.h"
 #include "physical.h"
@@ -63,6 +64,7 @@ static void
 load_logical_ingress_metadata(const struct sbrec_port_binding *binding,
                               const struct zone_ids *zone_ids,
                               struct ofpbuf *ofpacts_p);
+static int64_t get_vxlan_port_key(int64_t port_key);
 
 /* UUID to identify OF flows not associated with ovsdb rows. */
 static struct uuid *hc_uuid = NULL;
@@ -160,8 +162,9 @@ put_encapsulation(enum mf_field_id mff_ovn_geneve,
     } else if (tun->type == VXLAN) {
         uint64_t vni = datapath->tunnel_key;
         if (!is_ramp_switch) {
-            /* Only some bits are used for regular tunnels. */
-            vni |= (uint64_t) outport << 12;
+            /* Map southbound 16-bit port key to limited 12-bit range
+             * available for VXLAN, which differs for multicast groups. */
+            vni |= get_vxlan_port_key(outport) << 12;
         }
         put_load(vni, MFF_TUN_ID, 0, 24, ofpacts);
     } else {
@@ -1372,6 +1375,43 @@ out:
     }
 }
 
+static int64_t
+get_vxlan_port_key(int64_t port_key)
+{
+    if (port_key >= OVN_MIN_MULTICAST) {
+        /* 0b1<11 least significant bits> */
+        return OVN_VXLAN_MIN_MULTICAST |
+            (port_key & (OVN_VXLAN_MIN_MULTICAST - 1));
+    }
+    return port_key;
+}
+
+static void
+fanout_to_chassis(enum mf_field_id mff_ovn_geneve,
+                  struct sset *remote_chassis,
+                  const struct hmap *chassis_tunnels,
+                  const struct sbrec_datapath_binding *datapath,
+                  uint16_t outport, bool is_ramp_switch,
+                  struct ofpbuf *remote_ofpacts)
+{
+    const char *chassis_name;
+    const struct chassis_tunnel *prev = NULL;
+    SSET_FOR_EACH (chassis_name, remote_chassis) {
+        const struct chassis_tunnel *tun
+            = chassis_tunnel_find(chassis_tunnels, chassis_name, NULL);
+        if (!tun) {
+            continue;
+        }
+
+        if (!prev || tun->type != prev->type) {
+            put_encapsulation(mff_ovn_geneve, tun, datapath,
+                              outport, is_ramp_switch, remote_ofpacts);
+            prev = tun;
+        }
+        ofpact_put_OUTPUT(remote_ofpacts)->port = tun->ofport;
+    }
+}
+
 static void
 consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
                   enum mf_field_id mff_ovn_geneve,
@@ -1390,6 +1430,7 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
     }
 
     struct sset remote_chassis = SSET_INITIALIZER(&remote_chassis);
+    struct sset vtep_chassis = SSET_INITIALIZER(&vtep_chassis);
     struct match match;
 
     match_init_catchall(&match);
@@ -1398,7 +1439,8 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
 
     /* Go through all of the ports in the multicast group:
      *
-     *    - For remote ports, add the chassis to 'remote_chassis'.
+     *    - For remote ports, add the chassis to 'remote_chassis' or
+     *      'vtep_chassis'.
      *
      *    - For local ports (other than logical patch ports), add actions
      *      to 'ofpacts' to set the output port and resubmit.
@@ -1465,7 +1507,12 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
             /* Add remote chassis only when localnet port not exist,
              * otherwise multicast will reach remote ports through localnet
              * port. */
-            sset_add(&remote_chassis, port->chassis->name);
+            if (smap_get_bool(&port->chassis->other_config,
+                              "is-vtep", false)) {
+                sset_add(&vtep_chassis, port->chassis->name);
+            } else {
+                sset_add(&remote_chassis, port->chassis->name);
+            }
         }
     }
 
@@ -1490,7 +1537,8 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
      *
      * Handle output to the remote chassis in the multicast group, if
      * any. */
-    if (!sset_is_empty(&remote_chassis) || remote_ofpacts.size > 0) {
+    if (!sset_is_empty(&remote_chassis) ||
+            !sset_is_empty(&vtep_chassis) || remote_ofpacts.size > 0) {
         if (remote_ofpacts.size > 0) {
             /* Following delivery to logical patch ports, restore the
              * multicast group as the logical output port. */
@@ -1498,22 +1546,12 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
                      &remote_ofpacts);
         }
 
-        const char *chassis_name;
-        const struct chassis_tunnel *prev = NULL;
-        SSET_FOR_EACH (chassis_name, &remote_chassis) {
-            const struct chassis_tunnel *tun
-                = chassis_tunnel_find(chassis_tunnels, chassis_name, NULL);
-            if (!tun) {
-                continue;
-            }
-
-            if (!prev || tun->type != prev->type) {
-                put_encapsulation(mff_ovn_geneve, tun, mc->datapath,
-                                  mc->tunnel_key, true, &remote_ofpacts);
-                prev = tun;
-            }
-            ofpact_put_OUTPUT(&remote_ofpacts)->port = tun->ofport;
-        }
+        fanout_to_chassis(mff_ovn_geneve, &remote_chassis, chassis_tunnels,
+                          mc->datapath, mc->tunnel_key, false,
+                          &remote_ofpacts);
+        fanout_to_chassis(mff_ovn_geneve, &vtep_chassis, chassis_tunnels,
+                          mc->datapath, mc->tunnel_key, true,
+                          &remote_ofpacts);
 
         if (remote_ofpacts.size) {
             if (local_ports) {
@@ -1527,6 +1565,7 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
     ofpbuf_uninit(&ofpacts);
     ofpbuf_uninit(&remote_ofpacts);
     sset_destroy(&remote_chassis);
+    sset_destroy(&vtep_chassis);
 }
 
 bool
@@ -1689,6 +1728,27 @@ physical_run(struct physical_ctx *p_ctx,
                         &ofpacts, hc_uuid);
     }
 
+    /* Add VXLAN specific rules to transform port keys
+     * from 12 bits to 16 bits used elsewhere. */
+    HMAP_FOR_EACH (tun, hmap_node, p_ctx->chassis_tunnels) {
+        if (tun->type == VXLAN) {
+            ofpbuf_clear(&ofpacts);
+
+            struct match match = MATCH_CATCHALL_INITIALIZER;
+            match_set_in_port(&match, tun->ofport);
+            ovs_be64 mcast_bits = htonll((OVN_VXLAN_MIN_MULTICAST << 12));
+            match_set_tun_id_masked(&match, mcast_bits, mcast_bits);
+
+            put_load(1, MFF_LOG_OUTPORT, 15, 1, &ofpacts);
+            put_move(MFF_TUN_ID, 12, MFF_LOG_OUTPORT,  0, 11, &ofpacts);
+            put_move(MFF_TUN_ID, 0, MFF_LOG_DATAPATH, 0, 12, &ofpacts);
+            put_resubmit(OFTABLE_LOCAL_OUTPUT, &ofpacts);
+
+            ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 105, 0,
+                            &match, &ofpacts, hc_uuid);
+        }
+    }
+
     /* Handle ramp switch encapsulations. */
     HMAP_FOR_EACH (tun, hmap_node, p_ctx->chassis_tunnels) {
         if (tun->type != VXLAN) {
diff --git a/lib/mcast-group-index.h b/lib/mcast-group-index.h
index 72af117a4..5bc725451 100644
--- a/lib/mcast-group-index.h
+++ b/lib/mcast-group-index.h
@@ -23,6 +23,8 @@ struct sbrec_datapath_binding;
 #define OVN_MIN_MULTICAST 32768
 #define OVN_MAX_MULTICAST 65535
 
+#define OVN_VXLAN_MIN_MULTICAST 2048
+
 enum ovn_mcast_tunnel_keys {
 
     OVN_MCAST_FLOOD_TUNNEL_KEY = OVN_MIN_MULTICAST,
diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml
index 3d2910358..5c5acfaab 100644
--- a/ovn-architecture.7.xml
+++ b/ovn-architecture.7.xml
@@ -2797,11 +2797,14 @@
     </li>
 
     <li>
-      12-bit logical egress port identifier.  IDs 0 through 32767 have the same
-      meaning as for logical ingress ports.  IDs 32768 through 65535,
-      inclusive, may be assigned to logical multicast groups (see the
-      <code>tunnel_key</code> column in the OVN Southbound
-      <code>Multicast_Group</code> table).
+      12-bit logical egress port identifier. IDs 0 through 2047 are used for
+      unicast output ports. IDs 2048 through 4095, inclusive, may be assigned
+      to logical multicast groups (see the <code>tunnel_key</code> column in
+      the OVN Southbound <code>Multicast_Group</code> table). For multicast
+      group tunnel keys, a special mapping scheme is used to internally
+      transform from internal OVN 16-bit keys to 12-bit values before sending
+      packets through a VXLAN tunnel, and back from 12-bit tunnel keys to
+      16-bit values when receiving packets from a VXLAN tunnel.
     </li>
 
     <li>
diff --git a/tests/ovn.at b/tests/ovn.at
index 172b5c713..3ae4ef752 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -3271,8 +3271,13 @@ AT_SETUP([VLAN transparency, passthru=true, ARP responder disabled])
 ovn_start
 
 net_add net
-check ovs-vsctl add-br br-phys
-ovn_attach net br-phys 192.168.0.1
+for i in 1 2; do
+    sim_add hv-$i
+    as hv-$i
+    check ovs-vsctl add-br br-phys
+    check ovs-vsctl set open . external-ids:ovn-bridge-mappings=phys:br-phys
+    ovn_attach net br-phys 192.168.0.$i 24 vxlan
+done
 
 check ovn-nbctl ls-add ls
 check ovn-nbctl --wait=sb add Logical-Switch ls other_config vlan-passthru=true
@@ -3283,6 +3288,7 @@ for i in 1 2; do
 done
 
 for i in 1 2; do
+    as hv-$i
     check ovs-vsctl add-port br-int vif$i -- set Interface vif$i external-ids:iface-id=lsp$i \
                                   options:tx_pcap=vif$i-tx.pcap \
                                   options:rxq_pcap=vif$i-rx.pcap \
@@ -3298,18 +3304,27 @@ AT_CHECK([grep -w "ls_in_arp_rsp" lsflows | sort], [0], [dnl
   table=16(ls_in_arp_rsp      ), priority=0    , match=(1), action=(next;)
 ])
 
+for i in 1 2; do
+    : > $i.expected
+done
+
 test_arp() {
     local inport=$1 outport=$2 sha=$3 spa=$4 tpa=$5 reply_ha=$6
     tag=8100fefe
     local request=ffffffffffff${sha}${tag}08060001080006040001${sha}${spa}ffffffffffff${tpa}
-    ovs-appctl netdev-dummy/receive vif$inport $request
+    as hv-$inport ovs-appctl netdev-dummy/receive vif$inport $request
     echo $request >> $outport.expected
 
     local reply=${sha}${reply_ha}${tag}08060001080006040002${reply_ha}${tpa}${sha}${spa}
-    ovs-appctl netdev-dummy/receive vif$outport $reply
+    as hv-$outport ovs-appctl netdev-dummy/receive vif$outport $reply
     echo $reply >> $inport.expected
 }
 
+# Pre-populate the hypervisors' ARP tables so that we don't lose any
+# packets for ARP resolution (native tunneling doesn't queue packets
+# for ARP resolution).
+OVN_POPULATE_ARP
+
 test_arp 1 2 f00000000001 0a000001 0a000002 f00000000002
 test_arp 2 1 f00000000002 0a000002 0a000001 f00000000001
 
-- 
2.31.1



More information about the dev mailing list