[ovs-dev] [PATCH v5 ovn] Replace chassis mac with router port mac on destination chassis

Ankur Sharma ankur.sharma at nutanix.com
Tue Sep 10 21:22:44 UTC 2019


During E-W routing for vlan backed networks, we replace router port
mac with chassis mac, when packet leaves the source hypervisor.

As a result, the destination VM (on remote hypervisor) will see
chassis mac as source mac in received packet.

Although, functionality wise this does not cause any issue,
however chassis mac being see as source on VM, will
lead to following:
a. INCONSISTENT SOURCE MAC:
   If the destination VM moves to same hypervisor as sender,
   then it will see router port mac as source mac. Whereas, on
   a remote hypervisor, source mac will be the sender chassis mac.

   This will cause inconsistency in packet headers for the same
   flow and could be confusing for someone looking at packet
   captures inside the vm.

b. SYSTEM MAC BEING EXPOSED TO VM:
   Chassis mac is a CMS provided mac, i.e it is an infrastructure
   mac. It is not a good practice to expose such values to VM,
   which should not be seeing them in first place.

In order to replace chassis mac with router port mac, we will
do following.

a. Create conjunction for each chassis mac and router port vlan
   id combination. For example, for a 2 node chassis setup, where
   we have a logical router, connected to 4 logical switches with
   vlan ids: 2000, 1000, 0 and 24, the conjunction flow will look
   like following:

   cookie=0x0, duration=9094.608s, table=0, n_packets=0, n_bytes=0, idle_age=9094, priority=180,dl_src=aa:bb:cc:dd:ee:22 actions=conjunction(100,1/2)
   cookie=0x0, duration=9094.608s, table=0, n_packets=0, n_bytes=0, idle_age=9094, priority=180,dl_src=aa:bb:cc:dd:ff:ee actions=conjunction(100,1/2)

   cookie=0x0, duration=9094.552s, table=0, n_packets=0, n_bytes=0, idle_age=9094, priority=180,dl_vlan=2000 actions=conjunction(100,2/2)
   cookie=0x0, duration=9094.552s, table=0, n_packets=0, n_bytes=0, idle_age=9094, priority=180,dl_vlan=1000 actions=conjunction(100,2/2)
   cookie=0x0, duration=9094.552s, table=0, n_packets=0, n_bytes=0, idle_age=9094, priority=180,vlan_tci=0x0000/0x1fff actions=conjunction(100,2/2)
   cookie=0x0, duration=9094.552s, table=0, n_packets=0, n_bytes=0, idle_age=9094, priority=180,dl_vlan=24 actions=conjunction(100,2/2)

b. Using this conjunction as match, we can identify if packet entering destination
   hypervisor was routed at the source or not. This will be done in table=0 (Physical to logical)
   at priority=180.
   For example:
   cookie=0x0, duration=9795.957s, table=0, n_packets=1391, n_bytes=141882, idle_age=8396, priority=180,conj_id=100,in_port=146,dl_vlan=1000 actions=.........,mod_dl_src:00:00:01:01:02:03,...

c. We use conjunction, as it will ensure that we do not end up having lot of flows
   as we scale up. If we do not use conjunction, then we will have
   N (number of chassis macs) X M (number of router vlans) number of ovs flows.
   Conjunction converts it to N + M.
   Consider a setup, with 500 Chassis and 500 routed vlans.
   Without conjunction we will need 25000 (500 * 500) flows,
   whereas with conjunction that number comes down to 1000 (500 + 500).

Signed-off-by: Ankur Sharma <ankur.sharma at nutanix.com>
---
 controller/chassis.c        |   2 +-
 controller/chassis.h        |   3 +
 controller/ovn-controller.c |   5 +
 controller/physical.c       | 222 ++++++++++++++++++++++++++++++++++++++++++--
 controller/physical.h       |   1 +
 ovn-architecture.7.xml      |  10 +-
 tests/ovn.at                |  14 ++-
 7 files changed, 244 insertions(+), 13 deletions(-)

diff --git a/controller/chassis.c b/controller/chassis.c
index 937c557..699b662 100644
--- a/controller/chassis.c
+++ b/controller/chassis.c
@@ -144,7 +144,7 @@ get_bridge_mappings(const struct smap *ext_ids)
     return smap_get_def(ext_ids, "ovn-bridge-mappings", "");
 }
 
-static const char *
+const char *
 get_chassis_mac_mappings(const struct smap *ext_ids)
 {
     return smap_get_def(ext_ids, "ovn-chassis-mac-mappings", "");
diff --git a/controller/chassis.h b/controller/chassis.h
index eb46ca3..178d295 100644
--- a/controller/chassis.h
+++ b/controller/chassis.h
@@ -27,6 +27,7 @@ struct sbrec_chassis;
 struct sbrec_chassis_table;
 struct sset;
 struct eth_addr;
+struct smap;
 
 void chassis_register_ovs_idl(struct ovsdb_idl *);
 const struct sbrec_chassis *chassis_run(
@@ -42,5 +43,7 @@ bool chassis_get_mac(const struct sbrec_chassis *chassis,
                      const char *bridge_mapping,
                      struct eth_addr *chassis_mac);
 const char *chassis_get_id(void);
+const char * get_chassis_mac_mappings(const struct smap *ext_ids);
+
 
 #endif /* controller/chassis.h */
diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
index e27b56b..b5449b8 100644
--- a/controller/ovn-controller.c
+++ b/controller/ovn-controller.c
@@ -1268,9 +1268,14 @@ en_flow_output_run(struct engine_node *node)
         (struct sbrec_port_binding_table *)EN_OVSDB_GET(
             engine_get_input("SB_port_binding", node));
 
+    struct sbrec_chassis_table *chassis_table =
+        (struct sbrec_chassis_table *)EN_OVSDB_GET(
+            engine_get_input("SB_chassis", node));
+
     physical_run(sbrec_port_binding_by_name,
                  multicast_group_table,
                  port_binding_table,
+                 chassis_table,
                  mff_ovn_geneve,
                  br_int, chassis, ct_zones,
                  local_datapaths, local_lports,
diff --git a/controller/physical.c b/controller/physical.c
index f28c5f0..43113e4 100644
--- a/controller/physical.c
+++ b/controller/physical.c
@@ -47,9 +47,23 @@
 
 VLOG_DEFINE_THIS_MODULE(physical);
 
+/* Datapath zone IDs for connection tracking and NAT */
+struct zone_ids {
+    int ct;                     /* MFF_LOG_CT_ZONE. */
+    int dnat;                   /* MFF_LOG_DNAT_ZONE. */
+    int snat;                   /* MFF_LOG_SNAT_ZONE. */
+};
+
+static void
+load_logical_ingress_metadata(const struct sbrec_port_binding *binding,
+                              const struct zone_ids *zone_ids,
+                              struct ofpbuf *ofpacts_p);
+
 /* UUID to identify OF flows not associated with ovsdb rows. */
 static struct uuid *hc_uuid = NULL;
 
+#define CHASSIS_MAC_TO_ROUTER_MAC_CONJID        100
+
 void
 physical_register_ovs_idl(struct ovsdb_idl *ovs_idl)
 {
@@ -199,12 +213,6 @@ get_localnet_port(const struct hmap *local_datapaths, int64_t tunnel_key)
     return ld ? ld->localnet_port : NULL;
 }
 
-/* Datapath zone IDs for connection tracking and NAT */
-struct zone_ids {
-    int ct;                     /* MFF_LOG_CT_ZONE. */
-    int dnat;                   /* MFF_LOG_DNAT_ZONE. */
-    int snat;                   /* MFF_LOG_SNAT_ZONE. */
-};
 
 static struct zone_ids
 get_zone_ids(const struct sbrec_port_binding *binding,
@@ -388,6 +396,200 @@ put_remote_port_redirect_overlay(const struct
                     match, ofpacts_p, &binding->header_.uuid);
 }
 
+
+static struct hmap remote_chassis_macs =
+              HMAP_INITIALIZER(&remote_chassis_macs);
+
+/* Maps from a physical network name to the chassis macs of remote chassis. */
+struct remote_chassis_mac {
+    struct hmap_node hmap_node;
+    char *chassis_mac;
+    char *chassis_id;
+};
+
+static void
+populate_remote_chassis_macs(const struct sbrec_chassis *my_chassis,
+                             const struct sbrec_chassis_table *chassis_table)
+{
+    const struct sbrec_chassis *chassis;
+    SBREC_CHASSIS_TABLE_FOR_EACH (chassis, chassis_table) {
+
+        /* We want only remote chassis macs. */
+        if (!strcmp(my_chassis->name, chassis->name)) {
+            continue;
+        }
+
+        const char *tokens
+            = get_chassis_mac_mappings(&chassis->external_ids);
+
+        if (!strlen(tokens)) {
+            continue;
+        }
+
+        char *save_ptr = NULL;
+        char *token;
+        char *tokstr = xstrdup(tokens);
+
+        /* Format for a chassis mac configuration is:
+         * ovn-chassis-mac-mappings="bridge-name1:MAC1,bridge-name2:MAC2"
+         */
+        for (token = strtok_r(tokstr, ",", &save_ptr);
+             token != NULL;
+             token = strtok_r(NULL, ",", &save_ptr)) {
+            char *save_ptr2 = NULL;
+            char *chassis_mac_bridge = strtok_r(token, ":", &save_ptr2);
+            char *chassis_mac_str = strtok_r(NULL, "", &save_ptr2);
+            struct remote_chassis_mac *remote_chassis_mac = NULL;
+            remote_chassis_mac = xmalloc(sizeof *remote_chassis_mac);
+            hmap_insert(&remote_chassis_macs, &remote_chassis_mac->hmap_node,
+                        hash_string(chassis_mac_bridge, 0));
+            remote_chassis_mac->chassis_mac = xstrdup(chassis_mac_str);
+            remote_chassis_mac->chassis_id = xstrdup(chassis->name);
+        }
+        free(tokstr);
+    }
+}
+
+static void
+free_remote_chassis_macs(void)
+{
+    struct remote_chassis_mac *mac, *next_mac;
+
+    HMAP_FOR_EACH_SAFE (mac, next_mac, hmap_node, &remote_chassis_macs) {
+        hmap_remove(&remote_chassis_macs, &mac->hmap_node);
+        free(mac->chassis_mac);
+        free(mac->chassis_id);
+        free(mac);
+    }
+}
+
+static void
+put_chassis_mac_conj_id_flow(const struct sbrec_chassis_table *chassis_table,
+                             const struct sbrec_chassis *chassis,
+                             struct ofpbuf *ofpacts_p,
+                             struct ovn_desired_flow_table *flow_table)
+{
+    struct match match;
+    struct remote_chassis_mac *mac;
+
+    populate_remote_chassis_macs(chassis, chassis_table);
+
+    HMAP_FOR_EACH (mac, hmap_node, &remote_chassis_macs) {
+        struct eth_addr chassis_mac;
+        char *err_str = NULL;
+        struct ofpact_conjunction *conj;
+
+        if ((err_str = str_to_mac(mac->chassis_mac, &chassis_mac))) {
+            free(err_str);
+            free_remote_chassis_macs();
+            return;
+        }
+
+        ofpbuf_clear(ofpacts_p);
+        match_init_catchall(&match);
+
+
+        match_set_dl_src(&match, chassis_mac);
+
+        conj = ofpact_put_CONJUNCTION(ofpacts_p);
+        conj->id = CHASSIS_MAC_TO_ROUTER_MAC_CONJID;
+        conj->n_clauses = 2;
+        conj->clause = 0;
+        ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 180, 0,
+                        &match, ofpacts_p, hc_uuid);
+    }
+
+    free_remote_chassis_macs();
+}
+
+static void
+put_replace_chassis_mac_flows(const struct simap *ct_zones,
+                              const struct
+                              sbrec_port_binding *localnet_port,
+                              const struct hmap *local_datapaths,
+                              struct ofpbuf *ofpacts_p,
+                              ofp_port_t ofport,
+                              struct ovn_desired_flow_table *flow_table)
+{
+    /* Packets arriving on localnet port, could have been routed on
+     * source chassis and hence will have a chassis mac.
+     * conj_match will match source mac with chassis macs conjunction
+     * and replace it with corresponding router port mac.
+     */
+    struct local_datapath *ld = get_local_datapath(local_datapaths,
+                                                   localnet_port->datapath->
+                                                   tunnel_key);
+    ovs_assert(ld);
+
+    int tag = localnet_port->tag ? *localnet_port->tag : 0;
+    struct zone_ids zone_ids = get_zone_ids(localnet_port, ct_zones);
+
+    for (int i = 0; i < ld->n_peer_ports; i++) {
+        const struct sbrec_port_binding *rport_binding = ld->peer_ports[i];
+        struct eth_addr router_port_mac;
+        char *err_str = NULL;
+        struct match match;
+        struct ofpact_mac *replace_mac;
+
+        ovs_assert(rport_binding->n_mac == 1);
+        if ((err_str = str_to_mac(rport_binding->mac[0], &router_port_mac))) {
+            /* Parsing of mac failed. */
+            VLOG_WARN("Parsing or router port mac failed for router port: %s, "
+                    "with error: %s", rport_binding->logical_port, err_str);
+            free(err_str);
+            return;
+        }
+        ofpbuf_clear(ofpacts_p);
+        match_init_catchall(&match);
+
+        /* Add flow, which will match on conjunction id and will
+         * replace source with router port mac */
+
+        /* Match on ingress port, vlan_id and conjunction id */
+        match_set_in_port(&match, ofport);
+        match_set_conj_id(&match, CHASSIS_MAC_TO_ROUTER_MAC_CONJID);
+
+        if (tag) {
+            match_set_dl_vlan(&match, htons(tag), 0);
+        } else {
+            match_set_dl_tci_masked(&match, 0, htons(VLAN_CFI));
+        }
+
+        /* Actions */
+
+        if (tag) {
+            ofpact_put_STRIP_VLAN(ofpacts_p);
+        }
+        load_logical_ingress_metadata(localnet_port, &zone_ids, ofpacts_p);
+        replace_mac = ofpact_put_SET_ETH_SRC(ofpacts_p);
+        replace_mac->mac = router_port_mac;
+
+        /* Resubmit to first logical ingress pipeline table. */
+        put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, ofpacts_p);
+        ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG,
+                        180, 0, &match, ofpacts_p, hc_uuid);
+
+        /* Provide second search criteria, i.e localnet port's
+         * vlan ID for conjunction flow */
+        struct ofpact_conjunction *conj;
+        ofpbuf_clear(ofpacts_p);
+        match_init_catchall(&match);
+
+        if (tag) {
+            match_set_dl_vlan(&match, htons(tag), 0);
+        } else {
+            match_set_dl_tci_masked(&match, 0, htons(VLAN_CFI));
+        }
+
+        conj = ofpact_put_CONJUNCTION(ofpacts_p);
+        conj->id = CHASSIS_MAC_TO_ROUTER_MAC_CONJID;
+        conj->n_clauses = 2;
+        conj->clause = 1;
+        ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 180, 0, &match,
+                        ofpacts_p, hc_uuid);
+    }
+}
+
 static void
 put_replace_router_port_mac_flows(struct ovsdb_idl_index
                                   *sbrec_port_binding_by_name,
@@ -934,6 +1136,11 @@ consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name,
                             &binding->header_.uuid);
         }
 
+        if (!strcmp(binding->type, "localnet")) {
+            put_replace_chassis_mac_flows(ct_zones, binding, local_datapaths,
+                                          ofpacts_p, ofport, flow_table);
+        }
+
         /* Table 65, Priority 100.
          * =======================
          *
@@ -1227,6 +1434,7 @@ void
 physical_run(struct ovsdb_idl_index *sbrec_port_binding_by_name,
              const struct sbrec_multicast_group_table *multicast_group_table,
              const struct sbrec_port_binding_table *port_binding_table,
+             const struct sbrec_chassis_table *chassis_table,
              enum mf_field_id mff_ovn_geneve,
              const struct ovsrec_bridge *br_int,
              const struct sbrec_chassis *chassis,
@@ -1372,6 +1580,8 @@ physical_run(struct ovsdb_idl_index *sbrec_port_binding_by_name,
     struct ofpbuf ofpacts;
     ofpbuf_init(&ofpacts, 0);
 
+    put_chassis_mac_conj_id_flow(chassis_table, chassis, &ofpacts, flow_table);
+
     /* Set up flows in table 0 for physical-to-logical translation and in table
      * 64 for logical-to-physical translation. */
     const struct sbrec_port_binding *binding;
diff --git a/controller/physical.h b/controller/physical.h
index a85e297..c93f6b1 100644
--- a/controller/physical.h
+++ b/controller/physical.h
@@ -46,6 +46,7 @@ void physical_register_ovs_idl(struct ovsdb_idl *);
 void physical_run(struct ovsdb_idl_index *sbrec_port_binding_by_name,
                   const struct sbrec_multicast_group_table *,
                   const struct sbrec_port_binding_table *,
+                  const struct sbrec_chassis_table *chassis_table,
                   enum mf_field_id mff_ovn_geneve,
                   const struct ovsrec_bridge *br_int,
                   const struct sbrec_chassis *chassis,
diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml
index 415da59..6115e84 100644
--- a/ovn-architecture.7.xml
+++ b/ovn-architecture.7.xml
@@ -1426,7 +1426,7 @@
         (MAC,VLAN) tuple will seen by physical network from other chassis as
         well, which could cause these issues:
       </p>
-      
+
       <ul>
         <li>
           Continuous MAC moves in top-of-rack switch (ToR).
@@ -1442,9 +1442,11 @@
 
     <li>
       The destination chassis receives the packet via the localnet port and
-      sends it to the integration bridge. The packet enters the
-      ingress pipeline and then egress pipeline of the destination localnet
-      logical switch and finally gets delivered to the destination VM port.
+      sends it to the integration bridge. Before entering the integration
+      bridge the source mac of the packet will be replaced with
+      router port mac again. The packet enters the ingress pipeline and
+      then egress pipeline of the destination localnet logical switch and
+      finally gets delivered to the destination VM port.
     </li>
   </ol>
 
diff --git a/tests/ovn.at b/tests/ovn.at
index de1b3b3..2a35b4e 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -14278,6 +14278,14 @@ hv_to_chassis_mac () {
     esac
 }
 
+lrp_to_lrp_mac () {
+     case $1 in dnl (
+        router-to-ls[[1]]) echo 000001010203 ;; dnl (
+        router-to-ls[[2]]) echo 000001010205 ;; dnl (
+        *) AT_FAIL_IF([:]) ;;
+    esac
+}
+
 ip_to_hex() {
        printf "%02x%02x%02x%02x" "$@"
 }
@@ -14345,7 +14353,9 @@ test_ip() {
             # (and checksum).
             outport_num=`vif_to_num $outport`
             out_lrp=`vif_to_lrp $outport`
-            echo f000000000${outport_num}aabbccddee${hv_num}${hv_num}08004500001c00000000"3f1101"00${src_ip}${dst_ip}0035111100080000
+            lrp_mac=`lrp_to_lrp_mac $out_lrp`
+            echo f000000000${outport_num}${lrp_mac}08004500001c00000000"3f1101"00${src_ip}${dst_ip}0035111100080000
+
         fi >> $outport.expected
     done
 }
@@ -15720,7 +15730,7 @@ test_ip() {
                 out_lrp=`vif_to_lrp $outport`
                 # For North-South, packet will come via gateway chassis, i.e hv3
                 if test $inport = vif-north; then
-                    echo f00000000011aabbccddee3308004500001c000000003f110100${src_ip}${dst_ip}0035111100080000 >> $outport.expected
+                    echo f0000000001100000101020308004500001c000000003f110100${src_ip}${dst_ip}0035111100080000 >> $outport.expected
                 fi
                 if test $outport = vif-north; then
                     echo f0f00000001100000101020708004500001c000000003e110200${src_ip}${dst_ip}0035111100080000 >> $outport.expected
-- 
1.8.3.1



More information about the dev mailing list