[ovs-dev] [PATCH v4 2/3] userspace: add layer 3 flow and switching support

Lorand Jakab lojakab at cisco.com
Fri Jun 27 13:21:54 UTC 2014


This commit relaxes the assumption that all packets have an Ethernet
header, and adds support for layer 3 flows.  For each packet received on
the Linux kernel datapath the l2 and l3 members of struct ofpbuf are
intialized appropriately, and some functions now expect this (notably
flow_extract()), in order to differentiate between layer 2 and layer 3
packets.  struct flow has now a new 'base_layer' member, because we
cannot assume that a flow has no Ethernet header when eth_src and
eth_dst are 0.  For layer 3 packets, the protocol type is still stored
in the eth_type member.

Switching L2->L3 and L3->L2 are both implemented by adding the pop_eth
and push_eth actions respectively when a transition is detected.  The
push_eth action puts 0s on both source and destination MACs.  These
addresses can be modified with mod_dl_dst and mod_dl_src actions.

Added new prerequisite MFP_ETHERNET for fields MFF_ETH_SRC, MFF_ETH_DST,
MFF_VLAN_TCI, MFF_DL_VLAN, MFF_VLAN_VID and MFF_DL_VLAN_PCP.

Signed-off-by: Lorand Jakab <lojakab at cisco.com>
---
 lib/bfd.c                    |   1 +
 lib/dpif-linux.c             |   8 +++
 lib/dpif.c                   |   6 ++-
 lib/flow.c                   | 113 +++++++++++++++++++++++++++----------------
 lib/flow.h                   |  16 ++++--
 lib/match.c                  |  12 +++--
 lib/meta-flow.c              |  17 ++++---
 lib/meta-flow.h              |   1 +
 lib/netdev-dummy.c           |   1 +
 lib/netdev-linux.c           |   1 +
 lib/nx-match.c               |   2 +-
 lib/odp-util.c               |  34 +++++++++----
 lib/odp-util.h               |   2 +-
 lib/ofp-print.c              |  19 +++++---
 lib/ofp-print.h              |   3 +-
 lib/ofp-util.c               |   2 +-
 lib/ofpbuf.h                 |  12 +++--
 lib/packets.c                |   2 +
 lib/pcap-file.c              |   1 +
 ofproto/ofproto-dpif-xlate.c |  28 ++++++++---
 ofproto/ofproto-dpif-xlate.h |   2 +-
 ofproto/ofproto-dpif.c       |   4 +-
 ofproto/ofproto.c            |   1 +
 tests/ofproto-dpif.at        |   6 +--
 tests/vlan-splinters.at      |   4 +-
 25 files changed, 204 insertions(+), 94 deletions(-)

diff --git a/lib/bfd.c b/lib/bfd.c
index 4cbe999..a2eba47 100644
--- a/lib/bfd.c
+++ b/lib/bfd.c
@@ -599,6 +599,7 @@ bfd_put_packet(struct bfd *bfd, struct ofpbuf *p,
     ovs_assert(!(bfd->flags & FLAG_POLL) || !(bfd->flags & FLAG_FINAL));
 
     ofpbuf_reserve(p, 2); /* Properly align after the ethernet header. */
+    ofpbuf_set_frame(p, ofpbuf_data(p));
     eth = ofpbuf_put_uninit(p, sizeof *eth);
     memcpy(eth->eth_src, eth_src, ETH_ADDR_LEN);
     memcpy(eth->eth_dst, bfd->eth_dst, ETH_ADDR_LEN);
diff --git a/lib/dpif-linux.c b/lib/dpif-linux.c
index 0eac3e7..244b1cd 100644
--- a/lib/dpif-linux.c
+++ b/lib/dpif-linux.c
@@ -1744,6 +1744,14 @@ parse_odp_packet(struct ofpbuf *buf, struct dpif_upcall *upcall,
     ofpbuf_set_data(&upcall->packet,
                     (char *)ofpbuf_data(&upcall->packet) + sizeof(struct nlattr));
     ofpbuf_set_size(&upcall->packet, nl_attr_get_size(a[OVS_PACKET_ATTR_PACKET]));
+    ofpbuf_set_frame(&upcall->packet, ofpbuf_data(&upcall->packet));
+
+    /* Set the correct layer based on the presence of OVS_KEY_ATTR_ETHERNET */
+    if (nl_attr_find__(upcall->key, upcall->key_len, OVS_KEY_ATTR_ETHERNET)) {
+	ofpbuf_set_l3(&upcall->packet, NULL);
+    } else {
+        upcall->packet.l3_ofs = 0;
+    }
 
     *dp_ifindex = ovs_header->dp_ifindex;
 
diff --git a/lib/dpif.c b/lib/dpif.c
index befe7c4..db76644 100644
--- a/lib/dpif.c
+++ b/lib/dpif.c
@@ -1361,7 +1361,8 @@ dpif_recv(struct dpif *dpif, uint32_t handler_id, struct dpif_upcall *upcall,
         char *packet;
 
         packet = ofp_packet_to_string(ofpbuf_data(&upcall->packet),
-                                      ofpbuf_size(&upcall->packet));
+                                      ofpbuf_size(&upcall->packet),
+                                      ofpbuf_is_layer3_packet(&upcall->packet));
 
         ds_init(&flow);
         odp_flow_key_format(upcall->key, upcall->key_len, &flow);
@@ -1582,7 +1583,8 @@ log_execute_message(struct dpif *dpif, const struct dpif_execute *execute,
         char *packet;
 
         packet = ofp_packet_to_string(ofpbuf_data(execute->packet),
-                                      ofpbuf_size(execute->packet));
+                                      ofpbuf_size(execute->packet),
+                                      ofpbuf_is_layer3_packet(execute->packet));
         ds_put_format(&ds, "%s: %sexecute ",
                       dpif_name(dpif),
                       (subexecute ? "sub-"
diff --git a/lib/flow.c b/lib/flow.c
index 5ee5b3f..85cd037 100644
--- a/lib/flow.c
+++ b/lib/flow.c
@@ -121,7 +121,7 @@ struct mf_ctx {
  * away.  Some GCC versions gave warnigns on ALWAYS_INLINE, so these are
  * defined as macros. */
 
-#if (FLOW_WC_SEQ != 26)
+#if (FLOW_WC_SEQ != 28)
 #define MINIFLOW_ASSERT(X) ovs_assert(X)
 #else
 #define MINIFLOW_ASSERT(X)
@@ -323,18 +323,35 @@ invalid:
     return false;
 }
 
-/* Initializes 'flow' members from 'packet' and 'md'
+/* Determines IP version if a layer 3 packet */
+static ovs_be16
+get_l3_eth_type(struct ofpbuf *packet)
+{
+    struct ip_header *ip = ofpbuf_l3(packet);
+    int ip_ver = IP_VER(ip->ip_ihl_ver);
+    switch (ip_ver) {
+    case 4:
+        return htons(ETH_TYPE_IP);
+    case 6:
+        return htons(ETH_TYPE_IPV6);
+    default:
+        return 0;
+    }
+}
+
+/* Initializes 'flow' members from 'packet' and 'md'.  Expects packet->frame
+ * pointer to be equal to ofpbuf_data(packet), and packet->l3_ofs to be set to
+ * 0 for layer 3 packets.
  *
- * Initializes 'packet' header l2 pointer to the start of the Ethernet
- * header, and the layer offsets as follows:
+ * Initializes the layer offsets as follows:
  *
  *    - packet->l2_5_ofs to the start of the MPLS shim header, or UINT16_MAX
- *      when there is no MPLS shim header.
+ *      when there is no MPLS shim header, or Ethernet header
  *
- *    - packet->l3_ofs to just past the Ethernet header, or just past the
- *      vlan_header if one is present, to the first byte of the payload of the
- *      Ethernet frame.  UINT16_MAX if the frame is too short to contain an
- *      Ethernet header.
+ *    - packet->l3_ofs (if not 0) to just past the Ethernet header, or just
+ *      past the vlan_header if one is present, to the first byte of the
+ *      payload of the Ethernet frame.  UINT16_MAX if the frame is too short to
+ *      contain an Ethernet header.
  *
  *    - packet->l4_ofs to just past the IPv4 header, if one is present and
  *      has at least the content used for the fields of interest for the flow,
@@ -351,6 +368,8 @@ flow_extract(struct ofpbuf *packet, const struct pkt_metadata *md,
 
     COVERAGE_INC(flow_extract);
 
+    ovs_assert(packet->frame == ofpbuf_data(packet));
+
     miniflow_initialize(&m.mf, m.buf);
     miniflow_extract(packet, md, &m.mf);
     miniflow_expand(&m.mf, flow);
@@ -366,7 +385,7 @@ miniflow_extract(struct ofpbuf *packet, const struct pkt_metadata *md,
     size_t size = ofpbuf_size(packet);
     uint32_t *values = miniflow_values(dst);
     struct mf_ctx mf = { 0, values, values + FLOW_U32S };
-    char *l2;
+    char *frame = NULL;
     ovs_be16 dl_type;
     uint8_t nw_frag, nw_tos, nw_ttl, nw_proto;
 
@@ -382,40 +401,48 @@ miniflow_extract(struct ofpbuf *packet, const struct pkt_metadata *md,
         miniflow_push_uint32(mf, in_port, odp_to_u32(md->in_port.odp_port));
     }
 
-    /* Initialize packet's layer pointer and offsets. */
-    l2 = data;
-    ofpbuf_set_frame(packet, data);
-
-    /* Must have full Ethernet header to proceed. */
-    if (OVS_UNLIKELY(size < sizeof(struct eth_header))) {
-        goto out;
+    if (packet->l3_ofs) {
+	frame = data;
+	miniflow_push_uint32(mf, base_layer, LAYER_2);
+
+	/* Must have full Ethernet header to proceed. */
+	if (OVS_UNLIKELY(size < sizeof(struct eth_header))) {
+	    goto out;
+	} else {
+	    ovs_be16 vlan_tci;
+
+	    /* Link layer. */
+	    BUILD_ASSERT(offsetof(struct flow, dl_dst) + 6
+			 == offsetof(struct flow, dl_src));
+	    miniflow_push_words(mf, dl_dst, data, ETH_ADDR_LEN * 2 / 4);
+	    /* dl_type, vlan_tci. */
+	    vlan_tci = parse_vlan(&data, &size);
+	    dl_type = parse_ethertype(&data, &size);
+	    miniflow_push_be16(mf, dl_type, dl_type);
+	    miniflow_push_be16(mf, vlan_tci, vlan_tci);
+	}
+
+	/* Parse mpls. */
+	if (OVS_UNLIKELY(eth_type_mpls(dl_type))) {
+	    int count;
+	    const void *mpls = data;
+
+	    packet->l2_5_ofs = (char *)data - frame;
+	    count = parse_mpls(&data, &size);
+	    miniflow_push_words(mf, mpls_lse, mpls, count);
+	}
+
+	/* Network layer. */
+	packet->l3_ofs = (char *)data - frame;
     } else {
-        ovs_be16 vlan_tci;
+	miniflow_push_uint32(mf, base_layer, LAYER_3);
 
-        /* Link layer. */
-        BUILD_ASSERT(offsetof(struct flow, dl_dst) + 6
-                     == offsetof(struct flow, dl_src));
-        miniflow_push_words(mf, dl_dst, data, ETH_ADDR_LEN * 2 / 4);
-        /* dl_type, vlan_tci. */
-        vlan_tci = parse_vlan(&data, &size);
-        dl_type = parse_ethertype(&data, &size);
-        miniflow_push_be16(mf, dl_type, dl_type);
-        miniflow_push_be16(mf, vlan_tci, vlan_tci);
+	/* We assume L3 packets are either IPv4 or IPv6 */
+	dl_type = get_l3_eth_type(packet);
+	miniflow_push_be16(mf, dl_type, dl_type);
+	miniflow_push_be16(mf, vlan_tci, 0);
     }
 
-    /* Parse mpls. */
-    if (OVS_UNLIKELY(eth_type_mpls(dl_type))) {
-        int count;
-        const void *mpls = data;
-
-        packet->l2_5_ofs = (char *)data - l2;
-        count = parse_mpls(&data, &size);
-        miniflow_push_words(mf, mpls_lse, mpls, count);
-    }
-
-    /* Network layer. */
-    packet->l3_ofs = (char *)data - l2;
-
     nw_frag = 0;
     if (OVS_LIKELY(dl_type == htons(ETH_TYPE_IP))) {
         const struct ip_header *nh = data;
@@ -566,7 +593,7 @@ miniflow_extract(struct ofpbuf *packet, const struct pkt_metadata *md,
         goto out;
     }
 
-    packet->l4_ofs = (char *)data - l2;
+    packet->l4_ofs = (char *)data - frame;
     miniflow_push_be32(mf, nw_frag,
                        BYTES_TO_BE32(nw_frag, nw_tos, nw_ttl, nw_proto));
 
@@ -665,7 +692,7 @@ flow_unwildcard_tp_ports(const struct flow *flow, struct flow_wildcards *wc)
 void
 flow_get_metadata(const struct flow *flow, struct flow_metadata *fmd)
 {
-    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 27);
+    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 28);
 
     fmd->dp_hash = flow->dp_hash;
     fmd->recirc_id = flow->recirc_id;
@@ -1327,7 +1354,7 @@ flow_push_mpls(struct flow *flow, int n, ovs_be16 mpls_eth_type,
         flow->mpls_lse[0] = set_mpls_lse_values(ttl, tc, 1, htonl(label));
 
         /* Clear all L3 and L4 fields. */
-        BUILD_ASSERT(FLOW_WC_SEQ == 27);
+        BUILD_ASSERT(FLOW_WC_SEQ == 28);
         memset((char *) flow + FLOW_SEGMENT_2_ENDS_AT, 0,
                sizeof(struct flow) - FLOW_SEGMENT_2_ENDS_AT);
     }
diff --git a/lib/flow.h b/lib/flow.h
index 1bce46c..89ee577 100644
--- a/lib/flow.h
+++ b/lib/flow.h
@@ -38,7 +38,7 @@ struct pkt_metadata;
 /* This sequence number should be incremented whenever anything involving flows
  * or the wildcarding of flows changes.  This will cause build assertion
  * failures in places which likely need to be updated. */
-#define FLOW_WC_SEQ 27
+#define FLOW_WC_SEQ 28
 
 #define FLOW_N_REGS 8
 BUILD_ASSERT_DECL(FLOW_N_REGS <= NXM_NX_MAX_REGS);
@@ -65,6 +65,11 @@ const char *flow_tun_flag_to_string(uint32_t flags);
 /* Maximum number of supported MPLS labels. */
 #define FLOW_MAX_MPLS_LABELS 3
 
+enum base_layer {
+    LAYER_2 = 0,
+    LAYER_3 = 1
+};
+
 /*
  * A flow in the network.
  *
@@ -81,6 +86,10 @@ const char *flow_tun_flag_to_string(uint32_t flags);
  * lower layer fields are first used to determine if the later fields need to
  * be looked at.  This enables better wildcarding for datapath flows.
  *
+ * The starting layer is specified by 'base_layer'.  When 'base_layer' is
+ * LAYER_3, dl_src, dl_tci, and vlan_tci are not used for matching. The
+ * dl_type field is still used to specify the layer 3 protocol.
+ *
  * NOTE: Order of the fields is significant, any change in the order must be
  * reflected in miniflow_extract()!
  */
@@ -93,6 +102,7 @@ struct flow {
     uint32_t pkt_mark;          /* Packet mark. */
     uint32_t recirc_id;         /* Must be exact match. */
     union flow_in_port in_port; /* Input port.*/
+    uint32_t base_layer;        /* Fields start at this layer */
 
     /* L2, Order the same as in the Ethernet header! */
     uint8_t dl_dst[6];          /* Ethernet destination address. */
@@ -131,8 +141,8 @@ BUILD_ASSERT_DECL(sizeof(struct flow) % 4 == 0);
 
 /* Remember to update FLOW_WC_SEQ when changing 'struct flow'. */
 BUILD_ASSERT_DECL(offsetof(struct flow, dp_hash) + sizeof(uint32_t)
-                  == sizeof(struct flow_tnl) + 176
-                  && FLOW_WC_SEQ == 27);
+                  == sizeof(struct flow_tnl) + 180
+                  && FLOW_WC_SEQ == 28);
 
 /* Incremental points at which flow classification may be performed in
  * segments.
diff --git a/lib/match.c b/lib/match.c
index 58fa0e4..72fed73 100644
--- a/lib/match.c
+++ b/lib/match.c
@@ -46,6 +46,7 @@ match_wc_init(struct match *match, const struct flow *flow)
     wc = &match->wc;
     memset(&wc->masks, 0x0, sizeof wc->masks);
 
+    memset(&wc->masks.base_layer, 0xff, sizeof wc->masks.base_layer);
     memset(&wc->masks.dl_type, 0xff, sizeof wc->masks.dl_type);
 
     if (flow->nw_proto) {
@@ -81,9 +82,12 @@ match_wc_init(struct match *match, const struct flow *flow)
 
     memset(&wc->masks.metadata, 0xff, sizeof wc->masks.metadata);
     memset(&wc->masks.in_port, 0xff, sizeof wc->masks.in_port);
-    memset(&wc->masks.vlan_tci, 0xff, sizeof wc->masks.vlan_tci);
-    memset(&wc->masks.dl_src, 0xff, sizeof wc->masks.dl_src);
-    memset(&wc->masks.dl_dst, 0xff, sizeof wc->masks.dl_dst);
+
+    if (flow->base_layer == LAYER_2) {
+        memset(&wc->masks.vlan_tci, 0xff, sizeof wc->masks.vlan_tci);
+        memset(&wc->masks.dl_src, 0xff, sizeof wc->masks.dl_src);
+        memset(&wc->masks.dl_dst, 0xff, sizeof wc->masks.dl_dst);
+    }
 
     if (flow->dl_type == htons(ETH_TYPE_IPV6)) {
         memset(&wc->masks.ipv6_src, 0xff, sizeof wc->masks.ipv6_src);
@@ -944,7 +948,7 @@ match_format(const struct match *match, struct ds *s, unsigned int priority)
 
     int i;
 
-    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 27);
+    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 28);
 
     if (priority != OFP_DEFAULT_PRIORITY) {
         ds_put_format(s, "priority=%u,", priority);
diff --git a/lib/meta-flow.c b/lib/meta-flow.c
index 44fc2a9..0f30dee 100644
--- a/lib/meta-flow.c
+++ b/lib/meta-flow.c
@@ -260,7 +260,7 @@ const struct mf_field mf_fields[MFF_N_IDS] = {
         MF_FIELD_SIZES(mac),
         MFM_FULLY,
         MFS_ETHERNET,
-        MFP_NONE,
+        MFP_ETHERNET,
         true,
         NXM_OF_ETH_SRC, "NXM_OF_ETH_SRC",
         OXM_OF_ETH_SRC, "OXM_OF_ETH_SRC", OFP12_VERSION,
@@ -272,7 +272,7 @@ const struct mf_field mf_fields[MFF_N_IDS] = {
         MF_FIELD_SIZES(mac),
         MFM_FULLY,
         MFS_ETHERNET,
-        MFP_NONE,
+        MFP_ETHERNET,
         true,
         NXM_OF_ETH_DST, "NXM_OF_ETH_DST",
         OXM_OF_ETH_DST, "OXM_OF_ETH_DST", OFP12_VERSION,
@@ -298,7 +298,7 @@ const struct mf_field mf_fields[MFF_N_IDS] = {
         MF_FIELD_SIZES(be16),
         MFM_FULLY,
         MFS_HEXADECIMAL,
-        MFP_NONE,
+        MFP_ETHERNET,
         true,
         NXM_OF_VLAN_TCI, "NXM_OF_VLAN_TCI",
         NXM_OF_VLAN_TCI, "NXM_OF_VLAN_TCI", 0,
@@ -310,7 +310,7 @@ const struct mf_field mf_fields[MFF_N_IDS] = {
         sizeof(ovs_be16), 12,
         MFM_NONE,
         MFS_DECIMAL,
-        MFP_NONE,
+        MFP_ETHERNET,
         true,
         0, NULL,
         0, NULL, 0,
@@ -322,7 +322,7 @@ const struct mf_field mf_fields[MFF_N_IDS] = {
         sizeof(ovs_be16), 12,
         MFM_FULLY,
         MFS_DECIMAL,
-        MFP_NONE,
+        MFP_ETHERNET,
         true,
         OXM_OF_VLAN_VID, "OXM_OF_VLAN_VID",
         OXM_OF_VLAN_VID, "OXM_OF_VLAN_VID", OFP12_VERSION,
@@ -334,7 +334,7 @@ const struct mf_field mf_fields[MFF_N_IDS] = {
         1, 3,
         MFM_NONE,
         MFS_DECIMAL,
-        MFP_NONE,
+        MFP_ETHERNET,
         true,
         0, NULL,
         0, NULL, 0,
@@ -1065,6 +1065,8 @@ mf_are_prereqs_ok(const struct mf_field *mf, const struct flow *flow)
     case MFP_NONE:
         return true;
 
+    case MFP_ETHERNET:
+        return flow->base_layer == LAYER_2;
     case MFP_ARP:
       return (flow->dl_type == htons(ETH_TYPE_ARP) ||
               flow->dl_type == htons(ETH_TYPE_RARP));
@@ -1142,6 +1144,9 @@ mf_mask_field_and_prereqs(const struct mf_field *mf, struct flow *mask)
     case MFP_VLAN_VID:
         mask->vlan_tci |= htons(VLAN_CFI);
         break;
+    case MFP_ETHERNET:
+        mask->base_layer = OVS_BE32_MAX;
+        break;
     case MFP_NONE:
         break;
     }
diff --git a/lib/meta-flow.h b/lib/meta-flow.h
index 7a4b8dc..4b560f0 100644
--- a/lib/meta-flow.h
+++ b/lib/meta-flow.h
@@ -189,6 +189,7 @@ enum OVS_PACKED_ENUM mf_prereqs {
     MFP_NONE,
 
     /* L2 requirements. */
+    MFP_ETHERNET,
     MFP_ARP,
     MFP_VLAN_VID,
     MFP_IPV4,
diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
index 8d1c298..07f31bf 100644
--- a/lib/netdev-dummy.c
+++ b/lib/netdev-dummy.c
@@ -804,6 +804,7 @@ netdev_dummy_rxq_recv(struct netdev_rxq *rxq_, struct dpif_packet **arr,
     netdev->stats.rx_bytes += ofpbuf_size(packet);
     ovs_mutex_unlock(&netdev->mutex);
 
+    ofpbuf_set_frame(packet, ofpbuf_data(packet));
     dp_packet_pad(packet);
 
     /* This performs a (sometimes unnecessary) copy */
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 1780639..7a46170 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -1014,6 +1014,7 @@ netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dpif_packet **packets,
         }
         dpif_packet_delete(packet);
     } else {
+	ofpbuf_set_frame(buffer, ofpbuf_data(buffer));
         dp_packet_pad(buffer);
         packets[0] = packet;
         *c = 1;
diff --git a/lib/nx-match.c b/lib/nx-match.c
index 678e6f3..390464a 100644
--- a/lib/nx-match.c
+++ b/lib/nx-match.c
@@ -616,7 +616,7 @@ nx_put_raw(struct ofpbuf *b, enum ofp_version oxm, const struct match *match,
     int match_len;
     int i;
 
-    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 27);
+    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 28);
 
     /* Metadata. */
     if (match->wc.masks.dp_hash) {
diff --git a/lib/odp-util.c b/lib/odp-util.c
index bf64830..d88df25 100644
--- a/lib/odp-util.c
+++ b/lib/odp-util.c
@@ -2576,7 +2576,7 @@ odp_flow_key_from_flow__(struct ofpbuf *buf, const struct flow *flow,
                          size_t max_mpls_depth, bool recirc, bool export_mask)
 {
     struct ovs_key_ethernet *eth_key;
-    size_t encap;
+    size_t encap = 0;
     const struct flow *data = export_mask ? mask : flow;
 
     nl_msg_put_u32(buf, OVS_KEY_ATTR_PRIORITY, data->skb_priority);
@@ -2598,6 +2598,10 @@ odp_flow_key_from_flow__(struct ofpbuf *buf, const struct flow *flow,
         nl_msg_put_odp_port(buf, OVS_KEY_ATTR_IN_PORT, odp_in_port);
     }
 
+    if (flow->base_layer == LAYER_3) {
+        goto noethernet;
+    }
+
     eth_key = nl_msg_put_unspec_uninit(buf, OVS_KEY_ATTR_ETHERNET,
                                        sizeof *eth_key);
     memcpy(eth_key->eth_src, data->dl_src, ETH_ADDR_LEN);
@@ -2614,8 +2618,6 @@ odp_flow_key_from_flow__(struct ofpbuf *buf, const struct flow *flow,
         if (flow->vlan_tci == htons(0)) {
             goto unencap;
         }
-    } else {
-        encap = 0;
     }
 
     if (ntohs(flow->dl_type) < ETH_TYPE_MIN) {
@@ -2638,6 +2640,7 @@ odp_flow_key_from_flow__(struct ofpbuf *buf, const struct flow *flow,
 
     nl_msg_put_be16(buf, OVS_KEY_ATTR_ETHERTYPE, data->dl_type);
 
+noethernet:
     if (flow->dl_type == htons(ETH_TYPE_IP)) {
         struct ovs_key_ipv4 *ipv4_key;
 
@@ -3038,7 +3041,13 @@ parse_ethertype(const struct nlattr *attrs[OVS_KEY_ATTR_MAX + 1],
         *expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_ETHERTYPE;
     } else {
         if (!is_mask) {
-            flow->dl_type = htons(FLOW_DL_TYPE_NONE);
+            if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_IPV4)) {
+                flow->dl_type = htons(ETH_TYPE_IP);
+            } else if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_IPV6)) {
+                flow->dl_type = htons(ETH_TYPE_IPV6);
+            } else {
+                flow->dl_type = htons(FLOW_DL_TYPE_NONE);
+            }
         } else if (ntohs(src_flow->dl_type) < ETH_TYPE_MIN) {
             /* See comments in odp_flow_key_from_flow__(). */
             VLOG_ERR_RL(&rl, "mask expected for non-Ethernet II frame");
@@ -3452,12 +3461,10 @@ odp_flow_key_to_flow__(const struct nlattr *key, size_t key_len,
         eth_key = nl_attr_get(attrs[OVS_KEY_ATTR_ETHERNET]);
         memcpy(flow->dl_src, eth_key->eth_src, ETH_ADDR_LEN);
         memcpy(flow->dl_dst, eth_key->eth_dst, ETH_ADDR_LEN);
-        if (is_mask) {
-            expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_ETHERNET;
-        }
-    }
-    if (!is_mask) {
+        flow->base_layer = LAYER_2;
         expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_ETHERNET;
+    } else {
+        flow->base_layer = LAYER_3;
     }
 
     /* Get Ethertype or 802.1Q TPID or FLOW_DL_TYPE_NONE. */
@@ -3474,6 +3481,7 @@ odp_flow_key_to_flow__(const struct nlattr *key, size_t key_len,
     }
     if (is_mask) {
         flow->vlan_tci = htons(0xffff);
+	flow->base_layer = htonl(0xffffffff);
         if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_VLAN)) {
             flow->vlan_tci = nl_attr_get_be16(attrs[OVS_KEY_ATTR_VLAN]);
             expected_attrs |= (UINT64_C(1) << OVS_KEY_ATTR_VLAN);
@@ -3655,6 +3663,14 @@ commit_set_ether_addr_action(const struct flow *flow, struct flow *base,
         return;
     }
 
+    /* If we have a L3 --> L2 flow, the push_eth action takes care of setting
+     * the appropriate MAC source and destination addresses, no need to add a
+     * set action
+     */
+    if (base->base_layer == LAYER_3 && flow->base_layer == LAYER_2) {
+        return;
+    }
+
     memset(&wc->masks.dl_src, 0xff, sizeof wc->masks.dl_src);
     memset(&wc->masks.dl_dst, 0xff, sizeof wc->masks.dl_dst);
 
diff --git a/lib/odp-util.h b/lib/odp-util.h
index d4169db..b061f0c 100644
--- a/lib/odp-util.h
+++ b/lib/odp-util.h
@@ -126,7 +126,7 @@ void odp_portno_names_destroy(struct hmap *portno_names);
  * add another field and forget to adjust this value.
  */
 #define ODPUTIL_FLOW_KEY_BYTES 512
-BUILD_ASSERT_DECL(FLOW_WC_SEQ == 27);
+BUILD_ASSERT_DECL(FLOW_WC_SEQ == 28);
 
 /* A buffer with sufficient size and alignment to hold an nlattr-formatted flow
  * key.  An array of "struct nlattr" might not, in theory, be sufficiently
diff --git a/lib/ofp-print.c b/lib/ofp-print.c
index a2c2434..877bc76 100644
--- a/lib/ofp-print.c
+++ b/lib/ofp-print.c
@@ -53,10 +53,11 @@ static void ofp_print_queue_name(struct ds *string, uint32_t port);
 static void ofp_print_error(struct ds *, enum ofperr);
 
 
-/* Returns a string that represents the contents of the Ethernet frame in the
- * 'len' bytes starting at 'data'.  The caller must free the returned string.*/
+/* Returns a string that represents the contents of the Ethernet frame
+ * (is_layer3 == False) or IP packet (is_layer3 == True) in the 'len' bytes
+ * starting at 'data'.   The caller must free the returned string.*/
 char *
-ofp_packet_to_string(const void *data, size_t len)
+ofp_packet_to_string(const void *data, size_t len, bool is_layer3)
 {
     struct ds ds = DS_EMPTY_INITIALIZER;
     const struct pkt_metadata md = PKT_METADATA_INITIALIZER(0);
@@ -65,6 +66,12 @@ ofp_packet_to_string(const void *data, size_t len)
     size_t l4_size;
 
     ofpbuf_use_const(&buf, data, len);
+    ofpbuf_set_frame(&buf, ofpbuf_data(&buf));
+
+    if (is_layer3) {
+        buf.l3_ofs = 0;
+    }
+
     flow_extract(&buf, &md, &flow);
     flow_format(&ds, &flow);
 
@@ -158,7 +165,7 @@ ofp_print_packet_in(struct ds *string, const struct ofp_header *oh,
     ds_put_char(string, '\n');
 
     if (verbosity > 0) {
-        char *packet = ofp_packet_to_string(pin.packet, pin.packet_len);
+        char *packet = ofp_packet_to_string(pin.packet, pin.packet_len, false);
         ds_put_cstr(string, packet);
         free(packet);
     }
@@ -192,7 +199,7 @@ ofp_print_packet_out(struct ds *string, const struct ofp_header *oh,
     if (po.buffer_id == UINT32_MAX) {
         ds_put_format(string, " data_len=%"PRIuSIZE, po.packet_len);
         if (verbosity > 0 && po.packet_len > 0) {
-            char *packet = ofp_packet_to_string(po.packet, po.packet_len);
+            char *packet = ofp_packet_to_string(po.packet, po.packet_len, false);
             ds_put_char(string, '\n');
             ds_put_cstr(string, packet);
             free(packet);
@@ -3118,5 +3125,5 @@ ofp_print(FILE *stream, const void *oh, size_t len, int verbosity)
 void
 ofp_print_packet(FILE *stream, const void *data, size_t len)
 {
-    print_and_free(stream, ofp_packet_to_string(data, len));
+    print_and_free(stream, ofp_packet_to_string(data, len, false));
 }
diff --git a/lib/ofp-print.h b/lib/ofp-print.h
index 825e139..15aa196 100644
--- a/lib/ofp-print.h
+++ b/lib/ofp-print.h
@@ -21,6 +21,7 @@
 
 #include <stdint.h>
 #include <stdio.h>
+#include <stdbool.h>
 
 struct ds;
 struct ofp10_match;
@@ -39,7 +40,7 @@ void ofp10_match_print(struct ds *, const struct ofp10_match *, int verbosity);
 
 char *ofp_to_string(const void *, size_t, int verbosity);
 char *ofp10_match_to_string(const struct ofp10_match *, int verbosity);
-char *ofp_packet_to_string(const void *data, size_t len);
+char *ofp_packet_to_string(const void *data, size_t len, bool is_layer3);
 
 void ofp_print_flow_stats(struct ds *, struct ofputil_flow_stats *);
 void ofp_print_version(const struct ofp_header *, struct ds *);
diff --git a/lib/ofp-util.c b/lib/ofp-util.c
index 6d551ad..6b1be85 100644
--- a/lib/ofp-util.c
+++ b/lib/ofp-util.c
@@ -132,7 +132,7 @@ ofputil_netmask_to_wcbits(ovs_be32 netmask)
 void
 ofputil_wildcard_from_ofpfw10(uint32_t ofpfw, struct flow_wildcards *wc)
 {
-    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 27);
+    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 28);
 
     /* Initialize most of wc. */
     flow_wildcards_init_catchall(wc);
diff --git a/lib/ofpbuf.h b/lib/ofpbuf.h
index eed5ca8..c7211e3 100644
--- a/lib/ofpbuf.h
+++ b/lib/ofpbuf.h
@@ -270,11 +270,12 @@ static inline bool ofpbuf_equal(const struct ofpbuf *a, const struct ofpbuf *b)
            memcmp(ofpbuf_data(a), ofpbuf_data(b), ofpbuf_size(a)) == 0;
 }
 
-/* Get the start if the Ethernet frame.  'l3_ofs' marks the end of the l2
- * headers, so return NULL if it is not set. */
+/* Get the start of the Ethernet frame.  'l3_ofs' marks the end of the l2
+ * headers, so return NULL if it is not set.  A 'l3_ofs' of 0 marks a layer 3
+ * packet, so return NULL in that case too. */
 static inline void * ofpbuf_l2(const struct ofpbuf *b)
 {
-    return (b->l3_ofs != UINT16_MAX) ? b->frame : NULL;
+    return (b->l3_ofs != UINT16_MAX && b->l3_ofs != 0) ? b->frame : NULL;
 }
 
 /* Sets the packet frame start pointer and resets all layer offsets.
@@ -356,6 +357,11 @@ static inline const void *ofpbuf_get_icmp_payload(const struct ofpbuf *b)
         ? (const char *)ofpbuf_l4(b) + ICMP_HEADER_LEN : NULL;
 }
 
+static inline bool ofpbuf_is_layer3_packet(const struct ofpbuf *b)
+{
+    return (b->frame == b->data_) && (b->l3_ofs == 0);
+}
+
 #ifdef DPDK_NETDEV
 BUILD_ASSERT_DECL(offsetof(struct ofpbuf, mbuf) == 0);
 
diff --git a/lib/packets.c b/lib/packets.c
index c1f7ade..a19fc65 100644
--- a/lib/packets.c
+++ b/lib/packets.c
@@ -390,6 +390,8 @@ eth_from_hex(const char *hex, struct ofpbuf **packetp)
         return "Packet data too short for Ethernet";
     }
 
+    ofpbuf_set_frame(packet, ofpbuf_data(packet));
+
     return NULL;
 }
 
diff --git a/lib/pcap-file.c b/lib/pcap-file.c
index 191e690..682503d 100644
--- a/lib/pcap-file.c
+++ b/lib/pcap-file.c
@@ -185,6 +185,7 @@ ovs_pcap_read(FILE *file, struct ofpbuf **bufp, long long int *when)
         ofpbuf_delete(buf);
         return error;
     }
+    ofpbuf_set_frame(buf, ofpbuf_data(buf));
     *bufp = buf;
     return 0;
 }
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index e930521..dddda8e 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -154,6 +154,7 @@ struct xport {
 
     bool may_enable;                 /* May be enabled in bonds. */
     bool is_tunnel;                  /* Is a tunnel port. */
+    bool is_layer3;                  /* Is a layer 3 port. */
 
     struct cfm *cfm;                 /* CFM handle or null. */
     struct bfd *bfd;                 /* BFD handle or null. */
@@ -378,7 +379,7 @@ static void xlate_xport_set(struct xport *xport, odp_port_t odp_port,
                             const struct bfd *bfd, int stp_port_no,
                             enum ofputil_port_config config,
                             enum ofputil_port_state state, bool is_tunnel,
-                            bool may_enable);
+                            bool may_enable, bool is_layer3);
 static void xlate_xbridge_remove(struct xlate_cfg *, struct xbridge *);
 static void xlate_xbundle_remove(struct xlate_cfg *, struct xbundle *);
 static void xlate_xport_remove(struct xlate_cfg *, struct xport *);
@@ -510,13 +511,14 @@ xlate_xport_set(struct xport *xport, odp_port_t odp_port,
                 const struct netdev *netdev, const struct cfm *cfm,
                 const struct bfd *bfd, int stp_port_no,
                 enum ofputil_port_config config, enum ofputil_port_state state,
-                bool is_tunnel, bool may_enable)
+                bool is_tunnel, bool may_enable, bool is_layer3)
 {
     xport->config = config;
     xport->state = state;
     xport->stp_port_no = stp_port_no;
     xport->is_tunnel = is_tunnel;
     xport->may_enable = may_enable;
+    xport->is_layer3 = is_layer3;
     xport->odp_port = odp_port;
 
     if (xport->cfm != cfm) {
@@ -597,7 +599,7 @@ xlate_xport_copy(struct xbridge *xbridge, struct xbundle *xbundle,
 
     xlate_xport_set(new_xport, xport->odp_port, xport->netdev, xport->cfm,
                     xport->bfd, xport->stp_port_no, xport->config, xport->state,
-                    xport->is_tunnel, xport->may_enable);
+                    xport->is_tunnel, xport->may_enable, xport->is_layer3);
 
     if (xport->peer) {
         struct xport *peer = xport_lookup(new_xcfg, xport->peer->ofport);
@@ -839,7 +841,7 @@ xlate_ofport_set(struct ofproto_dpif *ofproto, struct ofbundle *ofbundle,
                  const struct ofproto_port_queue *qdscp_list, size_t n_qdscp,
                  enum ofputil_port_config config,
                  enum ofputil_port_state state, bool is_tunnel,
-                 bool may_enable)
+                 bool may_enable, bool is_layer3)
 {
     size_t i;
     struct xport *xport;
@@ -859,7 +861,7 @@ xlate_ofport_set(struct ofproto_dpif *ofproto, struct ofbundle *ofbundle,
     ovs_assert(xport->ofp_port == ofp_port);
 
     xlate_xport_set(xport, odp_port, netdev, cfm, bfd, stp_port_no, config,
-                    state, is_tunnel, may_enable);
+                    state, is_tunnel, may_enable, is_layer3);
 
     if (xport->peer) {
         xport->peer->peer = NULL;
@@ -2052,7 +2054,7 @@ xlate_normal(struct xlate_ctx *ctx)
     }
 
     /* Learn source MAC. */
-    if (ctx->xin->may_learn) {
+    if (ctx->xin->may_learn && !(in_port->is_layer3)) {
         update_learning_table(ctx->xbridge, flow, wc, vlan, in_xbundle);
     }
     if (ctx->xin->xcache) {
@@ -2358,6 +2360,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port,
     const struct xport *xport = get_ofp_port(ctx->xbridge, ofp_port);
     struct flow_wildcards *wc = &ctx->xout->wc;
     struct flow *flow = &ctx->xin->flow;
+    const struct xport *in_xport = get_ofp_port(ctx->xbridge, flow->in_port.ofp_port);
     ovs_be16 flow_vlan_tci;
     uint32_t flow_pkt_mark;
     uint8_t flow_nw_tos;
@@ -2366,7 +2369,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port,
 
     /* If 'struct flow' gets additional metadata, we'll need to zero it out
      * before traversing a patch port. */
-    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 27);
+    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 28);
 
     if (!xport) {
         xlate_report(ctx, "Nonexistent output port");
@@ -2393,6 +2396,16 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port,
                                                  xport->xbundle);
     }
 
+    if (in_xport && !in_xport->is_layer3 && xport->is_layer3) {
+        odp_put_pop_eth_action(&ctx->xout->odp_actions);
+    }
+
+    if (flow->base_layer == LAYER_3 && !xport->is_layer3) {
+        flow->base_layer = LAYER_2;
+        odp_put_push_eth_action(&ctx->xout->odp_actions, flow->dl_src,
+                                flow->dl_dst, flow->dl_type);
+    }
+
     if (xport->peer) {
         const struct xport *peer = xport->peer;
         struct flow old_flow = ctx->xin->flow;
@@ -3968,6 +3981,7 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout)
     flow_wildcards_init_catchall(wc);
     memset(&wc->masks.in_port, 0xff, sizeof wc->masks.in_port);
     memset(&wc->masks.skb_priority, 0xff, sizeof wc->masks.skb_priority);
+    memset(&wc->masks.base_layer, 0xff, sizeof wc->masks.base_layer);
     memset(&wc->masks.dl_type, 0xff, sizeof wc->masks.dl_type);
     if (is_ip_any(flow)) {
         wc->masks.nw_frag |= FLOW_NW_FRAG_MASK;
diff --git a/ofproto/ofproto-dpif-xlate.h b/ofproto/ofproto-dpif-xlate.h
index 4bdf2d3..752796b 100644
--- a/ofproto/ofproto-dpif-xlate.h
+++ b/ofproto/ofproto-dpif-xlate.h
@@ -164,7 +164,7 @@ void xlate_ofport_set(struct ofproto_dpif *, struct ofbundle *,
                       int stp_port_no, const struct ofproto_port_queue *qdscp,
                       size_t n_qdscp, enum ofputil_port_config,
                       enum ofputil_port_state, bool is_tunnel,
-                      bool may_enable);
+                      bool may_enable, bool is_layer3);
 void xlate_ofport_remove(struct ofport_dpif *);
 
 int xlate_receive(const struct dpif_backer *, struct ofpbuf *packet,
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index 8c130e1..81ad0fd 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -621,7 +621,8 @@ type_run(const char *type)
                                  ofport->bfd, ofport->peer, stp_port,
                                  ofport->qdscp, ofport->n_qdscp,
                                  ofport->up.pp.config, ofport->up.pp.state,
-                                 ofport->is_tunnel, ofport->may_enable);
+                                 ofport->is_tunnel, ofport->may_enable,
+                                 ofport->is_layer3);
             }
             xlate_txn_commit();
         }
@@ -1029,6 +1030,7 @@ check_variable_length_userdata(struct dpif_backer *backer)
     ofpbuf_init(&packet, ETH_HEADER_LEN);
     eth = ofpbuf_put_zeros(&packet, ETH_HEADER_LEN);
     eth->eth_type = htons(0x1234);
+    ofpbuf_set_frame(&packet, ofpbuf_data(&packet));
 
     /* Execute the actions.  On older datapaths this fails with ERANGE, on
      * newer datapaths it succeeds. */
diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c
index 5399c9f..a781979 100644
--- a/ofproto/ofproto.c
+++ b/ofproto/ofproto.c
@@ -2976,6 +2976,7 @@ handle_packet_out(struct ofconn *ofconn, const struct ofp_header *oh)
     } else {
         /* Ensure that the L3 header is 32-bit aligned. */
         payload = ofpbuf_clone_data_with_headroom(po.packet, po.packet_len, 2);
+        ofpbuf_set_frame(payload, ofpbuf_data(payload));
     }
 
     /* Verify actions against packet, then send packet if successful. */
diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
index d8e43a1..9fe4992 100644
--- a/tests/ofproto-dpif.at
+++ b/tests/ofproto-dpif.at
@@ -3489,15 +3489,15 @@ in_port=2 actions=output:1
 ])
 AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
 
-odp_flow="in_port(1)"
-br_flow="in_port=1"
+odp_flow="in_port(1),eth(src=00:00:00:00:00:00,dst=00:00:00:00:00:00)"
+br_flow="in_port=1,dl_dst=00:00:00:00:00:00"
 # Test command: ofproto/trace odp_flow with in_port as a name.
 AT_CHECK([ovs-appctl ofproto/trace "$odp_flow"], [0], [stdout])
 AT_CHECK([tail -1 stdout], [0], [dnl
 Datapath actions: 2
 ])
 
-odp_flow="in_port(1)"
+odp_flow="in_port(1),eth(src=00:00:00:00:00:00,dst=00:00:00:00:00:00)"
 # Test command: ofproto/trace odp_flow
 AT_CHECK([ovs-appctl ofproto/trace "$odp_flow"], [0], [stdout])
 AT_CHECK([tail -1 stdout], [0], [dnl
diff --git a/tests/vlan-splinters.at b/tests/vlan-splinters.at
index b38ab52..70570aa 100644
--- a/tests/vlan-splinters.at
+++ b/tests/vlan-splinters.at
@@ -28,7 +28,7 @@ for args in '9 p2' '11 p3' '15 p4'; do
 
     # Check that when a packet is received on $splinter_port, it is
     # treated as if it had been received on p1 in the correct VLAN.
-    AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port($splinter_port)"],
+    AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port($splinter_port),eth(src=00:00:00:00:00:00,dst=00:00:00:00:00:00)"],
              [0], [stdout])
     AT_CHECK_UNQUOTED([sed -n '/^Flow/p; /^Datapath/p' stdout], [0], [dnl
 Flow: metadata=0,in_port=$p1,dl_vlan=$vlan,dl_vlan_pcp=0,dl_src=00:00:00:00:00:00,dl_dst=00:00:00:00:00:00,dl_type=0x05ff
@@ -37,7 +37,7 @@ Datapath actions: $access_port
 
     # Check that when an OpenFlow action sends a packet to p1 on
     # splintered VLAN $vlan, it is actually output to $splinter_port.
-    AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port($access_port)"],
+    AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port($access_port),eth(src=00:00:00:00:00:00,dst=00:00:00:00:00:00)"],
              [0], [stdout])
     AT_CHECK_UNQUOTED([tail -1 stdout], [0], [Datapath actions: $splinter_port
 ])
-- 
1.8.5.2 (Apple Git-48)




More information about the dev mailing list