[ovs-dev] [PATCH 08/11] dpif: Prepare for additional kernel provided packet metadata.

Jarno Rajahalme jarno.rajahalme at nsn.com
Mon Feb 11 14:46:24 UTC 2013


Use kernel provided packet layer pointers if available and deal with ODP
keys that have kernel provided key hash attribute.  The kernel key hash
is known to correspond to the key, but the hashing algorithm is not disclosed.
The hash is expected to be the first attribute so that it's presence can be
easily tested.

Signed-off-by: Jarno Rajahalme <jarno.rajahalme at nsn.com>
---
 include/linux/openvswitch.h |   10 ++++++++++
 lib/dpif-linux.c            |   25 +++++++++++++++++++++++++
 lib/dpif-netdev.c           |    3 +++
 lib/dpif-provider.h         |    4 ++++
 lib/dpif.c                  |    4 ++++
 lib/odp-util.c              |   16 ++++++++++++++++
 lib/odp-util.h              |    3 ++-
 lib/packets.h               |   15 ++++++++++++++-
 ofproto/in-band.c           |    3 ++-
 ofproto/ofproto-dpif.c      |   26 +++++++++++---------------
 10 files changed, 91 insertions(+), 18 deletions(-)

diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h
index 7c6e3ab..0c1647d 100644
--- a/include/linux/openvswitch.h
+++ b/include/linux/openvswitch.h
@@ -149,6 +149,12 @@ enum ovs_packet_cmd {
  * @OVS_PACKET_ATTR_USERDATA: Present for an %OVS_PACKET_CMD_ACTION
  * notification if the %OVS_ACTION_ATTR_USERSPACE action specified an
  * %OVS_USERSPACE_ATTR_USERDATA attribute.
+ * @OVS_PACKET_ATTR_L2_SIZE: Optional 32-bit size of the Ethernet header and
+ * all VLAN headers for the packet.
+ * @OVS_PACKET_ATTR_L3_OFFSET: Optional 32-bit offset from the beginning of
+ * the packet to just past all L2 and L2.5 headers.
+ * @OVS_PACKET_ATTR_L4_OFFSET: Optional 32-bit offset to the start of the
+ * header after the IP or IPv6 header. Present only if the IP header is valid.
  *
  * These attributes follow the &struct ovs_header within the Generic Netlink
  * payload for %OVS_PACKET_* commands.
@@ -159,6 +165,9 @@ enum ovs_packet_attr {
 	OVS_PACKET_ATTR_KEY,         /* Nested OVS_KEY_ATTR_* attributes. */
 	OVS_PACKET_ATTR_ACTIONS,     /* Nested OVS_ACTION_ATTR_* attributes. */
 	OVS_PACKET_ATTR_USERDATA,    /* u64 OVS_ACTION_ATTR_USERSPACE arg. */
+	OVS_PACKET_ATTR_L2_SIZE,     /* u32 optional packet's L2 size */
+	OVS_PACKET_ATTR_L3_OFFSET,   /* u32 optional packet's L3 offset */
+	OVS_PACKET_ATTR_L4_OFFSET,   /* u32 optional packet's L4 offset */
 	__OVS_PACKET_ATTR_MAX
 };
 
@@ -284,6 +293,7 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_IPV4_TUNNEL,  /* struct ovs_key_ipv4_tunnel */
 #endif
 
+	OVS_KEY_ATTR_HASH = 61,	/* u32 kernel provided key hash */
 	OVS_KEY_ATTR_MPLS = 62, /* struct ovs_key_mpls */
 	OVS_KEY_ATTR_TUN_ID = 63,  /* be64 tunnel ID */
 	__OVS_KEY_ATTR_MAX
diff --git a/lib/dpif-linux.c b/lib/dpif-linux.c
index a44e4a4..a3d9d26 100644
--- a/lib/dpif-linux.c
+++ b/lib/dpif-linux.c
@@ -1223,6 +1223,9 @@ parse_odp_packet(struct ofpbuf *buf, struct dpif_upcall *upcall,
         [OVS_PACKET_ATTR_PACKET] = { .type = NL_A_UNSPEC,
                                      .min_len = ETH_HEADER_LEN },
         [OVS_PACKET_ATTR_KEY] = { .type = NL_A_NESTED },
+        [OVS_PACKET_ATTR_L2_SIZE] = { .type = NL_A_U32, .optional = true },
+        [OVS_PACKET_ATTR_L3_OFFSET] = { .type = NL_A_U32, .optional = true },
+        [OVS_PACKET_ATTR_L4_OFFSET] = { .type = NL_A_U32, .optional = true },
 
         /* OVS_PACKET_CMD_ACTION only. */
         [OVS_PACKET_ATTR_USERDATA] = { .type = NL_A_U64, .optional = true },
@@ -1260,6 +1263,28 @@ parse_odp_packet(struct ofpbuf *buf, struct dpif_upcall *upcall,
     upcall->packet->data = CONST_CAST(struct nlattr *,
                                       nl_attr_get(a[OVS_PACKET_ATTR_PACKET]));
     upcall->packet->size = nl_attr_get_size(a[OVS_PACKET_ATTR_PACKET]);
+    /* Packet starts with an Ethernet frame */
+    buf->l2 = buf->data;
+    /* Rest of the layer pointers may be set via optional attributes.
+     * Layer pointers are initialized to NULL before we are called.
+     * NULL layer pointers will be set later as needed. */
+    if (a[OVS_PACKET_ATTR_L2_SIZE]
+        && a[OVS_PACKET_ATTR_L3_OFFSET]) {
+        uint32_t l2_size = nl_attr_get_u32(a[OVS_PACKET_ATTR_L2_SIZE]);
+        uint32_t l3_offset = nl_attr_get_u32(a[OVS_PACKET_ATTR_L3_OFFSET]);
+
+        if (l2_size < l3_offset) {
+            buf->l2_5 = (char *)buf->data + l2_size;
+        }
+        buf->l3 = (char *)buf->data + l3_offset;
+        if (a[OVS_PACKET_ATTR_L4_OFFSET]) {
+            uint32_t l4_offset = nl_attr_get_u32(a[OVS_PACKET_ATTR_L4_OFFSET]);
+            if (l4_offset > l3_offset) {
+                buf->l4 = (char *)buf->data + l4_offset;
+            }
+        }
+        /* We leave L7 for deferred setting as needed */
+    }
     upcall->key = CONST_CAST(struct nlattr *,
                              nl_attr_get(a[OVS_PACKET_ATTR_KEY]));
     upcall->key_len = nl_attr_get_size(a[OVS_PACKET_ATTR_KEY]);
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 3fa76d8..bd3d4d6 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -992,6 +992,8 @@ dpif_netdev_recv(struct dpif *dpif, struct dpif_upcall upcalls[],
         upcalls[n] = u->upcall;
         upcalls[n].packet = &bufs[n];
         bufs[n] = u->buf;
+        /* Packet starts with an Ethernet frame */
+        upcalls[n].packet->l2 = upcalls[n].packet->data;
     }
     *n_bufs = n;
     return (n > 0) ? 0 : EAGAIN;
@@ -1246,6 +1248,7 @@ execute_set_action(struct ofpbuf *packet, const struct nlattr *a)
      case OVS_KEY_ATTR_ICMPV6:
      case OVS_KEY_ATTR_ARP:
      case OVS_KEY_ATTR_ND:
+     case OVS_KEY_ATTR_HASH:
      case __OVS_KEY_ATTR_MAX:
      default:
         NOT_REACHED();
diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h
index c41827e..5516281 100644
--- a/lib/dpif-provider.h
+++ b/lib/dpif-provider.h
@@ -336,6 +336,10 @@ struct dpif_class {
      * using 'buf_space' of size 'buf_space_size', and point
      * 'upcalls[].packet' and 'upcalls[].key' into 'data' in the 'bufs[]'.
      *
+     * As part of the 'bufs[]' initialization the implementation MUST set the
+     * l2 layer pointer to the start of the packet, and MAY set other layer
+     * pointers, if available.
+     *
      * Caller gives the size of the 'upcalls' and 'bufs' arrays in '*n_bufs',
      * which also returns the number of received upcalls.
      *
diff --git a/lib/dpif.c b/lib/dpif.c
index fda7924..8c55adf 100644
--- a/lib/dpif.c
+++ b/lib/dpif.c
@@ -1129,6 +1129,10 @@ dpif_recv_set(struct dpif *dpif, bool enable)
  * freed with ofpbuf_uninit() by the caller after the 'upcalls' have been
  * processed.
  *
+ * As part of the 'bufs[]' initialization the implementation MUST set the l2
+ * layer pointer to the start of the packet, and MAY set other layer pointers,
+ * if available.
+ *
  * Returns 0 if successful, otherwise a positive errno value.  Returns EAGAIN
  * if no upcall is immediately available. */
 int
diff --git a/lib/odp-util.c b/lib/odp-util.c
index f3f66b7..9ebccc8 100644
--- a/lib/odp-util.c
+++ b/lib/odp-util.c
@@ -93,6 +93,7 @@ ovs_key_attr_to_string(enum ovs_key_attr attr)
 
     switch (attr) {
     case OVS_KEY_ATTR_UNSPEC: return "unspec";
+    case OVS_KEY_ATTR_HASH: return "kernel_hash";
     case OVS_KEY_ATTR_ENCAP: return "encap";
     case OVS_KEY_ATTR_PRIORITY: return "skb_priority";
     case OVS_KEY_ATTR_SKB_MARK: return "skb_mark";
@@ -639,6 +640,7 @@ odp_flow_key_attr_len(uint16_t type)
 
     switch ((enum ovs_key_attr) type) {
     case OVS_KEY_ATTR_ENCAP: return -2;
+    case OVS_KEY_ATTR_HASH: return 4;
     case OVS_KEY_ATTR_PRIORITY: return 4;
     case OVS_KEY_ATTR_SKB_MARK: return 4;
     case OVS_KEY_ATTR_TUN_ID: return 8;
@@ -978,6 +980,10 @@ format_odp_key_attr(const struct nlattr *a, struct ds *ds)
         break;
     }
 
+    case OVS_KEY_ATTR_HASH:
+        /* We hide the kernel hash from higher layers */
+        break;
+
     case OVS_KEY_ATTR_UNSPEC:
     case __OVS_KEY_ATTR_MAX:
     default:
@@ -1649,6 +1655,11 @@ uint32_t
 odp_flow_key_hash(const struct nlattr *key, size_t key_len)
 {
     BUILD_ASSERT_DECL(!(NLA_ALIGNTO % sizeof(uint32_t)));
+    /* Skip the hash inside, if any */
+    if (nl_attr_type(key) == OVS_KEY_ATTR_HASH) {
+        key_len -= key->nla_len;
+        key = (const struct nlattr *)((const char *)key + key->nla_len);
+    }
     return hash_words((const uint32_t *) key, key_len / sizeof(uint32_t), 0);
 }
 
@@ -2037,6 +2048,11 @@ odp_flow_key_to_flow(const struct nlattr *key, size_t key_len,
     }
     expected_attrs = 0;
 
+    /* Skip kernel hash if any */
+    if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_HASH)) {
+        expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_HASH;
+    }
+
     /* Metadata. */
     if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_PRIORITY)) {
         flow->skb_priority = nl_attr_get_u32(attrs[OVS_KEY_ATTR_PRIORITY]);
diff --git a/lib/odp-util.h b/lib/odp-util.h
index ccf6c2a..40731ac 100644
--- a/lib/odp-util.h
+++ b/lib/odp-util.h
@@ -73,8 +73,9 @@ int odp_actions_from_string(const char *, const struct simap *port_names,
  *  OVS_KEY_ATTR_IPV6                   40    --     4     44
  *  OVS_KEY_ATTR_ICMPV6                  2     2     4      8
  *  OVS_KEY_ATTR_ND                     28    --     4     32
+ *  OVS_KEY_ATTR_HASH                    4    --     4      8
  *  ----------------------------------------------------------
- *  total                                                 220
+ *  total                                                 228
  *
  * We include some slack space in case the calculation isn't quite right or we
  * add another field and forget to adjust this value.
diff --git a/lib/packets.h b/lib/packets.h
index 428d702..acb44a1 100644
--- a/lib/packets.h
+++ b/lib/packets.h
@@ -586,11 +586,24 @@ void packet_set_udp_port(struct ofpbuf *, ovs_be16 src, ovs_be16 dst);
 uint8_t packet_get_tcp_flags(const struct ofpbuf *, const struct flow *);
 void packet_format_tcp_flags(struct ds *, uint8_t);
 
-static inline void packet_check_pointers(struct ofpbuf *packet) {
+/* Set nw_proto to 0 if no need for L7 pointer */
+static inline void packet_check_pointers(struct ofpbuf *packet,
+                                         uint8_t nw_proto)
+{
     /* Need to set packet pointers l2_5/l3/l4/l7 ? */
     if (packet->l3 == NULL) {
         struct flow flow;
         flow_extract_l2_onwards(packet, &flow);
+    } else if (packet->l4 != NULL && packet->l7 == NULL
+               && (nw_proto == IPPROTO_UDP || nw_proto == IPPROTO_TCP)) {
+        /* Check if L7 pointer could be set */
+        size_t l4_size =
+            (nw_proto == IPPROTO_UDP)
+            ? UDP_HEADER_LEN
+            : TCP_OFFSET(((struct tcp_header *)packet->l4)->tcp_ctl) * 4;
+        packet->l7 = (((char *)packet->l4 - (char *)packet->data) + l4_size
+                      <= packet->size)
+            ? (char *)packet->l4 + l4_size : NULL;
     }
 }
 
diff --git a/ofproto/in-band.c b/ofproto/in-band.c
index ae43e12..92d2851 100644
--- a/ofproto/in-band.c
+++ b/ofproto/in-band.c
@@ -235,7 +235,8 @@ in_band_msg_in_hook(struct in_band *in_band, const struct flow *flow,
             && flow->tp_src == htons(DHCP_SERVER_PORT)
             && flow->tp_dst == htons(DHCP_CLIENT_PORT)) {
         /* Deferred setting of packet layer pointers? */
-        packet_check_pointers(CONST_CAST(struct ofpbuf *, packet));
+        packet_check_pointers(CONST_CAST(struct ofpbuf *, packet),
+                              IPPROTO_UDP);
         if (packet->l7) {
             struct dhcp_header *dhcp;
 
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index d0459e5..d4c94f1 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -3322,7 +3322,7 @@ process_special(struct ofproto_dpif *ofproto, const struct flow *flow,
     if (ofport->cfm && cfm_should_process_flow(ofport->cfm, flow)) {
         if (packet) {
             /* Deferred setting of packet layer pointers? */
-            packet_check_pointers(CONST_CAST(struct ofpbuf *, packet));
+            packet_check_pointers(CONST_CAST(struct ofpbuf *, packet), 0);
             cfm_process_heartbeat(ofport->cfm, packet);
         }
         return SLOW_CFM;
@@ -3330,14 +3330,14 @@ process_special(struct ofproto_dpif *ofproto, const struct flow *flow,
                && flow->dl_type == htons(ETH_TYPE_LACP)) {
         if (packet) {
             /* Deferred setting of packet layer pointers? */
-            packet_check_pointers(CONST_CAST(struct ofpbuf *, packet));
+            packet_check_pointers(CONST_CAST(struct ofpbuf *, packet), 0);
             lacp_process_packet(ofport->bundle->lacp, ofport, packet);
         }
         return SLOW_LACP;
     } else if (ofproto->stp && stp_should_process_flow(flow)) {
         if (packet) {
             /* Deferred setting of packet layer pointers? */
-            packet_check_pointers(CONST_CAST(struct ofpbuf *, packet));
+            packet_check_pointers(CONST_CAST(struct ofpbuf *, packet), 0);
             stp_process_packet(ofport, packet);
         }
         return SLOW_STP;
@@ -3526,9 +3526,11 @@ handle_flow_miss_with_facet(struct flow_miss *miss, struct facet *facet,
         }
 
         packet_stats_extract(packet, now, &stats);
-        if (facet->has_fin_timeout || ofproto->netflow) {
-            packet_check_pointers(packet); /* Deferred setting of packet layer
-                                            * pointers? */
+        if ((facet->has_fin_timeout || ofproto->netflow)
+            && facet->flow.nw_proto == IPPROTO_TCP
+            && is_ip_any(&facet->flow)) {
+            /* Deferred setting of packet layer pointers? */
+            packet_check_pointers(packet, IPPROTO_TCP);
             stats.tcp_flags = packet_get_tcp_flags(packet, &facet->flow);
         }
         subfacet_update_stats(subfacet, &stats);
@@ -3847,14 +3849,7 @@ do_handle_upcalls(struct dpif_backer *backer, struct dpif_upcall *upcalls,
             continue;
         }
 
-        if (miss->key_fitness == ODP_FIT_PERFECT) {
-            /* We have a perfect key fitness, keep the flow.
-             * This is safe because MISS_UPCALLs never have the
-             * packet modified by any actions before being passed to us. */
-
-            /* Init l2 layer pointer and leave the rest for later */
-            upcall->packet->l2 = upcall->packet->data;
-        } else {
+        if (miss->key_fitness != ODP_FIT_PERFECT) {
             /* The packet may have been modified since the key extraction,
              * or the kernel provided key may otherwise be insufficient.
              * Do full flow key extraction, but keep the metadata.
@@ -5955,7 +5950,8 @@ execute_controller_action(struct action_xlate_ctx *ctx, int len,
     }
 
     /* Deferred setting of packet layer pointers? */
-    packet_check_pointers(CONST_CAST(struct ofpbuf *, ctx->packet));
+    packet_check_pointers(CONST_CAST(struct ofpbuf *, ctx->packet),
+                          is_ip_any(&ctx->flow) ? ctx->flow.nw_proto : 0);
 
     packet = ofpbuf_clone(ctx->packet);
 
-- 
1.7.10.4




More information about the dev mailing list