[ovs-dev] [PATCH 2/2] ofproto: Avoid buffer copy in OFPT_PACKET_IN path.

Ben Pfaff blp at nicira.com
Fri Apr 9 19:38:46 UTC 2010


When a dpif passes an odp_msg down to ofproto, and ofproto transforms it
into an ofp_packet_in to send to the controller, until now this always
involved a full copy of the packet inside ofproto.  This commit eliminates
this copy by ensuring that there is always enough headroom in the ofpbuf
that holds the odp_msg to replace it by an ofp_packet_in in-place.

>From Jean Tourrilhes <jt at hpl.hp.com>, with some revisions.
---
 lib/dpif-linux.c    |    3 +-
 lib/dpif-netdev.c   |    3 +-
 lib/dpif-provider.h |   16 ++++++-
 lib/dpif.c          |    7 ++-
 ofproto/ofproto.c   |  113 +++++++++++++++++++++++++++++++++++++++++----------
 5 files changed, 114 insertions(+), 28 deletions(-)

diff --git a/lib/dpif-linux.c b/lib/dpif-linux.c
index 1eaba74..63a0551 100644
--- a/lib/dpif-linux.c
+++ b/lib/dpif-linux.c
@@ -417,7 +417,8 @@ dpif_linux_recv(struct dpif *dpif_, struct ofpbuf **bufp)
     int retval;
     int error;
 
-    buf = ofpbuf_new(65536);
+    buf = ofpbuf_new(65536 + DPIF_RECV_MSG_PADDING);
+    ofpbuf_reserve(buf, DPIF_RECV_MSG_PADDING);
     retval = read(dpif->fd, ofpbuf_tail(buf), ofpbuf_tailroom(buf));
     if (retval < 0) {
         error = errno;
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 1cc4ed4..e8645f3 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -1280,7 +1280,8 @@ dp_netdev_output_control(struct dp_netdev *dp, const struct ofpbuf *packet,
     }
 
     msg_size = sizeof *header + packet->size;
-    msg = ofpbuf_new(msg_size);
+    msg = ofpbuf_new(msg_size + DPIF_RECV_MSG_PADDING);
+    ofpbuf_reserve(msg, DPIF_RECV_MSG_PADDING);
     header = ofpbuf_put_uninit(msg, sizeof *header);
     header->type = queue_no;
     header->length = msg_size;
diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h
index fddc8ea..33663ff 100644
--- a/lib/dpif-provider.h
+++ b/lib/dpif-provider.h
@@ -21,7 +21,9 @@
  * datapath. */
 
 #include <assert.h>
+#include "openflow/openflow.h"
 #include "dpif.h"
+#include "util.h"
 
 #ifdef  __cplusplus
 extern "C" {
@@ -300,8 +302,10 @@ struct dpif_class {
 
     /* Attempts to receive a message from 'dpif'.  If successful, stores the
      * message into '*packetp'.  The message, if one is received, must begin
-     * with 'struct odp_msg' as a header.  Only messages of the types selected
-     * with the set_listen_mask member function should be received.
+     * with 'struct odp_msg' as a header, and must have at least
+     * DPIF_RECV_MSG_PADDING bytes of headroom (allocated using
+     * e.g. ofpbuf_reserve()).  Only messages of the types selected with the
+     * set_listen_mask member function should be received.
      *
      * This function must not block.  If no message is ready to be received
      * when it is called, it should return EAGAIN without blocking. */
@@ -312,6 +316,14 @@ struct dpif_class {
     void (*recv_wait)(struct dpif *dpif);
 };
 
+/* Minimum number of bytes of headroom for a packet returned by the 'recv'
+ * member function (see above).  This headroom allows "struct odp_msg" to be
+ * replaced by "struct ofp_packet_in" without copying the buffer. */
+#define DPIF_RECV_MSG_PADDING (sizeof(struct ofp_packet_in) \
+                               - sizeof(struct odp_msg))
+BUILD_ASSERT_DECL(sizeof(struct ofp_packet_in) > sizeof(struct odp_msg));
+BUILD_ASSERT_DECL(DPIF_RECV_MSG_PADDING % 4 == 0);
+
 extern const struct dpif_class dpif_linux_class;
 extern const struct dpif_class dpif_netdev_class;
 
diff --git a/lib/dpif.c b/lib/dpif.c
index 8e5cf9f..3a6367d 100644
--- a/lib/dpif.c
+++ b/lib/dpif.c
@@ -1010,7 +1010,8 @@ dpif_set_sflow_probability(struct dpif *dpif, uint32_t probability)
 
 /* Attempts to receive a message from 'dpif'.  If successful, stores the
  * message into '*packetp'.  The message, if one is received, will begin with
- * 'struct odp_msg' as a header.  Only messages of the types selected with
+ * 'struct odp_msg' as a header, and will have at least DPIF_RECV_MSG_PADDING
+ * bytes of headroom.  Only messages of the types selected with
  * dpif_set_listen_mask() will ordinarily be received (but if a message type is
  * enabled and then later disabled, some stragglers might pop up).
  *
@@ -1021,8 +1022,10 @@ dpif_recv(struct dpif *dpif, struct ofpbuf **packetp)
 {
     int error = dpif->dpif_class->recv(dpif, packetp);
     if (!error) {
+        struct ofpbuf *buf = *packetp;
+
+        assert(ofpbuf_headroom(buf) >= DPIF_RECV_MSG_PADDING);
         if (VLOG_IS_DBG_ENABLED()) {
-            struct ofpbuf *buf = *packetp;
             struct odp_msg *msg = buf->data;
             void *payload = msg + 1;
             size_t payload_len = buf->size - sizeof *msg;
diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c
index fd1256f..0adeda0 100644
--- a/ofproto/ofproto.c
+++ b/ofproto/ofproto.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2009, 2010 Nicira Networks.
+ * Copyright (c) 2010 Jean Tourrilhes - HP-Labs.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -3637,26 +3638,63 @@ update_used(struct ofproto *p)
     free(flows);
 }
 
+/* Replace struct odp_msg header in 'packet' by equivalent struct
+ * ofp_packet_in. */
 static void
-do_send_packet_in(struct ofconn *ofconn, uint32_t buffer_id,
-                  const struct ofpbuf *packet, int send_len)
+do_convert_to_packet_in(struct ofpbuf *packet)
 {
     struct odp_msg *msg = packet->data;
-    struct ofpbuf payload;
-    struct ofpbuf *opi;
+    struct ofp_packet_in *opi;
     uint8_t reason;
+    uint16_t total_len;
+    uint16_t in_port;
 
-    /* Extract packet payload from 'msg'. */
-    payload.data = msg + 1;
-    payload.size = msg->length - sizeof *msg;
+    /* Extract relevant header fields */
+    reason = (msg->type == _ODPL_ACTION_NR ? OFPR_ACTION : OFPR_NO_MATCH);
+    total_len = msg->length - sizeof *msg;
+    in_port = odp_port_to_ofp_port(msg->port);
+
+    /* Repurpose packet buffer, only override header */
+    ofpbuf_pull(packet, sizeof(struct odp_msg));
+    opi = ofpbuf_push_zeros(packet, offsetof(struct ofp_packet_in, data));
+    opi->header.version = OFP_VERSION;
+    opi->header.type = OFPT_PACKET_IN;
+    opi->total_len = htons(total_len);
+    opi->in_port = htons(in_port);
+    opi->reason = reason;
+
+    /* We are not done, we need to fix up the buffer length, set the
+     * length in the OpenFlow header and set buffer_id.
+     * That depends on the controller settings. */
+}
 
-    /* Construct ofp_packet_in message. */
-    reason = msg->type == _ODPL_ACTION_NR ? OFPR_ACTION : OFPR_NO_MATCH;
-    opi = make_packet_in(buffer_id, odp_port_to_ofp_port(msg->port), reason,
-                         &payload, send_len);
+static void
+do_send_converted_packet_in(struct ofconn *ofconn, uint32_t buffer_id,
+                            struct ofpbuf *packet, bool clone,
+                            int max_send_len)
+{
+    struct ofp_packet_in *opi = packet->data;
+    uint16_t total_len = ntohs(opi->total_len);
+    int send_len, trim_size;
+
+    /* Set buffer ID */
+    opi->buffer_id = htonl(buffer_id);
+
+    /* Calculate maximum length to send */
+    send_len = MIN(max_send_len, total_len);
+
+    /* Adjust packet length. */
+    trim_size = offsetof(struct ofp_packet_in, data) + send_len;
+    if (clone) {
+        packet = ofpbuf_clone_data(packet->data, trim_size);
+    } else {
+        packet->size = trim_size;
+    }
+    update_openflow_length(packet);
 
     /* Send. */
-    rconn_send_with_limit(ofconn->rconn, opi, ofconn->packet_in_counter, 100);
+    rconn_send_with_limit(ofconn->rconn, packet,
+                          ofconn->packet_in_counter, 100);
 }
 
 static void
@@ -3664,15 +3702,30 @@ send_packet_in_action(struct ofpbuf *packet, void *p_)
 {
     struct ofproto *p = p_;
     struct ofconn *ofconn;
+    struct ofconn *prev_ofconn = NULL;
     struct odp_msg *msg;
+    uint32_t arg = 0;
 
     msg = packet->data;
+    arg = msg->arg;
+
+    do_convert_to_packet_in(packet);
+
     LIST_FOR_EACH (ofconn, struct ofconn, node, &p->all_conns) {
         if (ofconn == p->controller || ofconn->miss_send_len) {
-            do_send_packet_in(ofconn, UINT32_MAX, packet, msg->arg);
+            if (prev_ofconn) {
+                do_send_converted_packet_in(prev_ofconn, INT32_MAX,
+                                            packet, true, arg);
+            }
+            prev_ofconn = ofconn;
         }
     }
-    ofpbuf_delete(packet);
+    if (prev_ofconn) {
+        do_send_converted_packet_in(prev_ofconn, INT32_MAX, packet, false,
+                                    arg);
+    } else {
+        ofpbuf_delete(packet);
+    }
 }
 
 static void
@@ -3681,24 +3734,40 @@ send_packet_in_miss(struct ofpbuf *packet, void *p_)
     struct ofproto *p = p_;
     bool in_fail_open = p->fail_open && fail_open_is_active(p->fail_open);
     struct ofconn *ofconn;
+    struct ofconn *prev_ofconn = NULL;
     struct ofpbuf payload;
     struct odp_msg *msg;
+    uint16_t port;
+    uint32_t buffer_id = 0;
+    int send_len = 0;
 
     msg = packet->data;
     payload.data = msg + 1;
     payload.size = msg->length - sizeof *msg;
+    port = msg->port;
+
+    do_convert_to_packet_in(packet);
+
     LIST_FOR_EACH (ofconn, struct ofconn, node, &p->all_conns) {
         if (ofconn->miss_send_len) {
-            struct pktbuf *pb = ofconn->pktbuf;
-            uint32_t buffer_id = (in_fail_open
-                                  ? pktbuf_get_null()
-                                  : pktbuf_save(pb, &payload, msg->port));
-            int send_len = (buffer_id != UINT32_MAX ? ofconn->miss_send_len
-                            : UINT32_MAX);
-            do_send_packet_in(ofconn, buffer_id, packet, send_len);
+            if (prev_ofconn) {
+                do_send_converted_packet_in(prev_ofconn, buffer_id,
+                                            packet, true, send_len);
+            }
+            buffer_id = (in_fail_open
+                         ? pktbuf_get_null()
+                         : pktbuf_save(ofconn->pktbuf, &payload, port));
+            send_len = (buffer_id != UINT32_MAX ? ofconn->miss_send_len
+                        : INT32_MAX);
+            prev_ofconn = ofconn;
         }
     }
-    ofpbuf_delete(packet);
+    if (prev_ofconn) {
+        do_send_converted_packet_in(prev_ofconn, buffer_id, packet, false,
+                                    send_len);
+    } else {
+        ofpbuf_delete(packet);
+    }
 }
 
 static uint64_t
-- 
1.6.6.1





More information about the dev mailing list