[ovs-dev] [PATCH] ofproto: Avoid buffer copy in OFPT_PACKET_IN path.

Ben Pfaff blp at nicira.com
Mon Apr 26 23:18:57 UTC 2010


When a dpif passes an odp_msg down to ofproto, and ofproto transforms it
into an ofp_packet_in to send to the controller, until now this always
involved a full copy of the packet inside ofproto.  This commit eliminates
this copy by ensuring that there is always enough headroom in the ofpbuf
that holds the odp_msg to replace it by an ofp_packet_in in-place.

>From Jean Tourrilhes <jt at hpl.hp.com>, with some revisions.
---
 lib/dpif-linux.c    |    3 +-
 lib/dpif-netdev.c   |    3 +-
 lib/dpif-provider.h |   16 ++++++-
 lib/dpif.c          |    7 ++-
 ofproto/ofproto.c   |  123 ++++++++++++++++++++++++++++++++++++++------------
 5 files changed, 116 insertions(+), 36 deletions(-)

diff --git a/lib/dpif-linux.c b/lib/dpif-linux.c
index 87c9f32..15283b2 100644
--- a/lib/dpif-linux.c
+++ b/lib/dpif-linux.c
@@ -460,7 +460,8 @@ dpif_linux_recv(struct dpif *dpif_, struct ofpbuf **bufp)
     int retval;
     int error;
 
-    buf = ofpbuf_new(65536);
+    buf = ofpbuf_new(65536 + DPIF_RECV_MSG_PADDING);
+    ofpbuf_reserve(buf, DPIF_RECV_MSG_PADDING);
     retval = read(dpif->fd, ofpbuf_tail(buf), ofpbuf_tailroom(buf));
     if (retval < 0) {
         error = errno;
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 7d31a69..2f4463e 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -1280,7 +1280,8 @@ dp_netdev_output_control(struct dp_netdev *dp, const struct ofpbuf *packet,
     }
 
     msg_size = sizeof *header + packet->size;
-    msg = ofpbuf_new(msg_size);
+    msg = ofpbuf_new(msg_size + DPIF_RECV_MSG_PADDING);
+    ofpbuf_reserve(msg, DPIF_RECV_MSG_PADDING);
     header = ofpbuf_put_uninit(msg, sizeof *header);
     header->type = queue_no;
     header->length = msg_size;
diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h
index fddc8ea..33663ff 100644
--- a/lib/dpif-provider.h
+++ b/lib/dpif-provider.h
@@ -21,7 +21,9 @@
  * datapath. */
 
 #include <assert.h>
+#include "openflow/openflow.h"
 #include "dpif.h"
+#include "util.h"
 
 #ifdef  __cplusplus
 extern "C" {
@@ -300,8 +302,10 @@ struct dpif_class {
 
     /* Attempts to receive a message from 'dpif'.  If successful, stores the
      * message into '*packetp'.  The message, if one is received, must begin
-     * with 'struct odp_msg' as a header.  Only messages of the types selected
-     * with the set_listen_mask member function should be received.
+     * with 'struct odp_msg' as a header, and must have at least
+     * DPIF_RECV_MSG_PADDING bytes of headroom (allocated using
+     * e.g. ofpbuf_reserve()).  Only messages of the types selected with the
+     * set_listen_mask member function should be received.
      *
      * This function must not block.  If no message is ready to be received
      * when it is called, it should return EAGAIN without blocking. */
@@ -312,6 +316,14 @@ struct dpif_class {
     void (*recv_wait)(struct dpif *dpif);
 };
 
+/* Minimum number of bytes of headroom for a packet returned by the 'recv'
+ * member function (see above).  This headroom allows "struct odp_msg" to be
+ * replaced by "struct ofp_packet_in" without copying the buffer. */
+#define DPIF_RECV_MSG_PADDING (sizeof(struct ofp_packet_in) \
+                               - sizeof(struct odp_msg))
+BUILD_ASSERT_DECL(sizeof(struct ofp_packet_in) > sizeof(struct odp_msg));
+BUILD_ASSERT_DECL(DPIF_RECV_MSG_PADDING % 4 == 0);
+
 extern const struct dpif_class dpif_linux_class;
 extern const struct dpif_class dpif_netdev_class;
 
diff --git a/lib/dpif.c b/lib/dpif.c
index 186f165..097b38d 100644
--- a/lib/dpif.c
+++ b/lib/dpif.c
@@ -1015,7 +1015,8 @@ dpif_set_sflow_probability(struct dpif *dpif, uint32_t probability)
 
 /* Attempts to receive a message from 'dpif'.  If successful, stores the
  * message into '*packetp'.  The message, if one is received, will begin with
- * 'struct odp_msg' as a header.  Only messages of the types selected with
+ * 'struct odp_msg' as a header, and will have at least DPIF_RECV_MSG_PADDING
+ * bytes of headroom.  Only messages of the types selected with
  * dpif_set_listen_mask() will ordinarily be received (but if a message type is
  * enabled and then later disabled, some stragglers might pop up).
  *
@@ -1026,8 +1027,10 @@ dpif_recv(struct dpif *dpif, struct ofpbuf **packetp)
 {
     int error = dpif->dpif_class->recv(dpif, packetp);
     if (!error) {
+        struct ofpbuf *buf = *packetp;
+
+        assert(ofpbuf_headroom(buf) >= DPIF_RECV_MSG_PADDING);
         if (VLOG_IS_DBG_ENABLED()) {
-            struct ofpbuf *buf = *packetp;
             struct odp_msg *msg = buf->data;
             void *payload = msg + 1;
             size_t payload_len = buf->size - sizeof *msg;
diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c
index 3f9cc90..1dbd200 100644
--- a/ofproto/ofproto.c
+++ b/ofproto/ofproto.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2009, 2010 Nicira Networks.
+ * Copyright (c) 2010 Jean Tourrilhes - HP-Labs.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -3993,65 +3994,127 @@ update_used(struct ofproto *p)
     free(flows);
 }
 
+/* pinsched callback for sending 'packet' on 'ofconn'. */
 static void
 do_send_packet_in(struct ofpbuf *packet, void *ofconn_)
 {
     struct ofconn *ofconn = ofconn_;
+
+    rconn_send_with_limit(ofconn->rconn, packet,
+                          ofconn->packet_in_counter, 100);
+}
+
+/* Takes 'packet', which has been converted with do_convert_to_packet_in(), and
+ * finalizes its content for sending on 'ofconn', and passes it to 'ofconn''s
+ * packet scheduler it for sending.
+ *
+ * If 'clone' is true, the caller retains ownership of 'packet'.  Otherwise,
+ * ownership is transferred to this function. */
+static void
+schedule_packet_in(struct ofconn *ofconn, struct ofpbuf *packet, bool clone)
+{
     struct ofproto *ofproto = ofconn->ofproto;
-    struct odp_msg *msg = packet->data;
-    struct ofpbuf payload;
-    struct ofpbuf *opi;
+    struct ofp_packet_in *opi = packet->data;
+    uint16_t in_port = ofp_port_to_odp_port(ntohs(opi->in_port));
+    int send_len, trim_size;
     uint32_t buffer_id;
-    int send_len;
 
-    /* Extract packet payload from 'msg'. */
-    payload.data = msg + 1;
-    payload.size = msg->length - sizeof *msg;
-
-    /* Construct packet-in message. */
-    send_len = INT_MAX;
-    if (msg->type == _ODPL_ACTION_NR) {
+    /* Get buffer. */
+    if (opi->reason == OFPR_ACTION) {
         buffer_id = UINT32_MAX;
+    } else if (ofproto->fail_open && fail_open_is_active(ofproto->fail_open)) {
+        buffer_id = pktbuf_get_null();
     } else {
-        if (ofproto->fail_open && fail_open_is_active(ofproto->fail_open)) {
-            buffer_id = pktbuf_get_null();
-        } else {
-            buffer_id = pktbuf_save(ofconn->pktbuf, &payload, msg->port);
-        }
-        if (buffer_id != UINT32_MAX) {
-            send_len = ofconn->miss_send_len;
-        }
+        struct ofpbuf payload;
+        payload.data = opi->data;
+        payload.size = packet->size - offsetof(struct ofp_packet_in, data);
+        buffer_id = pktbuf_save(ofconn->pktbuf, &payload, in_port);
     }
-    opi = make_packet_in(buffer_id, odp_port_to_ofp_port(msg->port),
-                         msg->type, &payload, send_len);
 
-    /* Send. */
-    rconn_send_with_limit(ofconn->rconn, opi, ofconn->packet_in_counter, 100);
+    /* Figure out how many of the packet to send. */
+    send_len = ntohs(opi->total_len);
+    if (buffer_id != UINT32_MAX) {
+        send_len = MIN(send_len, ofconn->miss_send_len);
+    }
 
-    ofpbuf_delete(packet);
+    /* Adjust packet length and clone if necessary. */
+    trim_size = offsetof(struct ofp_packet_in, data) + send_len;
+    if (clone) {
+        packet = ofpbuf_clone_data(packet->data, trim_size);
+        opi = packet->data;
+    } else {
+        packet->size = trim_size;
+    }
+
+    /* Update packet headers. */
+    opi->buffer_id = htonl(buffer_id);
+    update_openflow_length(packet);
+
+    /* Hand over to packet scheduler.  It might immediately call into
+     * do_send_packet_in() or it might buffer it for a while (until a later
+     * call to pinsched_run()). */
+    pinsched_send(ofconn->schedulers[opi->reason], in_port,
+                  packet, do_send_packet_in, ofconn);
 }
 
+/* Replace struct odp_msg header in 'packet' by equivalent struct
+ * ofp_packet_in.  The odp_msg must have sufficient headroom to do so (e.g. as
+ * returned by dpif_recv()).
+ *
+ * The conversion is not complete: the caller still needs to trim any unneeded
+ * payload off the end of the buffer, set the length in the OpenFlow header,
+ * and set buffer_id.  Those require us to know the controller settings and so
+ * must be done on a per-controller basis. */
 static void
-send_packet_in(struct ofproto *ofproto, struct ofpbuf *packet)
+do_convert_to_packet_in(struct ofpbuf *packet)
 {
     struct odp_msg *msg = packet->data;
+    struct ofp_packet_in *opi;
+    uint8_t reason;
+    uint16_t total_len;
+    uint16_t in_port;
+
+    /* Extract relevant header fields */
+    reason = (msg->type == _ODPL_ACTION_NR ? OFPR_ACTION : OFPR_NO_MATCH);
+    total_len = msg->length - sizeof *msg;
+    in_port = odp_port_to_ofp_port(msg->port);
+
+    /* Repurpose packet buffer, only override header */
+    ofpbuf_pull(packet, sizeof(struct odp_msg));
+    opi = ofpbuf_push_zeros(packet, offsetof(struct ofp_packet_in, data));
+    opi->header.version = OFP_VERSION;
+    opi->header.type = OFPT_PACKET_IN;
+    opi->total_len = htons(total_len);
+    opi->in_port = htons(in_port);
+    opi->reason = reason;
+}
+
+/* Given 'packet' containing an odp_msg of type _ODPL_ACTION_NR or
+ * _ODPL_MISS_NR, sends an OFPT_PACKET_IN message to each OpenFlow controller
+ * as necessary according to their individual configurations.
+ *
+ * 'packet' must have sufficient headroom to convert it into a struct
+ * ofp_packet_in (e.g. as returned by dpif_recv()).
+ *
+ * Takes ownership of 'packet'. */
+static void
+send_packet_in(struct ofproto *ofproto, struct ofpbuf *packet)
+{
     struct ofconn *ofconn, *prev;
 
-    assert(msg->type == _ODPL_MISS_NR || msg->type == _ODPL_ACTION_NR);
+    do_convert_to_packet_in(packet);
 
     prev = NULL;
     LIST_FOR_EACH (ofconn, struct ofconn, node, &ofproto->all_conns) {
         if (ofconn->role != NX_ROLE_SLAVE) {
             if (prev) {
-                pinsched_send(prev->schedulers[msg->type], msg->port,
-                              ofpbuf_clone(packet), do_send_packet_in, prev);
+                schedule_packet_in(prev, packet, true);
             }
             prev = ofconn;
         }
     }
     if (prev) {
-        pinsched_send(prev->schedulers[msg->type], msg->port,
-                      packet, do_send_packet_in, prev);
+        schedule_packet_in(prev, packet, false);
     } else {
         ofpbuf_delete(packet);
     }
-- 
1.6.6.1





More information about the dev mailing list