[ovs-dev] [PATCH v3 1/2] tunneling: Avoid recirculation on datapath by computing the recirculate actions at translate time.

Zoltán Balogh zoltan.balogh at ericsson.com
Thu Jan 26 15:22:24 UTC 2017


From: Sugesh Chandran <sugesh.chandran at intel.com>

Openvswitch datapath recirculates packets for tunneling, i.e.
the incoming packets are encapsulated at first pass. Further actions are
applied on encapsulated packets on the second pass after recirculating.
The proposed patch compute and append the post tunnel actions at the time of
translation itself instead of recirculating at datapath. These actions are solely
depends on tunnel attributes so there is no need of datapath recirculation.
By avoiding the recirculation at datapath, the patch offers upto 30%
performance improvement for VxLAN tunneling in our testing.
The action execution logic is also extended with new CLONE action to define
the packet cloning when the actions are combined. The lenght in the CLONE
action specifies the size of nested action set.

Rebased onto commit: 535e3acfa70b1c1a44daf91de3a63b80d673dc33
                     dpif-netdev: Add clone action

Signed-off-by: Sugesh Chandran <sugesh.chandran at intel.com>
Signed-off-by: Zoltán Balogh <zoltan.balogh at ericsson.com>
Co-authored-by: Zoltán Balogh <zoltan.balogh at ericsson.com>
---
 lib/dpif-netdev.c            |  18 +--
 lib/odp-util.c               |  51 +++++++--
 ofproto/ofproto-dpif-xlate.c | 266 ++++++++++++++++++++++---------------------
 3 files changed, 174 insertions(+), 161 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 42631bc..a2ff3bc 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -4554,24 +4554,8 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
 
     case OVS_ACTION_ATTR_TUNNEL_PUSH:
         if (*depth < MAX_RECIRC_DEPTH) {
-            struct dp_packet_batch tnl_pkt;
-            struct dp_packet_batch *orig_packets_ = packets_;
-            int err;
-
-            if (!may_steal) {
-                dp_packet_batch_clone(&tnl_pkt, packets_);
-                packets_ = &tnl_pkt;
-                dp_packet_batch_reset_cutlen(orig_packets_);
-            }
-
             dp_packet_batch_apply_cutlen(packets_);
-
-            err = push_tnl_action(pmd, a, packets_);
-            if (!err) {
-                (*depth)++;
-                dp_netdev_recirculate(pmd, packets_);
-                (*depth)--;
-            }
+            push_tnl_action(pmd, a, packets_);
             return;
         }
         break;
diff --git a/lib/odp-util.c b/lib/odp-util.c
index 430793b..d534e06 100644
--- a/lib/odp-util.c
+++ b/lib/odp-util.c
@@ -122,7 +122,6 @@ odp_action_len(uint16_t type)
     case OVS_ACTION_ATTR_SAMPLE: return ATTR_LEN_VARIABLE;
     case OVS_ACTION_ATTR_CT: return ATTR_LEN_VARIABLE;
     case OVS_ACTION_ATTR_CLONE: return ATTR_LEN_VARIABLE;
-
     case OVS_ACTION_ATTR_UNSPEC:
     case __OVS_ACTION_ATTR_MAX:
         return ATTR_LEN_INVALID;
@@ -230,9 +229,9 @@ format_odp_clone_action(struct ds *ds, const struct nlattr *attr)
     int len = nl_attr_get_size(attr);
 
     ds_put_cstr(ds, "clone");
-    ds_put_format(ds, "(");
+    ds_put_format(ds, "{");
     format_odp_actions(ds, nla_acts, len);
-    ds_put_format(ds, ")");
+    ds_put_format(ds, "}");
 }
 
 static const char *
@@ -522,7 +521,7 @@ format_odp_tnl_push_header(struct ds *ds, struct ovs_action_push_tnl *data)
                       gnh->oam ? "oam," : "",
                       gnh->critical ? "crit," : "",
                       ntohl(get_16aligned_be32(&gnh->vni)) >> 8);
- 
+
         if (gnh->opt_len) {
             ds_put_cstr(ds, ",options(");
             format_geneve_opts(gnh->options, NULL, gnh->opt_len * 4,
@@ -889,27 +888,55 @@ format_odp_action(struct ds *ds, const struct nlattr *a)
     }
 }
 
+static void
+print_odp_actions(struct ds *ds, bool is_first_action,
+                                const struct nlattr **action_,
+                                size_t *action_len)
+
+{
+    const struct nlattr *action = *action_;
+    size_t curr_action_len;
+
+    if (!nl_attr_is_valid(action, *action_len)) {
+        return;
+    }
+    curr_action_len = nl_attr_len_pad(action, *action_len);
+    if(!is_first_action) {
+        ds_put_char(ds, ',');
+    } else {
+        /* Next iteration onwards, its not first action anymore. Reset flag
+         * accordingly.
+         */
+        is_first_action = 0;
+    }
+
+    format_odp_action(ds, action);
+
+    action = nl_attr_next(action);
+    *action_ = action;
+    *action_len -= curr_action_len;
+
+    print_odp_actions(ds, is_first_action, action_, action_len);
+}
+
 void
 format_odp_actions(struct ds *ds, const struct nlattr *actions,
                    size_t actions_len)
 {
     if (actions_len) {
         const struct nlattr *a;
-        unsigned int left;
+        size_t left;
 
-        NL_ATTR_FOR_EACH (a, left, actions, actions_len) {
-            if (a != actions) {
-                ds_put_char(ds, ',');
-            }
-            format_odp_action(ds, a);
-        }
+        a = actions;
+        left = actions_len;
+        print_odp_actions(ds, 1, &a, &left);
         if (left) {
             int i;
 
             if (left == actions_len) {
                 ds_put_cstr(ds, "<empty>");
             }
-            ds_put_format(ds, ",***%u leftover bytes*** (", left);
+            ds_put_format(ds, ",***%"PRIuSIZE" leftover bytes*** (", left);
             for (i = 0; i < left; i++) {
                 ds_put_format(ds, "%02x", ((const uint8_t *) a)[i]);
             }
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index 0513394..a14b4ba 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -408,6 +408,10 @@ static void xlate_action_set(struct xlate_ctx *ctx);
 static void xlate_commit_actions(struct xlate_ctx *ctx);
 
 static void
+group_actions(struct xlate_ctx *ctx, const struct xport *in_dev,
+              struct xport *out_dev);
+
+static void
 ctx_trigger_freeze(struct xlate_ctx *ctx)
 {
     ctx->exit = true;
@@ -3029,7 +3033,22 @@ build_tunnel_send(struct xlate_ctx *ctx, const struct xport *xport,
     }
     tnl_push_data.tnl_port = odp_to_u32(tunnel_odp_port);
     tnl_push_data.out_port = odp_to_u32(out_dev->odp_port);
+
+    struct nlattr *start_combine;
+    uint32_t action_size = 0;
+    uint32_t start_action_size = 0;
+    uint32_t push_action_size = 0;
+    start_action_size = ctx->odp_actions->size;
+    nl_msg_put_flag(ctx->odp_actions, OVS_ACTION_ATTR_CLONE);
+    action_size = ctx->odp_actions->size;
     odp_put_tnl_push_action(ctx->odp_actions, &tnl_push_data);
+    push_action_size = ctx->odp_actions->size;
+    group_actions(ctx, xport, out_dev);
+    start_combine = ofpbuf_at(ctx->odp_actions, start_action_size, (NLA_HDRLEN));
+    if (ctx->odp_actions->size > push_action_size) {
+        /* Update the combine action only when combined */
+        start_combine->nla_len += ctx->odp_actions->size - action_size;
+    }
     return 0;
 }
 
@@ -3071,6 +3090,119 @@ xlate_flow_is_protected(const struct xlate_ctx *ctx, const struct flow *flow, co
 }
 
 static void
+group_actions(struct xlate_ctx *ctx, const struct xport *in_dev,
+              struct xport *out_dev)
+{
+    struct flow *flow = &ctx->xin->flow;
+    struct flow old_flow = ctx->xin->flow;
+    struct flow_tnl old_flow_tnl_wc = ctx->wc->masks.tunnel;
+    bool old_conntrack = ctx->conntracked;
+    bool old_was_mpls = ctx->was_mpls;
+    ovs_version_t old_version = ctx->xin->tables_version;
+    struct ofpbuf old_stack = ctx->stack;
+    union mf_subvalue new_stack[1024 / sizeof(union mf_subvalue)];
+    struct ofpbuf old_action_set = ctx->action_set;
+    uint64_t actset_stub[1024 / 8];
+
+    ofpbuf_use_stub(&ctx->stack, new_stack, sizeof new_stack);
+    ofpbuf_use_stub(&ctx->action_set, actset_stub, sizeof actset_stub);
+    flow->in_port.ofp_port = out_dev->ofp_port;
+    flow->metadata = htonll(0);
+    memset(&flow->tunnel, 0, sizeof flow->tunnel);
+    memset(&ctx->wc->masks.tunnel, 0, sizeof ctx->wc->masks.tunnel);
+    flow->tunnel.metadata.tab = ofproto_get_tun_tab(&out_dev->xbridge->ofproto->up);
+    ctx->wc->masks.tunnel.metadata.tab = flow->tunnel.metadata.tab;
+    memset(flow->regs, 0, sizeof flow->regs);
+    flow->actset_output = OFPP_UNSET;
+    ctx->conntracked = false;
+    clear_conntrack(ctx);
+    mirror_mask_t old_mirrors = ctx->mirrors;
+    bool independent_mirrors = out_dev->xbridge != ctx->xbridge;
+    if (independent_mirrors) {
+        ctx->mirrors = 0;
+    }
+    ctx->xbridge = out_dev->xbridge;
+
+    /* The bridge is now known so obtain its table version. */
+    ctx->xin->tables_version
+              = ofproto_dpif_get_tables_version(ctx->xbridge->ofproto);
+
+    if (!process_special(ctx, out_dev) && may_receive(out_dev, ctx)) {
+        if (xport_stp_forward_state(out_dev) && xport_rstp_forward_state(out_dev)) {
+                xlate_table_action(ctx, flow->in_port.ofp_port, 0, true, true);
+            if (!ctx->freezing) {
+                xlate_action_set(ctx);
+            }
+            if (ctx->freezing) {
+                finish_freezing(ctx);
+            }
+        } else {
+            /* Forwarding is disabled by STP and RSTP.  Let OFPP_NORMAL and
+             * the learning action look at the packet, then drop it. */
+            struct flow old_base_flow = ctx->base_flow;
+            size_t old_size = ctx->odp_actions->size;
+            mirror_mask_t old_mirrors2 = ctx->mirrors;
+
+            xlate_table_action(ctx, flow->in_port.ofp_port, 0, true, true);
+            ctx->mirrors = old_mirrors2;
+            ctx->base_flow = old_base_flow;
+            ctx->odp_actions->size = old_size;
+
+            /* Undo changes that may have been done for freezing. */
+            ctx_cancel_freeze(ctx);
+        }
+    }
+
+    if (independent_mirrors) {
+        ctx->mirrors = old_mirrors;
+    }
+    ctx->xin->flow = old_flow;
+    ctx->xbridge = in_dev->xbridge;
+    ofpbuf_uninit(&ctx->action_set);
+    ctx->action_set = old_action_set;
+    ofpbuf_uninit(&ctx->stack);
+    ctx->stack = old_stack;
+
+    /* Restore calling bridge's lookup version. */
+    ctx->xin->tables_version = old_version;
+
+    /* Restore to calling bridge tunneling information */
+    ctx->wc->masks.tunnel = old_flow_tnl_wc;
+
+    /* The out bridge popping MPLS should have no effect on the original
+     * bridge. */
+    ctx->was_mpls = old_was_mpls;
+
+    /* The out bridge's conntrack execution should have no effect on the
+     * original bridge. */
+    ctx->conntracked = old_conntrack;
+
+    /* The fact that the out bridge exits (for any reason) does not mean
+     * that the original bridge should exit.  Specifically, if the out
+     * bridge freezes translation, the original bridge must continue
+     * processing with the original, not the frozen packet! */
+    ctx->exit = false;
+
+    /* Out bridge errors do not propagate back. */
+    ctx->error = XLATE_OK;
+
+    if (ctx->xin->resubmit_stats) {
+        netdev_vport_inc_tx(in_dev->netdev, ctx->xin->resubmit_stats);
+        netdev_vport_inc_rx(out_dev->netdev, ctx->xin->resubmit_stats);
+        if (out_dev->bfd) {
+            bfd_account_rx(out_dev->bfd, ctx->xin->resubmit_stats);
+        }
+    }
+    if (ctx->xin->xcache) {
+        struct xc_entry *entry;
+        entry = xlate_cache_add_entry(ctx->xin->xcache, XC_NETDEV);
+        entry->dev.tx = netdev_ref(in_dev->netdev);
+        entry->dev.rx = netdev_ref(out_dev->netdev);
+        entry->dev.bfd = bfd_ref(out_dev->bfd);
+    }
+}
+
+static void
 compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port,
                         const struct xlate_bond_recirc *xr, bool check_stp)
 {
@@ -3133,138 +3265,8 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port,
     }
 
     if (xport->peer) {
-        const struct xport *peer = xport->peer;
-        struct flow old_flow = ctx->xin->flow;
-        struct flow_tnl old_flow_tnl_wc = ctx->wc->masks.tunnel;
-        bool old_conntrack = ctx->conntracked;
-        bool old_was_mpls = ctx->was_mpls;
-        ovs_version_t old_version = ctx->xin->tables_version;
-        struct ofpbuf old_stack = ctx->stack;
-        uint8_t new_stack[1024];
-        struct ofpbuf old_action_set = ctx->action_set;
-        struct ovs_list *old_trace = ctx->xin->trace;
-        uint64_t actset_stub[1024 / 8];
-
-        ofpbuf_use_stub(&ctx->stack, new_stack, sizeof new_stack);
-        ofpbuf_use_stub(&ctx->action_set, actset_stub, sizeof actset_stub);
-        flow->in_port.ofp_port = peer->ofp_port;
-        flow->metadata = htonll(0);
-        memset(&flow->tunnel, 0, sizeof flow->tunnel);
-        flow->tunnel.metadata.tab = ofproto_get_tun_tab(
-            &peer->xbridge->ofproto->up);
-        ctx->wc->masks.tunnel.metadata.tab = flow->tunnel.metadata.tab;
-        memset(flow->regs, 0, sizeof flow->regs);
-        flow->actset_output = OFPP_UNSET;
-        clear_conntrack(ctx);
-        ctx->xin->trace = xlate_report(ctx, OFT_BRIDGE,
-                                       "bridge(\"%s\")", peer->xbridge->name);
-
-        /* When the patch port points to a different bridge, then the mirrors
-         * for that bridge clearly apply independently to the packet, so we
-         * reset the mirror bitmap to zero and then restore it after the packet
-         * returns.
-         *
-         * When the patch port points to the same bridge, this is more of a
-         * design decision: can mirrors be re-applied to the packet after it
-         * re-enters the bridge, or should we treat that as doubly mirroring a
-         * single packet?  The former may be cleaner, since it respects the
-         * model in which a patch port is like a physical cable plugged from
-         * one switch port to another, but the latter may be less surprising to
-         * users.  We take the latter choice, for now at least.  (To use the
-         * former choice, hard-code 'independent_mirrors' to "true".) */
-        mirror_mask_t old_mirrors = ctx->mirrors;
-        bool independent_mirrors = peer->xbridge != ctx->xbridge;
-        if (independent_mirrors) {
-            ctx->mirrors = 0;
-        }
-        ctx->xbridge = peer->xbridge;
-
-        /* The bridge is now known so obtain its table version. */
-        ctx->xin->tables_version
-            = ofproto_dpif_get_tables_version(ctx->xbridge->ofproto);
-
-        if (!process_special(ctx, peer) && may_receive(peer, ctx)) {
-            if (xport_stp_forward_state(peer) && xport_rstp_forward_state(peer)) {
-                xlate_table_action(ctx, flow->in_port.ofp_port, 0, true, true);
-                if (!ctx->freezing) {
-                    xlate_action_set(ctx);
-                }
-                if (ctx->freezing) {
-                    finish_freezing(ctx);
-                }
-            } else {
-                /* Forwarding is disabled by STP and RSTP.  Let OFPP_NORMAL and
-                 * the learning action look at the packet, then drop it. */
-                struct flow old_base_flow = ctx->base_flow;
-                size_t old_size = ctx->odp_actions->size;
-                mirror_mask_t old_mirrors2 = ctx->mirrors;
-
-                xlate_table_action(ctx, flow->in_port.ofp_port, 0, true, true);
-                ctx->mirrors = old_mirrors2;
-                ctx->base_flow = old_base_flow;
-                ctx->odp_actions->size = old_size;
-
-                /* Undo changes that may have been done for freezing. */
-                ctx_cancel_freeze(ctx);
-            }
-        }
-
-        ctx->xin->trace = old_trace;
-        if (independent_mirrors) {
-            ctx->mirrors = old_mirrors;
-        }
-        ctx->xin->flow = old_flow;
-        ctx->xbridge = xport->xbridge;
-        ofpbuf_uninit(&ctx->action_set);
-        ctx->action_set = old_action_set;
-        ofpbuf_uninit(&ctx->stack);
-        ctx->stack = old_stack;
-
-        /* Restore calling bridge's lookup version. */
-        ctx->xin->tables_version = old_version;
-
-        /* Since this packet came in on a patch port (from the perspective of
-         * the peer bridge), it cannot have useful tunnel information. As a
-         * result, any wildcards generated on that tunnel also cannot be valid.
-         * The tunnel wildcards must be restored to their original version since
-         * the peer bridge uses a separate tunnel metadata table and therefore
-         * any generated wildcards will be garbage in the context of our
-         * metadata table. */
-        ctx->wc->masks.tunnel = old_flow_tnl_wc;
-
-        /* The peer bridge popping MPLS should have no effect on the original
-         * bridge. */
-        ctx->was_mpls = old_was_mpls;
-
-        /* The peer bridge's conntrack execution should have no effect on the
-         * original bridge. */
-        ctx->conntracked = old_conntrack;
-
-        /* The fact that the peer bridge exits (for any reason) does not mean
-         * that the original bridge should exit.  Specifically, if the peer
-         * bridge freezes translation, the original bridge must continue
-         * processing with the original, not the frozen packet! */
-        ctx->exit = false;
-
-        /* Peer bridge errors do not propagate back. */
-        ctx->error = XLATE_OK;
-
-        if (ctx->xin->resubmit_stats) {
-            netdev_vport_inc_tx(xport->netdev, ctx->xin->resubmit_stats);
-            netdev_vport_inc_rx(peer->netdev, ctx->xin->resubmit_stats);
-            if (peer->bfd) {
-                bfd_account_rx(peer->bfd, ctx->xin->resubmit_stats);
-            }
-        }
-        if (ctx->xin->xcache) {
-            struct xc_entry *entry;
-
-            entry = xlate_cache_add_entry(ctx->xin->xcache, XC_NETDEV);
-            entry->dev.tx = netdev_ref(xport->netdev);
-            entry->dev.rx = netdev_ref(peer->netdev);
-            entry->dev.bfd = bfd_ref(peer->bfd);
-        }
-        return;
+       group_actions(ctx, xport, xport->peer);
+       return;
     }
 
     flow_vlan_tci = flow->vlan_tci;
-- 
2.7.4



More information about the dev mailing list