[ovs-dev] [PATCH v2] xlate: Use dp_hash for select groups.

Jarno Rajahalme jarno at ovn.org
Mon Sep 12 23:16:08 UTC 2016


Add a new select group selection method "dp_hash", which uses minimal
number of bits from the datapath calculated packet hash to inform the
select group bucket selection.  This makes the datapath flows more
generic resulting in less upcalls to userspace, but adds recirculation
prior to group selection.

Signed-off-by: Jarno Rajahalme <jarno at ovn.org>
---
v2: Rebase and documentation.

 NEWS                         |  8 +++++
 lib/ofp-parse.c              |  2 ++
 lib/ofp-util.c               | 16 ++++------
 ofproto/ofproto-dpif-xlate.c | 71 ++++++++++++++++++++++++++++++++++++++++++--
 ofproto/ofproto-dpif.c       | 11 ++++---
 ofproto/ofproto-dpif.h       |  4 +--
 tests/ofproto-dpif.at        | 38 ++++++++++++++++++++++++
 tests/ofproto.at             |  3 ++
 utilities/ovs-ofctl.8.in     | 16 ++++++++--
 9 files changed, 147 insertions(+), 22 deletions(-)

diff --git a/NEWS b/NEWS
index 8c78b36..88fd90a 100644
--- a/NEWS
+++ b/NEWS
@@ -37,6 +37,14 @@ v2.6.0 - xx xxx xxxx
        already expected to work properly in cases where the switch can
        not buffer packets, so this change should not affect existing
        users.
+     * A new "selection_method=dp_hash" type for OpenFlow select group
+       bucket selection that uses the datapath computed 5-tuple hash
+       without making datapath flows match the 5-tuple fields, which
+       is useful for more efficient load balancing, for example.  This
+       uses the Netronome extension to OpenFlow 1.5+ that allows
+       control over the OpenFlow select groups selection method.  See
+       "selection_method" and related options in ovs-ofctl(8) for
+       details.
    - Improved OpenFlow version compatibility for actions:
      * New OpenFlow extension to support the "group" action in OpenFlow 1.0.
      * OpenFlow 1.0 "enqueue" action now properly translated to OpenFlow 1.1+.
diff --git a/lib/ofp-parse.c b/lib/ofp-parse.c
index d3ef140..4c1f07a 100644
--- a/lib/ofp-parse.c
+++ b/lib/ofp-parse.c
@@ -1566,6 +1566,8 @@ parse_ofp_group_mod_str__(struct ofputil_group_mod *gm, int command,
         goto out;
     }
 
+    /* XXX Exclude fields for non "hash" selection method. */
+    
     if (fields & F_COMMAND_BUCKET_ID) {
         if (!(fields & F_COMMAND_BUCKET_ID_ALL || had_command_bucket_id)) {
             error = xstrdup("must specify a command bucket id");
diff --git a/lib/ofp-util.c b/lib/ofp-util.c
index 1fa4998..f0fdbe6 100644
--- a/lib/ofp-util.c
+++ b/lib/ofp-util.c
@@ -8901,28 +8901,24 @@ parse_group_prop_ntr_selection_method(struct ofpbuf *payload,
         return OFPERR_OFPBPC_BAD_VALUE;
     }
 
-    if (strcmp("hash", prop->selection_method)) {
+    if (strcmp("hash", prop->selection_method)
+        && strcmp("dp_hash", prop->selection_method)) {
         OFPPROP_LOG(&bad_ofmsg_rl, false,
                     "ntr selection method '%s' is not supported",
                     prop->selection_method);
         return OFPERR_OFPBPC_BAD_VALUE;
     }
+    /* 'method_len' is now non-zero. */
 
     strcpy(gp->selection_method, prop->selection_method);
     gp->selection_method_param = ntohll(prop->selection_method_param);
 
-    if (!method_len && gp->selection_method_param) {
-        OFPPROP_LOG(&bad_ofmsg_rl, false, "ntr selection method parameter is "
-                    "non-zero but selection method is empty");
-        return OFPERR_OFPBPC_BAD_VALUE;
-    }
-
     ofpbuf_pull(payload, sizeof *prop);
 
     fields_len = ntohs(prop->length) - sizeof *prop;
-    if (!method_len && fields_len) {
-        OFPPROP_LOG(&bad_ofmsg_rl, false, "ntr selection method parameter is "
-                    "zero but fields are provided");
+    if (fields_len && strcmp("hash", gp->selection_method)) {
+        OFPPROP_LOG(&bad_ofmsg_rl, false, "ntr selection method %s "
+                    "does not support fields", gp->selection_method);
         return OFPERR_OFPBPC_BAD_VALUE;
     }
 
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index 74e3387..e00a5f8 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -348,6 +348,11 @@ struct xlate_ctx {
     * case at that point.
     */
     bool freezing;
+    bool recirc_update_dp_hash;    /* Generated recirculation will be preceded
+                                    * by datapath HASH action to get an updated
+                                    * dp_hash after recirculation. */
+    uint32_t dp_hash_alg;
+    uint32_t dp_hash_basis;
     struct ofpbuf frozen_actions;
     const struct ofpact_controller *pause;
 
@@ -410,6 +415,17 @@ ctx_trigger_freeze(struct xlate_ctx *ctx)
     ctx->freezing = true;
 }
 
+static void
+ctx_trigger_recirculate_with_hash(struct xlate_ctx *ctx, uint32_t type,
+                                  uint32_t basis)
+{
+    ctx->exit = true;
+    ctx->freezing = true;
+    ctx->recirc_update_dp_hash = true;
+    ctx->dp_hash_alg = type;
+    ctx->dp_hash_basis = basis;
+}
+
 static bool
 ctx_first_frozen_action(const struct xlate_ctx *ctx)
 {
@@ -421,6 +437,7 @@ ctx_cancel_freeze(struct xlate_ctx *ctx)
 {
     if (ctx->freezing) {
         ctx->freezing = false;
+        ctx->recirc_update_dp_hash = false;
         ofpbuf_clear(&ctx->frozen_actions);
         ctx->frozen_actions.header = NULL;
     }
@@ -1546,7 +1563,7 @@ group_first_live_bucket(const struct xlate_ctx *ctx,
     struct ofputil_bucket *bucket;
     const struct ovs_list *buckets;
 
-    buckets = group_dpif_get_buckets(group);
+    buckets = group_dpif_get_buckets(group, NULL);
     LIST_FOR_EACH (bucket, list_node, buckets) {
         if (bucket_is_alive(ctx, bucket, depth)) {
             return bucket;
@@ -1567,7 +1584,7 @@ group_best_live_bucket(const struct xlate_ctx *ctx,
     struct ofputil_bucket *bucket;
     const struct ovs_list *buckets;
 
-    buckets = group_dpif_get_buckets(group);
+    buckets = group_dpif_get_buckets(group, NULL);
     LIST_FOR_EACH (bucket, list_node, buckets) {
         if (bucket_is_alive(ctx, bucket, 0)) {
             uint32_t score =
@@ -3459,7 +3476,7 @@ xlate_all_group(struct xlate_ctx *ctx, struct group_dpif *group)
     struct ofputil_bucket *bucket;
     const struct ovs_list *buckets;
 
-    buckets = group_dpif_get_buckets(group);
+    buckets = group_dpif_get_buckets(group, NULL);
     LIST_FOR_EACH (bucket, list_node, buckets) {
         xlate_group_bucket(ctx, bucket);
     }
@@ -3550,6 +3567,41 @@ xlate_hash_fields_select_group(struct xlate_ctx *ctx, struct group_dpif *group)
 }
 
 static void
+xlate_dp_hash_select_group(struct xlate_ctx *ctx, struct group_dpif *group)
+{
+    struct ofputil_bucket *bucket;
+
+    /* dp_hash value 0 is special since it means that the dp_hash has not been
+     * computed, as all computed dp_hash values are non-zero.  Therefore
+     * compare to zero can be used to decide if the dp_hash value is valid
+     * without masking the dp_hash field. */
+    if (!ctx->xin->flow.dp_hash) {
+        uint64_t param = group_dpif_get_selection_method_param(group);
+
+        ctx_trigger_recirculate_with_hash(ctx, param >> 32, (uint32_t)param);
+    } else {
+        const struct ovs_list *buckets;
+        uint32_t n_buckets;
+
+        buckets = group_dpif_get_buckets(group, &n_buckets);
+        if (n_buckets) {
+            /* Minimal mask to cover the number of buckets. */
+            uint32_t mask = (1 << log_2_ceil(n_buckets)) - 1;
+            /* Multiplier chosen to make the trivial 1 bit case to
+             * actually distribute amongst two equal weight buckets. */
+            uint32_t basis = 0xc2b73583 * (ctx->xin->flow.dp_hash & mask);
+
+            ctx->wc->masks.dp_hash |= mask;
+            bucket = group_best_live_bucket(ctx, group, basis);
+            if (bucket) {
+                xlate_group_bucket(ctx, bucket);
+                xlate_group_stats(ctx, group, bucket);
+            }
+        }
+    }
+}
+
+static void
 xlate_select_group(struct xlate_ctx *ctx, struct group_dpif *group)
 {
     const char *selection_method = group_dpif_get_selection_method(group);
@@ -3565,6 +3617,8 @@ xlate_select_group(struct xlate_ctx *ctx, struct group_dpif *group)
         xlate_default_select_group(ctx, group);
     } else if (!strcasecmp("hash", selection_method)) {
         xlate_hash_fields_select_group(ctx, group);
+    } else if (!strcasecmp("dp_hash", selection_method)) {
+        xlate_dp_hash_select_group(ctx, group);
     } else {
         /* Parsing of groups should ensure this never happens */
         OVS_NOT_REACHED();
@@ -3795,6 +3849,16 @@ finish_freezing__(struct xlate_ctx *ctx, uint8_t table)
         }
         recirc_refs_add(&ctx->xout->recircs, id);
 
+        if (ctx->recirc_update_dp_hash) {
+            struct ovs_action_hash *act_hash;
+
+            /* Hash action. */
+            act_hash = nl_msg_put_unspec_uninit(ctx->odp_actions,
+                                                OVS_ACTION_ATTR_HASH,
+                                                sizeof *act_hash);
+            act_hash->hash_alg = OVS_HASH_ALG_L4;  /* Make configurable. */
+            act_hash->hash_basis = 0;              /* Make configurable. */
+        }
         nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_RECIRC, id);
     }
 
@@ -5394,6 +5458,7 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout)
         .mirrors = 0,
 
         .freezing = false,
+        .recirc_update_dp_hash = false,
         .frozen_actions = OFPBUF_STUB_INITIALIZER(frozen_actions_stub),
         .pause = NULL,
 
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index 289e7d6..97995a1 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -4276,7 +4276,7 @@ group_construct_stats(struct group_dpif *group)
     group->packet_count = 0;
     group->byte_count = 0;
 
-    buckets = group_dpif_get_buckets(group);
+    buckets = group_dpif_get_buckets(group, NULL);
     LIST_FOR_EACH (bucket, list_node, buckets) {
         bucket->stats.packet_count = 0;
         bucket->stats.byte_count = 0;
@@ -4297,7 +4297,7 @@ group_dpif_credit_stats(struct group_dpif *group,
     } else { /* Credit to all buckets */
         const struct ovs_list *buckets;
 
-        buckets = group_dpif_get_buckets(group);
+        buckets = group_dpif_get_buckets(group, NULL);
         LIST_FOR_EACH (bucket, list_node, buckets) {
             bucket->stats.packet_count += stats->n_packets;
             bucket->stats.byte_count += stats->n_bytes;
@@ -4345,7 +4345,7 @@ group_get_stats(const struct ofgroup *group_, struct ofputil_group_stats *ogs)
     ogs->packet_count = group->packet_count;
     ogs->byte_count = group->byte_count;
 
-    buckets = group_dpif_get_buckets(group);
+    buckets = group_dpif_get_buckets(group, NULL);
     bucket_stats = ogs->bucket_stats;
     LIST_FOR_EACH (bucket, list_node, buckets) {
         bucket_stats->packet_count = bucket->stats.packet_count;
@@ -4371,8 +4371,11 @@ group_dpif_lookup(struct ofproto_dpif *ofproto, uint32_t group_id,
 }
 
 const struct ovs_list *
-group_dpif_get_buckets(const struct group_dpif *group)
+group_dpif_get_buckets(const struct group_dpif *group, uint32_t *n_buckets)
 {
+    if (n_buckets) {
+        *n_buckets = group->up.n_buckets;
+    }
     return &group->up.buckets;
 }
 
diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h
index a3b1d6b..a75caa4 100644
--- a/ofproto/ofproto-dpif.h
+++ b/ofproto/ofproto-dpif.h
@@ -139,8 +139,8 @@ void group_dpif_credit_stats(struct group_dpif *,
 struct group_dpif *group_dpif_lookup(struct ofproto_dpif *ofproto,
                                      uint32_t group_id, ovs_version_t version,
                                      bool take_ref);
-const struct ovs_list *group_dpif_get_buckets(const struct group_dpif *group);
-
+const struct ovs_list *group_dpif_get_buckets(const struct group_dpif *group,
+                                              uint32_t *n_buckets);
 enum ofp11_group_type group_dpif_get_type(const struct group_dpif *group);
 const char *group_dpif_get_selection_method(const struct group_dpif *group);
 uint64_t group_dpif_get_selection_method_param(const struct group_dpif *group);
diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
index 557c8be..bdc9168 100644
--- a/tests/ofproto-dpif.at
+++ b/tests/ofproto-dpif.at
@@ -448,6 +448,44 @@ AT_CHECK([sort results | uniq | sed 's/1[[01]]/1?/'], [0],
 OVS_VSWITCHD_STOP
 AT_CLEANUP
 
+AT_SETUP([ofproto-dpif - select group with dp_hash selection method])
+OVS_VSWITCHD_START
+add_of_ports br0 1 10 11
+AT_CHECK([ovs-ofctl -O OpenFlow15 add-group br0 'group_id=1234,type=select,selection_method=dp_hash,bucket=output:10,bucket=output:11'])
+AT_CHECK([ovs-ofctl -O OpenFlow15 add-flow br0 'ip,nw_src=192.168.0.1 actions=group:1234'])
+
+# Try a bunch of different flows and make sure that they get distributed
+# at least somewhat.
+for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do
+    pkt="in_port(1),eth(src=50:54:00:00:00:07,dst=50:54:00:00:00:01),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.1.$d,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)"
+    AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt])
+done
+
+AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0x1)/dp_hash(0xXXXX\/0x1)/' | sed 's/packets.*actions:1/actions:1/' | strip_ufid | strip_used | sort], [0], [dnl
+flow-dump from non-dpdk interfaces:
+recirc_id(0),in_port(1),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no), packets:15, bytes:630, used:0.0s, actions:hash(hash_l4(0)),recirc(0x1)
+recirc_id(0x1),dp_hash(0xXXXX/0x1),in_port(1),eth_type(0x0800),ipv4(frag=no), actions:10
+recirc_id(0x1),dp_hash(0xXXXX/0x1),in_port(1),eth_type(0x0800),ipv4(frag=no), actions:11
+])
+
+AT_CHECK([ovs-appctl revalidator/purge], [0])
+
+# Try a bunch of different flows and make sure that they are not distributed
+# as they only vary a field that is not hashed
+for d in 0 1 2 3 4 5 6 7 8 9 a b c d e f; do
+    pkt="in_port(1),eth(src=50:54:00:00:00:$d,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)"
+    AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt])
+done
+
+AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0x1)/dp_hash(0xXXXX\/0x1)/' | strip_ufid | strip_used | sort], [0], [dnl
+flow-dump from non-dpdk interfaces:
+recirc_id(0),in_port(1),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no), packets:15, bytes:630, used:0.0s, actions:hash(hash_l4(0)),recirc(0x2)
+recirc_id(0x2),dp_hash(0xXXXX/0x1),in_port(1),eth_type(0x0800),ipv4(frag=no), packets:15, bytes:630, used:0.0s, actions:11
+])
+
+OVS_VSWITCHD_STOP
+AT_CLEANUP
+
 AT_SETUP([ofproto-dpif - fast failover group])
 OVS_VSWITCHD_START
 add_of_ports br0 1 10 11
diff --git a/tests/ofproto.at b/tests/ofproto.at
index a849c43..e6685b6 100644
--- a/tests/ofproto.at
+++ b/tests/ofproto.at
@@ -428,6 +428,7 @@ AT_DATA([groups.txt], [dnl
 group_id=1233,type=select,selection_method=hash,bucket=output:10,bucket=output:11
 group_id=1234,type=select,selection_method=hash,fields(eth_dst,ip_dst,tcp_dst),bucket=output:10,bucket=output:11
 group_id=1235,type=all,bucket=actions=output:12,bucket=bucket_id:0,actions=output:10,bucket=bucket_id:1,actions=output:11
+group_id=1236,type=select,selection_method=dp_hash,bucket=output:10,bucket=output:11
 ])
 AT_CHECK([ovs-ofctl -O OpenFlow15 -vwarn add-groups br0 groups.txt])
 AT_CHECK([ovs-ofctl -F OXM-OpenFlow15 -O OpenFlow15 -vwarn dump-groups br0 1234], [0], [stdout])
@@ -440,6 +441,7 @@ AT_CHECK([ovs-ofctl -O OpenFlow15 -vwarn dump-groups br0], [0], [stdout])
 AT_CHECK([strip_xids < stdout | sort], [0], [dnl
  group_id=1233,type=select,selection_method=hash,bucket=bucket_id:0,actions=output:10,bucket=bucket_id:1,actions=output:11
  group_id=1235,type=all,bucket=bucket_id:2,actions=output:12,bucket=bucket_id:0,actions=output:10,bucket=bucket_id:1,actions=output:11
+ group_id=1236,type=select,selection_method=dp_hash,bucket=bucket_id:0,actions=output:10,bucket=bucket_id:1,actions=output:11
 OFPST_GROUP_DESC reply (OF1.5):
 ])
 AT_CHECK([ovs-ofctl -O OpenFlow15 -vwarn del-groups br0 group_id=1234])
@@ -447,6 +449,7 @@ AT_CHECK([ovs-ofctl -O OpenFlow15 -vwarn dump-groups br0], [0], [stdout])
 AT_CHECK([strip_xids < stdout | sort], [0], [dnl
  group_id=1233,type=select,selection_method=hash,bucket=bucket_id:0,actions=output:10,bucket=bucket_id:1,actions=output:11
  group_id=1235,type=all,bucket=bucket_id:2,actions=output:12,bucket=bucket_id:0,actions=output:10,bucket=bucket_id:1,actions=output:11
+ group_id=1236,type=select,selection_method=dp_hash,bucket=bucket_id:0,actions=output:10,bucket=bucket_id:1,actions=output:11
 OFPST_GROUP_DESC reply (OF1.5):
 ])
 AT_CHECK([ovs-ofctl -O OpenFlow15 -vwarn del-groups br0], [0])
diff --git a/utilities/ovs-ofctl.8.in b/utilities/ovs-ofctl.8.in
index b56e5b3..52069d1 100644
--- a/utilities/ovs-ofctl.8.in
+++ b/utilities/ovs-ofctl.8.in
@@ -2839,9 +2839,19 @@ This is a string of 1 to 15 bytes in length known to lower layers.
 This field is optional for \fBadd\-group\fR, \fBadd\-groups\fR and
 \fBmod\-group\fR commands on groups of type \fBselect\fR. Prohibited
 otherwise. The default value is the empty string.
-.IP
-Other than the empty string, \fBhash\fR is currently the only defined
-selection method.
+.RS
+.IP \fBhash\fR
+Use a hash computed over the fields specified with the \fBfields\fR
+option, see below.  \fBhash\fR uses the \fBselection_method_param\fR
+as the hash basis.
+.IP \fBdp_hash\fR
+Use a datapath computed hash value.  The hash algorithm varies accross
+different datapath implementations.  \fBdp_hash\fR uses the upper 32
+bits of the \fBselection_method_param\fR as the datapath hash
+algorithm selector, which currently must always be 0, corresponding to
+hash computation over the IP 5-tuple.  The lower 32 bits are used as
+the hash basis.
+.RE
 .IP
 This option will use a Netronome OpenFlow extension which is only supported
 when using Open vSwitch 2.4 and later with OpenFlow 1.5 and later.
-- 
2.1.4




More information about the dev mailing list