[ovs-dev] [PATCH v2 2/3] ofproto-dpif: Improve dp_hash selection method for select groups

ychen ychen103103 at 163.com
Tue Apr 17 11:21:52 UTC 2018


Hi, Jan:
    I think the following code should also be modified
 + for (int hash = 0; hash < n_hash; hash++) {
+ double max_val = 0.0;
+ struct webster *winner;
+        for (i = 0; i < n_buckets; i++) {
+            if (webster[i].value > max_val) {  =======================> if bucket->weight=0, and there is only one bucket with weight equal to 0, then winner will be null
+                max_val = webster[i].value;
+                winner = &webster[i];
+            }

+        }


   Test like this command:
   ovs-ofctl add-group br-int -O openflow15 "group_id=2,type=select,selection_method=dp_hash,bucket=bucket_id=1,weight=0,actions=output:10"
  vswitchd crashed after command put.



At 2018-04-16 22:26:27, "Jan Scheurich" <jan.scheurich at ericsson.com> wrote:
>The current implementation of the "dp_hash" selection method suffers
>from two deficiences: 1. The hash mask and hence the number of dp_hash
>values is just large enough to cover the number of group buckets, but
>does not consider the case that buckets have different weights. 2. The
>xlate-time selection of best bucket from the masked dp_hash value often
>results in bucket load distributions that are quite different from the
>bucket weights because the number of available masked dp_hash values
>is too small (2-6 bits compared to 32 bits of a full hash in the default
>hash selection method).
>
>This commit provides a more accurate implementation of the dp_hash
>select group by applying the well known Webster method for distributing
>a small number of "seats" fairly over the weighted "parties"
>(see https://en.wikipedia.org/wiki/Webster/Sainte-Lagu%C3%AB_method).
>The dp_hash mask is autmatically chosen large enough to provide good
>enough accuracy even with widely differing weights.
>
>This distribution happens at group modification time and the resulting
>table is stored with the group-dpif struct. At xlation time, we use the
>masked dp_hash values as index to look up the assigned bucket.
>
>If the bucket should not be live, we do a circular search over the
>mapping table until we find the first live bucket. As the buckets in
>the table are by construction in pseudo-random order with a frequency
>according to their weight, this method maintains correct distribution
>even if one or more buckets are non-live.
>
>Xlation is further simplified by storing some derived select group state
>at group construction in struct group-dpif in a form better suited for
>xlation purposes.
>
>Adapted the unit test case for dp_hash select group accordingly.
>
>Signed-off-by: Jan Scheurich <jan.scheurich at ericsson.com>
>Signed-off-by: Nitin Katiyar <nitin.katiyar at ericsson.com>
>Co-authored-by: Nitin Katiyar <nitin.katiyar at ericsson.com>
>---
> include/openvswitch/ofp-group.h |   1 +
> ofproto/ofproto-dpif-xlate.c    |  74 +++++++++++++-------
> ofproto/ofproto-dpif.c          | 146 ++++++++++++++++++++++++++++++++++++++++
> ofproto/ofproto-dpif.h          |  13 ++++
> tests/ofproto-dpif.at           |  18 +++--
> 5 files changed, 221 insertions(+), 31 deletions(-)
>
>diff --git a/include/openvswitch/ofp-group.h b/include/openvswitch/ofp-group.h
>index 8d893a5..af4033d 100644
>--- a/include/openvswitch/ofp-group.h
>+++ b/include/openvswitch/ofp-group.h
>@@ -47,6 +47,7 @@ struct bucket_counter {
> /* Bucket for use in groups. */
> struct ofputil_bucket {
>     struct ovs_list list_node;
>+    uint16_t aux;               /* Padding. Also used for temporary data. */
>     uint16_t weight;            /* Relative weight, for "select" groups. */
>     ofp_port_t watch_port;      /* Port whose state affects whether this bucket
>                                  * is live. Only required for fast failover
>diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
>index c8baba1..df245c5 100644
>--- a/ofproto/ofproto-dpif-xlate.c
>+++ b/ofproto/ofproto-dpif-xlate.c
>@@ -4235,35 +4235,55 @@ xlate_hash_fields_select_group(struct xlate_ctx *ctx, struct group_dpif *group,
>     }
> }
> 
>+static struct ofputil_bucket *
>+group_dp_hash_best_bucket(struct xlate_ctx *ctx,
>+                          const struct group_dpif *group,
>+                          uint32_t dp_hash)
>+{
>+    struct ofputil_bucket *bucket, *best_bucket = NULL;
>+    uint32_t n_hash = group->hash_mask + 1;
>+
>+    uint32_t hash = dp_hash &= group->hash_mask;
>+    ctx->wc->masks.dp_hash |= group->hash_mask;
>+
>+    /* Starting from the original masked dp_hash value iterate over the
>+     * hash mapping table to find the first live bucket. As the buckets
>+     * are quasi-randomly spread over the hash values, this maintains
>+     * a distribution according to bucket weights even when some buckets
>+     * are non-live. */
>+    for (int i = 0; i < n_hash; i++) {
>+        bucket = group->hash_map[(hash + i) % n_hash];
>+        if (bucket_is_alive(ctx, bucket, 0)) {
>+            best_bucket = bucket;
>+            break;
>+        }
>+    }
>+
>+    return best_bucket;
>+}
>+
> static void
> xlate_dp_hash_select_group(struct xlate_ctx *ctx, struct group_dpif *group,
>                            bool is_last_action)
> {
>-    struct ofputil_bucket *bucket;
>-
>     /* dp_hash value 0 is special since it means that the dp_hash has not been
>      * computed, as all computed dp_hash values are non-zero.  Therefore
>      * compare to zero can be used to decide if the dp_hash value is valid
>      * without masking the dp_hash field. */
>     if (!ctx->xin->flow.dp_hash) {
>-        uint64_t param = group->up.props.selection_method_param;
>-
>-        ctx_trigger_recirculate_with_hash(ctx, param >> 32, (uint32_t)param);
>+        ctx_trigger_recirculate_with_hash(ctx, group->hash_alg,
>+                                          group->hash_basis);
>+        if (ctx->xin->xcache) {
>+            ofproto_group_unref(&group->up);
>+        }
>     } else {
>-        uint32_t n_buckets = group->up.n_buckets;
>-        if (n_buckets) {
>-            /* Minimal mask to cover the number of buckets. */
>-            uint32_t mask = (1 << log_2_ceil(n_buckets)) - 1;
>-            /* Multiplier chosen to make the trivial 1 bit case to
>-             * actually distribute amongst two equal weight buckets. */
>-            uint32_t basis = 0xc2b73583 * (ctx->xin->flow.dp_hash & mask);
>-
>-            ctx->wc->masks.dp_hash |= mask;
>-            bucket = group_best_live_bucket(ctx, group, basis);
>-            if (bucket) {
>-                xlate_group_bucket(ctx, bucket, is_last_action);
>-                xlate_group_stats(ctx, group, bucket);
>-            }
>+        struct ofputil_bucket *bucket =
>+                group_dp_hash_best_bucket(ctx, group, ctx->xin->flow.dp_hash);
>+        if (bucket) {
>+            xlate_group_bucket(ctx, bucket, is_last_action);
>+            xlate_group_stats(ctx, group, bucket);
>+        } else if (ctx->xin->xcache) {
>+            ofproto_group_unref(&group->up);
>         }
>     }
> }
>@@ -4272,8 +4292,6 @@ static void
> xlate_select_group(struct xlate_ctx *ctx, struct group_dpif *group,
>                    bool is_last_action)
> {
>-    const char *selection_method = group->up.props.selection_method;
>-
>     /* Select groups may access flow keys beyond L2 in order to
>      * select a bucket. Recirculate as appropriate to make this possible.
>      */
>@@ -4281,15 +4299,19 @@ xlate_select_group(struct xlate_ctx *ctx, struct group_dpif *group,
>         ctx_trigger_freeze(ctx);
>     }
> 
>-    if (selection_method[0] == '\0') {
>+    switch (group->selection_method) {
>+    case SEL_METHOD_DEFAULT:
>         xlate_default_select_group(ctx, group, is_last_action);
>-    } else if (!strcasecmp("hash", selection_method)) {
>+        break;
>+    case SEL_METHOD_HASH:
>         xlate_hash_fields_select_group(ctx, group, is_last_action);
>-    } else if (!strcasecmp("dp_hash", selection_method)) {
>+        break;
>+    case SEL_METHOD_DP_HASH:
>         xlate_dp_hash_select_group(ctx, group, is_last_action);
>-    } else {
>-        /* Parsing of groups should ensure this never happens */
>+        break;
>+    default:
>         OVS_NOT_REACHED();
>+        break;
>     }
> }
> 
>diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
>index 1ed82d0..e1a5c09 100644
>--- a/ofproto/ofproto-dpif.c
>+++ b/ofproto/ofproto-dpif.c
>@@ -32,6 +32,7 @@
> #include "lacp.h"
> #include "learn.h"
> #include "mac-learning.h"
>+#include "math.h"
> #include "mcast-snooping.h"
> #include "multipath.h"
> #include "netdev-vport.h"
>@@ -4717,6 +4718,143 @@ group_dpif_credit_stats(struct group_dpif *group,
>     ovs_mutex_unlock(&group->stats_mutex);
> }
> 
>+/* Calculate the dp_hash mask needed to provide the least weighted bucket
>+ * with at least one hash value and construct a mapping table from masked
>+ * dp_hash value to group bucket using the Webster method.
>+ * If the caller specifies a non-zero max_hash value, abort and return false
>+ * if more hash values would be required. The absolute maximum number of
>+ * hash values supported is 256. */
>+
>+#define MAX_SELECT_GROUP_HASH_VALUES 256
>+
>+static bool
>+group_setup_dp_hash_table(struct group_dpif *group, size_t max_hash)
>+{
>+    struct ofputil_bucket *bucket;
>+    uint32_t n_buckets = group->up.n_buckets;
>+    double total_weight = 0.0;
>+    uint16_t min_weight = UINT16_MAX;
>+    uint32_t n_hash;
>+    struct webster {
>+        struct ofputil_bucket *bucket;
>+        uint32_t divisor;
>+        double value;
>+    } webster[group->up.n_buckets];
>+
>+    if (n_buckets == 0) {
>+        VLOG_DBG("  Don't apply dp_hash method without buckets");
>+        return false;
>+    }
>+
>+    int i = 0;
>+    LIST_FOR_EACH (bucket, list_node, &group->up.buckets) {
>+        bucket->aux = 0;
>+        if (bucket->weight > 0 && bucket->weight < min_weight) {
>+            min_weight = bucket->weight;
>+        }
>+        total_weight += bucket->weight;
>+        webster[i].bucket = bucket;
>+        webster[i].divisor = 1;
>+        webster[i].value = bucket->weight;
>+        i++;
>+    }
>+
>+    VLOG_DBG("  Minimum weight: %d, total weight: %.0f",
>+             min_weight, total_weight);
>+
>+    uint32_t min_slots = ceil(total_weight / min_weight);
>+    n_hash = MAX(16, 1L << log_2_ceil(min_slots));
>+
>+    if (n_hash > MAX_SELECT_GROUP_HASH_VALUES ||
>+        (max_hash != 0 && n_hash > max_hash)) {
>+        VLOG_DBG("  Too many hash values required: %d", n_hash);
>+        return false;
>+    }
>+
>+    VLOG_DBG("  Using %d hash values:", n_hash);
>+    group->hash_mask = n_hash - 1;
>+    if (group->hash_map) {
>+        free(group->hash_map);
>+    }
>+    group->hash_map = xcalloc(n_hash, sizeof(struct ofputil_bucket *));
>+
>+    /* Use Webster method to distribute hash values over buckets. */
>+    for (int hash = 0; hash < n_hash; hash++) {
>+        double max_val = 0.0;
>+        struct webster *winner;
>+        for (i = 0; i < n_buckets; i++) {
>+            if (webster[i].value > max_val) {
>+                max_val = webster[i].value;
>+                winner = &webster[i];
>+            }
>+        }
>+#pragma GCC diagnostic push
>+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
>+        /* winner is a reference to a webster[] element initialized above. */
>+        winner->divisor += 2;
>+        winner->value = (double) winner->bucket->weight / winner->divisor;
>+        group->hash_map[hash] = winner->bucket;
>+        winner->bucket->aux++;
>+#pragma GCC diagnostic pop
>+    }
>+
>+    LIST_FOR_EACH (bucket, list_node, &group->up.buckets) {
>+        double target = (n_hash * bucket->weight) / total_weight;
>+        VLOG_DBG("  Bucket %d: weight=%d, target=%.2f hits=%d",
>+                 bucket->bucket_id, bucket->weight,
>+                 target, bucket->aux);
>+    }
>+
>+    return true;
>+}
>+
>+static void
>+group_set_selection_method(struct group_dpif *group)
>+{
>+    struct ofputil_group_props *props = &group->up.props;
>+    char *selection_method = props->selection_method;
>+
>+    if (selection_method[0] == '\0') {
>+        VLOG_INFO("No selection method specified.");
>+        group->selection_method = SEL_METHOD_DEFAULT;
>+
>+    } else if (!strcmp(selection_method, "dp_hash")) {
>+        VLOG_INFO("Selection method specified: dp_hash.");
>+        /* Try to use dp_hash if possible at all. */
>+        if (group_setup_dp_hash_table(group, 0)) {
>+            group->selection_method = SEL_METHOD_DP_HASH;
>+            group->hash_alg = props->selection_method_param >> 32;
>+            if (group->hash_alg >= __OVS_HASH_MAX) {
>+                VLOG_INFO("  Invalid dp_hash algorithm %d. "
>+                          "Defaulting to OVS_HASH_ALG_L4", group->hash_alg);
>+                group->hash_alg = OVS_HASH_ALG_L4;
>+            }
>+            group->hash_basis = (uint32_t) props->selection_method_param;
>+        } else {
>+            /* Fall back to original default hashing in slow path. */
>+            VLOG_INFO("  Falling back to default hash method.");
>+            group->selection_method = SEL_METHOD_DEFAULT;
>+        }
>+    } else if (!strcmp(selection_method, "hash")) {
>+        VLOG_INFO("Selection method specified: hash.");
>+        if (props->fields.values_size > 0) {
>+            /* Controller has specified hash fields. */
>+            struct ds s = DS_EMPTY_INITIALIZER;
>+            oxm_format_field_array(&s, &props->fields);
>+            VLOG_INFO("  Hash fields: %s", ds_cstr(&s));
>+            ds_destroy(&s);
>+            group->selection_method = SEL_METHOD_HASH;
>+        } else {
>+            /* No hash fields. Fall back to original default hashing. */
>+            VLOG_INFO("  No hash fields. Falling back to default hash method.");
>+            group->selection_method = SEL_METHOD_DEFAULT;
>+        }
>+    } else {
>+        /* Parsing of groups should ensure this never happens */
>+        OVS_NOT_REACHED();
>+    }
>+}
>+
> static enum ofperr
> group_construct(struct ofgroup *group_)
> {
>@@ -4725,6 +4863,10 @@ group_construct(struct ofgroup *group_)
>     ovs_mutex_init_adaptive(&group->stats_mutex);
>     ovs_mutex_lock(&group->stats_mutex);
>     group_construct_stats(group);
>+    group->hash_map = NULL;
>+    if (group->up.type == OFPGT11_SELECT) {
>+        group_set_selection_method(group);
>+    }
>     ovs_mutex_unlock(&group->stats_mutex);
>     return 0;
> }
>@@ -4734,6 +4876,10 @@ group_destruct(struct ofgroup *group_)
> {
>     struct group_dpif *group = group_dpif_cast(group_);
>     ovs_mutex_destroy(&group->stats_mutex);
>+    if (group->hash_map) {
>+        free(group->hash_map);
>+        group->hash_map = NULL;
>+    }
> }
> 
> static enum ofperr
>diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h
>index 47bf7f9..880dd3d 100644
>--- a/ofproto/ofproto-dpif.h
>+++ b/ofproto/ofproto-dpif.h
>@@ -119,6 +119,12 @@ rule_dpif_is_internal(const struct rule_dpif *rule)
> 
> /* Groups. */
> 
>+enum group_selection_method {
>+    SEL_METHOD_DEFAULT,
>+    SEL_METHOD_DP_HASH,
>+    SEL_METHOD_HASH,
>+};
>+
> struct group_dpif {
>     struct ofgroup up;
> 
>@@ -129,6 +135,12 @@ struct group_dpif {
>     struct ovs_mutex stats_mutex;
>     uint64_t packet_count OVS_GUARDED;  /* Number of packets received. */
>     uint64_t byte_count OVS_GUARDED;    /* Number of bytes received. */
>+
>+    enum group_selection_method selection_method;
>+    enum ovs_hash_alg hash_alg;         /* dp_hash algorithm to be applied. */
>+    uint32_t hash_basis;                /* Basis for dp_hash. */
>+    uint32_t hash_mask;                 /* Used to mask dp_hash (2^N - 1).*/
>+    struct ofputil_bucket **hash_map;   /* Map hash values to buckets. */
> };
> 
> void group_dpif_credit_stats(struct group_dpif *,
>@@ -137,6 +149,7 @@ void group_dpif_credit_stats(struct group_dpif *,
> struct group_dpif *group_dpif_lookup(struct ofproto_dpif *,
>                                      uint32_t group_id, ovs_version_t version,
>                                      bool take_ref);
>+
> 
> /* Backers.
>  *
>diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
>index 60f28e2..6c20208 100644
>--- a/tests/ofproto-dpif.at
>+++ b/tests/ofproto-dpif.at
>@@ -500,11 +500,19 @@ for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do
>     AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt])
> done
> 
>-AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0x1)/dp_hash(0xXXXX\/0x1)/' | sed 's/packets.*actions:1/actions:1/' | strip_ufid | strip_used | sort], [0], [dnl
>+AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0xf)/dp_hash(0xXXXX\/0xf)/' | sed 's/packets.*actions:1/actions:1/' | strip_ufid | strip_used | sort], [0], [dnl
> flow-dump from non-dpdk interfaces:
> recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no), packets:15, bytes:1590, used:0.0s, actions:hash(hash_l4(0)),recirc(0x1)
>-recirc_id(0x1),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10
>-recirc_id(0x1),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:11
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:10
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:11
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:11
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:11
>+recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:11
> ])
> 
> AT_CHECK([ovs-appctl revalidator/purge], [0])
>@@ -516,10 +524,10 @@ for d in 0 1 2 3 4 5 6 7 8 9 a b c d e f; do
>     AT_CHECK([ovs-appctl netdev-dummy/receive p1 $pkt])
> done
> 
>-AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0x1)/dp_hash(0xXXXX\/0x1)/' | sed 's/\(actions:1\)[[01]]/\1X/' | strip_ufid | strip_used | sort], [0], [dnl
>+AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0xf)/dp_hash(0xXXXX\/0xf)/' | sed 's/\(actions:1\)[[01]]/\1X/' | strip_ufid | strip_used | sort], [0], [dnl
> flow-dump from non-dpdk interfaces:
> recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no), packets:15, bytes:1590, used:0.0s, actions:hash(hash_l4(0)),recirc(0x2)
>-recirc_id(0x2),dp_hash(0xXXXX/0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:15, bytes:1590, used:0.0s, actions:1X
>+recirc_id(0x2),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:15, bytes:1590, used:0.0s, actions:1X
> ])
> 
> OVS_VSWITCHD_STOP
>-- 
>1.9.1


More information about the dev mailing list