[ovs-dev] [classifier-opt] hash: Introduce an implementation of murmurhash.

Tue Aug 21 02:15:33 UTC 2012

I verified that the code you've written matches up with:

http://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp

That code is, of course, total magic, so I'm glad there are unit tests for it.

Because you're only hashing words you leave out the following section of code:

------------------------------------------
  //----------
  // tail

  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);

  uint32_t k1 = 0;

  switch(len & 3)
  {
  case 3: k1 ^= tail[2] << 16;
  case 2: k1 ^= tail[1] << 8;
  case 1: k1 ^= tail[0];
          k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
  };

-----------------------------------

I would like to see an XXX comment noting it's absence so that we
remember to fold it in when/if we replace hash_bytes() with murmur.

I want to replace the implementation of flow_hash_symmetric_l4() with
this because it's in the fast path for bundle actions, and it's
current implementation is particularly inefficient.  Any reason I
shouldn't?

Also, I'm going to start doing Acked-bys

Acked-by: Ethan Jackson <ethan at nicira.com>

On Mon, Aug 20, 2012 at 2:27 PM, Ben Pfaff <blp at nicira.com> wrote:
> Murmurhash is generally superior to the Jenkins lookup3 hash according to
> the available figures.  Perhaps we should generally replace our current
> hashes by murmurhash.
>
> For now, I'm introducing a parallel implementation to allow it to be used
> in cases where an incremental one-word-at-a-time hash is desirable.  The
> first user will be added in an upcoming commit.
>
> Signed-off-by: Ben Pfaff <blp at nicira.com>
> ---
> This is meant for the "classifier-opt" series, just before the
> final patch.  (If you look at that final patch as I sent it out,
> you can see a couple of comments
>  * XXX murmurhash would be much better for this than hash_int(). */
> but I forgot to do that before I sent them out.
>
>  lib/hash.c        |   15 ++++++++++++
>  lib/hash.h        |   42 +++++++++++++++++++++++++++++++++++
>  tests/test-hash.c |   62 ++++++++++++++++++++++++++++++++--------------------
>  3 files changed, 95 insertions(+), 24 deletions(-)
>
> diff --git a/lib/hash.c b/lib/hash.c
> index f7aa916..41ad1bf 100644
> --- a/lib/hash.c
> +++ b/lib/hash.c
> @@ -102,3 +102,18 @@ hash_bytes(const void *p_, size_t n, uint32_t basis)
>
>      return c;
>  }
> +
> +/* Returns the hash of the 'n' 32-bit words at 'p', starting from 'basis'.
> + * 'p' must be properly aligned. */
> +uint32_t
> +mhash_words(const uint32_t p[], size_t n_words, uint32_t basis)
> +{
> +    uint32_t hash;
> +    size_t i;
> +
> +    hash = basis;
> +    for (i = 0; i < n_words; i++) {
> +        hash = mhash_add(hash, p[i]);
> +    }
> +    return mhash_finish(hash, n_words);
> +}
> diff --git a/lib/hash.h b/lib/hash.h
> index ac6a63c..701e686 100644
> --- a/lib/hash.h
> +++ b/lib/hash.h
> @@ -118,6 +118,48 @@ static inline uint32_t hash_pointer(const void *p, uint32_t basis)
>      return hash_int((uint32_t) (uintptr_t) p, basis);
>  }
>
> +/* Murmurhash by Austin Appleby,
> + * from http://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp.
> + *
> + * The upstream license there says:
> + *
> + * // MurmurHash3 was written by Austin Appleby, and is placed in the public
> + * // domain. The author hereby disclaims copyright to this source code.
> + *
> + * Murmurhash is faster and higher-quality than the Jenkins lookup3 hash.  When
> + * we have a little more familiarity with it, it's probably a good idea to
> + * switch all of OVS to it.
> + *
> + * For now, we have this implementation here for use by code that needs a hash
> + * that is convenient for use one word at a time, since the Jenkins lookup3
> + * hash works three words at a time.
> + *
> + * See mhash_words() for sample usage. */
> +
> +uint32_t mhash_words(const uint32_t data[], size_t n_words, uint32_t basis);
> +
> +static inline uint32_t mhash_add(uint32_t hash, uint32_t data)
> +{
> +    data *= 0xcc9e2d51;
> +    data = hash_rot(data, 15);
> +    data *= 0x1b873593;
> +
> +    hash ^= data;
> +    hash = hash_rot(hash, 13);
> +    return hash * 5 + 0xe6546b64;
> +}
> +
> +static inline uint32_t mhash_finish(uint32_t hash, size_t n)
> +{
> +    hash ^= n * 4;
> +    hash ^= hash_rot(hash, 16);
> +    hash *= 0x85ebca6b;
> +    hash ^= hash_rot(hash, 13);
> +    hash *= 0xc2b2ae35;
> +    hash ^= hash_rot(hash, 16);
> +    return hash;
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/tests/test-hash.c b/tests/test-hash.c
> index bdf1435..d53ba2e 100644
> --- a/tests/test-hash.c
> +++ b/tests/test-hash.c
> @@ -1,5 +1,5 @@
>  /*
> - * Copyright (c) 2009 Nicira, Inc.
> + * Copyright (c) 2009, 2012 Nicira, Inc.
>   *
>   * Licensed under the Apache License, Version 2.0 (the "License");
>   * you may not use this file except in compliance with the License.
> @@ -41,6 +41,12 @@ hash_words_cb(uint32_t input)
>  }
>
>  static uint32_t
> +mhash_words_cb(uint32_t input)
> +{
> +    return mhash_words(&input, 1, 0);
> +}
> +
> +static uint32_t
>  hash_int_cb(uint32_t input)
>  {
>      return hash_int(input, 0);
> @@ -76,11 +82,37 @@ check_word_hash(uint32_t (*hash)(uint32_t), const char *name,
>      }
>  }
>
> -int
> -main(void)
> +static void
> +check_3word_hash(uint32_t (*hash)(const uint32_t[], size_t, uint32_t),
> +                 const char *name)
>  {
>      int i, j;
>
> +    for (i = 0; i <= 96; i++) {
> +        for (j = i + 1; j <= 96; j++) {
> +            uint32_t in1[3], in2[3];
> +            uint32_t out1, out2;
> +            const int min_unique = 12;
> +            const uint32_t unique_mask = (UINT32_C(1) << min_unique) - 1;
> +
> +            set_bit(in1, i);
> +            set_bit(in2, j);
> +            out1 = hash(in1, 3, 0);
> +            out2 = hash(in2, 3, 0);
> +            if ((out1 & unique_mask) == (out2 & unique_mask)) {
> +                printf("%s has a partial collision:\n", name);
> +                printf("hash(1 << %d) == %08"PRIx32"\n", i, out1);
> +                printf("hash(1 << %d) == %08"PRIx32"\n", j, out2);
> +                printf("The low-order %d bits of output are both "
> +                       "0x%"PRIx32"\n", min_unique, out1 & unique_mask);
> +            }
> +        }
> +    }
> +}
> +
> +int
> +main(void)
> +{
>      /* Check that all hashes computed with hash_words with one 1-bit (or no
>       * 1-bits) set within a single 32-bit word have different values in all
>       * 11-bit consecutive runs.
> @@ -98,6 +130,7 @@ main(void)
>       * independence must be a bad assumption :-)
>       */
>      check_word_hash(hash_words_cb, "hash_words", 11);
> +    check_word_hash(mhash_words_cb, "mhash_words", 11);
>
>      /* Check that all hash functions of with one 1-bit (or no 1-bits) set
>       * within three 32-bit words have different values in their lowest 12
> @@ -112,27 +145,8 @@ main(void)
>       *
>       * so we are doing pretty well to not have any collisions in 12 bits.
>       */
> -    for (i = 0; i <= 96; i++) {
> -        for (j = i + 1; j <= 96; j++) {
> -            uint32_t in1[3], in2[3];
> -            uint32_t out1, out2;
> -            const int min_unique = 12;
> -            const uint32_t unique_mask = (UINT32_C(1) << min_unique) - 1;
> -
> -            set_bit(in1, i);
> -            set_bit(in2, j);
> -            out1 = hash_words(in1, 3, 0);
> -            out2 = hash_words(in2, 3, 0);
> -            if ((out1 & unique_mask) == (out2 & unique_mask)) {
> -                printf("Partial collision:\n");
> -                printf("hash(1 << %d) == %08"PRIx32"\n", i, out1);
> -                printf("hash(1 << %d) == %08"PRIx32"\n", j, out2);
> -                printf("The low-order %d bits of output are both "
> -                       "0x%"PRIx32"\n", min_unique, out1 & unique_mask);
> -                exit(1);
> -            }
> -        }
> -    }
> +    check_3word_hash(hash_words, "hash_words");
> +    check_3word_hash(mhash_words, "mhash_words");
>
>      /* Check that all hashes computed with hash_int with one 1-bit (or no
>       * 1-bits) set within a single 32-bit word have different values in all
> --
> 1.7.2.5
>
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev