[ovs-dev] [classifier-opt 23/28] util: New function popcount().

Ethan Jackson ethan at nicira.com
Tue Jul 31 17:40:13 UTC 2012


Oh one more thing:

This seems like the kind of thing that's both easy to get wrong, and
easy to unit test.  Does it make sense to add one?

Ethan

On Tue, Jul 31, 2012 at 10:38 AM, Ethan Jackson <ethan at nicira.com> wrote:
> Looks good to me.
>
> The following questions are to satisfy my curiosity, I think the patch
> should be merged as is:
>
> How performance critical is this popcount implementation going to be?
> I assume you've put all this work into testing it because the
> classifier will be relying on it heavily?
>
> Why do you think the gcc builtin is slow? That's surprising to me.  Is
> it possible that in newer versions of gcc (i.e. 4.7 and later) would
> simply generate the assembly instruction?
>
> If it's so performance critical, could we simply check for the
> assembly instruction in the configure script, and if it exists use it.
>  Of course, if it doesn't exist we would fall back to what you
> currently have.
>
> Ethan
>
>
>
> On Fri, Jul 20, 2012 at 4:25 PM, Ben Pfaff <blp at nicira.com> wrote:
>> This is the fastest portable implementation among the ones below, as
>> measured with GCC 4.4 on a Xeon X3430.  The measeured times were, in
>> seconds:
>>
>> popcount1    25.6
>> popcount2     6.9 (but is not portable)
>> popcount3    31.4
>> popcount4    25.6
>> popcount5    61.6 (and is buggy)
>> popcount6    64.6
>> popcount7    32.3
>> popcount8    11.2
>>
>> #include <limits.h>
>> #include <stdio.h>
>> #include <stdint.h>
>>
>> int
>> popcount1(unsigned int x)
>> {
>>     return __builtin_popcount(x);
>> }
>>
>> int
>> popcount2(unsigned int x)
>> {
>>     unsigned int y;
>>     asm("popcnt %1, %0" : "=r" (y) : "g" (x));
>>     return y;
>> }
>>
>> int
>> popcount3(unsigned int x)
>> {
>>     unsigned int n;
>>
>>     n = (x >> 1) & 033333333333;
>>     x -= n;
>>     n = (n >> 1) & 033333333333;
>>     x -= n;
>>     x = (x + (x >> 3)) & 030707070707;
>>     return x % 63;
>> }
>>
>> int
>> popcount4(unsigned int x)
>> {
>>     x -= (x >> 1) & 0x55555555;
>>     x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
>>     x = (x + (x >> 4)) & 0x0f0f0f0f;
>>     x += x >> 8;
>>     x += x >> 16;
>>     return x & 0x3f;
>> }
>>
>> int
>> popcount5(unsigned int x)
>> {
>>     int n;
>>
>>     n = 0;
>>     while (x) {
>>         if (x & 0xf) {
>>             n += ((0xe9949440 >> (x & 0xf)) & 3) + 1;
>>         }
>>         x >>= 4;
>>     }
>>     return n;
>> }
>>
>> int
>> popcount6(unsigned int x)
>> {
>>     int n;
>>
>>     n = 0;
>>     while (x) {
>>         n += (0xe994 >> (x & 7)) & 3;
>>         x >>= 3;
>>     }
>>     return n;
>> }
>>
>> int
>> popcount7(unsigned int x)
>> {
>>     static const int table[16] = {
>>         0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
>>     };
>>
>>     return (table[x & 0xf]
>>             + table[(x >> 4) & 0xf]
>>             + table[(x >> 8) & 0xf]
>>             + table[(x >> 12) & 0xf]
>>             + table[(x >> 16) & 0xf]
>>             + table[(x >> 20) & 0xf]
>>             + table[(x >> 24) & 0xf]
>>             + table[x >> 28]);
>> }
>>
>> static int
>> popcount8(unsigned int x)
>> {
>> #define INIT1(X)                                \
>>     ((((X) & (1 << 0)) != 0) +                  \
>>      (((X) & (1 << 1)) != 0) +                  \
>>      (((X) & (1 << 2)) != 0) +                  \
>>      (((X) & (1 << 3)) != 0) +                  \
>>      (((X) & (1 << 4)) != 0) +                  \
>>      (((X) & (1 << 5)) != 0) +                  \
>>      (((X) & (1 << 6)) != 0) +                  \
>>      (((X) & (1 << 7)) != 0))
>> #define INIT2(X)   INIT1(X),  INIT1((X) +  1)
>> #define INIT4(X)   INIT2(X),  INIT2((X) +  2)
>> #define INIT8(X)   INIT4(X),  INIT4((X) +  4)
>> #define INIT16(X)  INIT8(X),  INIT8((X) +  8)
>> #define INIT32(X) INIT16(X), INIT16((X) + 16)
>> #define INIT64(X) INIT32(X), INIT32((X) + 32)
>>
>>     static const uint8_t popcount8[256] = {
>>         INIT64(0), INIT64(64), INIT64(128), INIT64(192)
>>     };
>>
>>     return (popcount8[x & 0xff] +
>>             popcount8[(x >> 8) & 0xff] +
>>             popcount8[(x >> 16) & 0xff] +
>>             popcount8[x >> 24]);
>> }
>>
>> int
>> main(void)
>> {
>>     unsigned long long int x;
>>     int n;
>>
>>     n = 0;
>>     for (x = 0; x <= UINT32_MAX; x++) {
>>         n += popcount8(x);
>>     }
>>     printf("%d\n", n);
>>
>>     return 0;
>> }
>>
>> Signed-off-by: Ben Pfaff <blp at nicira.com>
>> ---
>>  lib/util.c |   34 ++++++++++++++++++++++++++++++++++
>>  lib/util.h |    2 ++
>>  2 files changed, 36 insertions(+), 0 deletions(-)
>>
>> diff --git a/lib/util.c b/lib/util.c
>> index cbcf693..322fc1c 100644
>> --- a/lib/util.c
>> +++ b/lib/util.c
>> @@ -738,6 +738,40 @@ ctz(uint32_t n)
>>      }
>>  }
>>
>> +/* Returns the number of 1-bits in 'x', between 0 and 32 inclusive. */
>> +int
>> +popcount(uint32_t x)
>> +{
>> +    /* In my testing, this implementation is over twice as fast as any other
>> +     * portable implementation that I tried, including GCC 4.4
>> +     * __builtin_popcount(), although nonportable asm("popcnt") was over 50%
>> +     * faster. */
>> +#define INIT1(X)                                \
>> +    ((((X) & (1 << 0)) != 0) +                  \
>> +     (((X) & (1 << 1)) != 0) +                  \
>> +     (((X) & (1 << 2)) != 0) +                  \
>> +     (((X) & (1 << 3)) != 0) +                  \
>> +     (((X) & (1 << 4)) != 0) +                  \
>> +     (((X) & (1 << 5)) != 0) +                  \
>> +     (((X) & (1 << 6)) != 0) +                  \
>> +     (((X) & (1 << 7)) != 0))
>> +#define INIT2(X)   INIT1(X),  INIT1((X) +  1)
>> +#define INIT4(X)   INIT2(X),  INIT2((X) +  2)
>> +#define INIT8(X)   INIT4(X),  INIT4((X) +  4)
>> +#define INIT16(X)  INIT8(X),  INIT8((X) +  8)
>> +#define INIT32(X) INIT16(X), INIT16((X) + 16)
>> +#define INIT64(X) INIT32(X), INIT32((X) + 32)
>> +
>> +    static const uint8_t popcount8[256] = {
>> +        INIT64(0), INIT64(64), INIT64(128), INIT64(192)
>> +    };
>> +
>> +    return (popcount8[x & 0xff] +
>> +            popcount8[(x >> 8) & 0xff] +
>> +            popcount8[(x >> 16) & 0xff] +
>> +            popcount8[x >> 24]);
>> +}
>> +
>>  /* Returns true if the 'n' bytes starting at 'p' are zeros. */
>>  bool
>>  is_all_zeros(const uint8_t *p, size_t n)
>> diff --git a/lib/util.h b/lib/util.h
>> index ca1d566..7cded28 100644
>> --- a/lib/util.h
>> +++ b/lib/util.h
>> @@ -17,6 +17,7 @@
>>  #ifndef UTIL_H
>>  #define UTIL_H 1
>>
>> +#include <limits.h>
>>  #include <stdarg.h>
>>  #include <stdbool.h>
>>  #include <stddef.h>
>> @@ -241,6 +242,7 @@ void ignore(bool x OVS_UNUSED);
>>  int log_2_floor(uint32_t);
>>  int log_2_ceil(uint32_t);
>>  int ctz(uint32_t);
>> +int popcount(uint32_t);
>>
>>  bool is_all_zeros(const uint8_t *, size_t);
>>  bool is_all_ones(const uint8_t *, size_t);
>> --
>> 1.7.2.5
>>



More information about the dev mailing list