[ovs-dev] [PATCH] ovs-atomics: Add atomic support for Windows.

Thu Aug 28 23:39:05 UTC 2014

On Aug 28, 2014, at 9:57 AM, Gurucharan Shetty <shettyg at nicira.com> wrote:
> diff --git a/lib/ovs-atomic-msvc.h b/lib/ovs-atomic-msvc.h
> new file mode 100644
> index 0000000..f357545
> --- /dev/null
> +++ b/lib/ovs-atomic-msvc.h
> @@ -0,0 +1,370 @@
> +/*
> + * Copyright (c) 2014 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +/* This header implements atomic operation primitives using pthreads. */
> +#ifndef IN_OVS_ATOMIC_H
> +#error "This header should only be included indirectly via ovs-atomic.h."
> +#endif
> +
> +/* From msdn documentation: With Visual Studio 2003, volatile to volatile
> +references are ordered; the compiler will not re-order volatile variable
> +access. With Visual Studio 2005, the compiler also uses acquire semantics
> +for read operations on volatile variables and release semantics for write
> +operations on volatile variables (when supported by the CPU). */

Is this still the same for MSVC 2012 or 2013?

> +#define ATOMIC(TYPE) TYPE volatile
> +
> +typedef enum {
> +    memory_order_relaxed,
> +    memory_order_consume,
> +    memory_order_acquire,
> +    memory_order_release,
> +    memory_order_acq_rel,
> +    memory_order_seq_cst
> +} memory_order;
> +
> +#define ATOMIC_BOOL_LOCK_FREE 2
> +#define ATOMIC_CHAR_LOCK_FREE 2
> +#define ATOMIC_SHORT_LOCK_FREE 2
> +#define ATOMIC_INT_LOCK_FREE 2
> +#define ATOMIC_LONG_LOCK_FREE 2
> +#define ATOMIC_LLONG_LOCK_FREE 2
> +#define ATOMIC_POINTER_LOCK_FREE 2
> +
> +#define IS_LOCKLESS_ATOMIC(OBJECT)                      \
> +    (sizeof(OBJECT) <= 8 && IS_POW2(sizeof(OBJECT)))
> +
> +#define ATOMIC_VAR_INIT(VALUE) (VALUE)
> +#define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0)
> +
> +static inline void
> +atomic_compiler_barrier(memory_order order OVS_UNUSED)
> +{
> +    _ReadWriteBarrier();
> +}

You could do “if (order > memory_order_consume) {“ to avoid unnecessary barriers.

> +
> +static inline void
> +atomic_thread_fence(memory_order order)
> +{
> +    if (order == memory_order_seq_cst) {
> +        MemoryBarrier();
> +    }
> +}

This needs an “else { atomic_compiler_barrier(order); }"

> +
> +static inline void
> +atomic_signal_fence(memory_order order)
> +{
> +    atomic_compiler_barrier(order);
> +}
> +
> +/* 1, 2 and 4 bytes loads and stores are atomic on aligned memory. In addition,
> + * since the compiler automatically takes acquire and release semantics on
> + * volatile variables, for any order lesser than 'memory_order_seq_cst', we
> + * can directly assign or read values. */
> +
> +#define atomic_store32(DST, SRC, ORDER)                                 \
> +    if (ORDER == memory_order_seq_cst) {                                \
> +        InterlockedExchange((int32_t volatile *) DST,                   \
> +                               (int32_t ) SRC);                         \
> +    } else {                                                            \
> +        *DST = SRC;                                                     \
> +    }
> +
> +/* 64 bit reads and write are not atomic on x86.

Just noticed this:

“8.1.1 Guaranteed Atomic Operations

The Intel486 processor (and newer processors since) guarantees that the following
basic memory operations will always be carried out atomically:
- Reading or writing a byte
- Reading or writing a word aligned on a 16-bit boundary
- Reading or writing a doubleword aligned on a 32-bit boundary

The Pentium processor (and newer processors since) guarantees that the following
additional memory operations will always be carried out atomically:
- Reading or writing a quadword aligned on a 64-bit boundary
- 16-bit accesses to uncached memory locations that fit within a 32-bit data bus

The P6 family processors (and newer processors since) guarantee that the following
additional memory operation will always be carried out atomically:
- Unaligned 16-, 32-, and 64-bit accesses to cached memory that fit within a cache line”

So, it might be worth it to limit the support for i586, and check alignment at run time to avoid the lock on 64-bit load and store. Or it may be that InterlockedExchange64() already does that?

> + *
> + * For atomic stores, 'consume' and 'acquire' semantics are not valid. But we
> + * are using 'Exchange' to get atomic stores here and we only have
> + * InterlockedExchange64(), InterlockedExchangeNoFence64() and
> + * InterlockedExchange64Acquire() available. So we are forced to use
> + * InterlockedExchange64() which uses full memory barrier for everything
> + * greater than 'memory_order_relaxed'. */
> +#define atomic_store64(DST, SRC, ORDER)                                 \
> +    if (ORDER == memory_order_relaxed) {                                \
> +        InterlockedExchangeNoFence64((int64_t volatile *) DST,          \
> +                                     (int64_t ) SRC);                   \
> +    } else {                                                            \
> +        InterlockedExchange64((int64_t volatile *) DST, (int64_t ) SRC);\
> +    }
> +
> +/* Used for 8 and 16 bit variations. */
> +#define atomic_storeX(X, DST, SRC, ORDER)                               \
> +    if (ORDER == memory_order_seq_cst) {                                \
> +        InterlockedExchange##X((int##X##_t volatile *) DST,             \
> +                               (int##X##_t ) SRC);                      \
> +    } else {                                                            \
> +        *DST = SRC;                                                     \

> +    }
> +
> +#define atomic_store(DST, SRC)                               \
> +        atomic_store_explicit(DST, SRC, memory_order_seq_cst)
> +
> +#define atomic_store_explicit(DST, SRC, ORDER)                           \
> +    if (sizeof (*DST) == 1) {                                            \
> +        atomic_storeX(8, DST, SRC, ORDER)                                \
> +    } else if (sizeof (*DST) == 2) {                                     \
> +        atomic_storeX(16, DST, SRC, ORDER)                               \
> +    } else if (sizeof (*DST) == 4) {                                     \
> +        atomic_store32(DST, SRC, ORDER)                                  \
> +    } else if (sizeof (*DST) == 8) {                                     \
> +        atomic_store64(DST, SRC, ORDER)                                  \
> +    }
> +
> +#define atomic_read32(SRC, DST, ORDER)                        \
> +    if (ORDER == memory_order_seq_cst) {                      \
> +        *DST = InterlockedOr((int32_t volatile *) SRC, 0);    \
> +    } else {                                                  \
> +        *DST = *SRC;                                          \
> +    }
> +

On x86, since seq_cst stores are locked, the corresponding reads don’t need to be:

“Locked operations are atomic with respect to all other memory operations and all externally visible events. Only instruction fetch and page table accesses can pass locked instructions. Locked instructions can be used to synchronize data written by one processor and read by another processor. For the P6 family processors, locked operations serialize all outstanding load and store operations (that is, wait for them to complete). This rule is also true for the Pentium 4 and Intel Xeon processors, with one exception. Load operations that reference weakly ordered memory types (such as the WC memory type) may not be serialized."

> +/* 64 bit reads and writes are not atomic on x86. */
> +#define atomic_read64(SRC, DST, ORDER)                        \
> +    *DST = InterlockedOr64((int64_t volatile *) SRC, 0);
> +

On 586 this could be just an assignment, if SRC is quad-word (64-bit) aligned.

> +/* For 8 and 16 bit variations. */
> +#define atomic_readX(X, SRC, DST, ORDER)                                   \
> +    if (ORDER == memory_order_seq_cst) {                                   \
> +        *DST = InterlockedOr##X((int##X##_t volatile *) SRC, 0);           \
> +    } else {                                                               \
> +        *DST = *SRC;                                                       \
> +    }
> +

Same here.

> +#define atomic_read(SRC, DST)                               \
> +        atomic_read_explicit(SRC, DST, memory_order_seq_cst)
> +
> +#define atomic_read_explicit(SRC, DST, ORDER)                            \
> +    if (sizeof (*DST) == 1) {                                            \
> +        atomic_readX(8, SRC, DST, ORDER)                                 \
> +    } else if (sizeof (*DST) == 2) {                                     \
> +        atomic_readX(16, SRC, DST, ORDER)                                \
> +    } else if (sizeof (*DST) == 4) {                                     \
> +        atomic_read32(SRC, DST, ORDER)                                   \
> +    } else if (sizeof (*DST) == 8) {                                     \
> +        atomic_read64(SRC, DST, ORDER)                                   \
> +    }
> +
> +/* For add, sub, and logical operations, for 8, 16 and 64 bit data types,
> + * functions for all the different memory orders does not exist
> + * (though documentation exists for some of them).  The MSVC C++ library which
> + * implements the c11 atomics simply calls the full memory barrier function
> + * for everything in x86(see xatomic.h). So do the same here. */
> +
> +/* For 8, 16 and 64 bit variations. */
> +#define atomic_op(OP, X, RMW, ARG, ORIG, ORDER)                         \
> +    atomic_##OP##_generic(X, RMW, ARG, ORIG, ORDER)
> +
> +/* Arithmetic addition calls. */
> +
> +#define atomic_add32(RMW, ARG, ORIG, ORDER)                        \
> +    *ORIG = InterlockedExchangeAdd((int32_t volatile *) RMW,       \
> +                                      (int32_t ) ARG);
> +
> +/* For 8, 16 and 64 bit variations. */
> +#define atomic_add_generic(X, RMW, ARG, ORIG, ORDER)                        \
> +    *ORIG = _InterlockedExchangeAdd##X((int##X##_t volatile *) RMW,          \
> +                                      (int##X##_t ) ARG);
> +
> +#define atomic_add(RMW, ARG, ORIG)                               \
> +        atomic_add_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
> +
> +#define atomic_add_explicit(RMW, ARG, ORIG, ORDER)             \
> +    if (sizeof (*RMW) == 1) {                                  \
> +        atomic_op(add, 8, RMW, ARG, ORIG, ORDER)               \
> +    } else if (sizeof (*RMW) == 2) {                           \
> +        atomic_op(add, 16, RMW, ARG, ORIG, ORDER)              \
> +    } else if (sizeof (*RMW) == 4) {                           \
> +        atomic_add32(RMW, ARG, ORIG, ORDER)                    \
> +    } else if (sizeof (*RMW) == 8) {                           \
> +        atomic_op(add, 64, RMW, ARG, ORIG, ORDER)              \
> +    }
> +
> +/* Arithmetic subtraction calls. */
> +
> +#define atomic_sub(RMW, ARG, ORIG)                             \
> +        atomic_add_explicit(RMW, (0 - ARG), ORIG, memory_order_seq_cst)
> +
> +#define atomic_sub_explicit(RMW, ARG, ORIG, ORDER)           \
> +        atomic_add_explicit(RMW, (0 - ARG), ORIG, ORDER)
> +
> +/* Logical 'and' calls. */
> +
> +#define atomic_and32(RMW, ARG, ORIG, ORDER)                        \
> +    *ORIG = InterlockedAnd((int32_t volatile *) RMW, (int32_t ) ARG);
> +
> +/* For 8, 16 and 64 bit variations. */
> +#define atomic_and_generic(X, RMW, ARG, ORIG, ORDER)                        \
> +    *ORIG = InterlockedAnd##X((int##X##_t volatile *) RMW, (int##X##_t ) ARG);
> +
> +#define atomic_and(RMW, ARG, ORIG)                               \
> +        atomic_and_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
> +
> +#define atomic_and_explicit(RMW, ARG, ORIG, ORDER)             \
> +    if (sizeof (*RMW) == 1) {                                  \
> +        atomic_op(and, 8, RMW, ARG, ORIG, ORDER)               \
> +    } else if (sizeof (*RMW) == 2) {                           \
> +        atomic_op(and, 16, RMW, ARG, ORIG, ORDER)              \
> +    } else if (sizeof (*RMW) == 4) {                           \
> +        atomic_and32(RMW, ARG, ORIG, ORDER)                    \
> +    } else if (sizeof (*RMW) == 8) {                           \
> +        atomic_op(and, 64, RMW, ARG, ORIG, ORDER)              \
> +    }
> +
> +/* Logical 'Or' calls. */
> +
> +#define atomic_or32(RMW, ARG, ORIG, ORDER)                        \
> +    *ORIG = InterlockedOr((int32_t volatile *) RMW, (int32_t ) ARG);
> +
> +/* For 8, 16 and 64 bit variations. */
> +#define atomic_or_generic(X, RMW, ARG, ORIG, ORDER)                        \
> +    *ORIG = InterlockedOr##X((int##X##_t volatile *) RMW, (int##X##_t ) ARG);
> +
> +#define atomic_or(RMW, ARG, ORIG)                               \
> +        atomic_or_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
> +
> +#define atomic_or_explicit(RMW, ARG, ORIG, ORDER)              \
> +    if (sizeof (*RMW) == 1) {                                  \
> +        atomic_op(or, 8, RMW, ARG, ORIG, ORDER)                \
> +    } else if (sizeof (*RMW) == 2) {                           \
> +        atomic_op(or, 16, RMW, ARG, ORIG, ORDER)               \
> +    } else if (sizeof (*RMW) == 4) {                           \
> +        atomic_or32(RMW, ARG, ORIG, ORDER)                     \
> +    } else if (sizeof (*RMW) == 8) {                           \
> +        atomic_op(or, 64, RMW, ARG, ORIG, ORDER)               \
> +    }
> +
> +/* Logical Xor calls. */
> +
> +#define atomic_xor32(RMW, ARG, ORIG, ORDER)                        \
> +    *ORIG = InterlockedXor((int32_t volatile *) RMW, (int32_t ) ARG);
> +
> +/* For 8, 16 and 64 bit variations. */
> +#define atomic_xor_generic(X, RMW, ARG, ORIG, ORDER)                        \
> +    *ORIG = InterlockedXor##X((int##X##_t volatile *) RMW, (int##X##_t ) ARG);
> +
> +#define atomic_xor(RMW, ARG, ORIG)                               \
> +        atomic_xor_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
> +
> +#define atomic_xor_explicit(RMW, ARG, ORIG, ORDER)             \
> +    if (sizeof (*RMW) == 1) {                                  \
> +        atomic_op(xor, 8, RMW, ARG, ORIG, ORDER)               \
> +    } else if (sizeof (*RMW) == 2) {                           \
> +        atomic_op(xor, 16, RMW, ARG, ORIG, ORDER)              \
> +    } else if (sizeof (*RMW) == 4) {                           \
> +        atomic_xor32(RMW, ARG, ORIG, ORDER);                   \
> +    } else if (sizeof (*RMW) == 8) {                           \
> +        atomic_op(xor, 64, RMW, ARG, ORIG, ORDER)              \
> +    }
> +
> +#define atomic_compare_exchange_strong(DST, EXP, SRC)   \
> +    atomic_compare_exchange_strong_explicit(DST, EXP, SRC, \
> +                                            memory_order_seq_cst, \
> +                                            memory_order_seq_cst)
> +
> +#define atomic_compare_exchange_weak atomic_compare_exchange_strong
> +#define atomic_compare_exchange_weak_explicit \
> +        atomic_compare_exchange_strong_explicit
> +
> +/* MSVCs c++ compiler implements c11 atomics and looking through its
> + * implementation (in xatomic.h), orders are ignored for x86 platform.
> + * Do the same here. */
> +static inline bool
> +atomic_compare_exchange8(int8_t volatile *dst, int8_t *expected, int8_t src)
> +{
> +    int8_t previous = _InterlockedCompareExchange8(dst, src, *expected);
> +    if (previous == *expected) {
> +        return true;
> +    } else {
> +        *expected = previous;
> +        return false;
> +    }
> +}
> +
> +static inline bool
> +atomic_compare_exchange16(int16_t volatile *dst, int16_t *expected,
> +                          int16_t src)
> +{
> +    int16_t previous = InterlockedCompareExchange16(dst, src, *expected);
> +    if (previous == *expected) {
> +        return true;
> +    } else {
> +        *expected = previous;
> +        return false;
> +    }
> +}
> +
> +static inline bool
> +atomic_compare_exchange32(int32_t volatile *dst, int32_t *expected,
> +                          int32_t src)
> +{
> +    int32_t previous = InterlockedCompareExchange(dst, src, *expected);
> +    if (previous == *expected) {
> +        return true;
> +    } else {
> +        *expected = previous;
> +        return false;
> +    }
> +}
> +
> +static inline bool
> +atomic_compare_exchange64(int64_t volatile *dst, int64_t *expected,
> +                          int64_t src)
> +{
> +    int64_t previous = InterlockedCompareExchange64(dst, src, *expected);
> +    if (previous == *expected) {
> +        return true;
> +    } else {
> +        *expected = previous;
> +        return false;
> +    }
> +}
> +
> +static inline bool
> +atomic_compare_unreachable()
> +{
> +    return true;
> +}
> +
> +#define atomic_compare_exchange_strong_explicit(DST, EXP, SRC, ORD1, ORD2)    \
> +    (sizeof (*DST) == 1                                                       \
> +     ? atomic_compare_exchange8((int8_t volatile *) DST, (int8_t *) EXP,      \
> +                                (int8_t ) SRC)                                \
> +     : (sizeof (*DST) == 2                                                    \
> +        ? atomic_compare_exchange16((int16_t volatile *) DST, (int16_t *) EXP,\
> +                                     (int16_t ) SRC)                          \
> +        : (sizeof (*DST) == 4                                                 \
> +           ? atomic_compare_exchange32((int32_t volatile *) DST,              \
> +                                       (int32_t *) EXP, (int32_t ) SRC)       \
> +           : (sizeof (*DST) == 8                                              \
> +              ? atomic_compare_exchange64((int64_t volatile *) DST,           \
> +                                           (int64_t *) EXP, (int64_t ) SRC)   \
> +              : ovs_fatal(0,                                                  \
> +                          "atomic operation with size greater than 8 bytes"), \
> +                atomic_compare_unreachable()))))
> +
> +
> +/* atomic_flag */
> +
> +typedef ATOMIC(int32_t) atomic_flag;
> +#define ATOMIC_FLAG_INIT 0
> +
> +#define atomic_flag_test_and_set(FLAG)                 \
> +    (bool) InterlockedBitTestAndSet(FLAG, 0)
> +
> +#define atomic_flag_test_and_set_explicit(FLAG, ORDER) \
> +        atomic_flag_test_and_set(FLAG)
> +
> +#define atomic_flag_clear_explicit(FLAG, ORDER) \
> +        atomic_flag_clear()
> +#define atomic_flag_clear(FLAG)                 \
> +    InterlockedBitTestAndReset(FLAG, 0)
> diff --git a/lib/ovs-atomic.h b/lib/ovs-atomic.h
> index 8e83411..88259f6 100644
> --- a/lib/ovs-atomic.h
> +++ b/lib/ovs-atomic.h
> @@ -333,6 +333,8 @@
>         #include "ovs-atomic-i586.h"
>     #elif HAVE_GCC4_ATOMICS
>         #include "ovs-atomic-gcc4+.h"
> +    #elif _MSC_VER && defined(_M_IX86)
> +        #include "ovs-atomic-msvc.h"
>     #else
>         /* ovs-atomic-pthreads implementation is provided for portability.
>          * It might be too slow for real use because Open vSwitch is
> -- 
> 1.7.9.5
> 
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev