| 1 | /* |
| 2 | * Copyright (c) 1991-1994 by Xerox Corporation. All rights reserved. |
| 3 | * Copyright (c) 1996-1999 by Silicon Graphics. All rights reserved. |
| 4 | * Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved. |
| 5 | * Copyright (c) 2008-2018 Ivan Maidanski |
| 6 | * |
| 7 | * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED |
| 8 | * OR IMPLIED. ANY USE IS AT YOUR OWN RISK. |
| 9 | * |
| 10 | * Permission is hereby granted to use or copy this program |
| 11 | * for any purpose, provided the above notices are retained on all copies. |
| 12 | * Permission to modify the code and to distribute modified code is granted, |
| 13 | * provided the above notices are retained, and a notice that the code was |
| 14 | * modified is included with the above copyright notice. |
| 15 | * |
| 16 | * Some of the machine specific code was borrowed from our GC distribution. |
| 17 | */ |
| 18 | |
| 19 | #if (AO_GNUC_PREREQ(4, 8) || AO_CLANG_PREREQ(3, 4)) \ |
| 20 | && !defined(__INTEL_COMPILER) /* TODO: test and enable icc */ \ |
| 21 | && !defined(AO_DISABLE_GCC_ATOMICS) |
| 22 | # define AO_GCC_ATOMIC_TEST_AND_SET |
| 23 | |
| 24 | # if defined(__APPLE_CC__) |
| 25 | /* OS X 10.7 clang-425 lacks __GCC_HAVE_SYNC_COMPARE_AND_SWAP_n */ |
| 26 | /* predefined macro (unlike e.g. OS X 10.11 clang-703). */ |
| 27 | # define AO_GCC_FORCE_HAVE_CAS |
| 28 | |
| 29 | # ifdef __x86_64__ |
| 30 | # if !AO_CLANG_PREREQ(9, 0) /* < Apple clang-900 */ |
| 31 | /* Older Apple clang (e.g., clang-600 based on LLVM 3.5svn) had */ |
| 32 | /* some bug in the double word CAS implementation for x64. */ |
| 33 | # define AO_SKIPATOMIC_double_compare_and_swap_ANY |
| 34 | # endif |
| 35 | |
| 36 | # elif defined(__MACH__) |
| 37 | /* OS X 10.8 lacks __atomic_load/store symbols for arch i386 */ |
| 38 | /* (even with a non-Apple clang). */ |
| 39 | # ifndef MAC_OS_X_VERSION_MIN_REQUIRED |
| 40 | /* Include this header just to import the version macro. */ |
| 41 | # include <AvailabilityMacros.h> |
| 42 | # endif |
| 43 | # if MAC_OS_X_VERSION_MIN_REQUIRED < 1090 /* MAC_OS_X_VERSION_10_9 */ |
| 44 | # define AO_SKIPATOMIC_DOUBLE_LOAD_STORE_ANY |
| 45 | # endif |
| 46 | # endif /* __i386__ */ |
| 47 | |
| 48 | # elif defined(__clang__) |
| 49 | # if !defined(__x86_64__) |
| 50 | # if !defined(AO_PREFER_BUILTIN_ATOMICS) && !defined(__CYGWIN__) \ |
| 51 | && !AO_CLANG_PREREQ(5, 0) |
| 52 | /* At least clang-3.8/i686 (from NDK r11c) required to specify */ |
| 53 | /* -latomic in case of a double-word atomic operation use. */ |
| 54 | # define AO_SKIPATOMIC_double_compare_and_swap_ANY |
| 55 | # define AO_SKIPATOMIC_DOUBLE_LOAD_STORE_ANY |
| 56 | # endif /* !AO_PREFER_BUILTIN_ATOMICS */ |
| 57 | |
| 58 | # elif !defined(__ILP32__) |
| 59 | # if (!AO_CLANG_PREREQ(3, 5) && !defined(AO_PREFER_BUILTIN_ATOMICS)) \ |
| 60 | || (!AO_CLANG_PREREQ(4, 0) && defined(AO_ADDRESS_SANITIZER)) \ |
| 61 | || defined(AO_THREAD_SANITIZER) |
| 62 | /* clang-3.4/x64 required -latomic. clang-3.9/x64 seems to */ |
| 63 | /* pass double-wide arguments to atomic operations incorrectly */ |
| 64 | /* in case of ASan/TSan. */ |
| 65 | /* TODO: As of clang-4.0, lock-free test_stack fails if TSan. */ |
| 66 | # define AO_SKIPATOMIC_double_compare_and_swap_ANY |
| 67 | # define AO_SKIPATOMIC_DOUBLE_LOAD_STORE_ANY |
| 68 | # endif |
| 69 | # endif /* __x86_64__ */ |
| 70 | |
| 71 | # elif defined(__x86_64__) && !defined(AO_PREFER_BUILTIN_ATOMICS) \ |
| 72 | && !defined(AO_THREAD_SANITIZER) |
| 73 | /* gcc/x64 (as of gcc-12.2) requires -latomic flag in case */ |
| 74 | /* of double-word atomic operations use (but not in case of TSan). */ |
| 75 | /* TODO: Revise it for the future gcc releases. */ |
| 76 | # define AO_SKIPATOMIC_double_compare_and_swap_ANY |
| 77 | # define AO_SKIPATOMIC_DOUBLE_LOAD_STORE_ANY |
| 78 | # endif /* __x86_64__ && !__clang__ */ |
| 79 | |
| 80 | # ifdef AO_SKIPATOMIC_DOUBLE_LOAD_STORE_ANY |
| 81 | # define AO_SKIPATOMIC_double_load |
| 82 | # define AO_SKIPATOMIC_double_load_acquire |
| 83 | # define AO_SKIPATOMIC_double_store |
| 84 | # define AO_SKIPATOMIC_double_store_release |
| 85 | # undef AO_SKIPATOMIC_DOUBLE_LOAD_STORE_ANY |
| 86 | # endif |
| 87 | |
| 88 | #else /* AO_DISABLE_GCC_ATOMICS */ |
| 89 | |
| 90 | /* The following really assume we have a 486 or better. Unfortunately */ |
| 91 | /* gcc doesn't define a suitable feature test macro based on command */ |
| 92 | /* line options. */ |
| 93 | /* We should perhaps test dynamically. */ |
| 94 | |
| 95 | #include "../all_aligned_atomic_load_store.h" |
| 96 | |
| 97 | #include "../test_and_set_t_is_char.h" |
| 98 | |
| 99 | #if defined(__SSE2__) && !defined(AO_USE_PENTIUM4_INSTRS) |
| 100 | /* "mfence" is a part of SSE2 set (introduced on Intel Pentium 4). */ |
| 101 | # define AO_USE_PENTIUM4_INSTRS |
| 102 | #endif |
| 103 | |
| 104 | #if defined(AO_USE_PENTIUM4_INSTRS) |
| 105 | AO_INLINE void |
| 106 | AO_nop_full(void) |
| 107 | { |
| 108 | __asm__ __volatile__("mfence" : : : "memory"); |
| 109 | } |
| 110 | # define AO_HAVE_nop_full |
| 111 | |
| 112 | #else |
| 113 | /* We could use the cpuid instruction. But that seems to be slower */ |
| 114 | /* than the default implementation based on test_and_set_full. Thus */ |
| 115 | /* we omit that bit of misinformation here. */ |
| 116 | #endif /* !AO_USE_PENTIUM4_INSTRS */ |
| 117 | |
| 118 | /* As far as we can tell, the lfence and sfence instructions are not */ |
| 119 | /* currently needed or useful for cached memory accesses. */ |
| 120 | |
| 121 | /* Really only works for 486 and later */ |
| 122 | #ifndef AO_PREFER_GENERALIZED |
| 123 | AO_INLINE AO_t |
| 124 | AO_fetch_and_add_full (volatile AO_t *p, AO_t incr) |
| 125 | { |
| 126 | AO_t result; |
| 127 | |
| 128 | __asm__ __volatile__ ("lock; xadd %0, %1" |
| 129 | : "=r" (result), "+m" (*p) |
| 130 | : "0" (incr) |
| 131 | : "memory"); |
| 132 | return result; |
| 133 | } |
| 134 | # define AO_HAVE_fetch_and_add_full |
| 135 | #endif /* !AO_PREFER_GENERALIZED */ |
| 136 | |
| 137 | AO_INLINE unsigned char |
| 138 | AO_char_fetch_and_add_full (volatile unsigned char *p, unsigned char incr) |
| 139 | { |
| 140 | unsigned char result; |
| 141 | |
| 142 | __asm__ __volatile__ ("lock; xaddb %0, %1" |
| 143 | : "=q" (result), "+m" (*p) |
| 144 | : "0" (incr) |
| 145 | : "memory"); |
| 146 | return result; |
| 147 | } |
| 148 | #define AO_HAVE_char_fetch_and_add_full |
| 149 | |
| 150 | AO_INLINE unsigned short |
| 151 | AO_short_fetch_and_add_full (volatile unsigned short *p, unsigned short incr) |
| 152 | { |
| 153 | unsigned short result; |
| 154 | |
| 155 | __asm__ __volatile__ ("lock; xaddw %0, %1" |
| 156 | : "=r" (result), "+m" (*p) |
| 157 | : "0" (incr) |
| 158 | : "memory"); |
| 159 | return result; |
| 160 | } |
| 161 | #define AO_HAVE_short_fetch_and_add_full |
| 162 | |
| 163 | #ifndef AO_PREFER_GENERALIZED |
| 164 | AO_INLINE void |
| 165 | AO_and_full (volatile AO_t *p, AO_t value) |
| 166 | { |
| 167 | __asm__ __volatile__ ("lock; and %1, %0" |
| 168 | : "+m" (*p) |
| 169 | : "r" (value) |
| 170 | : "memory"); |
| 171 | } |
| 172 | # define AO_HAVE_and_full |
| 173 | |
| 174 | AO_INLINE void |
| 175 | AO_or_full (volatile AO_t *p, AO_t value) |
| 176 | { |
| 177 | __asm__ __volatile__ ("lock; or %1, %0" |
| 178 | : "+m" (*p) |
| 179 | : "r" (value) |
| 180 | : "memory"); |
| 181 | } |
| 182 | # define AO_HAVE_or_full |
| 183 | |
| 184 | AO_INLINE void |
| 185 | AO_xor_full (volatile AO_t *p, AO_t value) |
| 186 | { |
| 187 | __asm__ __volatile__ ("lock; xor %1, %0" |
| 188 | : "+m" (*p) |
| 189 | : "r" (value) |
| 190 | : "memory"); |
| 191 | } |
| 192 | # define AO_HAVE_xor_full |
| 193 | |
| 194 | /* AO_store_full could be implemented directly using "xchg" but it */ |
| 195 | /* could be generalized efficiently as an ordinary store accomplished */ |
| 196 | /* with AO_nop_full ("mfence" instruction). */ |
| 197 | |
| 198 | AO_INLINE void |
| 199 | AO_char_and_full (volatile unsigned char *p, unsigned char value) |
| 200 | { |
| 201 | __asm__ __volatile__ ("lock; andb %1, %0" |
| 202 | : "+m" (*p) |
| 203 | : "r" (value) |
| 204 | : "memory"); |
| 205 | } |
| 206 | #define AO_HAVE_char_and_full |
| 207 | |
| 208 | AO_INLINE void |
| 209 | AO_char_or_full (volatile unsigned char *p, unsigned char value) |
| 210 | { |
| 211 | __asm__ __volatile__ ("lock; orb %1, %0" |
| 212 | : "+m" (*p) |
| 213 | : "r" (value) |
| 214 | : "memory"); |
| 215 | } |
| 216 | #define AO_HAVE_char_or_full |
| 217 | |
| 218 | AO_INLINE void |
| 219 | AO_char_xor_full (volatile unsigned char *p, unsigned char value) |
| 220 | { |
| 221 | __asm__ __volatile__ ("lock; xorb %1, %0" |
| 222 | : "+m" (*p) |
| 223 | : "r" (value) |
| 224 | : "memory"); |
| 225 | } |
| 226 | #define AO_HAVE_char_xor_full |
| 227 | |
| 228 | AO_INLINE void |
| 229 | AO_short_and_full (volatile unsigned short *p, unsigned short value) |
| 230 | { |
| 231 | __asm__ __volatile__ ("lock; andw %1, %0" |
| 232 | : "+m" (*p) |
| 233 | : "r" (value) |
| 234 | : "memory"); |
| 235 | } |
| 236 | #define AO_HAVE_short_and_full |
| 237 | |
| 238 | AO_INLINE void |
| 239 | AO_short_or_full (volatile unsigned short *p, unsigned short value) |
| 240 | { |
| 241 | __asm__ __volatile__ ("lock; orw %1, %0" |
| 242 | : "+m" (*p) |
| 243 | : "r" (value) |
| 244 | : "memory"); |
| 245 | } |
| 246 | #define AO_HAVE_short_or_full |
| 247 | |
| 248 | AO_INLINE void |
| 249 | AO_short_xor_full (volatile unsigned short *p, unsigned short value) |
| 250 | { |
| 251 | __asm__ __volatile__ ("lock; xorw %1, %0" |
| 252 | : "+m" (*p) |
| 253 | : "r" (value) |
| 254 | : "memory"); |
| 255 | } |
| 256 | #define AO_HAVE_short_xor_full |
| 257 | #endif /* !AO_PREFER_GENERALIZED */ |
| 258 | |
| 259 | AO_INLINE AO_TS_VAL_t |
| 260 | AO_test_and_set_full(volatile AO_TS_t *addr) |
| 261 | { |
| 262 | unsigned char oldval; |
| 263 | /* Note: the "xchg" instruction does not need a "lock" prefix */ |
| 264 | __asm__ __volatile__ ("xchgb %0, %1" |
| 265 | : "=q" (oldval), "+m" (*addr) |
| 266 | : "0" ((unsigned char)0xff) |
| 267 | : "memory"); |
| 268 | return (AO_TS_VAL_t)oldval; |
| 269 | } |
| 270 | #define AO_HAVE_test_and_set_full |
| 271 | |
| 272 | #ifndef AO_GENERALIZE_ASM_BOOL_CAS |
| 273 | /* Returns nonzero if the comparison succeeded. */ |
| 274 | AO_INLINE int |
| 275 | AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val) |
| 276 | { |
| 277 | # ifdef AO_USE_SYNC_CAS_BUILTIN |
| 278 | return (int)__sync_bool_compare_and_swap(addr, old, new_val |
| 279 | /* empty protection list */); |
| 280 | /* Note: an empty list of variables protected by the */ |
| 281 | /* memory barrier should mean all globally accessible */ |
| 282 | /* variables are protected. */ |
| 283 | # else |
| 284 | char result; |
| 285 | # if defined(__GCC_ASM_FLAG_OUTPUTS__) |
| 286 | AO_t dummy; |
| 287 | |
| 288 | __asm__ __volatile__ ("lock; cmpxchg %3, %0" |
| 289 | : "+m" (*addr), "=@ccz" (result), "=a" (dummy) |
| 290 | : "r" (new_val), "a" (old) |
| 291 | : "memory"); |
| 292 | # else |
| 293 | __asm__ __volatile__ ("lock; cmpxchg %2, %0; setz %1" |
| 294 | : "+m" (*addr), "=a" (result) |
| 295 | : "r" (new_val), "a" (old) |
| 296 | : "memory"); |
| 297 | # endif |
| 298 | return (int)result; |
| 299 | # endif |
| 300 | } |
| 301 | # define AO_HAVE_compare_and_swap_full |
| 302 | #endif /* !AO_GENERALIZE_ASM_BOOL_CAS */ |
| 303 | |
| 304 | AO_INLINE AO_t |
| 305 | AO_fetch_compare_and_swap_full(volatile AO_t *addr, AO_t old_val, |
| 306 | AO_t new_val) |
| 307 | { |
| 308 | # ifdef AO_USE_SYNC_CAS_BUILTIN |
| 309 | return __sync_val_compare_and_swap(addr, old_val, new_val |
| 310 | /* empty protection list */); |
| 311 | # else |
| 312 | AO_t fetched_val; |
| 313 | __asm__ __volatile__ ("lock; cmpxchg %3, %1" |
| 314 | : "=a" (fetched_val), "+m" (*addr) |
| 315 | : "a" (old_val), "r" (new_val) |
| 316 | : "memory"); |
| 317 | return fetched_val; |
| 318 | # endif |
| 319 | } |
| 320 | #define AO_HAVE_fetch_compare_and_swap_full |
| 321 | |
| 322 | AO_INLINE unsigned char |
| 323 | AO_char_fetch_compare_and_swap_full(volatile unsigned char *addr, |
| 324 | unsigned char old_val, |
| 325 | unsigned char new_val) |
| 326 | { |
| 327 | # ifdef AO_USE_SYNC_CAS_BUILTIN |
| 328 | return __sync_val_compare_and_swap(addr, old_val, new_val |
| 329 | /* empty protection list */); |
| 330 | # else |
| 331 | unsigned char fetched_val; |
| 332 | |
| 333 | __asm__ __volatile__ ("lock; cmpxchgb %3, %1" |
| 334 | : "=a" (fetched_val), "+m" (*addr) |
| 335 | : "a" (old_val), "q" (new_val) |
| 336 | : "memory"); |
| 337 | return fetched_val; |
| 338 | # endif |
| 339 | } |
| 340 | # define AO_HAVE_char_fetch_compare_and_swap_full |
| 341 | |
| 342 | AO_INLINE unsigned short |
| 343 | AO_short_fetch_compare_and_swap_full(volatile unsigned short *addr, |
| 344 | unsigned short old_val, |
| 345 | unsigned short new_val) |
| 346 | { |
| 347 | # ifdef AO_USE_SYNC_CAS_BUILTIN |
| 348 | return __sync_val_compare_and_swap(addr, old_val, new_val |
| 349 | /* empty protection list */); |
| 350 | # else |
| 351 | unsigned short fetched_val; |
| 352 | |
| 353 | __asm__ __volatile__ ("lock; cmpxchgw %3, %1" |
| 354 | : "=a" (fetched_val), "+m" (*addr) |
| 355 | : "a" (old_val), "r" (new_val) |
| 356 | : "memory"); |
| 357 | return fetched_val; |
| 358 | # endif |
| 359 | } |
| 360 | # define AO_HAVE_short_fetch_compare_and_swap_full |
| 361 | |
| 362 | # if defined(__x86_64__) && !defined(__ILP32__) |
| 363 | AO_INLINE unsigned int |
| 364 | AO_int_fetch_compare_and_swap_full(volatile unsigned int *addr, |
| 365 | unsigned int old_val, |
| 366 | unsigned int new_val) |
| 367 | { |
| 368 | # ifdef AO_USE_SYNC_CAS_BUILTIN |
| 369 | return __sync_val_compare_and_swap(addr, old_val, new_val |
| 370 | /* empty protection list */); |
| 371 | # else |
| 372 | unsigned int fetched_val; |
| 373 | |
| 374 | __asm__ __volatile__ ("lock; cmpxchgl %3, %1" |
| 375 | : "=a" (fetched_val), "+m" (*addr) |
| 376 | : "a" (old_val), "r" (new_val) |
| 377 | : "memory"); |
| 378 | return fetched_val; |
| 379 | # endif |
| 380 | } |
| 381 | # define AO_HAVE_int_fetch_compare_and_swap_full |
| 382 | |
| 383 | # ifndef AO_PREFER_GENERALIZED |
| 384 | AO_INLINE unsigned int |
| 385 | AO_int_fetch_and_add_full (volatile unsigned int *p, unsigned int incr) |
| 386 | { |
| 387 | unsigned int result; |
| 388 | |
| 389 | __asm__ __volatile__ ("lock; xaddl %0, %1" |
| 390 | : "=r" (result), "+m" (*p) |
| 391 | : "0" (incr) |
| 392 | : "memory"); |
| 393 | return result; |
| 394 | } |
| 395 | # define AO_HAVE_int_fetch_and_add_full |
| 396 | |
| 397 | AO_INLINE void |
| 398 | AO_int_and_full (volatile unsigned int *p, unsigned int value) |
| 399 | { |
| 400 | __asm__ __volatile__ ("lock; andl %1, %0" |
| 401 | : "+m" (*p) |
| 402 | : "r" (value) |
| 403 | : "memory"); |
| 404 | } |
| 405 | # define AO_HAVE_int_and_full |
| 406 | |
| 407 | AO_INLINE void |
| 408 | AO_int_or_full (volatile unsigned int *p, unsigned int value) |
| 409 | { |
| 410 | __asm__ __volatile__ ("lock; orl %1, %0" |
| 411 | : "+m" (*p) |
| 412 | : "r" (value) |
| 413 | : "memory"); |
| 414 | } |
| 415 | # define AO_HAVE_int_or_full |
| 416 | |
| 417 | AO_INLINE void |
| 418 | AO_int_xor_full (volatile unsigned int *p, unsigned int value) |
| 419 | { |
| 420 | __asm__ __volatile__ ("lock; xorl %1, %0" |
| 421 | : "+m" (*p) |
| 422 | : "r" (value) |
| 423 | : "memory"); |
| 424 | } |
| 425 | # define AO_HAVE_int_xor_full |
| 426 | # endif /* !AO_PREFER_GENERALIZED */ |
| 427 | |
| 428 | # else |
| 429 | # define AO_T_IS_INT |
| 430 | # endif /* !x86_64 || ILP32 */ |
| 431 | |
| 432 | /* Real X86 implementations, except for some old 32-bit WinChips, */ |
| 433 | /* appear to enforce ordering between memory operations, EXCEPT that */ |
| 434 | /* a later read can pass earlier writes, presumably due to the */ |
| 435 | /* visible presence of store buffers. */ |
| 436 | /* We ignore both the WinChips and the fact that the official specs */ |
| 437 | /* seem to be much weaker (and arguably too weak to be usable). */ |
| 438 | # include "../ordered_except_wr.h" |
| 439 | |
| 440 | #endif /* AO_DISABLE_GCC_ATOMICS */ |
| 441 | |
| 442 | #if defined(AO_GCC_ATOMIC_TEST_AND_SET) \ |
| 443 | && !defined(AO_SKIPATOMIC_double_compare_and_swap_ANY) |
| 444 | |
| 445 | # if defined(__ILP32__) || !defined(__x86_64__) /* 32-bit AO_t */ \ |
| 446 | || defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_16) /* 64-bit AO_t */ |
| 447 | # include "../standard_ao_double_t.h" |
| 448 | # endif |
| 449 | |
| 450 | #elif !defined(__x86_64__) && (!defined(AO_USE_SYNC_CAS_BUILTIN) \ |
| 451 | || defined(AO_GCC_ATOMIC_TEST_AND_SET)) |
| 452 | # include "../standard_ao_double_t.h" |
| 453 | |
| 454 | /* Reading or writing a quadword aligned on a 64-bit boundary is */ |
| 455 | /* always carried out atomically on at least a Pentium according to */ |
| 456 | /* Chapter 8.1.1 of Volume 3A Part 1 of Intel processor manuals. */ |
| 457 | # ifndef AO_PREFER_GENERALIZED |
| 458 | # define AO_ACCESS_double_CHECK_ALIGNED |
| 459 | # include "../loadstore/double_atomic_load_store.h" |
| 460 | # endif |
| 461 | |
| 462 | /* Returns nonzero if the comparison succeeded. */ |
| 463 | /* Really requires at least a Pentium. */ |
| 464 | AO_INLINE int |
| 465 | AO_compare_double_and_swap_double_full(volatile AO_double_t *addr, |
| 466 | AO_t old_val1, AO_t old_val2, |
| 467 | AO_t new_val1, AO_t new_val2) |
| 468 | { |
| 469 | char result; |
| 470 | # if defined(__PIC__) && !(AO_GNUC_PREREQ(5, 1) || AO_CLANG_PREREQ(4, 0)) |
| 471 | AO_t saved_ebx; |
| 472 | AO_t dummy; |
| 473 | |
| 474 | /* The following applies to an ancient GCC (and, probably, it was */ |
| 475 | /* never needed for Clang): */ |
| 476 | /* If PIC is turned on, we cannot use ebx as it is reserved for the */ |
| 477 | /* GOT pointer. We should save and restore ebx. The proposed */ |
| 478 | /* solution is not so efficient as the older alternatives using */ |
| 479 | /* push ebx or edi as new_val1 (w/o clobbering edi and temporary */ |
| 480 | /* local variable usage) but it is more portable (it works even if */ |
| 481 | /* ebx is not used as GOT pointer, and it works for the buggy GCC */ |
| 482 | /* releases that incorrectly evaluate memory operands offset in the */ |
| 483 | /* inline assembly after push). */ |
| 484 | # ifdef __OPTIMIZE__ |
| 485 | __asm__ __volatile__("mov %%ebx, %2\n\t" /* save ebx */ |
| 486 | "lea %0, %%edi\n\t" /* in case addr is in ebx */ |
| 487 | "mov %7, %%ebx\n\t" /* load new_val1 */ |
| 488 | "lock; cmpxchg8b (%%edi)\n\t" |
| 489 | "mov %2, %%ebx\n\t" /* restore ebx */ |
| 490 | "setz %1" |
| 491 | : "+m" (*addr), "=a" (result), |
| 492 | "=m" (saved_ebx), "=d" (dummy) |
| 493 | : "d" (old_val2), "a" (old_val1), |
| 494 | "c" (new_val2), "m" (new_val1) |
| 495 | : "%edi", "memory"); |
| 496 | # else |
| 497 | /* A less-efficient code manually preserving edi if GCC invoked */ |
| 498 | /* with -O0 option (otherwise it fails while finding a register */ |
| 499 | /* in class 'GENERAL_REGS'). */ |
| 500 | AO_t saved_edi; |
| 501 | __asm__ __volatile__("mov %%edi, %3\n\t" /* save edi */ |
| 502 | "mov %%ebx, %2\n\t" /* save ebx */ |
| 503 | "lea %0, %%edi\n\t" /* in case addr is in ebx */ |
| 504 | "mov %8, %%ebx\n\t" /* load new_val1 */ |
| 505 | "lock; cmpxchg8b (%%edi)\n\t" |
| 506 | "mov %2, %%ebx\n\t" /* restore ebx */ |
| 507 | "mov %3, %%edi\n\t" /* restore edi */ |
| 508 | "setz %1" |
| 509 | : "+m" (*addr), "=a" (result), |
| 510 | "=m" (saved_ebx), "=m" (saved_edi), "=d" (dummy) |
| 511 | : "d" (old_val2), "a" (old_val1), |
| 512 | "c" (new_val2), "m" (new_val1) |
| 513 | : "memory"); |
| 514 | # endif |
| 515 | # else |
| 516 | /* For non-PIC mode, this operation could be simplified (and be */ |
| 517 | /* faster) by using ebx as new_val1. Reuse of the PIC hard */ |
| 518 | /* register, instead of using a fixed register, is implemented */ |
| 519 | /* in Clang and GCC 5.1+, at least. (Older GCC refused to compile */ |
| 520 | /* such code for PIC mode). */ |
| 521 | # if defined(__GCC_ASM_FLAG_OUTPUTS__) |
| 522 | __asm__ __volatile__ ("lock; cmpxchg8b %0" |
| 523 | : "+m" (*addr), "=@ccz" (result), |
| 524 | "+d" (old_val2), "+a" (old_val1) |
| 525 | : "c" (new_val2), "b" (new_val1) |
| 526 | : "memory"); |
| 527 | # else |
| 528 | AO_t dummy; /* an output for clobbered edx */ |
| 529 | |
| 530 | __asm__ __volatile__ ("lock; cmpxchg8b %0; setz %1" |
| 531 | : "+m" (*addr), "=a" (result), "=d" (dummy) |
| 532 | : "d" (old_val2), "a" (old_val1), |
| 533 | "c" (new_val2), "b" (new_val1) |
| 534 | : "memory"); |
| 535 | # endif |
| 536 | # endif |
| 537 | return (int) result; |
| 538 | } |
| 539 | # define AO_HAVE_compare_double_and_swap_double_full |
| 540 | |
| 541 | #elif defined(__ILP32__) || !defined(__x86_64__) |
| 542 | # include "../standard_ao_double_t.h" |
| 543 | |
| 544 | /* Reading or writing a quadword aligned on a 64-bit boundary is */ |
| 545 | /* always carried out atomically (requires at least a Pentium). */ |
| 546 | # ifndef AO_PREFER_GENERALIZED |
| 547 | # define AO_ACCESS_double_CHECK_ALIGNED |
| 548 | # include "../loadstore/double_atomic_load_store.h" |
| 549 | # endif |
| 550 | |
| 551 | /* X32 has native support for 64-bit integer operations (AO_double_t */ |
| 552 | /* is a 64-bit integer and we could use 64-bit cmpxchg). */ |
| 553 | /* This primitive is used by compare_double_and_swap_double_full. */ |
| 554 | AO_INLINE int |
| 555 | AO_double_compare_and_swap_full(volatile AO_double_t *addr, |
| 556 | AO_double_t old_val, AO_double_t new_val) |
| 557 | { |
| 558 | /* It is safe to use __sync CAS built-in here. */ |
| 559 | return __sync_bool_compare_and_swap(&addr->AO_whole, |
| 560 | old_val.AO_whole, new_val.AO_whole |
| 561 | /* empty protection list */); |
| 562 | } |
| 563 | # define AO_HAVE_double_compare_and_swap_full |
| 564 | |
| 565 | #elif defined(AO_CMPXCHG16B_AVAILABLE) \ |
| 566 | || (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_16) \ |
| 567 | && !defined(AO_THREAD_SANITIZER)) |
| 568 | # include "../standard_ao_double_t.h" |
| 569 | |
| 570 | /* The Intel and AMD Architecture Programmer Manuals state roughly */ |
| 571 | /* the following: */ |
| 572 | /* - CMPXCHG16B (with a LOCK prefix) can be used to perform 16-byte */ |
| 573 | /* atomic accesses in 64-bit mode (with certain alignment */ |
| 574 | /* restrictions); */ |
| 575 | /* - SSE instructions that access data larger than a quadword (like */ |
| 576 | /* MOVDQA) may be implemented using multiple memory accesses; */ |
| 577 | /* - LOCK prefix causes an invalid-opcode exception when used with */ |
| 578 | /* 128-bit media (SSE) instructions. */ |
| 579 | /* Thus, currently, the only way to implement lock-free double_load */ |
| 580 | /* and double_store on x86_64 is to use CMPXCHG16B (if available). */ |
| 581 | |
| 582 | /* NEC LE-IT: older AMD Opterons are missing this instruction. */ |
| 583 | /* On these machines SIGILL will be thrown. */ |
| 584 | /* Define AO_WEAK_DOUBLE_CAS_EMULATION to have an emulated (lock */ |
| 585 | /* based) version available. */ |
| 586 | /* HB: Changed this to not define either by default. There are */ |
| 587 | /* enough machines and tool chains around on which cmpxchg16b */ |
| 588 | /* doesn't work. And the emulation is unsafe by our usual rules. */ |
| 589 | /* However both are clearly useful in certain cases. */ |
| 590 | |
| 591 | AO_INLINE int |
| 592 | AO_compare_double_and_swap_double_full(volatile AO_double_t *addr, |
| 593 | AO_t old_val1, AO_t old_val2, |
| 594 | AO_t new_val1, AO_t new_val2) |
| 595 | { |
| 596 | char result; |
| 597 | |
| 598 | # if defined(__GCC_ASM_FLAG_OUTPUTS__) |
| 599 | __asm__ __volatile__("lock; cmpxchg16b %0" |
| 600 | : "+m" (*addr), "=@ccz" (result), |
| 601 | "+d" (old_val2), "+a" (old_val1) |
| 602 | : "c" (new_val2), "b" (new_val1) |
| 603 | : "memory"); |
| 604 | # else |
| 605 | AO_t dummy; /* an output for clobbered rdx */ |
| 606 | |
| 607 | __asm__ __volatile__("lock; cmpxchg16b %0; setz %1" |
| 608 | : "+m" (*addr), "=a" (result), "=d" (dummy) |
| 609 | : "d" (old_val2), "a" (old_val1), |
| 610 | "c" (new_val2), "b" (new_val1) |
| 611 | : "memory"); |
| 612 | # endif |
| 613 | return (int) result; |
| 614 | } |
| 615 | # define AO_HAVE_compare_double_and_swap_double_full |
| 616 | |
| 617 | #elif defined(AO_WEAK_DOUBLE_CAS_EMULATION) |
| 618 | # include "../standard_ao_double_t.h" |
| 619 | |
| 620 | # ifdef __cplusplus |
| 621 | extern "C" { |
| 622 | # endif |
| 623 | |
| 624 | /* This one provides spinlock based emulation of CAS implemented in */ |
| 625 | /* atomic_ops.c. We probably do not want to do this here, since it */ |
| 626 | /* is not atomic with respect to other kinds of updates of *addr. */ |
| 627 | /* On the other hand, this may be a useful facility on occasion. */ |
| 628 | int AO_compare_double_and_swap_double_emulation( |
| 629 | volatile AO_double_t *addr, |
| 630 | AO_t old_val1, AO_t old_val2, |
| 631 | AO_t new_val1, AO_t new_val2); |
| 632 | |
| 633 | # ifdef __cplusplus |
| 634 | } /* extern "C" */ |
| 635 | # endif |
| 636 | |
| 637 | AO_INLINE int |
| 638 | AO_compare_double_and_swap_double_full(volatile AO_double_t *addr, |
| 639 | AO_t old_val1, AO_t old_val2, |
| 640 | AO_t new_val1, AO_t new_val2) |
| 641 | { |
| 642 | return AO_compare_double_and_swap_double_emulation(addr, |
| 643 | old_val1, old_val2, new_val1, new_val2); |
| 644 | } |
| 645 | # define AO_HAVE_compare_double_and_swap_double_full |
| 646 | #endif /* x86_64 && !ILP32 && CAS_EMULATION && !AO_CMPXCHG16B_AVAILABLE */ |
| 647 | |
| 648 | #ifdef AO_GCC_ATOMIC_TEST_AND_SET |
| 649 | # include "generic.h" |
| 650 | #endif |
| 651 | |
| 652 | #undef AO_GCC_FORCE_HAVE_CAS |
| 653 | #undef AO_SKIPATOMIC_double_compare_and_swap_ANY |
| 654 | #undef AO_SKIPATOMIC_double_load |
| 655 | #undef AO_SKIPATOMIC_double_load_acquire |
| 656 | #undef AO_SKIPATOMIC_double_store |
| 657 | #undef AO_SKIPATOMIC_double_store_release |
| 658 | |