From edd4a7129fa45c0bb5d5923b5b1a64cb5053ce46 Mon Sep 17 00:00:00 2001
From: oswyndel <67714233+RbPyer@users.noreply.github.com>
Date: Thu, 12 Feb 2026 10:31:05 +0300
Subject: [PATCH] vlib: add x.atomics - implement native_x86-64_atomics in V
 and assembly, without depending on an external C library (impl #26474)
 (#26529)

---
 vlib/x/atomics/README.md                     | 100 +++
 vlib/x/atomics/atomics.amd64.v               | 524 ++++++++++++++++
 vlib/x/atomics/atomics.i386.v                | 606 +++++++++++++++++++
 vlib/x/atomics/benchmarks/README.md          | 123 ++++
 vlib/x/atomics/benchmarks/atomic_benchmark.v | 376 ++++++++++++
 vlib/x/atomics/examples/basic.v              |  23 +
 vlib/x/atomics/examples/counter.v            |  40 ++
 vlib/x/atomics/examples/spinlock.v           |  59 ++
 vlib/x/atomics/i32_test.v                    | 189 ++++++
 vlib/x/atomics/i64_test.v                    | 161 +++++
 vlib/x/atomics/panic_unaligned.v             |  10 +
 vlib/x/atomics/u32_test.v                    | 178 ++++++
 vlib/x/atomics/u64_test.v                    | 155 +++++
 13 files changed, 2544 insertions(+)
 create mode 100644 vlib/x/atomics/README.md
 create mode 100644 vlib/x/atomics/atomics.amd64.v
 create mode 100644 vlib/x/atomics/atomics.i386.v
 create mode 100644 vlib/x/atomics/benchmarks/README.md
 create mode 100644 vlib/x/atomics/benchmarks/atomic_benchmark.v
 create mode 100644 vlib/x/atomics/examples/basic.v
 create mode 100644 vlib/x/atomics/examples/counter.v
 create mode 100644 vlib/x/atomics/examples/spinlock.v
 create mode 100644 vlib/x/atomics/i32_test.v
 create mode 100644 vlib/x/atomics/i64_test.v
 create mode 100644 vlib/x/atomics/panic_unaligned.v
 create mode 100644 vlib/x/atomics/u32_test.v
 create mode 100644 vlib/x/atomics/u64_test.v
diff --git a/vlib/x/atomics/README.md b/vlib/x/atomics/README.md
new file mode 100644
index 000000000..f445fe1f2
--- /dev/null
+++ b/vlib/x/atomics/README.md
@@ -0,0 +1,100 @@
+# v-atomics
+
+Low-level atomic operations for V with explicit i386 support (MMX required on i386).
+
+Native atomic primitives for V implemented with inline assembly, without relying on C FFI.
+
+This repository is an experiment in providing low-level atomic operations directly in V,
+using V's inline assembly support.
+
+At the moment, all operations provide sequentially consistent semantics.
+
+## Motivation
+
+In the current V ecosystem, atomic operations are implemented via calls into C.
+
+While this approach works, it introduces an additional dependency on the C toolchain
+and headers and limits control over the exact machine instructions being emitted.
+
+x.atomics explores an alternative: **native atomic operations implemented directly in V**,
+using architecture-specific inline assembly and explicit semantics.
+
+The current focus of this project is:
+
+- correctness of basic atomic primitives;
+- predictable and inspectable code generation;
+- sequentially consistent behavior for all operations.
+
+In future versions, the set of supported atomic operations will be expanded,
+and additional memory orderings will be introduced.
+
+---
+
+## Scope and Guarantees
+
+- atomic operations on integer types implemented in V with inline assembly;
+- architecture-specific implementations (per-platform `atomics.<arch>.v` files);
+- **sequential consistency** for all exposed operations.
+
+---
+
+## Memory Model
+
+All operations in this library are intended to be **sequentially consistent**:
+
+- operations appear to be globally ordered;
+- no weaker semantics (relaxed, acquire, release) are currently implemented;
+- when weaker variants are added in the future, they will be explicitly named and documented.
+
+---
+
+## Examples
+
+See the [examples](examples/) directory for complete runnable examples.
+
+### Basic Usage
+
+```v
+import x.atomics
+
+fn main() {
+	mut value := i32(0)
+
+	// Atomically store a value
+	atomics.store_i32(&value, 42)
+
+	// Atomically load the value
+	loaded := atomics.load_i32(&value)
+
+	// Atomic add: returns the new value after addition
+	new_value := atomics.add_i32(&value, 10)
+
+	// Atomic swap: returns the old value
+	old := atomics.swap_i32(&value, 100)
+}
+```
+
+### Compare-and-Swap (CAS)
+
+```v
+import x.atomics
+
+fn main() {
+	mut flag := u32(0)
+
+	// CAS: if flag == 0, set it to 1; returns true on success
+	if atomics.cas_u32(&flag, 0, 1) {
+		println('Successfully changed flag from 0 to 1')
+	}
+}
+```
+
+### Available Operations
+
+| Operation | i32 | i64 | u32 | u64 |
+|-----------|-----|-----|-----|-----|
+| `load_*`  | yes | yes | yes | yes |
+| `store_*` | yes | yes | yes | yes |
+| `add_*`   | yes | yes | yes | yes |
+| `swap_*`  | yes | yes | yes | yes |
+| `cas_*`   | yes | yes | yes | yes |
diff --git a/vlib/x/atomics/atomics.amd64.v b/vlib/x/atomics/atomics.amd64.v
new file mode 100644
index 000000000..f70165133
--- /dev/null
+++ b/vlib/x/atomics/atomics.amd64.v
@@ -0,0 +1,524 @@
+module atomics
+
+// add_i32 atomically adds delta to the value at dest and returns the new value.
+// The operation is performed with sequential consistency.
+// Panics if dest is not 4-byte aligned.
+pub fn add_i32(dest &i32, delta i32) i32 {
+	mut result := i32(0)
+	asm volatile amd64 {
+		mov rdx, dest
+		test rdx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, delta
+		lock xadd [rdx], eax
+		add eax, delta
+		mov result, eax
+		2:
+		; =r (result)
+		; r (dest)
+		  r (delta)
+		; eax
+		  rdx
+		  memory
+	}
+	return result
+}
+
+// swap_i32 atomically stores new value at dest and returns the old value.
+// The operation is performed with sequential consistency.
+// Panics if dest is not 4-byte aligned.
+pub fn swap_i32(dest &i32, new i32) i32 {
+	mut old := i32(0)
+	asm volatile amd64 {
+		mov rdx, dest
+		test rdx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, new
+		xchg [rdx], eax
+		mov old, eax
+		2:
+		; =r (old)
+		; r (dest)
+		  r (new)
+		; eax
+		  rdx
+		  memory
+	}
+	return old
+}
+
+// store_i32 atomically stores value at dest.
+// The operation is performed with sequential consistency.
+// Panics if dest is not 4-byte aligned.
+pub fn store_i32(dest &i32, value i32) {
+	asm volatile amd64 {
+		mov rdx, dest
+		test rdx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, value
+		xchg eax, [rdx]
+		2:
+		; ; r (dest)
+		  r (value)
+		; eax
+		  rdx
+		  memory
+	}
+}
+
+// load_i32 atomically loads and returns the value at num.
+// The operation is performed with sequential consistency.
+// Panics if num is not 4-byte aligned.
+pub fn load_i32(num &i32) i32 {
+	mut out := i32(0)
+	asm volatile amd64 {
+		mov rdx, num
+		test rdx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, [rdx]
+		mov out, eax
+		2:
+		; =r (out)
+		; r (num)
+		; eax
+		  rdx
+		  memory
+	}
+	return out
+}
+
+// cas_i32 performs a compare-and-swap operation.
+// If the current value at addr equals old, it atomically stores new.
+// Returns true if the swap was performed, false otherwise.
+// The operation is performed with sequential consistency.
+// Panics if addr is not 4-byte aligned.
+pub fn cas_i32(addr &i32, old i32, new i32) bool {
+	mut swapped := false
+	asm volatile amd64 {
+		mov rdx, addr
+		test rdx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, old
+		mov ecx, new
+		lock cmpxchg [rdx], ecx
+		sete al
+		mov swapped, al
+		2:
+		; =r (swapped)
+		; r (addr)
+		  r (old)
+		  r (new)
+		; eax
+		  ecx
+		  rdx
+		  memory
+	}
+	return swapped
+}
+
+// store_i64 atomically stores value at dest.
+// The operation is performed with sequential consistency.
+// Panics if dest is not 8-byte aligned.
+pub fn store_i64(dest &i64, value i64) {
+	asm volatile amd64 {
+		mov rdx, dest
+		test rdx, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov rax, value
+		xchg rax, [rdx]
+		2:
+		; ; r (dest)
+		  r (value)
+		; rax
+		  rdx
+		  memory
+	}
+}
+
+// load_i64 atomically loads and returns the value at num.
+// The operation is performed with sequential consistency.
+// Panics if num is not 8-byte aligned.
+pub fn load_i64(num &i64) i64 {
+	mut out := i64(0)
+	asm volatile amd64 {
+		mov rdx, num
+		test rdx, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov rax, [rdx]
+		mov out, rax
+		2:
+		; =r (out)
+		; r (num)
+		; rax
+		  rdx
+		  memory
+	}
+	return out
+}
+
+// add_i64 atomically adds delta to the value at dest and returns the new value.
+// The operation is performed with sequential consistency.
+// Panics if dest is not 8-byte aligned.
+pub fn add_i64(dest &i64, delta i64) i64 {
+	mut result := i64(0)
+	asm volatile amd64 {
+		mov rdx, dest
+		test rdx, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov rax, delta
+		lock xadd [rdx], rax
+		add rax, delta
+		mov result, rax
+		2:
+		; =r (result)
+		; r (delta)
+		  r (dest)
+		; rax
+		  rdx
+		  memory
+	}
+	return result
+}
+
+// swap_i64 atomically stores value at dest and returns the old value.
+// The operation is performed with sequential consistency.
+// Panics if dest is not 8-byte aligned.
+pub fn swap_i64(dest &i64, value i64) i64 {
+	mut old := i64(0)
+	asm volatile amd64 {
+		mov rdx, dest
+		test rdx, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov rax, value
+		xchg rax, [rdx]
+		mov old, rax
+		2:
+		; =r (old)
+		; r (dest)
+		  r (value)
+		; rax
+		  rdx
+		  memory
+	}
+	return old
+}
+
+// cas_i64 performs a compare-and-swap operation.
+// If the current value at addr equals old, it atomically stores new.
+// Returns true if the swap was performed, false otherwise.
+// The operation is performed with sequential consistency.
+// Panics if addr is not 8-byte aligned.
+pub fn cas_i64(addr &i64, old i64, new i64) bool {
+	mut swapped := false
+	asm volatile amd64 {
+		mov rdx, addr
+		test rdx, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov rax, old
+		mov rcx, new
+		lock cmpxchgq [rdx], rcx
+		sete al
+		mov swapped, al
+		2:
+		; =r (swapped)
+		; r (addr)
+		  r (old)
+		  r (new)
+		; rax
+		  rcx
+		  rdx
+		  memory
+	}
+	return swapped
+}
+
+// add_u32 atomically adds delta to the value at dest and returns the new value.
+// The operation is performed with sequential consistency.
+// Panics if dest is not 4-byte aligned.
+pub fn add_u32(dest &u32, delta u32) u32 {
+	mut result := u32(0)
+	asm volatile amd64 {
+		mov rdx, dest
+		test rdx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, delta
+		lock xadd [rdx], eax
+		add eax, delta
+		mov result, eax
+		2:
+		; =r (result)
+		; r (dest)
+		  r (delta)
+		; rax
+		  rdx
+		  memory
+	}
+	return result
+}
+
+// swap_u32 atomically stores new value at dest and returns the old value.
+// The operation is performed with sequential consistency.
+// Panics if dest is not 4-byte aligned.
+pub fn swap_u32(dest &u32, new u32) u32 {
+	mut old := u32(0)
+	asm volatile amd64 {
+		mov rdx, dest
+		test rdx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, new
+		xchg [rdx], eax
+		mov old, eax
+		2:
+		; =r (old)
+		; r (dest)
+		  r (new)
+		; eax
+		  rdx
+		  memory
+	}
+	return old
+}
+
+// store_u32 atomically stores value at dest.
+// The operation is performed with sequential consistency.
+// Panics if dest is not 4-byte aligned.
+pub fn store_u32(dest &u32, value u32) {
+	asm volatile amd64 {
+		mov rdx, dest
+		test rdx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, value
+		xchg eax, [rdx]
+		2:
+		; ; r (dest)
+		  r (value)
+		; eax
+		  rdx
+		  memory
+	}
+}
+
+// load_u32 atomically loads and returns the value at num.
+// The operation is performed with sequential consistency.
+// Panics if num is not 4-byte aligned.
+pub fn load_u32(num &u32) u32 {
+	mut out := u32(0)
+	asm volatile amd64 {
+		mov rdx, num
+		test rdx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, [rdx]
+		mov out, eax
+		2:
+		; =r (out)
+		; r (num)
+		; rax
+		  rdx
+	}
+	return out
+}
+
+// cas_u32 performs a compare-and-swap operation.
+// If the current value at addr equals old, it atomically stores new.
+// Returns true if the swap was performed, false otherwise.
+// The operation is performed with sequential consistency.
+// Panics if addr is not 4-byte aligned.
+pub fn cas_u32(addr &u32, old u32, new u32) bool {
+	mut swapped := false
+	asm volatile amd64 {
+		mov rdx, addr
+		test rdx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, old
+		mov ecx, new
+		lock cmpxchg [rdx], ecx
+		sete al
+		mov swapped, al
+		2:
+		; =r (swapped)
+		; r (addr)
+		  r (old)
+		  r (new)
+		; eax
+		  ecx
+		  rdx
+		  memory
+	}
+	return swapped
+}
+
+// load_u64 atomically loads and returns the value at num.
+// The operation is performed with sequential consistency.
+// Panics if num is not 8-byte aligned.
+pub fn load_u64(num &u64) u64 {
+	mut out := u64(0)
+	asm volatile amd64 {
+		mov rdx, num
+		test rdx, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov rax, [rdx]
+		mov out, rax
+		2:
+		; =r (out)
+		; r (num)
+		; rax
+		  rdx
+		  memory
+	}
+	return out
+}
+
+// store_u64 atomically stores value at dest.
+// The operation is performed with sequential consistency.
+// Panics if dest is not 8-byte aligned.
+pub fn store_u64(dest &u64, value u64) {
+	asm volatile amd64 {
+		mov rdx, dest
+		test rdx, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov rax, value
+		xchg rax, [rdx]
+		2:
+		; ; r (dest)
+		  r (value)
+		; rax
+		  rdx
+		  memory
+	}
+}
+
+// add_u64 atomically adds delta to the value at dest and returns the new value.
+// The operation is performed with sequential consistency.
+// Panics if dest is not 8-byte aligned.
+pub fn add_u64(dest &u64, delta u64) u64 {
+	mut result := u64(0)
+	asm volatile amd64 {
+		mov rdx, dest
+		test rdx, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov rax, delta
+		lock xadd [rdx], rax
+		add rax, delta
+		mov result, rax
+		2:
+		; =r (result)
+		; r (dest)
+		  r (delta)
+		; rax
+		  rdx
+		  memory
+	}
+	return result
+}
+
+// swap_u64 atomically stores value at dest and returns the old value.
+// The operation is performed with sequential consistency.
+// Panics if dest is not 8-byte aligned.
+pub fn swap_u64(dest &u64, value u64) u64 {
+	mut old := u64(0)
+	asm volatile amd64 {
+		mov rdx, dest
+		test rdx, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov rax, value
+		xchg [rdx], rax
+		mov old, rax
+		2:
+		; =r (old)
+		; r (dest)
+		  r (value)
+		; rax
+		  rdx
+		  memory
+	}
+	return old
+}
+
+// cas_u64 performs a compare-and-swap operation.
+// If the current value at addr equals old, it atomically stores new.
+// Returns true if the swap was performed, false otherwise.
+// The operation is performed with sequential consistency.
+// Panics if addr is not 8-byte aligned.
+pub fn cas_u64(addr &u64, old u64, new u64) bool {
+	mut swapped := false
+	asm volatile amd64 {
+		mov rdx, addr
+		test rdx, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov rax, old
+		mov rcx, new
+		lock cmpxchgq [rdx], rcx
+		sete al
+		mov swapped, al
+		2:
+		; =r (swapped)
+		; r (addr)
+		  r (old)
+		  r (new)
+		; rax
+		  rcx
+		  rdx
+		  memory
+	}
+	return swapped
+}
diff --git a/vlib/x/atomics/atomics.i386.v b/vlib/x/atomics/atomics.i386.v
new file mode 100644
index 000000000..55655254b
--- /dev/null
+++ b/vlib/x/atomics/atomics.i386.v
@@ -0,0 +1,606 @@
+module atomics
+
+// add_i32 atomically adds delta to the value at dest and returns the new value.
+// The operation is performed with sequential consistency.
+// Panics if dest is not 4-byte aligned.
+pub fn add_i32(dest &i32, delta i32) i32 {
+	mut result := i32(0)
+	asm volatile i386 {
+		mov edx, dest
+		test edx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, delta
+		lock xadd [edx], eax
+		add eax, delta
+		mov result, eax
+		2:
+		; =r (result)
+		; r (dest)
+		  r (delta)
+		; eax
+		  edx
+		  memory
+	}
+	return result
+}
+
+// swap_i32 atomically stores new value at dest and returns the old value.
+// The operation is performed with sequential consistency.
+// Panics if dest is not 4-byte aligned.
+pub fn swap_i32(dest &i32, new i32) i32 {
+	mut old := i32(0)
+	asm volatile i386 {
+		mov edx, dest
+		test edx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, new
+		xchg [edx], eax
+		mov old, eax
+		2:
+		; =m (old)
+		; r (dest)
+		  r (new)
+		; eax
+		  edx
+		  memory
+	}
+	return old
+}
+
+// store_i32 atomically stores value at dest.
+// The operation is performed with sequential consistency.
+// Panics if dest is not 4-byte aligned.
+pub fn store_i32(dest &i32, value i32) {
+	asm volatile i386 {
+		mov edx, dest
+		test edx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, value
+		xchg eax, [edx]
+		2:
+		; ; r (dest)
+		  r (value)
+		; eax
+		  edx
+		  memory
+	}
+}
+
+// load_i32 atomically loads and returns the value at num.
+// The operation is performed with sequential consistency.
+// Panics if num is not 4-byte aligned.
+pub fn load_i32(num &i32) i32 {
+	mut out := i32(0)
+	asm volatile i386 {
+		mov edx, num
+		test edx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, [edx]
+		mov out, eax
+		2:
+		; =r (out)
+		; r (num)
+		; eax
+		  edx
+		  memory
+	}
+	return out
+}
+
+// cas_i32 performs a compare-and-swap operation.
+// If the current value at addr equals old, it atomically stores new.
+// Returns true if the swap was performed, false otherwise.
+// The operation is performed with sequential consistency.
+// Panics if addr is not 4-byte aligned.
+pub fn cas_i32(addr &i32, old i32, new i32) bool {
+	mut swapped := false
+	asm volatile i386 {
+		mov edx, addr
+		test edx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, old
+		mov ecx, new
+		lock cmpxchg [edx], ecx
+		sete al
+		mov swapped, al
+		2:
+		; =m (swapped)
+		; r (addr)
+		  r (old)
+		  r (new)
+		; eax
+		  ecx
+		  edx
+		  memory
+	}
+	return swapped
+}
+
+// store_i64 atomically stores value at dest using MMX instructions.
+// The operation is performed with sequential consistency.
+// Requires MMX support. Panics if dest is not 8-byte aligned.
+pub fn store_i64(dest &i64, value i64) {
+	asm volatile i386 {
+		mov esi, dest
+		test esi, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		movq mm0, value
+		movq [esi], mm0
+		emms
+		xor eax, eax
+		lock xaddl [esp], eax
+		2:
+		; ; r (dest)
+		  m (value)
+		; esi
+		  eax
+		  mm0
+		  memory
+	}
+}
+
+// load_i64 atomically loads and returns the value at num using MMX instructions.
+// The operation is performed with sequential consistency.
+// Requires MMX support. Panics if num is not 8-byte aligned.
+pub fn load_i64(num &i64) i64 {
+	mut out := i64(0)
+	asm volatile i386 {
+		mov esi, num
+		test esi, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		movq mm0, [esi]
+		movq out, mm0
+		emms
+		2:
+		; =m (out)
+		; r (num)
+		; esi
+		  mm0
+		  memory
+	}
+	return out
+}
+
+// add_i64 atomically adds delta to the value at dest and returns the new value.
+// Uses a compare-and-swap loop. The operation is performed with sequential consistency.
+// Panics if dest is not 8-byte aligned.
+pub fn add_i64(dest &i64, delta i64) i64 {
+	mut delta_lo := u32(u64(delta) & 0xFFFF_FFFF)
+	mut delta_hi := u32(u64(delta) >> 32)
+	mut res_lo := u32(0)
+	mut res_hi := u32(0)
+	asm volatile i386 {
+		mov esi, dest
+		test esi, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		3:
+		mov eax, [esi]
+		mov edx, [esi + 4]
+		mov ebx, eax
+		mov ecx, edx
+		add ebx, delta_lo
+		adc ecx, delta_hi
+		lock cmpxchg8b [esi]
+		jnz '3b'
+		mov res_lo, ebx
+		mov res_hi, ecx
+		2:
+		; ; r (dest)
+		  m (delta_lo)
+		  m (delta_hi)
+		  m (res_lo)
+		  m (res_hi)
+		; eax
+		  ebx
+		  ecx
+		  edx
+		  esi
+		  memory
+	}
+	return i64(u64(res_lo) | (u64(res_hi) << 32))
+}
+
+// swap_i64 atomically stores value at dest and returns the old value.
+// Uses a compare-and-swap loop. The operation is performed with sequential consistency.
+// Panics if dest is not 8-byte aligned.
+pub fn swap_i64(dest &i64, value i64) i64 {
+	mut value_lo := u32(u64(value) & 0xFFFF_FFFF)
+	mut value_hi := u32(u64(value) >> 32)
+	asm volatile i386 {
+		mov esi, dest
+		test esi, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		3:
+		mov eax, [esi]
+		mov edx, [esi + 4]
+		mov ebx, value_lo
+		mov ecx, value_hi
+		lock cmpxchg8b [esi]
+		jnz '3b'
+		mov value_lo, eax
+		mov value_hi, edx
+		2:
+		; ; r (dest)
+		  m (value_lo)
+		  m (value_hi)
+		; eax
+		  ebx
+		  ecx
+		  edx
+		  esi
+		  memory
+	}
+	return i64(u64(value_lo) | (u64(value_hi) << 32))
+}
+
+// cas_i64 performs a compare-and-swap operation.
+// If the current value at addr equals old, it atomically stores new.
+// Returns true if the swap was performed, false otherwise.
+// The operation is performed with sequential consistency.
+// Panics if addr is not 8-byte aligned.
+pub fn cas_i64(addr &i64, old i64, new i64) bool {
+	mut swapped := false
+	mut old_lo := u32(u64(old) & 0xFFFF_FFFF)
+	mut old_hi := u32(u64(old) >> 32)
+	mut new_lo := u32(u64(new) & 0xFFFF_FFFF)
+	mut new_hi := u32(u64(new) >> 32)
+	asm volatile i386 {
+		mov esi, addr
+		test esi, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, old_lo
+		mov edx, old_hi
+		mov ebx, new_lo
+		mov ecx, new_hi
+		lock cmpxchg8b [esi]
+		sete al
+		mov swapped, al
+		2:
+		; =m (swapped)
+		; r (addr)
+		  m (old_lo)
+		  m (old_hi)
+		  m (new_lo)
+		  m (new_hi)
+		; eax
+		  ebx
+		  ecx
+		  edx
+		  esi
+		  memory
+	}
+	return swapped
+}
+
+// add_u32 atomically adds delta to the value at dest and returns the new value.
+// The operation is performed with sequential consistency.
+// Panics if dest is not 4-byte aligned.
+pub fn add_u32(dest &u32, delta u32) u32 {
+	mut result := u32(0)
+	asm volatile i386 {
+		mov edx, dest
+		test edx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, delta
+		lock xadd [edx], eax
+		add eax, delta
+		mov result, eax
+		2:
+		; =r (result)
+		; r (dest)
+		  r (delta)
+		; eax
+		  edx
+		  memory
+	}
+	return result
+}
+
+// swap_u32 atomically stores new value at dest and returns the old value.
+// The operation is performed with sequential consistency.
+// Panics if dest is not 4-byte aligned.
+pub fn swap_u32(dest &u32, new u32) u32 {
+	mut old := u32(0)
+	asm volatile i386 {
+		mov edx, dest
+		test edx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, new
+		xchg [edx], eax
+		mov old, eax
+		2:
+		; =m (old)
+		; r (dest)
+		  r (new)
+		; eax
+		  edx
+		  memory
+	}
+	return old
+}
+
+// store_u32 atomically stores value at dest.
+// The operation is performed with sequential consistency.
+// Panics if dest is not 4-byte aligned.
+pub fn store_u32(dest &u32, value u32) {
+	asm volatile i386 {
+		mov edx, dest
+		test edx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, value
+		xchg eax, [edx]
+		2:
+		; ; r (dest)
+		  r (value)
+		; eax
+		  edx
+		  memory
+	}
+}
+
+// load_u32 atomically loads and returns the value at num.
+// The operation is performed with sequential consistency.
+// Panics if num is not 4-byte aligned.
+pub fn load_u32(num &u32) u32 {
+	mut out := u32(0)
+	asm volatile i386 {
+		mov edx, num
+		test edx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, [edx]
+		mov out, eax
+		2:
+		; =r (out)
+		; r (num)
+		; eax
+		  edx
+		  memory
+	}
+	return out
+}
+
+// cas_u32 performs a compare-and-swap operation.
+// If the current value at addr equals old, it atomically stores new.
+// Returns true if the swap was performed, false otherwise.
+// The operation is performed with sequential consistency.
+// Panics if addr is not 4-byte aligned.
+pub fn cas_u32(addr &u32, old u32, new u32) bool {
+	mut swapped := false
+	asm volatile i386 {
+		mov edx, addr
+		test edx, 3
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, old
+		mov ecx, new
+		lock cmpxchg [edx], ecx
+		sete al
+		mov swapped, al
+		2:
+		; =m (swapped)
+		; r (addr)
+		  r (old)
+		  r (new)
+		; eax
+		  ecx
+		  edx
+		  memory
+	}
+	return swapped
+}
+
+// load_u64 atomically loads and returns the value at num using MMX instructions.
+// The operation is performed with sequential consistency.
+// Requires MMX support. Panics if num is not 8-byte aligned.
+pub fn load_u64(num &u64) u64 {
+	mut out := u64(0)
+	asm volatile i386 {
+		mov esi, num
+		test esi, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		movq mm0, [esi]
+		movq out, mm0
+		emms
+		2:
+		; =m (out)
+		; r (num)
+		; esi
+		  mm0
+		  memory
+	}
+	return out
+}
+
+// store_u64 atomically stores value at dest using MMX instructions.
+// The operation is performed with sequential consistency.
+// Requires MMX support. Panics if dest is not 8-byte aligned.
+pub fn store_u64(dest &u64, value u64) {
+	asm volatile i386 {
+		mov esi, dest
+		test esi, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		movq mm0, value
+		movq [esi], mm0
+		emms
+		xor eax, eax
+		lock xaddl [esp], eax
+		2:
+		; ; r (dest)
+		  m (value)
+		; eax
+		  mm0
+		  memory
+	}
+}
+
+// add_u64 atomically adds delta to the value at dest and returns the new value.
+// Uses a compare-and-swap loop. The operation is performed with sequential consistency.
+// Panics if dest is not 8-byte aligned.
+pub fn add_u64(dest &u64, delta u64) u64 {
+	mut delta_lo := u32(delta & 0xFFFF_FFFF)
+	mut delta_hi := u32(delta >> 32)
+	mut res_lo := u32(0)
+	mut res_hi := u32(0)
+	asm volatile i386 {
+		mov esi, dest
+		test esi, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		3:
+		mov eax, [esi]
+		mov edx, [esi + 4]
+		mov ebx, eax
+		mov ecx, edx
+		add ebx, delta_lo
+		adc ecx, delta_hi
+		lock cmpxchg8b [esi]
+		jnz '3b'
+		mov res_lo, ebx
+		mov res_hi, ecx
+		2:
+		; ; r (dest)
+		  m (delta_lo)
+		  m (delta_hi)
+		  m (res_lo)
+		  m (res_hi)
+		; eax
+		  ebx
+		  ecx
+		  edx
+		  esi
+		  memory
+	}
+	return u64(res_lo) | (u64(res_hi) << 32)
+}
+
+// swap_u64 atomically stores value at dest and returns the old value.
+// Uses a compare-and-swap loop. The operation is performed with sequential consistency.
+// Panics if dest is not 8-byte aligned.
+pub fn swap_u64(dest &u64, value u64) u64 {
+	mut old := u64(0)
+	mut value_lo := u32(value & 0xFFFF_FFFF)
+	mut value_hi := u32(value >> 32)
+	asm volatile i386 {
+		mov esi, dest
+		test esi, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		3:
+		mov eax, [esi]
+		mov edx, [esi + 4]
+		mov ebx, value_lo
+		mov ecx, value_hi
+		lock cmpxchg8b [esi]
+		jnz '3b'
+		mov value_lo, eax
+		mov value_hi, edx
+		2:
+		; ; r (dest)
+		  m (value_lo)
+		  m (value_hi)
+		; eax
+		  ebx
+		  ecx
+		  edx
+		  esi
+		  memory
+	}
+	old = u64(value_lo) | (u64(value_hi) << 32)
+	return old
+}
+
+// cas_u64 performs a compare-and-swap operation.
+// If the current value at addr equals old, it atomically stores new.
+// Returns true if the swap was performed, false otherwise.
+// The operation is performed with sequential consistency.
+// Panics if addr is not 8-byte aligned.
+pub fn cas_u64(addr &u64, old u64, new u64) bool {
+	mut swapped := false
+	mut old_lo := u32(old & 0xFFFF_FFFF)
+	mut old_hi := u32(old >> 32)
+	mut new_lo := u32(new & 0xFFFF_FFFF)
+	mut new_hi := u32(new >> 32)
+	asm volatile i386 {
+		mov esi, addr
+		test esi, 7
+		jz '1f'
+		call panicUnaligned
+		jmp '2f'
+		1:
+		mov eax, old_lo
+		mov edx, old_hi
+		mov ebx, new_lo
+		mov ecx, new_hi
+		lock cmpxchg8b [esi]
+		sete al
+		mov swapped, al
+		2:
+		; =m (swapped)
+		; r (addr)
+		  m (old_lo)
+		  m (old_hi)
+		  m (new_lo)
+		  m (new_hi)
+		; eax
+		  ebx
+		  ecx
+		  edx
+		  esi
+		  memory
+	}
+	return swapped
+}
diff --git a/vlib/x/atomics/benchmarks/README.md b/vlib/x/atomics/benchmarks/README.md
new file mode 100644
index 000000000..8e14d8cda
--- /dev/null
+++ b/vlib/x/atomics/benchmarks/README.md
@@ -0,0 +1,123 @@
+### Environment
+
+- CPU: AMD Ryzen 9 9950X3D (16C / 32T)
+- RAM: 64 GiB
+- OS: EndeavourOS (Linux, kernel 6.18.6-arch1-1)
+- Compiler:
+  - amd64: v -prod -cc gcc -gc none
+  - i386: v -keepc -cc i686-linux-gnu-gcc -prod -m32 -arch i386 -cflags -mmmx -w -gc none
+
+### How to run
+
+```bash
+# amd64
+v -prod -cc gcc -gc none run benchmarks/atomic_benchmark.v
+
+# i386
+v -keepc -cc i686-linux-gnu-gcc -prod -m32 -arch i386 -cflags -mmmx -w -gc none run benchmarks/atomic_benchmark.v
+```
+
+### Results (ns/op, 100M iterations)
+
+```
+AMD64
+=====
+
+Command:
+v -prod -cc gcc -gc none run atomic_benchmark.v
+
+u64 store std: 3.788 ns/op (total: 378.783ms, iters: 100000000)
+u64 store custom: 3.773 ns/op (total: 377.301ms, iters: 100000000)
+u64 load std: 1.078 ns/op (total: 107.848ms, iters: 100000000)
+u64 load custom: 1.084 ns/op (total: 108.381ms, iters: 100000000)
+u64 add std: 3.601 ns/op (total: 360.067ms, iters: 100000000)
+u64 add custom: 3.782 ns/op (total: 378.213ms, iters: 100000000)
+u64 swap std (exchange): 3.805 ns/op (total: 380.520ms, iters: 100000000)
+u64 swap custom: 3.835 ns/op (total: 383.493ms, iters: 100000000)
+u64 cas std: 3.824 ns/op (total: 382.391ms, iters: 100000000)
+u64 cas custom: 3.783 ns/op (total: 378.264ms, iters: 100000000)
+
+u32 store std: 3.783 ns/op (total: 378.346ms, iters: 100000000)
+u32 store custom: 3.822 ns/op (total: 382.245ms, iters: 100000000)
+u32 load std: 1.084 ns/op (total: 108.427ms, iters: 100000000)
+u32 load custom: 1.085 ns/op (total: 108.536ms, iters: 100000000)
+u32 add std: 3.663 ns/op (total: 366.308ms, iters: 100000000)
+u32 add custom: 3.857 ns/op (total: 385.722ms, iters: 100000000)
+u32 swap std (exchange): 3.855 ns/op (total: 385.503ms, iters: 100000000)
+u32 swap custom: 3.859 ns/op (total: 385.892ms, iters: 100000000)
+u32 cas std: 3.87 ns/op (total: 387.025ms, iters: 100000000)
+u32 cas custom: 3.837 ns/op (total: 383.680ms, iters: 100000000)
+
+i64 store std (via u64): 3.784 ns/op (total: 378.377ms, iters: 100000000)
+i64 store custom: 3.79 ns/op (total: 378.993ms, iters: 100000000)
+i64 load std (via u64): 0.935 ns/op (total: 93.519ms, iters: 100000000)
+i64 load custom: 0.864 ns/op (total: 86.350ms, iters: 100000000)
+i64 add std (via u64): 3.608 ns/op (total: 360.752ms, iters: 100000000)
+i64 add custom: 3.843 ns/op (total: 384.319ms, iters: 100000000)
+i64 swap std (exchange u64): 3.826 ns/op (total: 382.621ms, iters: 100000000)
+i64 swap custom: 3.835 ns/op (total: 383.513ms, iters: 100000000)
+i64 cas std (via u64): 3.84 ns/op (total: 383.988ms, iters: 100000000)
+i64 cas custom: 3.847 ns/op (total: 384.733ms, iters: 100000000)
+
+i32 store std (via u32): 3.84 ns/op (total: 383.983ms, iters: 100000000)
+i32 store custom: 3.841 ns/op (total: 384.068ms, iters: 100000000)
+i32 load std (via u32): 1.1 ns/op (total: 109.956ms, iters: 100000000)
+i32 load custom: 1.1 ns/op (total: 109.978ms, iters: 100000000)
+i32 add std (via u32): 3.659 ns/op (total: 365.907ms, iters: 100000000)
+i32 add custom: 3.846 ns/op (total: 384.574ms, iters: 100000000)
+i32 swap std (exchange u32): 3.848 ns/op (total: 384.830ms, iters: 100000000)
+i32 swap custom: 3.836 ns/op (total: 383.562ms, iters: 100000000)
+i32 cas std (via u32): 3.837 ns/op (total: 383.690ms, iters: 100000000)
+i32 cas custom: 3.815 ns/op (total: 381.453ms, iters: 100000000)
+
+
+I386
+====
+
+Command:
+v -keepc -cc i686-linux-gnu-gcc -prod -m32 -arch i386 -cflags -mmmx -w -gc none run benchmarks/atomic_benchmark.v
+
+u64 store std: 9.575 ns/op (total: 957.485ms, iters: 100000000)
+u64 store custom: 7.703 ns/op (total: 770.251ms, iters: 100000000)
+u64 load std: 1.769 ns/op (total: 176.860ms, iters: 100000000)
+u64 load custom: 1.892 ns/op (total: 189.238ms, iters: 100000000)
+u64 add std: 5.544 ns/op (total: 554.431ms, iters: 100000000)
+u64 add custom: 5.31 ns/op (total: 530.964ms, iters: 100000000)
+u64 swap std: 5.32 ns/op (total: 531.988ms, iters: 100000000)
+u64 swap custom: 5.242 ns/op (total: 524.175ms, iters: 100000000)
+u64 cas std: 4.948 ns/op (total: 494.824ms, iters: 100000000)
+u64 cas custom: 5.268 ns/op (total: 526.833ms, iters: 100000000)
+
+u32 store std: 3.896 ns/op (total: 389.574ms, iters: 100000000)
+u32 store custom: 4.067 ns/op (total: 406.712ms, iters: 100000000)
+u32 load std: 1.132 ns/op (total: 113.242ms, iters: 100000000)
+u32 load custom: 1.135 ns/op (total: 113.523ms, iters: 100000000)
+u32 add std: 3.951 ns/op (total: 395.090ms, iters: 100000000)
+u32 add custom: 4.141 ns/op (total: 414.139ms, iters: 100000000)
+u32 swap std: 4.136 ns/op (total: 413.586ms, iters: 100000000)
+u32 swap custom: 4.138 ns/op (total: 413.812ms, iters: 100000000)
+u32 cas std: 4.136 ns/op (total: 413.644ms, iters: 100000000)
+u32 cas custom: 4.705 ns/op (total: 470.505ms, iters: 100000000)
+
+i64 store std: 9.643 ns/op (total: 964.327ms, iters: 100000000)
+i64 store custom: 7.373 ns/op (total: 737.327ms, iters: 100000000)
+i64 load std: 1.7 ns/op (total: 169.983ms, iters: 100000000)
+i64 load custom: 1.824 ns/op (total: 182.396ms, iters: 100000000)
+i64 add std: 5.27 ns/op (total: 526.950ms, iters: 100000000)
+i64 add custom: 5.268 ns/op (total: 526.833ms, iters: 100000000)
+i64 swap std: 4.917 ns/op (total: 491.690ms, iters: 100000000)
+i64 swap custom: 5.095 ns/op (total: 509.546ms, iters: 100000000)
+i64 cas std: 4.931 ns/op (total: 493.122ms, iters: 100000000)
+i64 cas custom: 5.268 ns/op (total: 526.774ms, iters: 100000000)
+
+i32 store std: 4.018 ns/op (total: 401.776ms, iters: 100000000)
+i32 store custom: 4.138 ns/op (total: 413.820ms, iters: 100000000)
+i32 load std: 1.132 ns/op (total: 113.185ms, iters: 100000000)
+i32 load custom: 1.134 ns/op (total: 113.359ms, iters: 100000000)
+i32 add std: 3.949 ns/op (total: 394.853ms, iters: 100000000)
+i32 add custom: 4.139 ns/op (total: 413.947ms, iters: 100000000)
+i32 swap std: 4.135 ns/op (total: 413.485ms, iters: 100000000)
+i32 swap custom: 4.136 ns/op (total: 413.623ms, iters: 100000000)
+i32 cas std: 4.137 ns/op (total: 413.702ms, iters: 100000000)
+i32 cas custom: 4.704 ns/op (total: 470.402ms, iters: 100000000)
+```
\ No newline at end of file
diff --git a/vlib/x/atomics/benchmarks/atomic_benchmark.v b/vlib/x/atomics/benchmarks/atomic_benchmark.v
new file mode 100644
index 000000000..5e2752983
--- /dev/null
+++ b/vlib/x/atomics/benchmarks/atomic_benchmark.v
@@ -0,0 +1,376 @@
+module main
+
+import x.atomics
+import time
+
+$if windows {
+	#include "@VEXEROOT/thirdparty/stdatomic/win/atomic.h"
+} $else {
+	#include "@VEXEROOT/thirdparty/stdatomic/nix/atomic.h"
+}
+
+fn C.atomic_store_u32(voidptr, u32)
+fn C.atomic_load_u32(voidptr) u32
+fn C.atomic_fetch_add_u32(voidptr, u32) u32
+fn C.atomic_compare_exchange_strong_u32(voidptr, voidptr, u32) bool
+fn C.atomic_exchange_u32(voidptr, u32) u32
+
+fn C.atomic_store_u64(voidptr, u64)
+fn C.atomic_load_u64(voidptr) u64
+fn C.atomic_fetch_add_u64(voidptr, u64) u64
+fn C.atomic_compare_exchange_strong_u64(voidptr, voidptr, u64) bool
+fn C.atomic_exchange_u64(voidptr, u64) u64
+
+const iterations = 100_000_000
+
+fn keepalive_u64(x u64) {
+	asm volatile amd64 {
+		nop
+		; ; r (x)
+	}
+}
+
+fn keepalive_u32(x u32) {
+	asm volatile amd64 {
+		nop
+		; ; r (x)
+	}
+}
+
+fn keepalive_i64(x i64) {
+	asm volatile amd64 {
+		nop
+		; ; r (x)
+	}
+}
+
+fn keepalive_i32(x i32) {
+	asm volatile amd64 {
+		nop
+		; ; r (x)
+	}
+}
+
+fn bench_u64(name string, f fn (&u64, u64), iters int) {
+	mut v := u64(0)
+
+	for i in 0 .. 100_000 {
+		f(&v, u64(i))
+	}
+
+	mut sw := time.new_stopwatch()
+	for i in 0 .. iters {
+		f(&v, u64(i))
+	}
+
+	elapsed := sw.elapsed()
+	ns_per_op := f64(elapsed.nanoseconds()) / f64(iters)
+
+	keepalive_u64(v)
+	println('${name:-17s}: ${ns_per_op:6.3f} ns/op (total: ${elapsed:9}, iters: ${iters})')
+}
+
+fn bench_u32(name string, f fn (&u32, u32), iters int) {
+	mut v := u32(0)
+
+	for i in 0 .. 100_000 {
+		f(&v, u32(i))
+	}
+
+	mut sw := time.new_stopwatch()
+	for i in 0 .. iters {
+		f(&v, u32(i))
+	}
+
+	elapsed := sw.elapsed()
+	ns_per_op := f64(elapsed.nanoseconds()) / f64(iters)
+
+	keepalive_u32(v)
+	println('${name:-17s}: ${ns_per_op:6.3f} ns/op (total: ${elapsed:9}, iters: ${iters})')
+}
+
+fn bench_i64(name string, f fn (&i64, i64), iters int) {
+	mut v := i64(0)
+
+	for i in 0 .. 100_000 {
+		f(&v, i64(i))
+	}
+
+	mut sw := time.new_stopwatch()
+	for i in 0 .. iters {
+		f(&v, i64(i))
+	}
+
+	elapsed := sw.elapsed()
+	ns_per_op := f64(elapsed.nanoseconds()) / f64(iters)
+
+	keepalive_i64(v)
+	println('${name:-17s}: ${ns_per_op:6.3f} ns/op (total: ${elapsed:9}, iters: ${iters})')
+}
+
+fn bench_i32(name string, f fn (&i32, i32), iters int) {
+	mut v := i32(0)
+
+	for i in 0 .. 100_000 {
+		f(&v, i32(i))
+	}
+
+	mut sw := time.new_stopwatch()
+	for i in 0 .. iters {
+		f(&v, i32(i))
+	}
+
+	elapsed := sw.elapsed()
+	ns_per_op := f64(elapsed.nanoseconds()) / f64(iters)
+
+	keepalive_i32(v)
+	println('${name:-17s}: ${ns_per_op:6.3f} ns/op (total: ${elapsed:9}, iters: ${iters})')
+}
+
+fn std_store_u64(addr &u64, val u64) {
+	C.atomic_store_u64(voidptr(addr), val)
+}
+
+fn custom_store_u64(addr &u64, val u64) {
+	atomics.store_u64(addr, val)
+}
+
+fn std_load_u64(addr &u64, _ u64) {
+	_ = C.atomic_load_u64(voidptr(addr))
+}
+
+fn custom_load_u64(addr &u64, _ u64) {
+	_ = atomics.load_u64(addr)
+}
+
+fn std_add_u64(addr &u64, delta u64) {
+	_ = C.atomic_fetch_add_u64(voidptr(addr), delta)
+}
+
+fn custom_add_u64(addr &u64, delta u64) {
+	_ = atomics.add_u64(addr, delta)
+}
+
+fn std_swap_u64(addr &u64, val u64) {
+	_ = C.atomic_exchange_u64(voidptr(addr), val)
+}
+
+fn custom_swap_u64(addr &u64, val u64) {
+	_ = atomics.swap_u64(addr, val)
+}
+
+fn std_cas_u64(addr &u64, val u64) {
+	mut expected := u64(0)
+	_ = C.atomic_compare_exchange_strong_u64(voidptr(addr), voidptr(&expected), val)
+}
+
+fn custom_cas_u64(addr &u64, val u64) {
+	_ = atomics.cas_u64(addr, 0, val)
+}
+
+fn std_store_u32(addr &u32, val u32) {
+	C.atomic_store_u32(voidptr(addr), val)
+}
+
+fn custom_store_u32(addr &u32, val u32) {
+	atomics.store_u32(addr, val)
+}
+
+fn std_load_u32(addr &u32, _ u32) {
+	_ = C.atomic_load_u32(voidptr(addr))
+}
+
+fn custom_load_u32(addr &u32, _ u32) {
+	_ = atomics.load_u32(addr)
+}
+
+fn std_add_u32(addr &u32, delta u32) {
+	_ = C.atomic_fetch_add_u32(voidptr(addr), delta)
+}
+
+fn custom_add_u32(addr &u32, delta u32) {
+	_ = atomics.add_u32(addr, delta)
+}
+
+fn std_swap_u32(addr &u32, val u32) {
+	_ = C.atomic_exchange_u32(voidptr(addr), val)
+}
+
+fn custom_swap_u32(addr &u32, val u32) {
+	_ = atomics.swap_u32(addr, val)
+}
+
+fn std_cas_u32(addr &u32, val u32) {
+	mut expected := u32(0)
+	_ = C.atomic_compare_exchange_strong_u32(voidptr(addr), voidptr(&expected), val)
+}
+
+fn custom_cas_u32(addr &u32, val u32) {
+	_ = atomics.cas_u32(addr, 0, val)
+}
+
+fn std_store_i64(addr &i64, val i64) {
+	unsafe { C.atomic_store_u64(voidptr(addr), u64(val)) }
+}
+
+fn custom_store_i64(addr &i64, val i64) {
+	atomics.store_i64(addr, val)
+}
+
+fn std_load_i64(addr &i64, _ i64) {
+	unsafe {
+		_ = C.atomic_load_u64(voidptr(addr))
+	}
+}
+
+fn custom_load_i64(addr &i64, _ i64) {
+	_ = atomics.load_i64(addr)
+}
+
+fn std_add_i64(addr &i64, delta i64) {
+	unsafe {
+		_ = C.atomic_fetch_add_u64(voidptr(addr), u64(delta))
+	}
+}
+
+fn custom_add_i64(addr &i64, delta i64) {
+	_ = atomics.add_i64(addr, delta)
+}
+
+fn std_swap_i64(addr &i64, val i64) {
+	unsafe {
+		_ = C.atomic_exchange_u64(voidptr(addr), u64(val))
+	}
+}
+
+fn custom_swap_i64(addr &i64, val i64) {
+	_ = atomics.swap_i64(addr, val)
+}
+
+fn std_cas_i64(addr &i64, val i64) {
+	unsafe {
+		mut expected := u64(0)
+		_ = C.atomic_compare_exchange_strong_u64(voidptr(addr), voidptr(&expected), u64(val))
+	}
+}
+
+fn custom_cas_i64(addr &i64, val i64) {
+	_ = atomics.cas_i64(addr, 0, val)
+}
+
+fn std_store_i32(addr &i32, val i32) {
+	unsafe { C.atomic_store_u32(voidptr(addr), u32(val)) }
+}
+
+fn custom_store_i32(addr &i32, val i32) {
+	atomics.store_i32(addr, val)
+}
+
+fn std_load_i32(addr &i32, _ i32) {
+	unsafe {
+		_ = C.atomic_load_u32(voidptr(addr))
+	}
+}
+
+fn custom_load_i32(addr &i32, _ i32) {
+	_ = atomics.load_i32(addr)
+}
+
+fn std_add_i32(addr &i32, delta i32) {
+	unsafe {
+		_ = C.atomic_fetch_add_u32(voidptr(addr), u32(delta))
+	}
+}
+
+fn custom_add_i32(addr &i32, delta i32) {
+	_ = atomics.add_i32(addr, delta)
+}
+
+fn std_swap_i32(addr &i32, val i32) {
+	unsafe {
+		_ = C.atomic_exchange_u32(voidptr(addr), u32(val))
+	}
+}
+
+fn custom_swap_i32(addr &i32, val i32) {
+	_ = atomics.swap_i32(addr, val)
+}
+
+fn std_cas_i32(addr &i32, val i32) {
+	unsafe {
+		mut expected := u32(0)
+		_ = C.atomic_compare_exchange_strong_u32(voidptr(addr), voidptr(&expected), u32(val))
+	}
+}
+
+fn custom_cas_i32(addr &i32, val i32) {
+	_ = atomics.cas_i32(addr, 0, val)
+}
+
+fn main() {
+	bench_u64('u64 store std', std_store_u64, iterations)
+	bench_u64('u64 store custom', custom_store_u64, iterations)
+
+	bench_u64('u64 load std', std_load_u64, iterations)
+	bench_u64('u64 load custom', custom_load_u64, iterations)
+
+	bench_u64('u64 add std', std_add_u64, iterations)
+	bench_u64('u64 add custom', custom_add_u64, iterations)
+
+	bench_u64('u64 swap std', std_swap_u64, iterations)
+	bench_u64('u64 swap custom', custom_swap_u64, iterations)
+
+	bench_u64('u64 cas std', std_cas_u64, iterations)
+	bench_u64('u64 cas custom', custom_cas_u64, iterations)
+
+	println('')
+
+	bench_u32('u32 store std', std_store_u32, iterations)
+	bench_u32('u32 store custom', custom_store_u32, iterations)
+
+	bench_u32('u32 load std', std_load_u32, iterations)
+	bench_u32('u32 load custom', custom_load_u32, iterations)
+
+	bench_u32('u32 add std', std_add_u32, iterations)
+	bench_u32('u32 add custom', custom_add_u32, iterations)
+
+	bench_u32('u32 swap std', std_swap_u32, iterations)
+	bench_u32('u32 swap custom', custom_swap_u32, iterations)
+
+	bench_u32('u32 cas std', std_cas_u32, iterations)
+	bench_u32('u32 cas custom', custom_cas_u32, iterations)
+
+	println('')
+
+	bench_i64('i64 store std', std_store_i64, iterations)
+	bench_i64('i64 store custom', custom_store_i64, iterations)
+
+	bench_i64('i64 load std', std_load_i64, iterations)
+	bench_i64('i64 load custom', custom_load_i64, iterations)
+
+	bench_i64('i64 add std', std_add_i64, iterations)
+	bench_i64('i64 add custom', custom_add_i64, iterations)
+
+	bench_i64('i64 swap std', std_swap_i64, iterations)
+	bench_i64('i64 swap custom', custom_swap_i64, iterations)
+
+	bench_i64('i64 cas std', std_cas_i64, iterations)
+	bench_i64('i64 cas custom', custom_cas_i64, iterations)
+
+	println('')
+
+	bench_i32('i32 store std', std_store_i32, iterations)
+	bench_i32('i32 store custom', custom_store_i32, iterations)
+
+	bench_i32('i32 load std', std_load_i32, iterations)
+	bench_i32('i32 load custom', custom_load_i32, iterations)
+
+	bench_i32('i32 add std', std_add_i32, iterations)
+	bench_i32('i32 add custom', custom_add_i32, iterations)
+
+	bench_i32('i32 swap std', std_swap_i32, iterations)
+	bench_i32('i32 swap custom', custom_swap_i32, iterations)
+
+	bench_i32('i32 cas std', std_cas_i32, iterations)
+	bench_i32('i32 cas custom', custom_cas_i32, iterations)
+}
diff --git a/vlib/x/atomics/examples/basic.v b/vlib/x/atomics/examples/basic.v
new file mode 100644
index 000000000..3cac83097
--- /dev/null
+++ b/vlib/x/atomics/examples/basic.v
@@ -0,0 +1,23 @@
+module main
+
+import x.atomics
+
+fn main() {
+	// Basic atomic load and store operations
+	mut value := i32(0)
+
+	// Atomically store a value
+	atomics.store_i32(&value, 42)
+
+	// Atomically load the value
+	loaded := atomics.load_i32(&value)
+	println('Loaded value: ${loaded}') // Output: 42
+
+	// Atomic add: returns the new value after addition
+	new_value := atomics.add_i32(&value, 10)
+	println('After add: ${new_value}') // Output: 52
+
+	// Atomic swap: returns the old value
+	old := atomics.swap_i32(&value, 100)
+	println('Old value: ${old}, new value: ${atomics.load_i32(&value)}') // Output: 52, 100
+}
diff --git a/vlib/x/atomics/examples/counter.v b/vlib/x/atomics/examples/counter.v
new file mode 100644
index 000000000..8e7299e1d
--- /dev/null
+++ b/vlib/x/atomics/examples/counter.v
@@ -0,0 +1,40 @@
+module main
+
+import x.atomics
+
+fn increment(counter &i64) {
+	atomics.add_i64(counter, 1)
+}
+
+fn worker(counter &i64, iterations int) {
+	for _ in 0 .. iterations {
+		increment(counter)
+	}
+}
+
+fn main() {
+	mut counter := i64(0)
+
+	num_threads := 4
+	increments_per_thread := 10000
+
+	mut threads := []thread{}
+
+	for _ in 0 .. num_threads {
+		threads << spawn worker(&counter, increments_per_thread)
+	}
+
+	threads.wait()
+
+	expected := i64(num_threads * increments_per_thread)
+	actual := atomics.load_i64(&counter)
+
+	println('Expected: ${expected}')
+	println('Actual:   ${actual}')
+
+	if actual == expected {
+		println('Counter is correct')
+	} else {
+		println('Counter mismatch - race condition detected')
+	}
+}
diff --git a/vlib/x/atomics/examples/spinlock.v b/vlib/x/atomics/examples/spinlock.v
new file mode 100644
index 000000000..3e4cf274a
--- /dev/null
+++ b/vlib/x/atomics/examples/spinlock.v
@@ -0,0 +1,59 @@
+module main
+
+import x.atomics
+
+struct SpinLock {
+mut:
+	state u32 // 0 = unlocked, 1 = locked
+}
+
+fn acquire(mut spinlock SpinLock) {
+	for !atomics.cas_u32(&spinlock.state, 0, 1) {
+		// Busy-wait
+	}
+}
+
+fn release(mut spinlock SpinLock) {
+	atomics.store_u32(&spinlock.state, 0)
+}
+
+struct SharedData {
+mut:
+	spinlock SpinLock
+	value    int
+}
+
+fn worker(mut data SharedData, iterations int) {
+	for _ in 0 .. iterations {
+		acquire(mut data.spinlock)
+		data.value++
+		release(mut data.spinlock)
+	}
+}
+
+fn main() {
+	mut data := &SharedData{}
+
+	num_threads := 4
+	iterations_per_thread := 10000
+
+	mut threads := []thread{}
+
+	for _ in 0 .. num_threads {
+		threads << spawn worker(mut data, iterations_per_thread)
+	}
+
+	threads.wait()
+
+	expected := num_threads * iterations_per_thread
+	actual := data.value
+
+	println('Expected: ${expected}')
+	println('Actual:   ${actual}')
+
+	if actual == expected {
+		println('Spinlock works correctly')
+	} else {
+		println('Race condition detected')
+	}
+}
diff --git a/vlib/x/atomics/i32_test.v b/vlib/x/atomics/i32_test.v
new file mode 100644
index 000000000..c6d3264b0
--- /dev/null
+++ b/vlib/x/atomics/i32_test.v
@@ -0,0 +1,189 @@
+// vtest build: !(macos || windows)
+
+module atomics
+
+fn test_cas_i32_basic() {
+	mut x := i32(10)
+	ok := cas_i32(&x, 10, 20)
+	assert ok == true
+	assert x == 20
+}
+
+fn test_cas_fail() {
+	mut x := i32(5)
+	assert !cas_i32(&x, 10, 42)
+	assert x == 5
+}
+
+fn test_cas_fail_memory_unchanged() {
+	mut x := i32(7)
+	cas_i32(&x, 1, 2)
+	assert x == 7
+}
+
+fn test_cas_exact_match() {
+	mut x := i32(-1)
+	assert !cas_i32(&x, 0, 999)
+	assert x == -1
+}
+
+fn test_cas_twice() {
+	mut x := i32(1)
+	assert cas_i32(&x, 1, 2)
+	assert cas_i32(&x, 2, 3)
+	assert x == 3
+}
+
+fn test_cas_with_negative() {
+	mut x := i32(-123)
+	assert cas_i32(&x, -123, 8)
+	assert x == 8
+}
+
+fn test_add_i32_basic() {
+	mut x := i32(0)
+	for _ in 0 .. 1000 {
+		add_i32(&x, 1)
+	}
+	assert x == 1000
+}
+
+fn test_add_i32_negative() {
+	mut x := i32(10)
+	add_i32(&x, -3)
+	assert x == 7
+}
+
+fn test_add_i32_return_value() {
+	mut x := i32(5)
+	r := add_i32(&x, 7)
+	assert r == 12
+	assert x == 12
+}
+
+fn test_add_i32_overflow_wraps() {
+	mut x := i32(2147483647)
+	add_i32(&x, 1)
+	assert x == -2147483648
+}
+
+fn test_swap_i32_basic() {
+	mut x := i32(5)
+	old := swap_i32(&x, 99)
+	assert old == 5
+	assert x == 99
+}
+
+fn test_swap_i32_twice() {
+	mut x := i32(1)
+	assert swap_i32(&x, 2) == 1
+	assert swap_i32(&x, 3) == 2
+	assert x == 3
+}
+
+fn test_swap_i32_with_cas() {
+	mut x := i32(10)
+	assert cas_i32(&x, 10, 20)
+	old := swap_i32(&x, 30)
+	assert old == 20
+	assert x == 30
+}
+
+fn test_load_i32_basic() {
+	mut x := i32(123456)
+	assert load_i32(&x) == 123456
+}
+
+fn test_store_i32_basic() {
+	mut x := i32(5)
+	store_i32(&x, 777)
+	assert x == 777
+}
+
+fn test_add_i32_concurrent() {
+	mut x := i32(0)
+	mut threads := []thread{}
+
+	for _ in 0 .. 9 {
+		threads << spawn fn (ptr &i32) {
+			for _ in 0 .. 100_000 {
+				add_i32(ptr, 1)
+			}
+		}(&x)
+	}
+
+	for t in threads {
+		t.wait()
+	}
+
+	assert x == 900_000
+}
+
+fn test_swap_i32_concurrent() {
+	mut x := i32(0)
+	mut threads := []thread{}
+
+	for _ in 0 .. 8 {
+		threads << spawn fn (ptr &i32) {
+			for _ in 0 .. 50_000 {
+				swap_i32(ptr, 123)
+			}
+		}(&x)
+	}
+
+	for t in threads {
+		t.wait()
+	}
+
+	assert x == 123
+}
+
+fn test_cas_i32_concurrent() {
+	mut x := i32(0)
+	mut threads := []thread{}
+
+	for _ in 0 .. 8 {
+		threads << spawn fn (ptr &i32) {
+			for _ in 0 .. 100_000 {
+				for {
+					old := load_i32(ptr)
+					if cas_i32(ptr, old, old + 1) {
+						break
+					}
+				}
+			}
+		}(&x)
+	}
+
+	for t in threads {
+		t.wait()
+	}
+
+	assert x == 800_000
+}
+
+fn test_load_store_i32_concurrent() {
+	mut x := i32(0)
+	mut threads := []thread{}
+
+	for _ in 0 .. 4 {
+		threads << spawn fn (px &i32) {
+			for _ in 0 .. 50_000 {
+				add_i32(px, 1)
+			}
+		}(&x)
+	}
+	for _ in 0 .. 4 {
+		threads << spawn fn (px &i32) {
+			for _ in 0 .. 50_000 {
+				_ = load_i32(px)
+			}
+		}(&x)
+	}
+
+	for t in threads {
+		t.wait()
+	}
+
+	assert x == 4 * 50_000
+}
diff --git a/vlib/x/atomics/i64_test.v b/vlib/x/atomics/i64_test.v
new file mode 100644
index 000000000..d0b476cd4
--- /dev/null
+++ b/vlib/x/atomics/i64_test.v
@@ -0,0 +1,161 @@
+// vtest build: !(macos || windows)
+
+module atomics
+
+fn test_load_i64_basic() {
+	mut x := i64(1234567890123)
+	assert load_i64(&x) == 1234567890123
+}
+
+fn test_store_i64_basic() {
+	mut x := i64(1)
+	store_i64(&x, 9999999)
+	assert x == 9999999
+}
+
+fn test_swap_i64_basic() {
+	mut x := i64(50)
+	old := swap_i64(&x, 777)
+	assert old == 50
+	assert x == 777
+}
+
+fn test_swap_i64_same_value() {
+	mut x := i64(-123)
+	old := swap_i64(&x, -123)
+	assert old == -123
+	assert x == -123
+}
+
+fn test_add_i64_basic() {
+	mut x := i64(0)
+	for _ in 0 .. 1000 {
+		add_i64(&x, 3)
+	}
+	assert x == 3000
+}
+
+fn test_add_i64_negative_delta() {
+	mut x := i64(100)
+	add_i64(&x, -7)
+	assert x == 93
+}
+
+fn test_add_i64_wraparound_behavior() {
+	mut x := i64(0x7fffffffffffffff)
+	add_i64(&x, 1)
+	assert x == -0x8000000000000000
+}
+
+fn test_add_i64_return_value() {
+	mut x := i64(10)
+	r := add_i64(&x, 100)
+	assert r == 110
+	assert x == 110
+}
+
+fn test_cas_i64_basic() {
+	mut x := i64(111)
+	assert cas_i64(&x, 111, 222)
+	assert x == 222
+}
+
+fn test_cas_i64_fail() {
+	mut x := i64(555)
+	assert !cas_i64(&x, 123, 999)
+	assert x == 555
+}
+
+fn test_cas_i64_nochange_on_fail() {
+	mut x := i64(-999)
+	cas_i64(&x, 1, 5)
+	assert x == -999
+}
+
+fn test_cas_i64_negative_values() {
+	mut x := i64(-123456)
+	assert cas_i64(&x, -123456, 777)
+	assert x == 777
+}
+
+fn test_add_i64_concurrent() {
+	mut x := i64(0)
+	mut threads := []thread{}
+
+	for _ in 0 .. 8 {
+		threads << spawn fn (px &i64) {
+			for _ in 0 .. 100_000 {
+				add_i64(px, 1)
+			}
+		}(&x)
+	}
+
+	for t in threads {
+		t.wait()
+	}
+
+	assert x == 800_000
+}
+
+fn test_swap_i64_concurrent() {
+	mut x := i64(0)
+	mut threads := []thread{}
+
+	for _ in 0 .. 8 {
+		threads << spawn fn (px &i64) {
+			for _ in 0 .. 50_000 {
+				swap_i64(px, 12345)
+			}
+		}(&x)
+	}
+
+	for t in threads {
+		t.wait()
+	}
+
+	assert x == 12345
+}
+
+fn test_cas_i64_concurrent_inc() {
+	mut x := i64(0)
+	mut threads := []thread{}
+
+	for _ in 0 .. 8 {
+		threads << spawn fn (px &i64) {
+			for _ in 0 .. 50_000 {
+				for {
+					old := load_i64(px)
+					if cas_i64(px, old, old + 1) {
+						break
+					}
+				}
+			}
+		}(&x)
+	}
+
+	for t in threads {
+		t.wait()
+	}
+
+	assert x == 400_000
+}
+
+fn test_cas_i64_contended_flip() {
+	mut x := i64(0)
+	mut threads := []thread{}
+
+	for _ in 0 .. 4 {
+		threads << spawn fn (px &i64) {
+			for _ in 0 .. 200_000 {
+				cas_i64(px, 0, 1)
+				cas_i64(px, 1, 0)
+			}
+		}(&x)
+	}
+
+	for t in threads {
+		t.wait()
+	}
+
+	assert x == 0 || x == 1
+}
diff --git a/vlib/x/atomics/panic_unaligned.v b/vlib/x/atomics/panic_unaligned.v
new file mode 100644
index 000000000..4b3838899
--- /dev/null
+++ b/vlib/x/atomics/panic_unaligned.v
@@ -0,0 +1,10 @@
+module atomics
+
+$if prod && (gcc || clang) {
+	#flag -Wl,--undefined=panicUnaligned
+}
+
+@[export: 'panicUnaligned']
+fn panic_unaligned() {
+	panic('unaligned atomic operation')
+}
diff --git a/vlib/x/atomics/u32_test.v b/vlib/x/atomics/u32_test.v
new file mode 100644
index 000000000..2823a400f
--- /dev/null
+++ b/vlib/x/atomics/u32_test.v
@@ -0,0 +1,178 @@
+// vtest build: !(macos || windows)
+
+module atomics
+
+fn test_add_u32_basic() {
+	mut x := u32(0)
+	for _ in 0 .. 1000 {
+		add_u32(&x, 1)
+	}
+	assert x == 1000
+}
+
+fn test_add_u32_wraparound() {
+	mut x := u32(0xffffffff)
+	add_u32(&x, 1)
+	assert x == 0
+}
+
+fn test_add_u32_large() {
+	mut x := u32(10)
+	r := add_u32(&x, 1000)
+	assert r == 1010
+	assert x == 1010
+}
+
+fn test_swap_u32_basic() {
+	mut x := u32(123)
+	old := swap_u32(&x, 999)
+	assert old == 123
+	assert x == 999
+}
+
+fn test_swap_u32_same() {
+	mut x := u32(777)
+	old := swap_u32(&x, 777)
+	assert old == 777
+	assert x == 777
+}
+
+fn test_store_u32_basic() {
+	mut x := u32(1)
+	store_u32(&x, 555)
+	assert x == 555
+}
+
+fn test_load_u32_basic() {
+	mut x := u32(888)
+	assert load_u32(&x) == 888
+}
+
+fn test_cas_u32_basic() {
+	mut x := u32(10)
+	assert cas_u32(&x, 10, 50)
+	assert x == 50
+}
+
+fn test_cas_u32_fail() {
+	mut x := u32(10)
+	assert !cas_u32(&x, 5, 999)
+	assert x == 10
+}
+
+fn test_cas_u32_nochange_on_fail() {
+	mut x := u32(777)
+	cas_u32(&x, 5, 9)
+	assert x == 777
+}
+
+fn test_cas_u32_boundary() {
+	mut x := u32(0xffffffff)
+	assert cas_u32(&x, 0xffffffff, 0)
+	assert x == 0
+}
+
+fn test_add_u32_concurrent() {
+	mut x := u32(0)
+	mut threads := []thread{}
+
+	for _ in 0 .. 8 {
+		threads << spawn fn (px &u32) {
+			for _ in 0 .. 100_000 {
+				add_u32(px, 1)
+			}
+		}(&x)
+	}
+
+	for t in threads {
+		t.wait()
+	}
+
+	assert x == 800_000
+}
+
+fn test_swap_u32_concurrent() {
+	mut x := u32(0)
+	mut threads := []thread{}
+
+	for _ in 0 .. 8 {
+		threads << spawn fn (px &u32) {
+			for _ in 0 .. 50_000 {
+				swap_u32(px, 123)
+			}
+		}(&x)
+	}
+
+	for t in threads {
+		t.wait()
+	}
+
+	assert x == 123
+}
+
+fn test_cas_u32_concurrent_inc() {
+	mut x := u32(0)
+	mut threads := []thread{}
+
+	for _ in 0 .. 8 {
+		threads << spawn fn (px &u32) {
+			for _ in 0 .. 50_000 {
+				for {
+					old := load_u32(px)
+					if cas_u32(px, old, old + 1) {
+						break
+					}
+				}
+			}
+		}(&x)
+	}
+
+	for t in threads {
+		t.wait()
+	}
+
+	assert x == 400_000
+}
+
+fn test_cas_u32_contended_flip() {
+	mut x := u32(0)
+	mut threads := []thread{}
+
+	for _ in 0 .. 4 {
+		threads << spawn fn (px &u32) {
+			for _ in 0 .. 200_000 {
+				cas_u32(px, 0, 1)
+				cas_u32(px, 1, 0)
+			}
+		}(&x)
+	}
+
+	for t in threads {
+		t.wait()
+	}
+
+	assert x == 0 || x == 1
+}
+
+fn test_load_store_u32_concurrent() {
+	mut x := u32(0)
+	mut threads := []thread{}
+
+	for i in 0 .. 8 {
+		threads << spawn fn (px &u32, id int) {
+			for _ in 0 .. 50_000 {
+				if id % 2 == 0 {
+					store_u32(px, 1)
+				} else {
+					_ = load_u32(px)
+				}
+			}
+		}(&x, i)
+	}
+
+	for t in threads {
+		t.wait()
+	}
+
+	assert x == 1
+}
diff --git a/vlib/x/atomics/u64_test.v b/vlib/x/atomics/u64_test.v
new file mode 100644
index 000000000..ab704342e
--- /dev/null
+++ b/vlib/x/atomics/u64_test.v
@@ -0,0 +1,155 @@
+// vtest build: !(macos || windows)
+
+module atomics
+
+fn test_load_u64_basic() {
+	mut x := u64(1234567890123)
+	assert load_u64(&x) == 1234567890123
+}
+
+fn test_store_u64_basic() {
+	mut x := u64(1)
+	store_u64(&x, 999999)
+	assert x == 999999
+}
+
+fn test_swap_u64_basic() {
+	mut x := u64(123)
+	old := swap_u64(&x, 999)
+	assert old == 123
+	assert x == 999
+}
+
+fn test_swap_u64_same() {
+	mut x := u64(777777)
+	old := swap_u64(&x, 777777)
+	assert old == 777777
+	assert x == 777777
+}
+
+fn test_add_u64_basic() {
+	mut x := u64(0)
+	for _ in 0 .. 1000 {
+		add_u64(&x, 2)
+	}
+	assert x == 2000
+}
+
+fn test_add_u64_wraparound() {
+	mut x := u64(0xffffffffffffffff)
+	add_u64(&x, 1)
+	assert x == 0
+}
+
+fn test_add_u64_return() {
+	mut x := u64(10)
+	r := add_u64(&x, 100)
+	assert r == 110
+	assert x == 110
+}
+
+fn test_cas_u64_basic() {
+	mut x := u64(111)
+	assert cas_u64(&x, 111, 222)
+	assert x == 222
+}
+
+fn test_cas_u64_fail() {
+	mut x := u64(500)
+	assert !cas_u64(&x, 100, 200)
+	assert x == 500
+}
+
+fn test_cas_u64_nochange_fail() {
+	mut x := u64(999)
+	cas_u64(&x, 1, 2)
+	assert x == 999
+}
+
+fn test_cas_u64_boundary() {
+	mut x := u64(0xffffffffffffffff)
+	assert cas_u64(&x, 0xffffffffffffffff, 0)
+	assert x == 0
+}
+
+fn test_add_u64_concurrent() {
+	mut x := u64(0)
+	mut threads := []thread{}
+
+	for _ in 0 .. 8 {
+		threads << spawn fn (px &u64) {
+			for _ in 0 .. 100_000 {
+				add_u64(px, 1)
+			}
+		}(&x)
+	}
+
+	for t in threads {
+		t.wait()
+	}
+
+	assert x == 800_000
+}
+
+fn test_swap_u64_concurrent() {
+	mut x := u64(0)
+	mut threads := []thread{}
+
+	for _ in 0 .. 8 {
+		threads << spawn fn (px &u64) {
+			for _ in 0 .. 50_000 {
+				swap_u64(px, 123456)
+			}
+		}(&x)
+	}
+
+	for t in threads {
+		t.wait()
+	}
+
+	assert x == 123456
+}
+
+fn test_cas_u64_concurrent_inc() {
+	mut x := u64(0)
+	mut threads := []thread{}
+
+	for _ in 0 .. 8 {
+		threads << spawn fn (px &u64) {
+			for _ in 0 .. 50_000 {
+				for {
+					old := load_u64(px)
+					if cas_u64(px, old, old + 1) {
+						break
+					}
+				}
+			}
+		}(&x)
+	}
+
+	for t in threads {
+		t.wait()
+	}
+
+	assert x == 400_000
+}
+
+fn test_cas_u64_contended_flip() {
+	mut x := u64(0)
+	mut threads := []thread{}
+
+	for _ in 0 .. 4 {
+		threads << spawn fn (px &u64) {
+			for _ in 0 .. 200_000 {
+				cas_u64(px, 0, 1)
+				cas_u64(px, 1, 0)
+			}
+		}(&x)
+	}
+
+	for t in threads {
+		t.wait()
+	}
+
+	assert x == 0 || x == 1
+}
-- 
2.39.5