| 1 | // vgc_platform.h - Platform abstractions for V Garbage Collector |
| 2 | // Translated from Go's runtime GC (golang/go src/runtime/mgc*.go, malloc.go, mheap.go) |
| 3 | |
| 4 | #ifndef VGC_PLATFORM_H |
| 5 | #define VGC_PLATFORM_H |
| 6 | |
| 7 | #include <stdint.h> |
| 8 | #include <string.h> |
| 9 | #include <stdio.h> |
| 10 | |
| 11 | // ============================================================ |
| 12 | // Thread-local storage |
| 13 | // ============================================================ |
| 14 | #ifdef _WIN32 |
| 15 | #define VGC_TLS __declspec(thread) |
| 16 | #elif defined(__TINYC__) |
| 17 | #define VGC_TLS __thread |
| 18 | #else |
| 19 | #define VGC_TLS __thread |
| 20 | #endif |
| 21 | |
| 22 | static VGC_TLS int _vgc_cache_idx = -1; |
| 23 | |
| 24 | static inline int vgc_get_cache_idx(void) { return _vgc_cache_idx; } |
| 25 | static inline void vgc_set_cache_idx(int idx) { _vgc_cache_idx = idx; } |
| 26 | |
| 27 | // ============================================================ |
| 28 | // Atomic operations |
| 29 | // ============================================================ |
| 30 | #if defined(__TINYC__) |
| 31 | // TCC: use __sync builtins |
| 32 | #define vgc_atomic_load_u32(ptr) (*(volatile uint32_t*)(ptr)) |
| 33 | #define vgc_atomic_store_u32(ptr, val) do { *(volatile uint32_t*)(ptr) = (val); __sync_synchronize(); } while(0) |
| 34 | #define vgc_atomic_load_u64(ptr) (*(volatile uint64_t*)(ptr)) |
| 35 | #define vgc_atomic_store_u64(ptr, val) do { *(volatile uint64_t*)(ptr) = (val); __sync_synchronize(); } while(0) |
| 36 | #define vgc_atomic_add_u64(ptr, val) __sync_add_and_fetch((volatile uint64_t*)(ptr), (uint64_t)(val)) |
| 37 | #define vgc_atomic_sub_u64(ptr, val) __sync_sub_and_fetch((volatile uint64_t*)(ptr), (uint64_t)(val)) |
| 38 | #define vgc_atomic_add_u32(ptr, val) __sync_add_and_fetch((volatile uint32_t*)(ptr), (uint32_t)(val)) |
| 39 | #define vgc_atomic_cas_u32(ptr, expected, desired) __sync_bool_compare_and_swap((volatile uint32_t*)(ptr), *(expected), (desired)) |
| 40 | #define vgc_atomic_exchange_u32(ptr, val) __sync_lock_test_and_set((volatile uint32_t*)(ptr), (val)) |
| 41 | #define vgc_atomic_fence() __sync_synchronize() |
| 42 | #elif defined(_MSC_VER) |
| 43 | #include <intrin.h> |
| 44 | #pragma intrinsic(_InterlockedCompareExchange, _InterlockedExchange, _InterlockedExchangeAdd64) |
| 45 | static inline uint32_t vgc_atomic_load_u32_fn(volatile uint32_t* p) { uint32_t v = *p; _ReadBarrier(); return v; } |
| 46 | static inline void vgc_atomic_store_u32_fn(volatile uint32_t* p, uint32_t v) { _WriteBarrier(); *p = v; } |
| 47 | static inline uint64_t vgc_atomic_load_u64_fn(volatile uint64_t* p) { uint64_t v = *p; _ReadBarrier(); return v; } |
| 48 | static inline void vgc_atomic_store_u64_fn(volatile uint64_t* p, uint64_t v) { _WriteBarrier(); *p = v; } |
| 49 | #define vgc_atomic_load_u32(ptr) vgc_atomic_load_u32_fn((volatile uint32_t*)(ptr)) |
| 50 | #define vgc_atomic_store_u32(ptr, val) vgc_atomic_store_u32_fn((volatile uint32_t*)(ptr), (val)) |
| 51 | #define vgc_atomic_load_u64(ptr) vgc_atomic_load_u64_fn((volatile uint64_t*)(ptr)) |
| 52 | #define vgc_atomic_store_u64(ptr, val) vgc_atomic_store_u64_fn((volatile uint64_t*)(ptr), (val)) |
| 53 | #define vgc_atomic_add_u64(ptr, val) _InterlockedExchangeAdd64((volatile int64_t*)(ptr), (int64_t)(val)) |
| 54 | #define vgc_atomic_sub_u64(ptr, val) _InterlockedExchangeAdd64((volatile int64_t*)(ptr), -(int64_t)(val)) |
| 55 | #define vgc_atomic_add_u32(ptr, val) _InterlockedExchangeAdd((volatile long*)(ptr), (long)(val)) |
| 56 | #define vgc_atomic_cas_u32(ptr, expected, desired) (_InterlockedCompareExchange((volatile long*)(ptr), (long)(desired), (long)*(expected)) == (long)*(expected)) |
| 57 | #define vgc_atomic_exchange_u32(ptr, val) (uint32_t)_InterlockedExchange((volatile long*)(ptr), (long)(val)) |
| 58 | #define vgc_atomic_fence() _ReadWriteBarrier() |
| 59 | #else |
| 60 | // GCC/Clang |
| 61 | #define vgc_atomic_load_u32(ptr) __atomic_load_n((volatile uint32_t*)(ptr), __ATOMIC_ACQUIRE) |
| 62 | #define vgc_atomic_store_u32(ptr, val) __atomic_store_n((volatile uint32_t*)(ptr), (val), __ATOMIC_RELEASE) |
| 63 | #define vgc_atomic_load_u64(ptr) __atomic_load_n((volatile uint64_t*)(ptr), __ATOMIC_ACQUIRE) |
| 64 | #define vgc_atomic_store_u64(ptr, val) __atomic_store_n((volatile uint64_t*)(ptr), (val), __ATOMIC_RELEASE) |
| 65 | #define vgc_atomic_add_u64(ptr, val) __atomic_add_fetch((volatile uint64_t*)(ptr), (uint64_t)(val), __ATOMIC_ACQ_REL) |
| 66 | #define vgc_atomic_sub_u64(ptr, val) __atomic_sub_fetch((volatile uint64_t*)(ptr), (uint64_t)(val), __ATOMIC_ACQ_REL) |
| 67 | #define vgc_atomic_add_u32(ptr, val) __atomic_add_fetch((volatile uint32_t*)(ptr), (uint32_t)(val), __ATOMIC_ACQ_REL) |
| 68 | #define vgc_atomic_cas_u32(ptr, expected, desired) __atomic_compare_exchange_n((volatile uint32_t*)(ptr), (uint32_t*)(expected), (uint32_t)(desired), 0, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE) |
| 69 | #define vgc_atomic_exchange_u32(ptr, val) __atomic_exchange_n((volatile uint32_t*)(ptr), (uint32_t)(val), __ATOMIC_ACQ_REL) |
| 70 | #define vgc_atomic_fence() __atomic_thread_fence(__ATOMIC_SEQ_CST) |
| 71 | #endif |
| 72 | |
| 73 | // ============================================================ |
| 74 | // OS memory allocation |
| 75 | // ============================================================ |
| 76 | #ifdef _WIN32 |
| 77 | #ifndef WIN32_LEAN_AND_MEAN |
| 78 | #define WIN32_LEAN_AND_MEAN |
| 79 | #endif |
| 80 | #include <windows.h> |
| 81 | static inline void* vgc_os_alloc(size_t size) { |
| 82 | return VirtualAlloc(NULL, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); |
| 83 | } |
| 84 | static inline void vgc_os_free(void* ptr, size_t size) { |
| 85 | (void)size; |
| 86 | VirtualFree(ptr, 0, MEM_RELEASE); |
| 87 | } |
| 88 | static inline void vgc_os_decommit(void* ptr, size_t size) { |
| 89 | VirtualFree(ptr, size, MEM_DECOMMIT); |
| 90 | } |
| 91 | static inline int vgc_num_cpus(void) { |
| 92 | DWORD count = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); |
| 93 | return count > 0 ? (int)count : 1; |
| 94 | } |
| 95 | #else |
| 96 | #include <sys/mman.h> |
| 97 | #include <unistd.h> |
| 98 | static inline void* vgc_os_alloc(size_t size) { |
| 99 | void* p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); |
| 100 | return (p == MAP_FAILED) ? NULL : p; |
| 101 | } |
| 102 | static inline void vgc_os_free(void* ptr, size_t size) { |
| 103 | munmap(ptr, size); |
| 104 | } |
| 105 | static inline void vgc_os_decommit(void* ptr, size_t size) { |
| 106 | madvise(ptr, size, MADV_DONTNEED); |
| 107 | } |
| 108 | static inline int vgc_num_cpus(void) { |
| 109 | long count = sysconf(_SC_NPROCESSORS_ONLN); |
| 110 | return count > 0 ? (int)count : 1; |
| 111 | } |
| 112 | #endif |
| 113 | |
| 114 | // ============================================================ |
| 115 | // Stack pointer / frame address |
| 116 | // ============================================================ |
| 117 | #if defined(_MSC_VER) |
| 118 | #include <intrin.h> |
| 119 | static inline void* vgc_get_sp(void) { return _AddressOfReturnAddress(); } |
| 120 | #else |
| 121 | static inline void* vgc_get_sp(void) { return __builtin_frame_address(0); } |
| 122 | #endif |
| 123 | |
| 124 | // ============================================================ |
| 125 | // Stack bounds |
| 126 | // ============================================================ |
| 127 | #if defined(__APPLE__) |
| 128 | #include <pthread.h> |
| 129 | static inline int vgc_get_stack_bounds(uintptr_t* lo, uintptr_t* hi) { |
| 130 | void* stack_hi = pthread_get_stackaddr_np(pthread_self()); |
| 131 | size_t stack_size = pthread_get_stacksize_np(pthread_self()); |
| 132 | if (stack_hi == NULL || stack_size == 0) return 0; |
| 133 | *hi = (uintptr_t)stack_hi; |
| 134 | *lo = *hi - stack_size; |
| 135 | return 1; |
| 136 | } |
| 137 | #elif defined(__linux__) || defined(__ANDROID__) |
| 138 | #include <pthread.h> |
| 139 | static inline int vgc_get_stack_bounds(uintptr_t* lo, uintptr_t* hi) { |
| 140 | pthread_attr_t attr; |
| 141 | if (pthread_getattr_np(pthread_self(), &attr) != 0) return 0; |
| 142 | void* stack_lo = NULL; |
| 143 | size_t stack_size = 0; |
| 144 | int ok = pthread_attr_getstack(&attr, &stack_lo, &stack_size) == 0 && stack_lo != NULL && stack_size != 0; |
| 145 | pthread_attr_destroy(&attr); |
| 146 | if (!ok) return 0; |
| 147 | *lo = (uintptr_t)stack_lo; |
| 148 | *hi = *lo + stack_size; |
| 149 | return 1; |
| 150 | } |
| 151 | #elif defined(__FreeBSD__) || defined(__DragonFly__) || defined(__NetBSD__) || defined(__OpenBSD__) |
| 152 | #include <pthread.h> |
| 153 | static inline int vgc_get_stack_bounds(uintptr_t* lo, uintptr_t* hi) { |
| 154 | pthread_attr_t attr; |
| 155 | if (pthread_attr_init(&attr) != 0) return 0; |
| 156 | if (pthread_attr_get_np(pthread_self(), &attr) != 0) { |
| 157 | pthread_attr_destroy(&attr); |
| 158 | return 0; |
| 159 | } |
| 160 | void* stack_lo = NULL; |
| 161 | size_t stack_size = 0; |
| 162 | int ok = pthread_attr_getstack(&attr, &stack_lo, &stack_size) == 0 && stack_lo != NULL && stack_size != 0; |
| 163 | pthread_attr_destroy(&attr); |
| 164 | if (!ok) return 0; |
| 165 | *lo = (uintptr_t)stack_lo; |
| 166 | *hi = *lo + stack_size; |
| 167 | return 1; |
| 168 | } |
| 169 | #else |
| 170 | static inline int vgc_get_stack_bounds(uintptr_t* lo, uintptr_t* hi) { |
| 171 | (void)lo; |
| 172 | (void)hi; |
| 173 | return 0; |
| 174 | } |
| 175 | #endif |
| 176 | |
| 177 | // ============================================================ |
| 178 | // Bitmap operations (for mark/alloc bitmaps) |
| 179 | // ============================================================ |
| 180 | static inline int vgc_bitmap_get(const uint8_t* bits, uint32_t idx) { |
| 181 | return (bits[idx >> 3] >> (idx & 7)) & 1; |
| 182 | } |
| 183 | |
| 184 | static inline void vgc_bitmap_set(uint8_t* bits, uint32_t idx) { |
| 185 | bits[idx >> 3] |= (uint8_t)(1u << (idx & 7)); |
| 186 | } |
| 187 | |
| 188 | static inline void vgc_bitmap_clear(uint8_t* bits, uint32_t idx) { |
| 189 | bits[idx >> 3] &= (uint8_t)~(1u << (idx & 7)); |
| 190 | } |
| 191 | |
| 192 | // Returns 1 if bit was already set, 0 if newly set |
| 193 | static inline int vgc_bitmap_test_and_set(uint8_t* bits, uint32_t idx) { |
| 194 | uint8_t mask = (uint8_t)(1u << (idx & 7)); |
| 195 | uint8_t* byte_ptr = &bits[idx >> 3]; |
| 196 | if (*byte_ptr & mask) return 1; |
| 197 | *byte_ptr |= mask; |
| 198 | return 0; |
| 199 | } |
| 200 | |
| 201 | // Popcount on a byte - count number of set bits |
| 202 | static inline int vgc_popcount8(uint8_t x) { |
| 203 | x = x - ((x >> 1) & 0x55); |
| 204 | x = (x & 0x33) + ((x >> 2) & 0x33); |
| 205 | return (x + (x >> 4)) & 0x0f; |
| 206 | } |
| 207 | |
| 208 | // ============================================================ |
| 209 | // Size class tables (translated from Go's runtime/sizeclasses.go) |
| 210 | // 68 classes (0 = unused, 1-67 = actual size classes) |
| 211 | // ============================================================ |
| 212 | |
| 213 | // Object sizes per class |
| 214 | static const uint32_t vgc_class_sizes[68] = { |
| 215 | 0, 8, 16, 24, 32, 48, 64, 80, |
| 216 | 96, 112, 128, 144, 160, 176, 192, 208, |
| 217 | 224, 240, 256, 288, 320, 352, 384, 416, |
| 218 | 448, 480, 512, 576, 640, 704, 768, 896, |
| 219 | 1024, 1152, 1280, 1408, 1536, 1792, 2048, 2304, |
| 220 | 2688, 3072, 3200, 3456, 4096, 4864, 5376, 6144, |
| 221 | 6528, 6784, 6912, 8192, 9472, 9728, 10240, 10880, |
| 222 | 12288, 13568, 14336, 16384, 18432, 19072, 20480, 21760, |
| 223 | 24576, 27264, 28672, 32768 |
| 224 | }; |
| 225 | |
| 226 | // Pages per span for each class |
| 227 | static const uint32_t vgc_class_npages[68] = { |
| 228 | 0, 1, 1, 1, 1, 1, 1, 1, |
| 229 | 1, 1, 1, 1, 1, 1, 1, 1, |
| 230 | 1, 1, 1, 1, 1, 1, 1, 1, |
| 231 | 1, 1, 1, 1, 1, 1, 1, 1, |
| 232 | 1, 1, 1, 2, 1, 2, 1, 2, |
| 233 | 1, 3, 2, 3, 1, 3, 2, 3, |
| 234 | 4, 5, 6, 1, 7, 6, 5, 4, |
| 235 | 3, 5, 7, 2, 9, 7, 5, 8, |
| 236 | 3, 10, 7, 4 |
| 237 | }; |
| 238 | |
| 239 | // Objects per span for each class |
| 240 | static const uint32_t vgc_class_nobjs[68] = { |
| 241 | 0, 1024, 512, 341, 256, 170, 128, 102, |
| 242 | 85, 73, 64, 56, 51, 46, 42, 39, |
| 243 | 36, 34, 32, 28, 25, 23, 21, 19, |
| 244 | 18, 17, 16, 14, 12, 11, 10, 9, |
| 245 | 8, 7, 6, 11, 5, 9, 4, 7, |
| 246 | 3, 8, 5, 7, 2, 5, 3, 4, |
| 247 | 5, 6, 7, 1, 6, 5, 4, 3, |
| 248 | 2, 3, 4, 1, 4, 3, 2, 3, |
| 249 | 1, 3, 2, 1 |
| 250 | }; |
| 251 | |
| 252 | // Size-to-class lookup tables (computed at init) |
| 253 | static uint8_t vgc_s2c8[129]; // sizes 1..1024: index = (size+7)/8 |
| 254 | static uint8_t vgc_s2c128[249]; // sizes 1025..32768: index = (size-1024+127)/128 |
| 255 | |
| 256 | static inline uint32_t vgc_get_class_size(int cls) { return vgc_class_sizes[cls]; } |
| 257 | static inline uint32_t vgc_get_class_npages(int cls) { return vgc_class_npages[cls]; } |
| 258 | static inline uint32_t vgc_get_class_nobjs(int cls) { return vgc_class_nobjs[cls]; } |
| 259 | |
| 260 | static void vgc_init_size_tables(void) { |
| 261 | int cls = 1; |
| 262 | for (int i = 0; i < 129; i++) { |
| 263 | uint32_t size = (uint32_t)(i + 1) * 8; |
| 264 | while (cls < 67 && vgc_class_sizes[cls] < size) { |
| 265 | cls++; |
| 266 | } |
| 267 | vgc_s2c8[i] = (uint8_t)cls; |
| 268 | } |
| 269 | cls = 33; |
| 270 | for (int i = 0; i < 249; i++) { |
| 271 | uint32_t size = 1024 + (uint32_t)(i + 1) * 128; |
| 272 | while (cls < 67 && vgc_class_sizes[cls] < size) { |
| 273 | cls++; |
| 274 | } |
| 275 | vgc_s2c128[i] = (uint8_t)cls; |
| 276 | } |
| 277 | } |
| 278 | |
| 279 | // Get size class for a given allocation size |
| 280 | static inline uint8_t vgc_size_class(uint32_t size) { |
| 281 | if (size == 0) return 0; |
| 282 | if (size <= 1024) { |
| 283 | return vgc_s2c8[(size + 7) >> 3]; |
| 284 | } |
| 285 | if (size <= 32768) { |
| 286 | return vgc_s2c128[(size - 1024 + 127) >> 7]; |
| 287 | } |
| 288 | return 0; // large allocation |
| 289 | } |
| 290 | |
| 291 | // ============================================================ |
| 292 | // Pause / yield for spinlocks |
| 293 | // ============================================================ |
| 294 | #if defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86) |
| 295 | #define vgc_cpu_pause() __asm__ __volatile__("pause") |
| 296 | #elif defined(__aarch64__) || defined(_M_ARM64) |
| 297 | #define vgc_cpu_pause() __asm__ __volatile__("yield") |
| 298 | #else |
| 299 | #define vgc_cpu_pause() ((void)0) |
| 300 | #endif |
| 301 | |
| 302 | #ifdef _WIN32 |
| 303 | #define vgc_thread_yield() SwitchToThread() |
| 304 | #else |
| 305 | #include <sched.h> |
| 306 | #define vgc_thread_yield() sched_yield() |
| 307 | #endif |
| 308 | |
| 309 | // ============================================================ |
| 310 | // Simple mutex (adaptive spinlock with backoff) |
| 311 | // ============================================================ |
| 312 | static inline void vgc_mutex_lock(volatile uint32_t* lock) { |
| 313 | for (int spin = 0; ; spin++) { |
| 314 | uint32_t expected = 0; |
| 315 | if (vgc_atomic_cas_u32(lock, &expected, 1)) { |
| 316 | return; |
| 317 | } |
| 318 | if (spin < 16) { |
| 319 | vgc_cpu_pause(); |
| 320 | } else if (spin < 32) { |
| 321 | for (int i = 0; i < spin; i++) vgc_cpu_pause(); |
| 322 | } else { |
| 323 | vgc_thread_yield(); |
| 324 | } |
| 325 | } |
| 326 | } |
| 327 | |
| 328 | static inline void vgc_mutex_unlock(volatile uint32_t* lock) { |
| 329 | vgc_atomic_store_u32(lock, 0); |
| 330 | } |
| 331 | |
| 332 | // ============================================================ |
| 333 | // Arena address index for O(1) pointer-to-arena lookup |
| 334 | // Divides address space into 1GB chunks, maps to arena index. |
| 335 | // ============================================================ |
| 336 | #define VGC_ADDR_SHIFT 30 // 1GB chunks |
| 337 | #define VGC_ADDR_MAP_SIZE 4096 |
| 338 | |
| 339 | // Maps (address >> 30) -> arena_index+1 (0 = not mapped) |
| 340 | static uint16_t vgc_addr_map[VGC_ADDR_MAP_SIZE]; |
| 341 | |
| 342 | static inline void vgc_addr_map_register(uintptr_t base, size_t size, int arena_idx) { |
| 343 | uintptr_t lo = base >> VGC_ADDR_SHIFT; |
| 344 | uintptr_t hi = (base + size - 1) >> VGC_ADDR_SHIFT; |
| 345 | for (uintptr_t i = lo; i <= hi && i < VGC_ADDR_MAP_SIZE; i++) { |
| 346 | vgc_addr_map[i] = (uint16_t)(arena_idx + 1); |
| 347 | } |
| 348 | } |
| 349 | |
| 350 | // Returns arena index or -1 |
| 351 | static inline int vgc_addr_to_arena(uintptr_t addr) { |
| 352 | uintptr_t idx = addr >> VGC_ADDR_SHIFT; |
| 353 | if (idx >= VGC_ADDR_MAP_SIZE) return -1; |
| 354 | int v = vgc_addr_map[idx]; |
| 355 | return v ? v - 1 : -1; |
| 356 | } |
| 357 | |
| 358 | // ============================================================ |
| 359 | // Thread creation (for GC workers - avoids V's spawn which |
| 360 | // causes import cycles with builtin -> sync.threads -> builtin) |
| 361 | // ============================================================ |
| 362 | typedef void (*vgc_thread_fn)(void); |
| 363 | |
| 364 | #ifdef _WIN32 |
| 365 | static inline void vgc_start_thread(vgc_thread_fn fn) { |
| 366 | CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)fn, NULL, 0, NULL); |
| 367 | } |
| 368 | #else |
| 369 | #include <pthread.h> |
| 370 | static void* _vgc_thread_trampoline(void* arg) { |
| 371 | vgc_thread_fn fn = (vgc_thread_fn)arg; |
| 372 | fn(); |
| 373 | return NULL; |
| 374 | } |
| 375 | static inline void vgc_start_thread(vgc_thread_fn fn) { |
| 376 | pthread_t tid; |
| 377 | pthread_attr_t attr; |
| 378 | pthread_attr_init(&attr); |
| 379 | pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); |
| 380 | pthread_create(&tid, &attr, _vgc_thread_trampoline, (void*)fn); |
| 381 | pthread_attr_destroy(&attr); |
| 382 | } |
| 383 | #endif |
| 384 | |
| 385 | #endif // VGC_PLATFORM_H |
| 386 | |