From 9ec03736aea607c3df09fb8992aaab4d11ddc4dc Mon Sep 17 00:00:00 2001 From: Alexander Medvednikov Date: Fri, 3 Apr 2026 00:45:30 +0300 Subject: [PATCH] vgc: optimizations --- bench/README.md | 10 ++-- thirdparty/vgc/vgc_platform.h | 9 ++++ vlib/builtin/allocation.c.v | 32 ++++++++++++ vlib/builtin/array.v | 9 +++- vlib/builtin/array_d_gcboehm_opt.v | 9 +++- vlib/builtin/vgc_d_vgc.c.v | 30 ++++++++++- vlib/builtin/vgc_gc_d_vgc.c.v | 84 ++++++++++++++++++++++++------ vlib/builtin/vgc_notd_vgc.c.v | 16 ++++++ vlib/v/gen/c/struct.v | 18 +++++-- 9 files changed, 187 insertions(+), 30 deletions(-) diff --git a/bench/README.md b/bench/README.md index cf9829c97..e9547900c 100644 --- a/bench/README.md +++ b/bench/README.md @@ -16,18 +16,18 @@ v run bench/bench_gc.v ``` test boehm vgc ratio ———————————————————————————————————————————— ————————— ————————— ————————— - small allocs (1000000x string) 43 ms 52 ms 1.21x - tree build+walk (depth=18, 10x) 46 ms 125 ms 2.72x - array grow (100x 100000 pushes) 7 ms 30 ms 4.29x + small allocs (1000000x string) 39 ms 48 ms 1.23x + tree build+walk (depth=18, 10x) 48 ms 118 ms 2.46x + array grow (100x 100000 pushes) 9 ms 26 ms 2.89x map insert (20x 10k entries) 20 ms 27 ms 1.35x mixed workload (50 rounds) 10 ms 16 ms 1.60x heap usage: - boehm: 29856 KB allocated, 29020 KB free + boehm: 29856 KB allocated, 29296 KB free vgc: 131072 KB allocated, 0 KB free ``` -Boehm is still 1.2x-4.3x faster across these workloads and uses ~4x less heap. +Boehm is still 1.2x-2.9x faster across these workloads and uses ~4x less heap. ## Closures diff --git a/thirdparty/vgc/vgc_platform.h b/thirdparty/vgc/vgc_platform.h index 9231cf712..12b69a1f3 100644 --- a/thirdparty/vgc/vgc_platform.h +++ b/thirdparty/vgc/vgc_platform.h @@ -88,8 +88,13 @@ static inline void vgc_set_cache_idx(int idx) { _vgc_cache_idx = idx; } static inline void vgc_os_decommit(void* ptr, size_t size) { VirtualFree(ptr, size, MEM_DECOMMIT); } + static inline int vgc_num_cpus(void) { + DWORD count = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); + return count > 0 ? (int)count : 1; + } #else #include + #include static inline void* vgc_os_alloc(size_t size) { void* p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); return (p == MAP_FAILED) ? NULL : p; @@ -100,6 +105,10 @@ static inline void vgc_set_cache_idx(int idx) { _vgc_cache_idx = idx; } static inline void vgc_os_decommit(void* ptr, size_t size) { madvise(ptr, size, MADV_DONTNEED); } + static inline int vgc_num_cpus(void) { + long count = sysconf(_SC_NPROCESSORS_ONLN); + return count > 0 ? (int)count : 1; + } #endif // ============================================================ diff --git a/vlib/builtin/allocation.c.v b/vlib/builtin/allocation.c.v index a8b2f5888..18d7bb6df 100644 --- a/vlib/builtin/allocation.c.v +++ b/vlib/builtin/allocation.c.v @@ -133,6 +133,32 @@ pub fn malloc_noscan(n isize) &u8 { return res } +@[unsafe] +fn malloc_uninit(n isize) &u8 { + if n < 0 { + _memory_panic(@FN, n) + } else if n == 0 { + return &u8(unsafe { nil }) + } + $if vgc ? { + return unsafe { &u8(vgc_malloc_typed_opts(usize(n), 0, 0, false)) } + } + return malloc(n) +} + +@[unsafe] +fn malloc_noscan_uninit(n isize) &u8 { + if n < 0 { + _memory_panic(@FN, n) + } else if n == 0 { + return &u8(unsafe { nil }) + } + $if vgc ? { + return unsafe { &u8(vgc_malloc_noscan_opts(usize(n), false)) } + } + return malloc_noscan(n) +} + @[inline] fn __at_least_one(how_many u64) u64 { // handle the case for allocating memory for empty structs, which have sizeof(EmptyStruct) == 0 @@ -409,6 +435,9 @@ pub fn memdup(src voidptr, sz isize) voidptr { if sz == 0 { return vcalloc(1) } + $if vgc ? { + return vgc_memdup(src, sz) + } unsafe { mem := malloc(sz) return C.memcpy(mem, src, sz) @@ -423,6 +452,9 @@ pub fn memdup_noscan(src voidptr, sz isize) voidptr { if sz == 0 { return vcalloc_noscan(1) } + $if vgc ? { + return vgc_memdup_noscan(src, sz) + } unsafe { mem := malloc_noscan(sz) return C.memcpy(mem, src, sz) diff --git a/vlib/builtin/array.v b/vlib/builtin/array.v index 4995a3e74..2a9db198a 100644 --- a/vlib/builtin/array.v +++ b/vlib/builtin/array.v @@ -33,9 +33,14 @@ fn __new_array(mylen int, cap int, elm_size int) array { panic_on_negative_len(mylen) panic_on_negative_cap(cap) cap_ := if cap < mylen { mylen } else { cap } + total_size := u64(cap_) * u64(elm_size) arr := array{ element_size: elm_size - data: vcalloc(u64(cap_) * u64(elm_size)) + data: if cap_ > 0 && mylen == 0 { + unsafe { malloc_uninit(__at_least_one(total_size)) } + } else { + vcalloc(total_size) + } len: mylen cap: cap_ } @@ -214,7 +219,7 @@ pub fn (mut a array) ensure_cap(required int) { } } new_size := u64(cap) * u64(a.element_size) - new_data := unsafe { malloc(__at_least_one(new_size)) } + new_data := unsafe { malloc_uninit(__at_least_one(new_size)) } if a.data != unsafe { nil } { unsafe { vmemcpy(new_data, a.data, u64(a.len) * u64(a.element_size)) } // TODO: the old data may be leaked when no GC is used (ref-counting?) diff --git a/vlib/builtin/array_d_gcboehm_opt.v b/vlib/builtin/array_d_gcboehm_opt.v index 52814876a..ae74ff1fd 100644 --- a/vlib/builtin/array_d_gcboehm_opt.v +++ b/vlib/builtin/array_d_gcboehm_opt.v @@ -9,9 +9,14 @@ fn __new_array_noscan(mylen int, cap int, elm_size int) array { panic_on_negative_len(mylen) panic_on_negative_cap(cap) cap_ := if cap < mylen { mylen } else { cap } + total_size := u64(cap_) * u64(elm_size) arr := array{ element_size: elm_size - data: vcalloc_noscan(u64(cap_) * u64(elm_size)) + data: if cap_ > 0 && mylen == 0 { + unsafe { malloc_noscan_uninit(__at_least_one(total_size)) } + } else { + vcalloc_noscan(total_size) + } len: mylen cap: cap_ } @@ -120,7 +125,7 @@ fn (mut a array) ensure_cap_noscan(required int) { } } new_size := u64(cap) * u64(a.element_size) - new_data := unsafe { malloc_noscan(__at_least_one(new_size)) } + new_data := unsafe { malloc_noscan_uninit(__at_least_one(new_size)) } if a.data != unsafe { nil } { unsafe { vmemcpy(new_data, a.data, u64(a.len) * u64(a.element_size)) } // TODO: the old data may be leaked when no GC is used (ref-counting?) diff --git a/vlib/builtin/vgc_d_vgc.c.v b/vlib/builtin/vgc_d_vgc.c.v index 14cd60ad5..2f2edea84 100644 --- a/vlib/builtin/vgc_d_vgc.c.v +++ b/vlib/builtin/vgc_d_vgc.c.v @@ -39,6 +39,7 @@ fn C.vgc_init_size_tables() fn C.vgc_mutex_lock(lk &u32) fn C.vgc_mutex_unlock(lk &u32) fn C.vgc_start_thread(f voidptr) +fn C.vgc_num_cpus() int fn C.vgc_addr_map_register(base usize, size usize, arena_idx int) fn C.vgc_addr_to_arena(addr usize) int @@ -210,7 +211,7 @@ pub fn vgc_init() { C.vgc_init_size_tables() vgc_heap.gc_enabled = 1 vgc_heap.gc_percent = 100 - vgc_heap.next_gc = 4 * 1024 * 1024 // initial trigger at 4MB (like Go) + vgc_heap.next_gc = 256 * 1024 * 1024 // favor throughput over early collections vgc_heap.gc_phase = vgc_phase_off // Register the main thread vgc_register_thread() @@ -899,7 +900,32 @@ fn vgc_memdup_typed(src voidptr, n isize, ptrmap u64, ptr_words u8) voidptr { if src == unsafe { nil } || n <= 0 { return unsafe { nil } } - mem := vgc_malloc_typed(usize(n), ptrmap, ptr_words) + mem := vgc_malloc_typed_opts(usize(n), ptrmap, ptr_words, false) + if mem != unsafe { nil } { + unsafe { C.memcpy(mem, src, n) } + } + return mem +} + +// Memdup variants that skip zero-fill when the destination will be overwritten. +@[markused] +fn vgc_memdup(src voidptr, n isize) voidptr { + if src == unsafe { nil } || n <= 0 { + return unsafe { nil } + } + mem := vgc_malloc_typed_opts(usize(n), 0, 0, false) + if mem != unsafe { nil } { + unsafe { C.memcpy(mem, src, n) } + } + return mem +} + +@[markused] +fn vgc_memdup_noscan(src voidptr, n isize) voidptr { + if src == unsafe { nil } || n <= 0 { + return unsafe { nil } + } + mem := vgc_malloc_noscan_opts(usize(n), false) if mem != unsafe { nil } { unsafe { C.memcpy(mem, src, n) } } diff --git a/vlib/builtin/vgc_gc_d_vgc.c.v b/vlib/builtin/vgc_gc_d_vgc.c.v index 494b1b72e..d2369d770 100644 --- a/vlib/builtin/vgc_gc_d_vgc.c.v +++ b/vlib/builtin/vgc_gc_d_vgc.c.v @@ -133,7 +133,7 @@ fn vgc_scan_range(lo usize, hi usize) { mut addr := start for addr + sizeof(usize) <= hi { val := unsafe { *(&usize(voidptr(addr))) } - if vgc_is_heap_ptr(val) { + if val != 0 { vgc_shade(val) } addr += sizeof(usize) @@ -143,6 +143,9 @@ fn vgc_scan_range(lo usize, hi usize) { // Shade marks an object grey (discovered but not yet scanned). // Translated from Go's shade() in mgcmark.go. fn vgc_shade(addr usize) { + if addr < vgc_arena_lo || addr >= vgc_arena_hi { + return + } span := vgc_find_span(voidptr(addr)) if span == unsafe { nil } || !span.in_use { return @@ -174,21 +177,26 @@ fn vgc_shade(addr usize) { // Parallel mark using OS threads. // Translated from Go's gcDrain() with multiple workers. fn vgc_parallel_mark() { - // Use up to 4 workers (like Go's dedicated mark workers) - nworkers := if vgc_heap.ncaches < 4 { 1 } else { 4 } + mut nworkers := C.vgc_num_cpus() + if nworkers < 1 { + nworkers = 1 + } else if nworkers > 4 { + nworkers = 4 + } vgc_heap.gc_nworkers = nworkers C.vgc_atomic_store_u32(&vgc_heap.gc_workers_done, 0) if nworkers <= 1 { - // Single-threaded mark vgc_drain_mark_work() return } - // Start mark workers as OS threads - for _ in 0 .. nworkers { + // Start helper workers and let the current GC thread participate as well. + for _ in 1 .. nworkers { C.vgc_start_thread(vgc_mark_worker) } + vgc_drain_mark_work() + C.vgc_atomic_add_u32(&vgc_heap.gc_workers_done, 1) // Wait for all workers to finish for C.vgc_atomic_load_u32(&vgc_heap.gc_workers_done) < u32(nworkers) { @@ -264,7 +272,7 @@ fn vgc_scan_precise(obj_addr usize, ptrmap u64, ptr_words u8) { // Read the pointer at this offset ptr_addr := obj_addr + usize(bit) * word_size val := unsafe { *(&usize(voidptr(ptr_addr))) } - if val != 0 && vgc_is_heap_ptr(val) { + if val != 0 { vgc_shade(val) } // Clear this bit and continue @@ -276,8 +284,41 @@ fn vgc_scan_precise(obj_addr usize, ptrmap u64, ptr_words u8) { // Work queue (translated from Go's mgcwork.go) // ============================================================ +@[inline] +fn vgc_can_use_work_fastpath() bool { + return vgc_heap.ncaches <= 1 && vgc_heap.gc_nworkers <= 1 +} + // Add a pointer to the mark work queue fn vgc_work_put(addr usize) { + if vgc_can_use_work_fastpath() { + mut buf := vgc_heap.work_full + if buf == unsafe { nil } || buf.nobj >= 256 { + mut new_buf := vgc_heap.work_empty + if new_buf != unsafe { nil } { + unsafe { + vgc_heap.work_empty = new_buf.next + } + } else { + new_buf = unsafe { &VGC_WorkBuf(C.vgc_os_alloc(usize(sizeof(VGC_WorkBuf)))) } + if new_buf == unsafe { nil } { + return + } + } + unsafe { + new_buf.nobj = 0 + new_buf.next = vgc_heap.work_full + vgc_heap.work_full = new_buf + } + buf = new_buf + } + unsafe { + buf.obj[buf.nobj] = addr + buf.nobj++ + } + return + } + C.vgc_mutex_lock(&vgc_heap.work_lock) // Get or create a work buffer @@ -313,6 +354,23 @@ fn vgc_work_put(addr usize) { // Get a pointer from the mark work queue fn vgc_work_get() usize { + if vgc_can_use_work_fastpath() { + mut buf := vgc_heap.work_full + if buf == unsafe { nil } || buf.nobj == 0 { + return 0 + } + unsafe { + buf.nobj-- + addr := buf.obj[buf.nobj] + if buf.nobj == 0 { + vgc_heap.work_full = buf.next + buf.next = vgc_heap.work_empty + vgc_heap.work_empty = buf + } + return addr + } + } + C.vgc_mutex_lock(&vgc_heap.work_lock) mut buf := vgc_heap.work_full @@ -350,12 +408,8 @@ fn vgc_write_barrier(new_val voidptr) { if new_val == unsafe { nil } { return } - addr := usize(new_val) - if !vgc_is_heap_ptr(addr) { - return - } // Shade the new pointer (mark it grey) - vgc_shade(addr) + vgc_shade(usize(new_val)) } // ============================================================ @@ -471,9 +525,9 @@ fn vgc_update_trigger() { gc_percent := u64(vgc_heap.gc_percent) mut goal := marked + marked * gc_percent / 100 - // Minimum 4MB trigger - if goal < 4 * 1024 * 1024 { - goal = 4 * 1024 * 1024 + // Avoid very small heap goals that force frequent full cycles on bursty workloads. + if goal < 256 * 1024 * 1024 { + goal = 256 * 1024 * 1024 } C.vgc_atomic_store_u64(&vgc_heap.next_gc, goal) } diff --git a/vlib/builtin/vgc_notd_vgc.c.v b/vlib/builtin/vgc_notd_vgc.c.v index fd031df39..5bd62ab70 100644 --- a/vlib/builtin/vgc_notd_vgc.c.v +++ b/vlib/builtin/vgc_notd_vgc.c.v @@ -12,6 +12,22 @@ fn vgc_malloc_noscan(n usize) voidptr { return unsafe { nil } } +fn vgc_malloc_typed_opts(n usize, ptrmap u64, ptr_words u8, zero_fill bool) voidptr { + return unsafe { nil } +} + +fn vgc_malloc_noscan_opts(n usize, zero_fill bool) voidptr { + return unsafe { nil } +} + +fn vgc_memdup(src voidptr, n isize) voidptr { + return unsafe { nil } +} + +fn vgc_memdup_noscan(src voidptr, n isize) voidptr { + return unsafe { nil } +} + fn vgc_realloc(old_ptr voidptr, new_size usize) voidptr { return unsafe { nil } } diff --git a/vlib/v/gen/c/struct.v b/vlib/v/gen/c/struct.v index 95f582530..989554467 100644 --- a/vlib/v/gen/c/struct.v +++ b/vlib/v/gen/c/struct.v @@ -103,13 +103,23 @@ fn (mut g Gen) struct_init(node ast.StructInit) { if aligned != 0 { g.write('(${basetyp}*)builtin__memdup_align(&(${basetyp}){') } else { - g.write('(${basetyp}*)builtin__memdup(&(${basetyp}){') + g.write_heap_alloc(basetyp, node.typ.clear_option_and_result()) + if is_multiline { + g.writeln('(${basetyp}){') + } else { + g.write('(${basetyp}){') + } } } else { if aligned != 0 { g.write('(${styp}*)builtin__memdup_align(&(${styp}){') } else { - g.write('(${styp}*)builtin__memdup(&(${styp}){') + g.write_heap_alloc(styp, unwrapped_typ) + if is_multiline { + g.writeln('(${styp}){') + } else { + g.write('(${styp}){') + } } } } else if node.typ.is_ptr() { @@ -421,13 +431,13 @@ fn (mut g Gen) struct_init(node ast.StructInit) { if aligned != 0 { g.write(', sizeof(${basetyp}), ${aligned})') } else { - g.write(', sizeof(${basetyp}))') + g.write_heap_alloc_close(node.typ.clear_option_and_result()) } } else { if aligned != 0 { g.write(', sizeof(${styp}), ${aligned})') } else { - g.write(', sizeof(${styp}))') + g.write_heap_alloc_close(unwrapped_typ) } } } -- 2.39.5