From 4a6645fd15543b5295ca56a658240f47341aa049 Mon Sep 17 00:00:00 2001 From: Alexander Medvednikov Date: Tue, 16 Jun 2026 08:39:39 +0300 Subject: [PATCH] builtin: speed up single element array push (`a << x`), fixes #27468 (#27470) --- vlib/builtin/array.v | 32 ++- vlib/builtin/array_d_gcboehm_opt.v | 11 +- vlib/builtin/array_push_element_sizes_test.v | 243 +++++++++++++++++++ 3 files changed, 278 insertions(+), 8 deletions(-) create mode 100644 vlib/builtin/array_push_element_sizes_test.v diff --git a/vlib/builtin/array.v b/vlib/builtin/array.v index df267ee39..39c42dd9d 100644 --- a/vlib/builtin/array.v +++ b/vlib/builtin/array.v @@ -1128,6 +1128,27 @@ fn (mut a array) set_ni(i int, val voidptr) { a.set(v_ni_index(i, a.len), val) } +// copy_element_to copies a single `element_size` byte element from `src` to `dest`. +// It dispatches the common small element sizes to `vmemcpy` with a *compile time constant* +// size: since `vmemcpy` is inlined and forwards to `C.memcpy`, a literal size lets the C +// backend inline the copy as plain loads/stores instead of a `memcpy` library call, which +// otherwise dominates the cost of single element pushes (`a << x`) in hot loops. +// The copy semantics stay exactly those of `memcpy`, so it is safe for arbitrary element +// types regardless of their alignment, padding or aliasing. +@[inline; unsafe] +fn copy_element_to(dest voidptr, src voidptr, element_size int) { + unsafe { + match element_size { + 1 { vmemcpy(dest, src, 1) } + 2 { vmemcpy(dest, src, 2) } + 4 { vmemcpy(dest, src, 4) } + 8 { vmemcpy(dest, src, 8) } + 16 { vmemcpy(dest, src, 16) } + else { vmemcpy(dest, src, element_size) } + } + } +} + fn (mut a array) push(val voidptr) { $if !no_bounds_checking { if a.len < 0 { @@ -1138,12 +1159,15 @@ fn (mut a array) push(val voidptr) { panic('array.push: len bigger than max_int') } required := a.len + 1 - if a.needs_unique_append(required) { - a.clone_shallow_to_cap(a.cap) - } else if required > a.cap { + if required > a.cap { a.ensure_cap(required) + } else if a.flags.has(.is_slice) { + // `required <= a.cap` here, so this is the `needs_unique_append` case + a.clone_shallow_to_cap(a.cap) + } + unsafe { + copy_element_to(&u8(a.data) + u64(a.element_size) * u64(a.len), val, a.element_size) } - unsafe { vmemcpy(&u8(a.data) + u64(a.element_size) * u64(a.len), val, a.element_size) } a.len++ } diff --git a/vlib/builtin/array_d_gcboehm_opt.v b/vlib/builtin/array_d_gcboehm_opt.v index 5db3aa958..7ea8775e6 100644 --- a/vlib/builtin/array_d_gcboehm_opt.v +++ b/vlib/builtin/array_d_gcboehm_opt.v @@ -369,12 +369,15 @@ fn (mut a array) push_noscan(val voidptr) { panic('array.push_noscan: len bigger than max_int') } required := a.len + 1 - if a.needs_unique_append(required) { - a.clone_shallow_to_cap_noscan(a.cap) - } else if required > a.cap { + if required > a.cap { a.ensure_cap_noscan(required) + } else if a.flags.has(.is_slice) { + // `required <= a.cap` here, so this is the `needs_unique_append` case + a.clone_shallow_to_cap_noscan(a.cap) + } + unsafe { + copy_element_to(&u8(a.data) + u64(a.element_size) * u64(a.len), val, a.element_size) } - unsafe { vmemcpy(&u8(a.data) + u64(a.element_size) * u64(a.len), val, a.element_size) } a.len++ } diff --git a/vlib/builtin/array_push_element_sizes_test.v b/vlib/builtin/array_push_element_sizes_test.v new file mode 100644 index 000000000..9bcce8dd0 --- /dev/null +++ b/vlib/builtin/array_push_element_sizes_test.v @@ -0,0 +1,243 @@ +// Guards the single element push fast path in `array.push`/`push_noscan`. +// Single element pushes dispatch the common small element sizes (1/2/4/8/16 bytes) to +// `vmemcpy` with a constant size, and fall back to a runtime sized `vmemcpy` otherwise. +// These tests make sure every size copies the right bytes for awkward element types too: +// low alignment size 8/16 structs, structs with padding, and floats - i.e. types where a +// typed integer copy would impose alignment it does not have or read padding through the +// wrong type. They also check that pushing to a slice view clones the backing buffer +// instead of corrupting the parent array. + +struct S16 { + a u64 + b u64 +} + +struct S3 { + a u8 + b u8 + c u8 +} + +struct S24 { + a u64 + b u64 + c u64 +} + +// Pair8 is 8 bytes with only 4 byte alignment (no field needs 8 byte alignment). +struct Pair8 { + a int + b int +} + +// Padded8 is 8 bytes and contains padding: a u8 followed by 3 padding bytes, then a u32. +struct Padded8 { + a u8 + b u32 +} + +// LowAlign16 is a 16 byte element with only 1 byte alignment (a fixed byte array field). +// This is the case a 16 byte typed (two u64) copy would mishandle on strict alignment targets. +struct LowAlign16 { +mut: + data [16]u8 +} + +// Padded16 is 16 bytes and contains padding: a u8 followed by 3 padding bytes, then three +// u32s. It avoids u64 fields so its size is exactly 16 on every ABI (u64 alignment can be 4 +// on some 32-bit targets, which would make a u8+u64 struct 12 bytes instead of 16). +struct Padded16 { + a u8 + b u32 + c u32 + d u32 +} + +fn test_push_u8() { + mut a := []u8{cap: 4} + for i in 0 .. 300 { + a << u8(i & 0xff) + } + assert a.len == 300 + for i in 0 .. 300 { + assert a[i] == u8(i & 0xff) + } +} + +fn test_push_u16() { + mut a := []u16{} + for i in 0 .. 300 { + a << u16(i * 7) + } + for i in 0 .. 300 { + assert a[i] == u16(i * 7) + } +} + +fn test_push_int() { + mut a := []int{} + for i in 0 .. 300 { + a << i * 12345 + } + for i in 0 .. 300 { + assert a[i] == i * 12345 + } +} + +fn test_push_i64() { + mut a := []i64{} + for i in 0 .. 300 { + a << i64(i) * 9876543210 + } + for i in 0 .. 300 { + assert a[i] == i64(i) * 9876543210 + } +} + +fn test_push_f64() { + mut a := []f64{} + for i in 0 .. 100 { + a << f64(i) + 0.5 + } + for i in 0 .. 100 { + assert a[i] == f64(i) + 0.5 + } +} + +fn test_push_string() { + // string is a 16 byte element + mut a := []string{} + for i in 0 .. 200 { + a << 'item_${i}' + } + for i in 0 .. 200 { + assert a[i] == 'item_${i}' + } +} + +fn test_push_struct_16() { + mut a := []S16{} + for i in 0 .. 200 { + a << S16{u64(i), u64(i) * 2} + } + for i in 0 .. 200 { + assert a[i].a == u64(i) + assert a[i].b == u64(i) * 2 + } +} + +fn test_push_struct_odd_size() { + // 3 byte element exercises the vmemcpy fallback path + mut a := []S3{} + for i in 0 .. 200 { + a << S3{u8(i), u8(i + 1), u8(i + 2)} + } + for i in 0 .. 200 { + assert a[i].a == u8(i) + assert a[i].b == u8(i + 1) + assert a[i].c == u8(i + 2) + } +} + +fn test_push_struct_large() { + // 24 byte element exercises the vmemcpy fallback path + mut a := []S24{} + for i in 0 .. 200 { + a << S24{u64(i), u64(i) * 2, u64(i) * 3} + } + for i in 0 .. 200 { + assert a[i].a == u64(i) + assert a[i].b == u64(i) * 2 + assert a[i].c == u64(i) * 3 + } +} + +fn test_push_to_slice_view_clones() { + mut a := []int{cap: 10} + for i in 0 .. 5 { + a << i + } + // a real slice view into a's buffer, with spare capacity after it + mut s := unsafe { a[1..3] } + // pushing to a slice view must clone first, not overwrite a[3] + s << 999 + assert a == [0, 1, 2, 3, 4] + assert s == [1, 2, 999] +} + +fn test_push_to_byte_slice_view_clones() { + mut a := []u8{cap: 10} + for i in 0 .. 5 { + a << u8(i) + } + mut s := unsafe { a[1..3] } + s << u8(200) + assert a == [u8(0), 1, 2, 3, 4] + assert s == [u8(1), 2, 200] +} + +fn test_push_f32() { + // f32 is a 4 byte element; the copy must not route it through an integer type + mut a := []f32{} + for i in 0 .. 100 { + a << f32(i) + 0.25 + } + for i in 0 .. 100 { + assert a[i] == f32(i) + 0.25 + } +} + +fn test_push_pair8_low_alignment() { + assert sizeof(Pair8) == 8 + mut a := []Pair8{} + for i in 0 .. 200 { + a << Pair8{i, i * 3} + } + for i in 0 .. 200 { + assert a[i].a == i + assert a[i].b == i * 3 + } +} + +fn test_push_padded8() { + assert sizeof(Padded8) == 8 + mut a := []Padded8{} + for i in 0 .. 200 { + a << Padded8{u8(i), u32(i) * 7} + } + for i in 0 .. 200 { + assert a[i].a == u8(i) + assert a[i].b == u32(i) * 7 + } +} + +fn test_push_low_alignment_16() { + assert sizeof(LowAlign16) == 16 + mut a := []LowAlign16{} + for i in 0 .. 200 { + mut e := LowAlign16{} + for j in 0 .. 16 { + e.data[j] = u8(i + j) + } + a << e + } + for i in 0 .. 200 { + for j in 0 .. 16 { + assert a[i].data[j] == u8(i + j) + } + } +} + +fn test_push_padded16() { + assert sizeof(Padded16) == 16 + mut a := []Padded16{} + for i in 0 .. 200 { + a << Padded16{u8(i), u32(i) * 99, u32(i) * 7, u32(i) * 13} + } + for i in 0 .. 200 { + assert a[i].a == u8(i) + assert a[i].b == u32(i) * 99 + assert a[i].c == u32(i) * 7 + assert a[i].d == u32(i) * 13 + } +} -- 2.39.5