From de365a1fc6ab9c8cecbfd38bb4333bd24f887344 Mon Sep 17 00:00:00 2001 From: Alexander Medvednikov Date: Wed, 3 Jun 2026 23:35:52 +0300 Subject: [PATCH] v2: speed up the transform stage ~15x (42s -> ~2.85s on self-compile) (#27333) --- vlib/v2/builder/builder.v | 13 +- vlib/v2/builder/mem_darwin.c.v | 27 ++++ vlib/v2/builder/transform_parallel.v | 153 +++++++++++++++++-- vlib/v2/transformer/fn.v | 63 +++++--- vlib/v2/transformer/mem.v | 22 +++ vlib/v2/transformer/mem_darwin.c.v | 25 +++ vlib/v2/transformer/monomorphize.v | 220 +++++++++++++++++++++++++-- vlib/v2/transformer/struct.v | 21 +++ vlib/v2/transformer/transformer.v | 76 ++++++++- vlib/v2/transformer/types.v | 15 +- 10 files changed, 575 insertions(+), 60 deletions(-) create mode 100644 vlib/v2/builder/mem_darwin.c.v create mode 100644 vlib/v2/transformer/mem.v create mode 100644 vlib/v2/transformer/mem_darwin.c.v diff --git a/vlib/v2/builder/builder.v b/vlib/v2/builder/builder.v index 75c40355e..061e3c186 100644 --- a/vlib/v2/builder/builder.v +++ b/vlib/v2/builder/builder.v @@ -160,8 +160,17 @@ fn print_rss(stage string) { if os.getenv('V2_MEM') == '' { return } - bytes := runtime.used_memory() or { 0 } - eprintln(' [mem] ${stage}: ${bytes / (1024 * 1024)} MB') + rss := runtime.used_memory() or { 0 } + $if macos { + // Under -gc none nothing is freed, so `live` is monotonic and its + // per-phase delta is the exact bytes that phase allocated. `peak` is + // the high-water mark. Both are stable run-to-run, unlike `rss`. + live, peak := darwin_live_malloc_bytes() + mb := u64(1024 * 1024) + eprintln(' [mem] ${stage}: live ${live / mb} MB peak ${peak / mb} MB (rss ${rss / mb} MB)') + return + } + eprintln(' [mem] ${stage}: ${rss / (1024 * 1024)} MB') } // print_heap reports retained heap size after a forced GC, in MB. Unlike diff --git a/vlib/v2/builder/mem_darwin.c.v b/vlib/v2/builder/mem_darwin.c.v new file mode 100644 index 000000000..b7f9f8799 --- /dev/null +++ b/vlib/v2/builder/mem_darwin.c.v @@ -0,0 +1,27 @@ +// Copyright (c) 2020-2024 Joe Conigliaro. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module builder + +#include + +struct C.malloc_statistics_t { + blocks_in_use u32 + size_in_use usize + max_size_in_use usize + size_allocated usize +} + +fn C.malloc_default_zone() voidptr +fn C.malloc_zone_statistics(zone voidptr, stats &C.malloc_statistics_t) + +// darwin_live_malloc_bytes returns (current live malloc bytes, peak live bytes) +// from the default malloc zone. Under `-gc none` (no frees) the current value +// is monotonic across phases, so per-phase deltas are the exact number of bytes +// each phase allocated and never released. This is the reliable counterpart to +// runtime.used_memory(), whose resident_size reading is distorted by OS paging. +fn darwin_live_malloc_bytes() (u64, u64) { + mut st := C.malloc_statistics_t{} + C.malloc_zone_statistics(C.malloc_default_zone(), &st) + return u64(st.size_in_use), u64(st.max_size_in_use) +} diff --git a/vlib/v2/builder/transform_parallel.v b/vlib/v2/builder/transform_parallel.v index ff11c7aa8..74cf3133c 100644 --- a/vlib/v2/builder/transform_parallel.v +++ b/vlib/v2/builder/transform_parallel.v @@ -6,6 +6,8 @@ module builder import v2.ast import v2.transformer import runtime +import os +import time $if !windows { struct TransformChunkArgs { @@ -28,6 +30,8 @@ $if !windows { fn transform_chunk_thread(arg voidptr) voidptr { a := unsafe { &TransformChunkArgs(arg) } t := unsafe { &transformer.Transformer(a.t) } + wprof := os.getenv('V2_TTIME') != '' + mut wsw := time.new_stopwatch() mut w := t.new_worker_clone(a.worker_idx) if unsafe { a.flat != nil } { // Streaming rehydration: rehydrate one file at a time, transform it, @@ -52,6 +56,9 @@ $if !windows { for i := 0; i < a.files.len; i++ { result << w.transform_file_pub(a.files[i]) } + if wprof { + eprintln(' [ttime] worker ${a.worker_idx}: ${a.files.len} files in ${wsw.elapsed().milliseconds()}ms') + } unsafe { *(&[]ast.File(a.result_ptr)) = result *(&voidptr(a.worker_ptr)) = voidptr(w) @@ -61,8 +68,17 @@ $if !windows { } fn (mut b Builder) transform_files_parallel(mut trans transformer.Transformer) []ast.File { + timing := os.getenv('V2_TTIME') != '' + mut sw := time.new_stopwatch() mut result := b.transform_files_parallel_no_post_pass(mut trans) + if timing { + eprintln(' [ttime] (parallel) prepare+fanout: ${sw.elapsed().milliseconds()}ms') + sw = time.new_stopwatch() + } trans.post_pass(mut result) + if timing { + eprintln(' [ttime] (parallel) post_pass: ${sw.elapsed().milliseconds()}ms') + } return result } @@ -93,6 +109,8 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor } else { trans.pre_pass(b.files) } + timing_impl := os.getenv('V2_TTIME') != '' + mut sw_impl := time.new_stopwatch() mut stream_files_from_flat := stream_from_flat mut files_to_transform := []ast.File{} if trans.needs_full_files_for_transform() { @@ -102,6 +120,15 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor } else if !stream_from_flat { files_to_transform = b.files.clone() } + if timing_impl { + eprintln(' [ttime] prepare_files_for_transform total: ${sw_impl.elapsed().milliseconds()}ms') + sw_impl = time.new_stopwatch() + } + defer { + if timing_impl { + eprintln(' [ttime] per-file fanout: ${sw_impl.elapsed().milliseconds()}ms') + } + } // In flat mode, workers stream the rehydration per file (one legacy // ast.File in flight per worker at a time). Otherwise b.files is the @@ -143,8 +170,32 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor return result } - // Split files into chunks and spawn workers via pthreads - chunk_size := (n_files + n_jobs - 1) / n_jobs // ceiling division + // Assign files to workers. Contiguous chunks badly unbalance the load: + // the few huge files (transformer.v, monomorphize.v, the cleanc gen + // files, ...) cluster into adjacent chunks, so 2-3 workers run ~10s + // while the rest finish in <0.5s and idle. For the non-flat path we + // instead use longest-processing-time-first (LPT) bucketing keyed on a + // cheap size proxy, then scatter each worker's results back to their + // original file index after the join (no concurrent writes — workers + // each fill their own chunk_results slot, the merge happens serially). + mut bucket_indices := [][]int{len: n_jobs} + if stream_files_from_flat { + // Flat streaming still uses contiguous [start,end) ranges. + chunk_size := (n_files + n_jobs - 1) / n_jobs + mut i := 0 + mut w := 0 + for i < n_files { + end := if i + chunk_size < n_files { i + chunk_size } else { n_files } + for j in i .. end { + bucket_indices[w] << j + } + i = end + w++ + } + } else { + bucket_indices = lpt_buckets(files_to_transform, n_jobs) + } + mut chunk_results := [][]ast.File{len: n_jobs} mut worker_ptrs := []voidptr{len: n_jobs, init: unsafe { nil }} mut thread_ids := []C.pthread_t{len: n_jobs} @@ -159,21 +210,26 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor C.pthread_attr_setstacksize(attr, 64 * 1024 * 1024) mut chunk_idx := 0 - mut i := 0 - for i < n_files { - end := if i + chunk_size < n_files { i + chunk_size } else { n_files } + for w in 0 .. n_jobs { + idxs := bucket_indices[w] + if idxs.len == 0 { + continue + } if stream_files_from_flat { args << TransformChunkArgs{ t: unsafe { voidptr(trans) } flat: unsafe { &b.flat } - flat_start: i - flat_end: end + flat_start: idxs[0] + flat_end: idxs[idxs.len - 1] + 1 result_ptr: unsafe { voidptr(&chunk_results[chunk_idx]) } worker_ptr: unsafe { voidptr(&worker_ptrs[chunk_idx]) } worker_idx: chunk_idx } } else { - chunk := files_to_transform[i..end].clone() + mut chunk := []ast.File{cap: idxs.len} + for fi in idxs { + chunk << files_to_transform[fi] + } args << TransformChunkArgs{ t: unsafe { voidptr(trans) } files: chunk @@ -184,7 +240,6 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor } C.pthread_create(unsafe { &thread_ids[chunk_idx] }, attr, transform_chunk_thread, unsafe { voidptr(&args[chunk_idx]) }) - i = end chunk_idx++ } C.pthread_attr_destroy(attr) @@ -194,15 +249,25 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor C.pthread_join(thread_ids[ci], unsafe { nil }) } - // Collect results in chunk order and merge worker accumulated state - mut result := []ast.File{cap: n_files} - for ci := 0; ci < chunk_idx; ci++ { + // Scatter each worker's results back to original file order and merge + // accumulated state. bucket_indices[w] lists the original indices the + // w-th spawned worker processed, in the same order it produced results. + mut result := []ast.File{len: n_files} + mut ci := 0 + for w in 0 .. n_jobs { + idxs := bucket_indices[w] + if idxs.len == 0 { + continue + } chunk_files := chunk_results[ci] - for k := 0; k < chunk_files.len; k++ { - result << chunk_files[k] + for k, fi in idxs { + if k < chunk_files.len { + result[fi] = chunk_files[k] + } } - w := unsafe { &transformer.Transformer(worker_ptrs[ci]) } - trans.merge_worker(w) + worker := unsafe { &transformer.Transformer(worker_ptrs[ci]) } + trans.merge_worker(worker) + ci++ } // Set synth_pos_counter past all worker ranges to avoid ID collisions in post_pass. trans.set_synth_pos_counter(-(chunk_idx * 100_000) - 1) @@ -210,6 +275,62 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor } } +// lpt_buckets distributes file indices across n_jobs workers using the +// longest-processing-time-first heuristic: process files largest-first and +// always append to the currently least-loaded worker. This keeps the heaviest +// files on separate workers so the fan-out wall time approaches +// total_work / n_jobs instead of being pinned to one overloaded contiguous +// chunk. The cost proxy is top-level statement count (cheap, and the giant +// files have proportionally many declarations). Deterministic: files are +// ordered by (cost desc, index asc) and ties pick the lowest worker index. +fn lpt_buckets(files []ast.File, n_jobs int) [][]int { + n := files.len + mut cost := []int{len: n} + for i in 0 .. n { + // Cost proxy: count function bodies, not just top-level declarations, so + // a file of a few huge functions (transformer.v, the cleanc gen files) + // outranks one with many tiny ones. Deterministic; one level deep is + // enough to separate the heavyweight files that drove the imbalance. + mut c := 1 + for stmt in files[i].stmts { + c++ + if stmt is ast.FnDecl { + c += stmt.stmts.len + } + } + cost[i] = c + } + // order = file indices by cost descending. Implemented as a plain insertion + // sort (n is small, a few hundred) rather than sort_with_compare: this file + // must self-host through every backend, and capturing closures / pointer + // comparators are not reliably codegen'd by the v2 cleanc and arm64 paths. + // Stable on index (only shifts on strictly-greater), so deterministic. + mut order := []int{len: n, init: index} + for i in 1 .. n { + key := order[i] + kc := cost[key] + mut j := i - 1 + for j >= 0 && cost[order[j]] < kc { + order[j + 1] = order[j] + j-- + } + order[j + 1] = key + } + mut buckets := [][]int{len: n_jobs} + mut load := []i64{len: n_jobs} + for fi in order { + mut mw := 0 + for w in 1 .. n_jobs { + if load[w] < load[mw] { + mw = w + } + } + buckets[mw] << fi + load[mw] += i64(cost[fi]) + } + return buckets +} + // transform_files_parallel_to_flat is the parallel counterpart of // Transformer.transform_files_to_flat. Today it composes the existing // parallel transform with a boundary flatten_files() — same total work diff --git a/vlib/v2/transformer/fn.v b/vlib/v2/transformer/fn.v index 35ce1d2d4..10ee6cfbf 100644 --- a/vlib/v2/transformer/fn.v +++ b/vlib/v2/transformer/fn.v @@ -1172,26 +1172,31 @@ fn (t &Transformer) method_key_matches_type_name(method_key string, type_name st || !transformer_string_has_valid_data(type_name) { return false } - normalized_key := method_key.replace('.', '__') - normalized_type := type_name.replace('.', '__') + // Avoid .replace/.contains here: replace always allocates and contains builds + // a KMP failure table per call. This runs inside O(method_keys) fallback loops + // per call site, so those per-call allocations were a large transform cost. + // Only normalize when a '.' is actually present (index_u8 does not allocate), + // and locate `__` with a hand-rolled scan. + normalized_key := if method_key.index_u8(`.`) >= 0 { + method_key.replace('.', '__') + } else { + method_key + } + normalized_type := if type_name.index_u8(`.`) >= 0 { + type_name.replace('.', '__') + } else { + type_name + } if normalized_key == normalized_type { return true } - key_is_qualified := normalized_key.contains('__') - type_is_qualified := normalized_type.contains('__') - if key_is_qualified && type_is_qualified { + key_dunder := last_double_underscore(normalized_key) + type_dunder := last_double_underscore(normalized_type) + if key_dunder >= 0 && type_dunder >= 0 { return false } - short_type := if normalized_type.contains('__') { - normalized_type.all_after_last('__') - } else { - normalized_type - } - short_key := if normalized_key.contains('__') { - normalized_key.all_after_last('__') - } else { - normalized_key - } + short_type := if type_dunder >= 0 { normalized_type[type_dunder + 2..] } else { normalized_type } + short_key := if key_dunder >= 0 { normalized_key[key_dunder + 2..] } else { normalized_key } if short_key == short_type { return true } @@ -1210,6 +1215,28 @@ fn (t &Transformer) method_key_matches_type_name(method_key string, type_name st return false } +// candidate_method_keys returns the cached method keys that could fuzzy-match any +// of `names` — i.e. those sharing a receiver short name. A method_key_matches_type_name +// match always implies equal short names, so the fuzzy fallback loops can scan +// these candidates instead of every method key (O(all_keys) per call site). +fn (t &Transformer) candidate_method_keys(names []string) []string { + mut cand := []string{} + mut shorts_done := []string{} + for name in names { + if name == '' { + continue + } + sh := method_short_name(name) + if sh in shorts_done { + continue + } + shorts_done << sh + keys := t.cached_method_keys_by_short[sh] or { continue } + cand << keys + } + return cand +} + fn (t &Transformer) lookup_method_return_type(type_names []string, method_name string) ?types.Type { if method_name == '' { return none @@ -1229,7 +1256,7 @@ fn (t &Transformer) lookup_method_return_type(type_names []string, method_name s } } } - for key in t.cached_method_keys { + for key in t.candidate_method_keys(seen) { mut matches_receiver := false for type_name in seen { if t.method_key_matches_type_name(key, type_name) { @@ -1330,7 +1357,7 @@ fn (t &Transformer) lookup_method_exists(type_names []string, method_name string return true } } - for key in t.cached_method_keys { + for key in t.candidate_method_keys(seen) { mut matches_receiver := false for type_name in seen { if t.method_key_matches_type_name(key, type_name) { @@ -4454,7 +4481,7 @@ fn (t &Transformer) resolve_method_call_name(receiver ast.Expr, method_name stri } } // Fuzzy fallback: iterate method keys to find matching receiver types - for key in t.cached_method_keys { + for key in t.candidate_method_keys(lookup_names) { mut matches_receiver := false for name in lookup_names { if t.method_key_matches_type_name(key, name) { diff --git a/vlib/v2/transformer/mem.v b/vlib/v2/transformer/mem.v new file mode 100644 index 000000000..85020e576 --- /dev/null +++ b/vlib/v2/transformer/mem.v @@ -0,0 +1,22 @@ +// Copyright (c) 2020-2024 Joe Conigliaro. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module transformer + +import os + +// t_print_mem reports live malloc bytes at a transformer sub-phase boundary. +// Gated on V2_MEM. Under -gc none the value is monotonic on macOS, so deltas +// between stages are the exact bytes each sub-phase allocated. Defined for all +// platforms (transform_files calls it everywhere); the malloc-statistics probe +// is macOS-only, so other platforms just print the stage marker. +fn t_print_mem(stage string) { + if os.getenv('V2_MEM') == '' { + return + } + $if macos { + eprintln(' [mem] transform/${stage}: live ${darwin_transform_live_mb()} MB') + } $else { + eprintln(' [mem] transform/${stage}') + } +} diff --git a/vlib/v2/transformer/mem_darwin.c.v b/vlib/v2/transformer/mem_darwin.c.v new file mode 100644 index 000000000..1735f17a5 --- /dev/null +++ b/vlib/v2/transformer/mem_darwin.c.v @@ -0,0 +1,25 @@ +// Copyright (c) 2020-2024 Joe Conigliaro. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module transformer + +#include + +struct C.malloc_statistics_t { + blocks_in_use u32 + size_in_use usize + max_size_in_use usize + size_allocated usize +} + +fn C.malloc_default_zone() voidptr +fn C.malloc_zone_statistics(zone voidptr, stats &C.malloc_statistics_t) + +// darwin_transform_live_mb returns the process's currently live malloc bytes +// (in MB) on macOS. Used by t_print_mem; only referenced from its `$if macos` +// branch, so the macOS-only C calls here never reach other platforms. +fn darwin_transform_live_mb() u64 { + mut st := C.malloc_statistics_t{} + C.malloc_zone_statistics(C.malloc_default_zone(), &st) + return u64(st.size_in_use) / (1024 * 1024) +} diff --git a/vlib/v2/transformer/monomorphize.v b/vlib/v2/transformer/monomorphize.v index cd1b54fd7..2426e2282 100644 --- a/vlib/v2/transformer/monomorphize.v +++ b/vlib/v2/transformer/monomorphize.v @@ -8,6 +8,8 @@ module transformer import v2.ast import v2.token import v2.types +import os +import time struct CloneComptimeFieldCtx { var_name string @@ -59,24 +61,105 @@ pub fn (mut t Transformer) prepare_files_for_transform(files []ast.File) []ast.F mut prepared := files.clone() t.collect_declared_method_fns(prepared) t.collect_struct_field_generic_decl_types(prepared) - for _ in 0 .. 64 { + timing := os.getenv('V2_TTIME') != '' + mut sw := time.new_stopwatch() + // prepared_dirty tracks whether `prepared` changed since it was last scanned + // by collect_generic_call_specs. collect is a pure (idempotent) function of + // the program: scanning an unchanged file set rediscovers the exact same + // specs. So the leading full-program scan is only needed when the program + // actually changed since the previous scan — i.e. when the previous + // iteration's inject step appended new struct specializations. The + // post-mono_pass scan below already covers the clones mono_pass adds, so the + // only unscanned source of change between iterations is inject. This removes + // the fixpoint's redundant confirmation scan (~6s / ~700MB for v2 self-host). + mut prepared_dirty := true + for iter in 0 .. 64 { spec_count := t.monomorphized_specs.len generic_count := t.generic_types_spec_count() struct_count := t.generic_struct_specs.len - t.collect_generic_call_specs(prepared) + ts_start := sw.elapsed().milliseconds() + // collect1: full-program scan, only when the program changed since the + // last scan (initial pass, or inject appended structs last iteration). + if prepared_dirty { + t.collect_generic_call_specs(prepared) + } + ts_collect1 := sw.elapsed().milliseconds() + before_mono := t.monomorphized_specs.len prepared = t.monomorphize_pass(prepared) - t.collect_generic_call_specs(prepared) + ts_mono := sw.elapsed().milliseconds() + // collect2: only when monomorphize_pass materialized new clones. Those + // clones are the only statements the previous scan has not seen — every + // other statement is byte-for-byte unchanged and collect is idempotent — + // so rescan just the new clones instead of re-walking all files. This + // turns the dominant ~6s/~700MB full rescan into a walk of a few dozen + // functions. + if t.monomorphized_specs.len != before_mono { + t.collect_generic_call_specs_in_new_clones(prepared) + } + ts_collect2 := sw.elapsed().milliseconds() prepared = t.inject_generic_struct_specializations(prepared) + ts_inject := sw.elapsed().milliseconds() + // collect2 already rescanned mono_pass's clones, so `prepared` is clean + // for the next iteration unless inject just appended new specializations. + prepared_dirty = t.inject_changed_files + if timing { + eprintln(' [ttime] iter ${iter}: collect1=${ts_collect1 - ts_start}ms mono_pass=${ts_mono - ts_collect1}ms collect2=${ts_collect2 - ts_mono}ms inject=${ts_inject - ts_collect2}ms files=${prepared.len}') + } + t_print_mem('monomorphize iter ${iter}') if t.monomorphized_specs.len == spec_count && t.generic_types_spec_count() == generic_count && t.generic_struct_specs.len == struct_count { break } } + if dump_path := os.getenv_opt('V2_TDUMP') { + t.dump_monomorphize_specs(dump_path, prepared) + } t.collect_struct_default_decl_infos(prepared) t.collect_concrete_embedded_owner_names(prepared) return prepared } +// dump_monomorphize_specs writes a deterministic snapshot of the fixpoint's +// result (all monomorphized fn spec keys, generic struct spec keys, generic +// binding signatures, and per-file appended-stmt counts) to `path`. Used only +// for correctness validation: the file must be byte-identical before and after +// any optimization to the fixpoint loop. Gated on V2_TDUMP. +fn (t &Transformer) dump_monomorphize_specs(path string, prepared []ast.File) { + mut lines := []string{} + mut mspecs := t.monomorphized_specs.keys() + mspecs.sort() + lines << '# monomorphized_specs (${mspecs.len})' + for k in mspecs { + lines << 'M ${k}' + } + mut sspecs := t.generic_struct_specs.keys() + sspecs.sort() + lines << '# generic_struct_specs (${sspecs.len})' + for k in sspecs { + lines << 'S ${k}' + } + mut gkeys := t.env.generic_types.keys() + gkeys.sort() + lines << '# generic_types' + for k in gkeys { + blist := t.env.generic_types[k] or { continue } + mut sigs := []string{} + for b in blist { + sigs << generic_bindings_signature(b) + } + sigs.sort() + for sig in sigs { + lines << 'G ${k} :: ${sig}' + } + } + // Per-file fingerprint: file name + stmt count, in input order. + lines << '# files (${prepared.len})' + for f in prepared { + lines << 'F ${f.mod}/${f.name} stmts=${f.stmts.len}' + } + os.write_file(path, lines.join('\n')) or {} +} + fn (t &Transformer) generic_types_spec_count() int { mut count := 0 for _, bindings_list in t.env.generic_types { @@ -607,6 +690,7 @@ fn (mut t Transformer) clone_generic_struct_decl(decl ast.StructDecl, spec Gener } fn (mut t Transformer) inject_generic_struct_specializations(files []ast.File) []ast.File { + t.inject_changed_files = false if t.generic_struct_specs.len == 0 { return files } @@ -626,6 +710,22 @@ fn (mut t Transformer) inject_generic_struct_specializations(files []ast.File) [ spec_keys := t.generic_struct_specs.keys() mut sorted_keys := spec_keys.clone() sorted_keys.sort() + // Fast path: if every struct spec is already present in `files`, nothing + // will be injected and the loop below would only rebuild an identical file + // set. Return the input unchanged to avoid duplicating every file's stmt + // list (~1.5GB under -gc none on the fixpoint loop's confirmation pass). + // Mirrors monomorphize_pass's `per_file_clones.len == 0 { return files }`. + mut any_pending := false + for key in sorted_keys { + spec := t.generic_struct_specs[key] or { continue } + if spec.concrete_c_name !in existing { + any_pending = true + break + } + } + if !any_pending { + return files + } mut out := []ast.File{cap: files.len} for file in files { mut stmts := []ast.Stmt{cap: file.stmts.len} @@ -661,6 +761,7 @@ fn (mut t Transformer) inject_generic_struct_specializations(files []ast.File) [ selector_names: file.selector_names } } + t.inject_changed_files = true return out } @@ -901,6 +1002,9 @@ pub fn (mut t Transformer) monomorphize_pass(files []ast.File) []ast.File { deferred_specs := t.deferred_generic_call_specs.clone() t.deferred_generic_call_specs = old_deferred_specs.clone() t.flush_deferred_generic_call_specs(deferred_specs) + // Record the freshly-materialized clones so the fixpoint can rescan only + // them (collect_generic_call_specs_in_new_clones) instead of all files. + t.last_mono_clones = per_file_clones.clone() if per_file_clones.len == 0 { return files } @@ -1636,18 +1740,57 @@ fn monomorphized_clone_name(fn_key string, decl ast.FnDecl, spec_name string) st } fn generic_base_name_without_specialization(name string) string { - mut base_name := name - bracket_pos := base_name.index_u8(`[`) - if bracket_pos > 0 { - base_name = base_name[..bracket_pos] + // Hot path: called per-method per-call-site during generic collection. + // Hand-rolled byte scans replace string.contains/.index/.all_before, each of + // which builds a fresh KMP failure-table heap allocation on every call. With + // millions of calls (and most names having neither `[` nor `_T_`) that + // allocation churn dominated the whole transform stage. This is behaviour- + // identical to the previous index_u8(`[`)>0 / contains('_T_') / ends_with('_T') + // version, just without the per-call allocations. + mut end := name.len + for i in 0 .. name.len { + if name[i] == `[` { + if i > 0 { + end = i + } + break + } } - if base_name.contains('_T_') { - return base_name.all_before('_T_') + for i := 0; i + 3 <= end; i++ { + if name[i] == `_` && name[i + 1] == `T` && name[i + 2] == `_` { + return name[..i] + } } - if base_name.ends_with('_T') { - return base_name[..base_name.len - 2] + if end >= 2 && name[end - 1] == `T` && name[end - 2] == `_` { + return name[..end - 2] } - return base_name + if end == name.len { + return name + } + return name[..end] +} + +// method_short_name returns the final `__`-separated segment of s (after a +// `.`->`__` normalization). It matches the short form method_key_matches_type_name +// compares, so it can be used to bucket method keys for fast candidate lookup. +fn method_short_name(s string) string { + norm := if s.index_u8(`.`) >= 0 { s.replace('.', '__') } else { s } + d := last_double_underscore(norm) + return if d >= 0 { norm[d + 2..] } else { norm } +} + +// last_double_underscore returns the index of the last `__` in s, or -1. +// Hand-rolled (no allocation) replacement for s.contains('__') / .all_after_last('__'), +// which build KMP tables / allocate; used in hot per-call-site name matching. +fn last_double_underscore(s string) int { + mut i := s.len - 2 + for i >= 0 { + if s[i] == `_` && s[i + 1] == `_` { + return i + } + i-- + } + return -1 } fn (mut t Transformer) index_generic_fn_decl_for_monomorphize(mut decl_owner map[string]int, mut decl_node map[string]ast.FnDecl, decl ast.FnDecl, file_idx int, module_name string) { @@ -1759,6 +1902,33 @@ fn (mut t Transformer) collect_generic_call_specs(files []ast.File) { old_scope := t.scope old_file_idx := t.cur_generic_call_file_idx old_import_aliases := t.cur_import_aliases.clone() + t.build_generic_fn_decl_index(files) + for fi, file in files { + t.cur_file_name = file.name + t.cur_module = file.mod + t.cur_generic_call_file_idx = fi + t.cur_import_aliases = import_aliases_for_generic_collect(file.imports) + if scope := t.get_module_scope(file.mod) { + t.scope = scope + } else { + t.scope = unsafe { nil } + } + for stmt in file.stmts { + t.collect_generic_call_specs_in_stmt(stmt) + } + } + t.cur_module = old_module + t.cur_file_name = old_file + t.scope = old_scope + t.cur_generic_call_file_idx = old_file_idx + t.cur_import_aliases = old_import_aliases.clone() +} + +// build_generic_fn_decl_index (re)builds t.generic_fn_decl_index from every +// generic FnDecl across all files. Generic declarations never change during the +// fixpoint (monomorphize_pass only appends concrete clones), but the index maps +// keys to file indices, so it is rebuilt against the current file set. +fn (mut t Transformer) build_generic_fn_decl_index(files []ast.File) { t.generic_fn_decl_index = map[string]ast.FnDecl{} mut dummy_owner := map[string]int{} for fi, file in files { @@ -1772,7 +1942,31 @@ fn (mut t Transformer) collect_generic_call_specs(files []ast.File) { } } } +} + +// collect_generic_call_specs_in_new_clones is the incremental counterpart of +// collect_generic_call_specs. After monomorphize_pass appends concrete clones, +// only those clones can introduce generic calls the previous scan did not see — +// every other statement is byte-for-byte unchanged and was already walked. So +// this rebuilds the generic-decl index (cheap, shallow) but deep-walks only the +// clones recorded in t.last_mono_clones, each under its owning file's module, +// scope and import context. For v2 self-host this replaces a full ~6s/~700MB +// rescan with a walk of a few dozen functions. +fn (mut t Transformer) collect_generic_call_specs_in_new_clones(files []ast.File) { + if t.last_mono_clones.len == 0 { + return + } + old_module := t.cur_module + old_file := t.cur_file_name + old_scope := t.scope + old_file_idx := t.cur_generic_call_file_idx + old_import_aliases := t.cur_import_aliases.clone() + t.build_generic_fn_decl_index(files) for fi, file in files { + clones := t.last_mono_clones[fi] or { continue } + if clones.len == 0 { + continue + } t.cur_file_name = file.name t.cur_module = file.mod t.cur_generic_call_file_idx = fi @@ -1782,7 +1976,7 @@ fn (mut t Transformer) collect_generic_call_specs(files []ast.File) { } else { t.scope = unsafe { nil } } - for stmt in file.stmts { + for stmt in clones { t.collect_generic_call_specs_in_stmt(stmt) } } diff --git a/vlib/v2/transformer/struct.v b/vlib/v2/transformer/struct.v index db7570544..b0197a9f6 100644 --- a/vlib/v2/transformer/struct.v +++ b/vlib/v2/transformer/struct.v @@ -75,6 +75,27 @@ fn (t &Transformer) lookup_struct_field_type(struct_name string, field_name stri || !transformer_string_has_valid_data(field_name) { return none } + // Memoize: this is called per field-access expression and re-derives the type + // each time. The result is a pure function of (cur_module, struct_name, + // field_name) — cur_module only matters for the unqualified-name path, but + // keying on it unconditionally is always correct. A Void value is the + // "resolved to none" sentinel (struct fields are never void-typed). + cache_key := '${t.cur_module}\x01${struct_name}\x01${field_name}' + if cached := t.field_type_cache[cache_key] { + if cached is types.Void { + return none + } + return cached + } + result := t.lookup_struct_field_type_uncached(struct_name, field_name) + unsafe { + mut mt := &Transformer(voidptr(t)) + mt.field_type_cache[cache_key] = result or { types.Type(types.void_) } + } + return result +} + +fn (t &Transformer) lookup_struct_field_type_uncached(struct_name string, field_name string) ?types.Type { if field_type := t.lookup_struct_field_generic_decl_type(struct_name, field_name) { return field_type } diff --git a/vlib/v2/transformer/transformer.v b/vlib/v2/transformer/transformer.v index 48bed68ac..6752ceb72 100644 --- a/vlib/v2/transformer/transformer.v +++ b/vlib/v2/transformer/transformer.v @@ -123,13 +123,47 @@ mut: cached_methods map[string][]&types.Fn cached_method_keys []string cached_fn_scopes map[string]&types.Scope + // cached_method_base_index maps type_name -> generic-base method name -> + // FnType, precomputed once from cached_methods. lookup_method_cached used to + // linearly scan every method of a type and recompute its base name (via + // generic_base_name_without_specialization) on every call site — O(calls x + // methods) with a string scan inner loop, the single biggest transform cost. + // This makes the lookup O(1). Flat map keyed by "type_name#base_name" -> the + // types.Type sum (callers smartcast to FnType). Flat (not nested) so a lookup + // doesn't copy an inner map, and stored as the sum (not the FnType variant) + // because v2's own codegen mishandles a map valued by a bare sum-variant. + cached_method_base_index map[string]types.Type + // cached_method_keys_by_short buckets cached_method_keys by their short + // (final `__`-segment) name. The fuzzy method-key fallback loops only ever + // match a key whose short name equals the receiver's short name, so they can + // scan just this bucket instead of every method key (O(all_keys) per call). + cached_method_keys_by_short map[string][]string + // field_type_cache memoizes lookup_struct_field_type, which is called per + // field-access expression and re-derives the field type (scope lookups + + // types.Type-by-value copies + a scan-all-scopes fallback) every time. Keyed + // by "cur_module\x01struct\x01field" because the unqualified-name path + // consults cur_module. Written through an unsafe const->mut cast (the result + // is pure given the key, so this is benign interior mutability); each parallel + // worker gets its own empty cache, so there is no cross-thread sharing. + field_type_cache map[string]types.Type // Accumulated synth types for deferred application (thread-safe). // Instead of writing directly to env.set_expr_type during parallel transform, // store here and apply after merge. synth_types map[int]types.Type // Generic monomorphization clones generic FnDecls per env.generic_types // binding before code generation, so backends receive concrete functions. - monomorphized_specs map[string]bool + monomorphized_specs map[string]bool + // inject_changed_files records whether the last + // inject_generic_struct_specializations call actually appended new struct + // specializations (i.e. returned a modified file set). The monomorphize + // fixpoint uses it to skip the next iteration's full-program collect scan + // when nothing changed. + inject_changed_files bool + // last_mono_clones maps file index -> the FnDecl clone stmts that the most + // recent monomorphize_pass appended to that file. The fixpoint rescans only + // these freshly-materialized clones (the rest of the program was already + // scanned) instead of re-walking all files. Empty when mono_pass added none. + last_mono_clones map[int][]ast.Stmt generic_spec_owner_file map[string]int deferred_generic_call_specs []DeferredGenericCallSpec generic_struct_specs map[string]GenericStructSpec @@ -337,6 +371,9 @@ pub fn (t &Transformer) new_worker_clone(worker_idx int) &Transformer { file_set: unsafe { t.file_set } cached_scopes: t.cached_scopes.clone() cached_methods: t.cached_methods.clone() + cached_method_base_index: t.cached_method_base_index.clone() + cached_method_keys_by_short: t.cached_method_keys_by_short.clone() + field_type_cache: map[string]types.Type{} cached_method_keys: t.cached_method_keys.clone() cached_fn_scopes: t.cached_fn_scopes.clone() synth_types: t.synth_types.clone() @@ -1498,6 +1535,38 @@ fn (mut t Transformer) cache_env_maps() { t.cached_methods = t.env.snapshot_methods() t.cached_method_keys = t.cached_methods.keys() t.cached_fn_scopes = t.env.snapshot_fn_scopes() + t.build_cached_method_base_index() + mut by_short := map[string][]string{} + for key in t.cached_method_keys { + by_short[method_short_name(key)] << key + } + t.cached_method_keys_by_short = by_short.move() +} + +// build_cached_method_base_index precomputes, for each type, a base-method-name +// -> FnType map so lookup_method_cached is O(1) instead of scanning every method +// and recomputing base names per call site. For each base name it keeps the +// first method (in snapshot order) whose type is an FnType, matching the old +// linear scan's "first base-or-exact match that is a FnType wins" semantics. +fn (mut t Transformer) build_cached_method_base_index() { + // Iterate via keys()+index, not `for k, v in t.cached_methods`: v2's own + // codegen mistypes the value of a `for k, v` over map[string][]&types.Fn as + // []types.Fn (value array), breaking self-host. + mut index := map[string]types.Type{} + for type_name in t.cached_methods.keys() { + methods := t.cached_methods[type_name] or { continue } + for method in methods { + typ := method.get_typ() + if typ is types.FnType { + base := generic_base_name_without_specialization(method.get_name()) + key := '${type_name}#${base}' + if key !in index { + index[key] = typ + } + } + } + } + t.cached_method_base_index = index.move() } // GeneratedFnsParts is the pure-computation bundle produced by @@ -2242,13 +2311,18 @@ pub fn (mut t Transformer) inject_embed_file_helper_to_flat(mut out ast.FlatBuil // transform_files transforms all files and returns transformed copies pub fn (mut t Transformer) transform_files(files []ast.File) []ast.File { + t_print_mem('enter') t.pre_pass(files) + t_print_mem('after pre_pass') files_to_transform := t.prepare_files_for_transform(files) + t_print_mem('after prepare/monomorphize') mut result := []ast.File{cap: files_to_transform.len} for file in files_to_transform { result << t.transform_file(file) } + t_print_mem('after per-file loop') t.post_pass(mut result) + t_print_mem('after post_pass') return result } diff --git a/vlib/v2/transformer/types.v b/vlib/v2/transformer/types.v index 43454ea00..63508a5a0 100644 --- a/vlib/v2/transformer/types.v +++ b/vlib/v2/transformer/types.v @@ -26,17 +26,12 @@ fn (t &Transformer) get_synth_type(pos token.Pos) ?types.Type { // lookup_method_cached looks up a method by receiver type name and method name // using cached_methods (lock-free) instead of env.lookup_method. fn (t &Transformer) lookup_method_cached(type_name string, method_name string) ?types.FnType { - methods := t.cached_methods[type_name] or { return none } + // O(1) via the precomputed base-name index (built in build_cached_method_base_index). + // Equivalent to the old linear scan: match by generic base name, first FnType wins. base_method_name := generic_base_name_without_specialization(method_name) - for method in methods { - cached_name := method.get_name() - if cached_name == method_name - || generic_base_name_without_specialization(cached_name) == base_method_name { - typ := method.get_typ() - if typ is types.FnType { - return typ - } - } + typ := t.cached_method_base_index['${type_name}#${base_method_name}'] or { return none } + if typ is types.FnType { + return typ } return none } -- 2.39.5