From de365a1fc6ab9c8cecbfd38bb4333bd24f887344 Mon Sep 17 00:00:00 2001
From: Alexander Medvednikov <alexander@medvednikov.com>
Date: Wed, 3 Jun 2026 23:35:52 +0300
Subject: [PATCH] v2: speed up the transform stage ~15x (42s -> ~2.85s on
 self-compile) (#27333)

---
 vlib/v2/builder/builder.v            |  13 +-
 vlib/v2/builder/mem_darwin.c.v       |  27 ++++
 vlib/v2/builder/transform_parallel.v | 153 +++++++++++++++++--
 vlib/v2/transformer/fn.v             |  63 +++++---
 vlib/v2/transformer/mem.v            |  22 +++
 vlib/v2/transformer/mem_darwin.c.v   |  25 +++
 vlib/v2/transformer/monomorphize.v   | 220 +++++++++++++++++++++++++--
 vlib/v2/transformer/struct.v         |  21 +++
 vlib/v2/transformer/transformer.v    |  76 ++++++++-
 vlib/v2/transformer/types.v          |  15 +-
 10 files changed, 575 insertions(+), 60 deletions(-)
 create mode 100644 vlib/v2/builder/mem_darwin.c.v
 create mode 100644 vlib/v2/transformer/mem.v
 create mode 100644 vlib/v2/transformer/mem_darwin.c.v

diff --git a/vlib/v2/builder/builder.v b/vlib/v2/builder/builder.v
index 75c40355e..061e3c186 100644
--- a/vlib/v2/builder/builder.v
+++ b/vlib/v2/builder/builder.v
@@ -160,8 +160,17 @@ fn print_rss(stage string) {
 	if os.getenv('V2_MEM') == '' {
 		return
 	}
-	bytes := runtime.used_memory() or { 0 }
-	eprintln('  [mem] ${stage}: ${bytes / (1024 * 1024)} MB')
+	rss := runtime.used_memory() or { 0 }
+	$if macos {
+		// Under -gc none nothing is freed, so `live` is monotonic and its
+		// per-phase delta is the exact bytes that phase allocated. `peak` is
+		// the high-water mark. Both are stable run-to-run, unlike `rss`.
+		live, peak := darwin_live_malloc_bytes()
+		mb := u64(1024 * 1024)
+		eprintln('  [mem] ${stage}: live ${live / mb} MB  peak ${peak / mb} MB  (rss ${rss / mb} MB)')
+		return
+	}
+	eprintln('  [mem] ${stage}: ${rss / (1024 * 1024)} MB')
 }
 
 // print_heap reports retained heap size after a forced GC, in MB. Unlike
diff --git a/vlib/v2/builder/mem_darwin.c.v b/vlib/v2/builder/mem_darwin.c.v
new file mode 100644
index 000000000..b7f9f8799
--- /dev/null
+++ b/vlib/v2/builder/mem_darwin.c.v
@@ -0,0 +1,27 @@
+// Copyright (c) 2020-2024 Joe Conigliaro. All rights reserved.
+// Use of this source code is governed by an MIT license
+// that can be found in the LICENSE file.
+module builder
+
+#include <malloc/malloc.h>
+
+struct C.malloc_statistics_t {
+	blocks_in_use   u32
+	size_in_use     usize
+	max_size_in_use usize
+	size_allocated  usize
+}
+
+fn C.malloc_default_zone() voidptr
+fn C.malloc_zone_statistics(zone voidptr, stats &C.malloc_statistics_t)
+
+// darwin_live_malloc_bytes returns (current live malloc bytes, peak live bytes)
+// from the default malloc zone. Under `-gc none` (no frees) the current value
+// is monotonic across phases, so per-phase deltas are the exact number of bytes
+// each phase allocated and never released. This is the reliable counterpart to
+// runtime.used_memory(), whose resident_size reading is distorted by OS paging.
+fn darwin_live_malloc_bytes() (u64, u64) {
+	mut st := C.malloc_statistics_t{}
+	C.malloc_zone_statistics(C.malloc_default_zone(), &st)
+	return u64(st.size_in_use), u64(st.max_size_in_use)
+}
diff --git a/vlib/v2/builder/transform_parallel.v b/vlib/v2/builder/transform_parallel.v
index ff11c7aa8..74cf3133c 100644
--- a/vlib/v2/builder/transform_parallel.v
+++ b/vlib/v2/builder/transform_parallel.v
@@ -6,6 +6,8 @@ module builder
 import v2.ast
 import v2.transformer
 import runtime
+import os
+import time
 
 $if !windows {
 	struct TransformChunkArgs {
@@ -28,6 +30,8 @@ $if !windows {
 	fn transform_chunk_thread(arg voidptr) voidptr {
 		a := unsafe { &TransformChunkArgs(arg) }
 		t := unsafe { &transformer.Transformer(a.t) }
+		wprof := os.getenv('V2_TTIME') != ''
+		mut wsw := time.new_stopwatch()
 		mut w := t.new_worker_clone(a.worker_idx)
 		if unsafe { a.flat != nil } {
 			// Streaming rehydration: rehydrate one file at a time, transform it,
@@ -52,6 +56,9 @@ $if !windows {
 		for i := 0; i < a.files.len; i++ {
 			result << w.transform_file_pub(a.files[i])
 		}
+		if wprof {
+			eprintln('  [ttime] worker ${a.worker_idx}: ${a.files.len} files in ${wsw.elapsed().milliseconds()}ms')
+		}
 		unsafe {
 			*(&[]ast.File(a.result_ptr)) = result
 			*(&voidptr(a.worker_ptr)) = voidptr(w)
@@ -61,8 +68,17 @@ $if !windows {
 }
 
 fn (mut b Builder) transform_files_parallel(mut trans transformer.Transformer) []ast.File {
+	timing := os.getenv('V2_TTIME') != ''
+	mut sw := time.new_stopwatch()
 	mut result := b.transform_files_parallel_no_post_pass(mut trans)
+	if timing {
+		eprintln('  [ttime] (parallel) prepare+fanout: ${sw.elapsed().milliseconds()}ms')
+		sw = time.new_stopwatch()
+	}
 	trans.post_pass(mut result)
+	if timing {
+		eprintln('  [ttime] (parallel) post_pass: ${sw.elapsed().milliseconds()}ms')
+	}
 	return result
 }
 
@@ -93,6 +109,8 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor
 	} else {
 		trans.pre_pass(b.files)
 	}
+	timing_impl := os.getenv('V2_TTIME') != ''
+	mut sw_impl := time.new_stopwatch()
 	mut stream_files_from_flat := stream_from_flat
 	mut files_to_transform := []ast.File{}
 	if trans.needs_full_files_for_transform() {
@@ -102,6 +120,15 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor
 	} else if !stream_from_flat {
 		files_to_transform = b.files.clone()
 	}
+	if timing_impl {
+		eprintln('  [ttime] prepare_files_for_transform total: ${sw_impl.elapsed().milliseconds()}ms')
+		sw_impl = time.new_stopwatch()
+	}
+	defer {
+		if timing_impl {
+			eprintln('  [ttime] per-file fanout: ${sw_impl.elapsed().milliseconds()}ms')
+		}
+	}
 
 	// In flat mode, workers stream the rehydration per file (one legacy
 	// ast.File in flight per worker at a time). Otherwise b.files is the
@@ -143,8 +170,32 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor
 			return result
 		}
 
-		// Split files into chunks and spawn workers via pthreads
-		chunk_size := (n_files + n_jobs - 1) / n_jobs // ceiling division
+		// Assign files to workers. Contiguous chunks badly unbalance the load:
+		// the few huge files (transformer.v, monomorphize.v, the cleanc gen
+		// files, ...) cluster into adjacent chunks, so 2-3 workers run ~10s
+		// while the rest finish in <0.5s and idle. For the non-flat path we
+		// instead use longest-processing-time-first (LPT) bucketing keyed on a
+		// cheap size proxy, then scatter each worker's results back to their
+		// original file index after the join (no concurrent writes — workers
+		// each fill their own chunk_results slot, the merge happens serially).
+		mut bucket_indices := [][]int{len: n_jobs}
+		if stream_files_from_flat {
+			// Flat streaming still uses contiguous [start,end) ranges.
+			chunk_size := (n_files + n_jobs - 1) / n_jobs
+			mut i := 0
+			mut w := 0
+			for i < n_files {
+				end := if i + chunk_size < n_files { i + chunk_size } else { n_files }
+				for j in i .. end {
+					bucket_indices[w] << j
+				}
+				i = end
+				w++
+			}
+		} else {
+			bucket_indices = lpt_buckets(files_to_transform, n_jobs)
+		}
+
 		mut chunk_results := [][]ast.File{len: n_jobs}
 		mut worker_ptrs := []voidptr{len: n_jobs, init: unsafe { nil }}
 		mut thread_ids := []C.pthread_t{len: n_jobs}
@@ -159,21 +210,26 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor
 		C.pthread_attr_setstacksize(attr, 64 * 1024 * 1024)
 
 		mut chunk_idx := 0
-		mut i := 0
-		for i < n_files {
-			end := if i + chunk_size < n_files { i + chunk_size } else { n_files }
+		for w in 0 .. n_jobs {
+			idxs := bucket_indices[w]
+			if idxs.len == 0 {
+				continue
+			}
 			if stream_files_from_flat {
 				args << TransformChunkArgs{
 					t:          unsafe { voidptr(trans) }
 					flat:       unsafe { &b.flat }
-					flat_start: i
-					flat_end:   end
+					flat_start: idxs[0]
+					flat_end:   idxs[idxs.len - 1] + 1
 					result_ptr: unsafe { voidptr(&chunk_results[chunk_idx]) }
 					worker_ptr: unsafe { voidptr(&worker_ptrs[chunk_idx]) }
 					worker_idx: chunk_idx
 				}
 			} else {
-				chunk := files_to_transform[i..end].clone()
+				mut chunk := []ast.File{cap: idxs.len}
+				for fi in idxs {
+					chunk << files_to_transform[fi]
+				}
 				args << TransformChunkArgs{
 					t:          unsafe { voidptr(trans) }
 					files:      chunk
@@ -184,7 +240,6 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor
 			}
 			C.pthread_create(unsafe { &thread_ids[chunk_idx] }, attr, transform_chunk_thread,
 				unsafe { voidptr(&args[chunk_idx]) })
-			i = end
 			chunk_idx++
 		}
 		C.pthread_attr_destroy(attr)
@@ -194,15 +249,25 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor
 			C.pthread_join(thread_ids[ci], unsafe { nil })
 		}
 
-		// Collect results in chunk order and merge worker accumulated state
-		mut result := []ast.File{cap: n_files}
-		for ci := 0; ci < chunk_idx; ci++ {
+		// Scatter each worker's results back to original file order and merge
+		// accumulated state. bucket_indices[w] lists the original indices the
+		// w-th spawned worker processed, in the same order it produced results.
+		mut result := []ast.File{len: n_files}
+		mut ci := 0
+		for w in 0 .. n_jobs {
+			idxs := bucket_indices[w]
+			if idxs.len == 0 {
+				continue
+			}
 			chunk_files := chunk_results[ci]
-			for k := 0; k < chunk_files.len; k++ {
-				result << chunk_files[k]
+			for k, fi in idxs {
+				if k < chunk_files.len {
+					result[fi] = chunk_files[k]
+				}
 			}
-			w := unsafe { &transformer.Transformer(worker_ptrs[ci]) }
-			trans.merge_worker(w)
+			worker := unsafe { &transformer.Transformer(worker_ptrs[ci]) }
+			trans.merge_worker(worker)
+			ci++
 		}
 		// Set synth_pos_counter past all worker ranges to avoid ID collisions in post_pass.
 		trans.set_synth_pos_counter(-(chunk_idx * 100_000) - 1)
@@ -210,6 +275,62 @@ fn (mut b Builder) transform_files_parallel_no_post_pass_impl(mut trans transfor
 	}
 }
 
+// lpt_buckets distributes file indices across n_jobs workers using the
+// longest-processing-time-first heuristic: process files largest-first and
+// always append to the currently least-loaded worker. This keeps the heaviest
+// files on separate workers so the fan-out wall time approaches
+// total_work / n_jobs instead of being pinned to one overloaded contiguous
+// chunk. The cost proxy is top-level statement count (cheap, and the giant
+// files have proportionally many declarations). Deterministic: files are
+// ordered by (cost desc, index asc) and ties pick the lowest worker index.
+fn lpt_buckets(files []ast.File, n_jobs int) [][]int {
+	n := files.len
+	mut cost := []int{len: n}
+	for i in 0 .. n {
+		// Cost proxy: count function bodies, not just top-level declarations, so
+		// a file of a few huge functions (transformer.v, the cleanc gen files)
+		// outranks one with many tiny ones. Deterministic; one level deep is
+		// enough to separate the heavyweight files that drove the imbalance.
+		mut c := 1
+		for stmt in files[i].stmts {
+			c++
+			if stmt is ast.FnDecl {
+				c += stmt.stmts.len
+			}
+		}
+		cost[i] = c
+	}
+	// order = file indices by cost descending. Implemented as a plain insertion
+	// sort (n is small, a few hundred) rather than sort_with_compare: this file
+	// must self-host through every backend, and capturing closures / pointer
+	// comparators are not reliably codegen'd by the v2 cleanc and arm64 paths.
+	// Stable on index (only shifts on strictly-greater), so deterministic.
+	mut order := []int{len: n, init: index}
+	for i in 1 .. n {
+		key := order[i]
+		kc := cost[key]
+		mut j := i - 1
+		for j >= 0 && cost[order[j]] < kc {
+			order[j + 1] = order[j]
+			j--
+		}
+		order[j + 1] = key
+	}
+	mut buckets := [][]int{len: n_jobs}
+	mut load := []i64{len: n_jobs}
+	for fi in order {
+		mut mw := 0
+		for w in 1 .. n_jobs {
+			if load[w] < load[mw] {
+				mw = w
+			}
+		}
+		buckets[mw] << fi
+		load[mw] += i64(cost[fi])
+	}
+	return buckets
+}
+
 // transform_files_parallel_to_flat is the parallel counterpart of
 // Transformer.transform_files_to_flat. Today it composes the existing
 // parallel transform with a boundary flatten_files() — same total work
diff --git a/vlib/v2/transformer/fn.v b/vlib/v2/transformer/fn.v
index 35ce1d2d4..10ee6cfbf 100644
--- a/vlib/v2/transformer/fn.v
+++ b/vlib/v2/transformer/fn.v
@@ -1172,26 +1172,31 @@ fn (t &Transformer) method_key_matches_type_name(method_key string, type_name st
 		|| !transformer_string_has_valid_data(type_name) {
 		return false
 	}
-	normalized_key := method_key.replace('.', '__')
-	normalized_type := type_name.replace('.', '__')
+	// Avoid .replace/.contains here: replace always allocates and contains builds
+	// a KMP failure table per call. This runs inside O(method_keys) fallback loops
+	// per call site, so those per-call allocations were a large transform cost.
+	// Only normalize when a '.' is actually present (index_u8 does not allocate),
+	// and locate `__` with a hand-rolled scan.
+	normalized_key := if method_key.index_u8(`.`) >= 0 {
+		method_key.replace('.', '__')
+	} else {
+		method_key
+	}
+	normalized_type := if type_name.index_u8(`.`) >= 0 {
+		type_name.replace('.', '__')
+	} else {
+		type_name
+	}
 	if normalized_key == normalized_type {
 		return true
 	}
-	key_is_qualified := normalized_key.contains('__')
-	type_is_qualified := normalized_type.contains('__')
-	if key_is_qualified && type_is_qualified {
+	key_dunder := last_double_underscore(normalized_key)
+	type_dunder := last_double_underscore(normalized_type)
+	if key_dunder >= 0 && type_dunder >= 0 {
 		return false
 	}
-	short_type := if normalized_type.contains('__') {
-		normalized_type.all_after_last('__')
-	} else {
-		normalized_type
-	}
-	short_key := if normalized_key.contains('__') {
-		normalized_key.all_after_last('__')
-	} else {
-		normalized_key
-	}
+	short_type := if type_dunder >= 0 { normalized_type[type_dunder + 2..] } else { normalized_type }
+	short_key := if key_dunder >= 0 { normalized_key[key_dunder + 2..] } else { normalized_key }
 	if short_key == short_type {
 		return true
 	}
@@ -1210,6 +1215,28 @@ fn (t &Transformer) method_key_matches_type_name(method_key string, type_name st
 	return false
 }
 
+// candidate_method_keys returns the cached method keys that could fuzzy-match any
+// of `names` — i.e. those sharing a receiver short name. A method_key_matches_type_name
+// match always implies equal short names, so the fuzzy fallback loops can scan
+// these candidates instead of every method key (O(all_keys) per call site).
+fn (t &Transformer) candidate_method_keys(names []string) []string {
+	mut cand := []string{}
+	mut shorts_done := []string{}
+	for name in names {
+		if name == '' {
+			continue
+		}
+		sh := method_short_name(name)
+		if sh in shorts_done {
+			continue
+		}
+		shorts_done << sh
+		keys := t.cached_method_keys_by_short[sh] or { continue }
+		cand << keys
+	}
+	return cand
+}
+
 fn (t &Transformer) lookup_method_return_type(type_names []string, method_name string) ?types.Type {
 	if method_name == '' {
 		return none
@@ -1229,7 +1256,7 @@ fn (t &Transformer) lookup_method_return_type(type_names []string, method_name s
 			}
 		}
 	}
-	for key in t.cached_method_keys {
+	for key in t.candidate_method_keys(seen) {
 		mut matches_receiver := false
 		for type_name in seen {
 			if t.method_key_matches_type_name(key, type_name) {
@@ -1330,7 +1357,7 @@ fn (t &Transformer) lookup_method_exists(type_names []string, method_name string
 			return true
 		}
 	}
-	for key in t.cached_method_keys {
+	for key in t.candidate_method_keys(seen) {
 		mut matches_receiver := false
 		for type_name in seen {
 			if t.method_key_matches_type_name(key, type_name) {
@@ -4454,7 +4481,7 @@ fn (t &Transformer) resolve_method_call_name(receiver ast.Expr, method_name stri
 		}
 	}
 	// Fuzzy fallback: iterate method keys to find matching receiver types
-	for key in t.cached_method_keys {
+	for key in t.candidate_method_keys(lookup_names) {
 		mut matches_receiver := false
 		for name in lookup_names {
 			if t.method_key_matches_type_name(key, name) {
diff --git a/vlib/v2/transformer/mem.v b/vlib/v2/transformer/mem.v
new file mode 100644
index 000000000..85020e576
--- /dev/null
+++ b/vlib/v2/transformer/mem.v
@@ -0,0 +1,22 @@
+// Copyright (c) 2020-2024 Joe Conigliaro. All rights reserved.
+// Use of this source code is governed by an MIT license
+// that can be found in the LICENSE file.
+module transformer
+
+import os
+
+// t_print_mem reports live malloc bytes at a transformer sub-phase boundary.
+// Gated on V2_MEM. Under -gc none the value is monotonic on macOS, so deltas
+// between stages are the exact bytes each sub-phase allocated. Defined for all
+// platforms (transform_files calls it everywhere); the malloc-statistics probe
+// is macOS-only, so other platforms just print the stage marker.
+fn t_print_mem(stage string) {
+	if os.getenv('V2_MEM') == '' {
+		return
+	}
+	$if macos {
+		eprintln('  [mem]   transform/${stage}: live ${darwin_transform_live_mb()} MB')
+	} $else {
+		eprintln('  [mem]   transform/${stage}')
+	}
+}
diff --git a/vlib/v2/transformer/mem_darwin.c.v b/vlib/v2/transformer/mem_darwin.c.v
new file mode 100644
index 000000000..1735f17a5
--- /dev/null
+++ b/vlib/v2/transformer/mem_darwin.c.v
@@ -0,0 +1,25 @@
+// Copyright (c) 2020-2024 Joe Conigliaro. All rights reserved.
+// Use of this source code is governed by an MIT license
+// that can be found in the LICENSE file.
+module transformer
+
+#include <malloc/malloc.h>
+
+struct C.malloc_statistics_t {
+	blocks_in_use   u32
+	size_in_use     usize
+	max_size_in_use usize
+	size_allocated  usize
+}
+
+fn C.malloc_default_zone() voidptr
+fn C.malloc_zone_statistics(zone voidptr, stats &C.malloc_statistics_t)
+
+// darwin_transform_live_mb returns the process's currently live malloc bytes
+// (in MB) on macOS. Used by t_print_mem; only referenced from its `$if macos`
+// branch, so the macOS-only C calls here never reach other platforms.
+fn darwin_transform_live_mb() u64 {
+	mut st := C.malloc_statistics_t{}
+	C.malloc_zone_statistics(C.malloc_default_zone(), &st)
+	return u64(st.size_in_use) / (1024 * 1024)
+}
diff --git a/vlib/v2/transformer/monomorphize.v b/vlib/v2/transformer/monomorphize.v
index cd1b54fd7..2426e2282 100644
--- a/vlib/v2/transformer/monomorphize.v
+++ b/vlib/v2/transformer/monomorphize.v
@@ -8,6 +8,8 @@ module transformer
 import v2.ast
 import v2.token
 import v2.types
+import os
+import time
 
 struct CloneComptimeFieldCtx {
 	var_name       string
@@ -59,24 +61,105 @@ pub fn (mut t Transformer) prepare_files_for_transform(files []ast.File) []ast.F
 	mut prepared := files.clone()
 	t.collect_declared_method_fns(prepared)
 	t.collect_struct_field_generic_decl_types(prepared)
-	for _ in 0 .. 64 {
+	timing := os.getenv('V2_TTIME') != ''
+	mut sw := time.new_stopwatch()
+	// prepared_dirty tracks whether `prepared` changed since it was last scanned
+	// by collect_generic_call_specs. collect is a pure (idempotent) function of
+	// the program: scanning an unchanged file set rediscovers the exact same
+	// specs. So the leading full-program scan is only needed when the program
+	// actually changed since the previous scan — i.e. when the previous
+	// iteration's inject step appended new struct specializations. The
+	// post-mono_pass scan below already covers the clones mono_pass adds, so the
+	// only unscanned source of change between iterations is inject. This removes
+	// the fixpoint's redundant confirmation scan (~6s / ~700MB for v2 self-host).
+	mut prepared_dirty := true
+	for iter in 0 .. 64 {
 		spec_count := t.monomorphized_specs.len
 		generic_count := t.generic_types_spec_count()
 		struct_count := t.generic_struct_specs.len
-		t.collect_generic_call_specs(prepared)
+		ts_start := sw.elapsed().milliseconds()
+		// collect1: full-program scan, only when the program changed since the
+		// last scan (initial pass, or inject appended structs last iteration).
+		if prepared_dirty {
+			t.collect_generic_call_specs(prepared)
+		}
+		ts_collect1 := sw.elapsed().milliseconds()
+		before_mono := t.monomorphized_specs.len
 		prepared = t.monomorphize_pass(prepared)
-		t.collect_generic_call_specs(prepared)
+		ts_mono := sw.elapsed().milliseconds()
+		// collect2: only when monomorphize_pass materialized new clones. Those
+		// clones are the only statements the previous scan has not seen — every
+		// other statement is byte-for-byte unchanged and collect is idempotent —
+		// so rescan just the new clones instead of re-walking all files. This
+		// turns the dominant ~6s/~700MB full rescan into a walk of a few dozen
+		// functions.
+		if t.monomorphized_specs.len != before_mono {
+			t.collect_generic_call_specs_in_new_clones(prepared)
+		}
+		ts_collect2 := sw.elapsed().milliseconds()
 		prepared = t.inject_generic_struct_specializations(prepared)
+		ts_inject := sw.elapsed().milliseconds()
+		// collect2 already rescanned mono_pass's clones, so `prepared` is clean
+		// for the next iteration unless inject just appended new specializations.
+		prepared_dirty = t.inject_changed_files
+		if timing {
+			eprintln('  [ttime] iter ${iter}: collect1=${ts_collect1 - ts_start}ms mono_pass=${ts_mono - ts_collect1}ms collect2=${ts_collect2 - ts_mono}ms inject=${ts_inject - ts_collect2}ms files=${prepared.len}')
+		}
+		t_print_mem('monomorphize iter ${iter}')
 		if t.monomorphized_specs.len == spec_count && t.generic_types_spec_count() == generic_count
 			&& t.generic_struct_specs.len == struct_count {
 			break
 		}
 	}
+	if dump_path := os.getenv_opt('V2_TDUMP') {
+		t.dump_monomorphize_specs(dump_path, prepared)
+	}
 	t.collect_struct_default_decl_infos(prepared)
 	t.collect_concrete_embedded_owner_names(prepared)
 	return prepared
 }
 
+// dump_monomorphize_specs writes a deterministic snapshot of the fixpoint's
+// result (all monomorphized fn spec keys, generic struct spec keys, generic
+// binding signatures, and per-file appended-stmt counts) to `path`. Used only
+// for correctness validation: the file must be byte-identical before and after
+// any optimization to the fixpoint loop. Gated on V2_TDUMP.
+fn (t &Transformer) dump_monomorphize_specs(path string, prepared []ast.File) {
+	mut lines := []string{}
+	mut mspecs := t.monomorphized_specs.keys()
+	mspecs.sort()
+	lines << '# monomorphized_specs (${mspecs.len})'
+	for k in mspecs {
+		lines << 'M ${k}'
+	}
+	mut sspecs := t.generic_struct_specs.keys()
+	sspecs.sort()
+	lines << '# generic_struct_specs (${sspecs.len})'
+	for k in sspecs {
+		lines << 'S ${k}'
+	}
+	mut gkeys := t.env.generic_types.keys()
+	gkeys.sort()
+	lines << '# generic_types'
+	for k in gkeys {
+		blist := t.env.generic_types[k] or { continue }
+		mut sigs := []string{}
+		for b in blist {
+			sigs << generic_bindings_signature(b)
+		}
+		sigs.sort()
+		for sig in sigs {
+			lines << 'G ${k} :: ${sig}'
+		}
+	}
+	// Per-file fingerprint: file name + stmt count, in input order.
+	lines << '# files (${prepared.len})'
+	for f in prepared {
+		lines << 'F ${f.mod}/${f.name} stmts=${f.stmts.len}'
+	}
+	os.write_file(path, lines.join('\n')) or {}
+}
+
 fn (t &Transformer) generic_types_spec_count() int {
 	mut count := 0
 	for _, bindings_list in t.env.generic_types {
@@ -607,6 +690,7 @@ fn (mut t Transformer) clone_generic_struct_decl(decl ast.StructDecl, spec Gener
 }
 
 fn (mut t Transformer) inject_generic_struct_specializations(files []ast.File) []ast.File {
+	t.inject_changed_files = false
 	if t.generic_struct_specs.len == 0 {
 		return files
 	}
@@ -626,6 +710,22 @@ fn (mut t Transformer) inject_generic_struct_specializations(files []ast.File) [
 	spec_keys := t.generic_struct_specs.keys()
 	mut sorted_keys := spec_keys.clone()
 	sorted_keys.sort()
+	// Fast path: if every struct spec is already present in `files`, nothing
+	// will be injected and the loop below would only rebuild an identical file
+	// set. Return the input unchanged to avoid duplicating every file's stmt
+	// list (~1.5GB under -gc none on the fixpoint loop's confirmation pass).
+	// Mirrors monomorphize_pass's `per_file_clones.len == 0 { return files }`.
+	mut any_pending := false
+	for key in sorted_keys {
+		spec := t.generic_struct_specs[key] or { continue }
+		if spec.concrete_c_name !in existing {
+			any_pending = true
+			break
+		}
+	}
+	if !any_pending {
+		return files
+	}
 	mut out := []ast.File{cap: files.len}
 	for file in files {
 		mut stmts := []ast.Stmt{cap: file.stmts.len}
@@ -661,6 +761,7 @@ fn (mut t Transformer) inject_generic_struct_specializations(files []ast.File) [
 			selector_names: file.selector_names
 		}
 	}
+	t.inject_changed_files = true
 	return out
 }
 
@@ -901,6 +1002,9 @@ pub fn (mut t Transformer) monomorphize_pass(files []ast.File) []ast.File {
 	deferred_specs := t.deferred_generic_call_specs.clone()
 	t.deferred_generic_call_specs = old_deferred_specs.clone()
 	t.flush_deferred_generic_call_specs(deferred_specs)
+	// Record the freshly-materialized clones so the fixpoint can rescan only
+	// them (collect_generic_call_specs_in_new_clones) instead of all files.
+	t.last_mono_clones = per_file_clones.clone()
 	if per_file_clones.len == 0 {
 		return files
 	}
@@ -1636,18 +1740,57 @@ fn monomorphized_clone_name(fn_key string, decl ast.FnDecl, spec_name string) st
 }
 
 fn generic_base_name_without_specialization(name string) string {
-	mut base_name := name
-	bracket_pos := base_name.index_u8(`[`)
-	if bracket_pos > 0 {
-		base_name = base_name[..bracket_pos]
+	// Hot path: called per-method per-call-site during generic collection.
+	// Hand-rolled byte scans replace string.contains/.index/.all_before, each of
+	// which builds a fresh KMP failure-table heap allocation on every call. With
+	// millions of calls (and most names having neither `[` nor `_T_`) that
+	// allocation churn dominated the whole transform stage. This is behaviour-
+	// identical to the previous index_u8(`[`)>0 / contains('_T_') / ends_with('_T')
+	// version, just without the per-call allocations.
+	mut end := name.len
+	for i in 0 .. name.len {
+		if name[i] == `[` {
+			if i > 0 {
+				end = i
+			}
+			break
+		}
 	}
-	if base_name.contains('_T_') {
-		return base_name.all_before('_T_')
+	for i := 0; i + 3 <= end; i++ {
+		if name[i] == `_` && name[i + 1] == `T` && name[i + 2] == `_` {
+			return name[..i]
+		}
 	}
-	if base_name.ends_with('_T') {
-		return base_name[..base_name.len - 2]
+	if end >= 2 && name[end - 1] == `T` && name[end - 2] == `_` {
+		return name[..end - 2]
 	}
-	return base_name
+	if end == name.len {
+		return name
+	}
+	return name[..end]
+}
+
+// method_short_name returns the final `__`-separated segment of s (after a
+// `.`->`__` normalization). It matches the short form method_key_matches_type_name
+// compares, so it can be used to bucket method keys for fast candidate lookup.
+fn method_short_name(s string) string {
+	norm := if s.index_u8(`.`) >= 0 { s.replace('.', '__') } else { s }
+	d := last_double_underscore(norm)
+	return if d >= 0 { norm[d + 2..] } else { norm }
+}
+
+// last_double_underscore returns the index of the last `__` in s, or -1.
+// Hand-rolled (no allocation) replacement for s.contains('__') / .all_after_last('__'),
+// which build KMP tables / allocate; used in hot per-call-site name matching.
+fn last_double_underscore(s string) int {
+	mut i := s.len - 2
+	for i >= 0 {
+		if s[i] == `_` && s[i + 1] == `_` {
+			return i
+		}
+		i--
+	}
+	return -1
 }
 
 fn (mut t Transformer) index_generic_fn_decl_for_monomorphize(mut decl_owner map[string]int, mut decl_node map[string]ast.FnDecl, decl ast.FnDecl, file_idx int, module_name string) {
@@ -1759,6 +1902,33 @@ fn (mut t Transformer) collect_generic_call_specs(files []ast.File) {
 	old_scope := t.scope
 	old_file_idx := t.cur_generic_call_file_idx
 	old_import_aliases := t.cur_import_aliases.clone()
+	t.build_generic_fn_decl_index(files)
+	for fi, file in files {
+		t.cur_file_name = file.name
+		t.cur_module = file.mod
+		t.cur_generic_call_file_idx = fi
+		t.cur_import_aliases = import_aliases_for_generic_collect(file.imports)
+		if scope := t.get_module_scope(file.mod) {
+			t.scope = scope
+		} else {
+			t.scope = unsafe { nil }
+		}
+		for stmt in file.stmts {
+			t.collect_generic_call_specs_in_stmt(stmt)
+		}
+	}
+	t.cur_module = old_module
+	t.cur_file_name = old_file
+	t.scope = old_scope
+	t.cur_generic_call_file_idx = old_file_idx
+	t.cur_import_aliases = old_import_aliases.clone()
+}
+
+// build_generic_fn_decl_index (re)builds t.generic_fn_decl_index from every
+// generic FnDecl across all files. Generic declarations never change during the
+// fixpoint (monomorphize_pass only appends concrete clones), but the index maps
+// keys to file indices, so it is rebuilt against the current file set.
+fn (mut t Transformer) build_generic_fn_decl_index(files []ast.File) {
 	t.generic_fn_decl_index = map[string]ast.FnDecl{}
 	mut dummy_owner := map[string]int{}
 	for fi, file in files {
@@ -1772,7 +1942,31 @@ fn (mut t Transformer) collect_generic_call_specs(files []ast.File) {
 			}
 		}
 	}
+}
+
+// collect_generic_call_specs_in_new_clones is the incremental counterpart of
+// collect_generic_call_specs. After monomorphize_pass appends concrete clones,
+// only those clones can introduce generic calls the previous scan did not see —
+// every other statement is byte-for-byte unchanged and was already walked. So
+// this rebuilds the generic-decl index (cheap, shallow) but deep-walks only the
+// clones recorded in t.last_mono_clones, each under its owning file's module,
+// scope and import context. For v2 self-host this replaces a full ~6s/~700MB
+// rescan with a walk of a few dozen functions.
+fn (mut t Transformer) collect_generic_call_specs_in_new_clones(files []ast.File) {
+	if t.last_mono_clones.len == 0 {
+		return
+	}
+	old_module := t.cur_module
+	old_file := t.cur_file_name
+	old_scope := t.scope
+	old_file_idx := t.cur_generic_call_file_idx
+	old_import_aliases := t.cur_import_aliases.clone()
+	t.build_generic_fn_decl_index(files)
 	for fi, file in files {
+		clones := t.last_mono_clones[fi] or { continue }
+		if clones.len == 0 {
+			continue
+		}
 		t.cur_file_name = file.name
 		t.cur_module = file.mod
 		t.cur_generic_call_file_idx = fi
@@ -1782,7 +1976,7 @@ fn (mut t Transformer) collect_generic_call_specs(files []ast.File) {
 		} else {
 			t.scope = unsafe { nil }
 		}
-		for stmt in file.stmts {
+		for stmt in clones {
 			t.collect_generic_call_specs_in_stmt(stmt)
 		}
 	}
diff --git a/vlib/v2/transformer/struct.v b/vlib/v2/transformer/struct.v
index db7570544..b0197a9f6 100644
--- a/vlib/v2/transformer/struct.v
+++ b/vlib/v2/transformer/struct.v
@@ -75,6 +75,27 @@ fn (t &Transformer) lookup_struct_field_type(struct_name string, field_name stri
 		|| !transformer_string_has_valid_data(field_name) {
 		return none
 	}
+	// Memoize: this is called per field-access expression and re-derives the type
+	// each time. The result is a pure function of (cur_module, struct_name,
+	// field_name) — cur_module only matters for the unqualified-name path, but
+	// keying on it unconditionally is always correct. A Void value is the
+	// "resolved to none" sentinel (struct fields are never void-typed).
+	cache_key := '${t.cur_module}\x01${struct_name}\x01${field_name}'
+	if cached := t.field_type_cache[cache_key] {
+		if cached is types.Void {
+			return none
+		}
+		return cached
+	}
+	result := t.lookup_struct_field_type_uncached(struct_name, field_name)
+	unsafe {
+		mut mt := &Transformer(voidptr(t))
+		mt.field_type_cache[cache_key] = result or { types.Type(types.void_) }
+	}
+	return result
+}
+
+fn (t &Transformer) lookup_struct_field_type_uncached(struct_name string, field_name string) ?types.Type {
 	if field_type := t.lookup_struct_field_generic_decl_type(struct_name, field_name) {
 		return field_type
 	}
diff --git a/vlib/v2/transformer/transformer.v b/vlib/v2/transformer/transformer.v
index 48bed68ac..6752ceb72 100644
--- a/vlib/v2/transformer/transformer.v
+++ b/vlib/v2/transformer/transformer.v
@@ -123,13 +123,47 @@ mut:
 	cached_methods     map[string][]&types.Fn
 	cached_method_keys []string
 	cached_fn_scopes   map[string]&types.Scope
+	// cached_method_base_index maps type_name -> generic-base method name ->
+	// FnType, precomputed once from cached_methods. lookup_method_cached used to
+	// linearly scan every method of a type and recompute its base name (via
+	// generic_base_name_without_specialization) on every call site — O(calls x
+	// methods) with a string scan inner loop, the single biggest transform cost.
+	// This makes the lookup O(1). Flat map keyed by "type_name#base_name" -> the
+	// types.Type sum (callers smartcast to FnType). Flat (not nested) so a lookup
+	// doesn't copy an inner map, and stored as the sum (not the FnType variant)
+	// because v2's own codegen mishandles a map valued by a bare sum-variant.
+	cached_method_base_index map[string]types.Type
+	// cached_method_keys_by_short buckets cached_method_keys by their short
+	// (final `__`-segment) name. The fuzzy method-key fallback loops only ever
+	// match a key whose short name equals the receiver's short name, so they can
+	// scan just this bucket instead of every method key (O(all_keys) per call).
+	cached_method_keys_by_short map[string][]string
+	// field_type_cache memoizes lookup_struct_field_type, which is called per
+	// field-access expression and re-derives the field type (scope lookups +
+	// types.Type-by-value copies + a scan-all-scopes fallback) every time. Keyed
+	// by "cur_module\x01struct\x01field" because the unqualified-name path
+	// consults cur_module. Written through an unsafe const->mut cast (the result
+	// is pure given the key, so this is benign interior mutability); each parallel
+	// worker gets its own empty cache, so there is no cross-thread sharing.
+	field_type_cache map[string]types.Type
 	// Accumulated synth types for deferred application (thread-safe).
 	// Instead of writing directly to env.set_expr_type during parallel transform,
 	// store here and apply after merge.
 	synth_types map[int]types.Type
 	// Generic monomorphization clones generic FnDecls per env.generic_types
 	// binding before code generation, so backends receive concrete functions.
-	monomorphized_specs                map[string]bool
+	monomorphized_specs map[string]bool
+	// inject_changed_files records whether the last
+	// inject_generic_struct_specializations call actually appended new struct
+	// specializations (i.e. returned a modified file set). The monomorphize
+	// fixpoint uses it to skip the next iteration's full-program collect scan
+	// when nothing changed.
+	inject_changed_files bool
+	// last_mono_clones maps file index -> the FnDecl clone stmts that the most
+	// recent monomorphize_pass appended to that file. The fixpoint rescans only
+	// these freshly-materialized clones (the rest of the program was already
+	// scanned) instead of re-walking all files. Empty when mono_pass added none.
+	last_mono_clones                   map[int][]ast.Stmt
 	generic_spec_owner_file            map[string]int
 	deferred_generic_call_specs        []DeferredGenericCallSpec
 	generic_struct_specs               map[string]GenericStructSpec
@@ -337,6 +371,9 @@ pub fn (t &Transformer) new_worker_clone(worker_idx int) &Transformer {
 		file_set:                           unsafe { t.file_set }
 		cached_scopes:                      t.cached_scopes.clone()
 		cached_methods:                     t.cached_methods.clone()
+		cached_method_base_index:           t.cached_method_base_index.clone()
+		cached_method_keys_by_short:        t.cached_method_keys_by_short.clone()
+		field_type_cache:                   map[string]types.Type{}
 		cached_method_keys:                 t.cached_method_keys.clone()
 		cached_fn_scopes:                   t.cached_fn_scopes.clone()
 		synth_types:                        t.synth_types.clone()
@@ -1498,6 +1535,38 @@ fn (mut t Transformer) cache_env_maps() {
 	t.cached_methods = t.env.snapshot_methods()
 	t.cached_method_keys = t.cached_methods.keys()
 	t.cached_fn_scopes = t.env.snapshot_fn_scopes()
+	t.build_cached_method_base_index()
+	mut by_short := map[string][]string{}
+	for key in t.cached_method_keys {
+		by_short[method_short_name(key)] << key
+	}
+	t.cached_method_keys_by_short = by_short.move()
+}
+
+// build_cached_method_base_index precomputes, for each type, a base-method-name
+// -> FnType map so lookup_method_cached is O(1) instead of scanning every method
+// and recomputing base names per call site. For each base name it keeps the
+// first method (in snapshot order) whose type is an FnType, matching the old
+// linear scan's "first base-or-exact match that is a FnType wins" semantics.
+fn (mut t Transformer) build_cached_method_base_index() {
+	// Iterate via keys()+index, not `for k, v in t.cached_methods`: v2's own
+	// codegen mistypes the value of a `for k, v` over map[string][]&types.Fn as
+	// []types.Fn (value array), breaking self-host.
+	mut index := map[string]types.Type{}
+	for type_name in t.cached_methods.keys() {
+		methods := t.cached_methods[type_name] or { continue }
+		for method in methods {
+			typ := method.get_typ()
+			if typ is types.FnType {
+				base := generic_base_name_without_specialization(method.get_name())
+				key := '${type_name}#${base}'
+				if key !in index {
+					index[key] = typ
+				}
+			}
+		}
+	}
+	t.cached_method_base_index = index.move()
 }
 
 // GeneratedFnsParts is the pure-computation bundle produced by
@@ -2242,13 +2311,18 @@ pub fn (mut t Transformer) inject_embed_file_helper_to_flat(mut out ast.FlatBuil
 
 // transform_files transforms all files and returns transformed copies
 pub fn (mut t Transformer) transform_files(files []ast.File) []ast.File {
+	t_print_mem('enter')
 	t.pre_pass(files)
+	t_print_mem('after pre_pass')
 	files_to_transform := t.prepare_files_for_transform(files)
+	t_print_mem('after prepare/monomorphize')
 	mut result := []ast.File{cap: files_to_transform.len}
 	for file in files_to_transform {
 		result << t.transform_file(file)
 	}
+	t_print_mem('after per-file loop')
 	t.post_pass(mut result)
+	t_print_mem('after post_pass')
 	return result
 }
 
diff --git a/vlib/v2/transformer/types.v b/vlib/v2/transformer/types.v
index 43454ea00..63508a5a0 100644
--- a/vlib/v2/transformer/types.v
+++ b/vlib/v2/transformer/types.v
@@ -26,17 +26,12 @@ fn (t &Transformer) get_synth_type(pos token.Pos) ?types.Type {
 // lookup_method_cached looks up a method by receiver type name and method name
 // using cached_methods (lock-free) instead of env.lookup_method.
 fn (t &Transformer) lookup_method_cached(type_name string, method_name string) ?types.FnType {
-	methods := t.cached_methods[type_name] or { return none }
+	// O(1) via the precomputed base-name index (built in build_cached_method_base_index).
+	// Equivalent to the old linear scan: match by generic base name, first FnType wins.
 	base_method_name := generic_base_name_without_specialization(method_name)
-	for method in methods {
-		cached_name := method.get_name()
-		if cached_name == method_name
-			|| generic_base_name_without_specialization(cached_name) == base_method_name {
-			typ := method.get_typ()
-			if typ is types.FnType {
-				return typ
-			}
-		}
+	typ := t.cached_method_base_index['${type_name}#${base_method_name}'] or { return none }
+	if typ is types.FnType {
+		return typ
 	}
 	return none
 }
-- 
2.39.5