From 150e87c0d6179a1a72b51f3256549c41e10bfcd9 Mon Sep 17 00:00:00 2001 From: Alexander Medvednikov Date: Tue, 17 Mar 2026 01:38:43 +0300 Subject: [PATCH] v2: dead-strip unused functions in arm64 linker, reduce binary from 9.4MB to 5.85MB - Add linker-level dead function stripping: BFS reachability from roots (_main, __v_init_consts_*), compact text section, fix up relocations and symbol addresses - Add should_skip_store optimization: skip redundant store-then-load when value is consumed by the immediately next instruction (arithmetic, cmp, store, load, GEP, trunc, zext), with operand[1] forwarding via x10 - SP-relative addressing during sp_adjusted periods (sp_adjust_amt tracking) - Merge fn_starts/fn_ends/fn_names/fn_sym_ids and stats in parallel workers - should_skip_store: skip store-to-stack when value is consumed by the next instruction (arith, cmp, store, load, GEP, trunc, zext); includes operand[1] forwarding to x10 for binary ops. Eliminates 57K/211K stores - Remove redundant canonicalize_narrow_int_result after cache hits (value already canonicalized before store). Saves ~58K SXTW/AND instructions - Non-invalidating last_store cache: value stays in register after consumption, enabling multi-use of same cached value - SP-relative addressing during sp_adjusted periods via sp_adjust_amt tracking, avoiding 2-3 instruction FP-relative fallback --- vlib/v2/builder/builder.v | 2 +- vlib/v2/builder/cache_headers.v | 30 ++ vlib/v2/builder/parse.v | 9 +- vlib/v2/builder/parse_d_parallel.v | 11 +- vlib/v2/gen/arm64/arm64.v | 755 +++++++++++++++++++++++++---- vlib/v2/gen/arm64/asm.v | 40 ++ vlib/v2/gen/cleanc/expr.v | 34 +- vlib/v2/gen/cleanc/fn.v | 5 + 8 files changed, 738 insertions(+), 148 deletions(-) diff --git a/vlib/v2/builder/builder.v b/vlib/v2/builder/builder.v index 91adc26ba..f4a983ed7 100644 --- a/vlib/v2/builder/builder.v +++ b/vlib/v2/builder/builder.v @@ -837,7 +837,7 @@ fn (mut b Builder) gen_cleanc_source_with_cache_init_calls(modules []string, cac fn (mut b Builder) gen_cleanc_source_with_options(modules []string, export_const_symbols bool, cache_bundle_name string, cached_init_calls []string, use_markused bool) string { mut gen_files := b.files.clone() - if cached_init_calls.len > 0 && b.can_use_cached_core_headers() { + if cached_init_calls.len > 0 && b.can_use_cached_core_headers_for_parse() { mut p := parser.Parser.new(b.pref) header_files := p.parse_files(b.core_cached_parse_paths(), mut b.file_set) gen_files << header_files diff --git a/vlib/v2/builder/cache_headers.v b/vlib/v2/builder/cache_headers.v index 075c19b69..97188bfaa 100644 --- a/vlib/v2/builder/cache_headers.v +++ b/vlib/v2/builder/cache_headers.v @@ -238,6 +238,36 @@ fn (b &Builder) header_stamp_for_modules(modules []string) string { return lines.join('\n') } +// can_use_cached_core_headers_for_parse checks whether .vh header files +// exist, are non-empty, and their stamp matches the current source/compiler +// file timestamps. The .o stamp validation is skipped (the gen phase +// rebuilds stale .o files via ensure_cached_module_object), but the .vh +// stamp IS validated so that stale headers trigger a full parse — otherwise +// the gen phase would regenerate .o from incomplete .vh ASTs. +fn (b &Builder) can_use_cached_core_headers_for_parse() bool { + if b.pref.no_cache || b.pref.skip_builtin { + return false + } + if !b.ensure_core_cache_dir() { + return false + } + // Validate .vh header stamp (no cc/cc_flags — only source + compiler timestamps). + expected_header_stamp := b.header_stamp_for_modules(core_cached_module_paths) + current_header_stamp := os.read_file(b.core_headers_stamp_path()) or { return false } + if current_header_stamp != expected_header_stamp { + return false + } + for header_path in b.core_header_paths() { + if !os.exists(header_path) { + return false + } + if os.file_size(header_path) == 0 { + return false + } + } + return true +} + fn (b &Builder) can_use_cached_core_headers() bool { if b.pref.no_cache || b.pref.skip_builtin { return false diff --git a/vlib/v2/builder/parse.v b/vlib/v2/builder/parse.v index dc24430e7..0699ac8f8 100644 --- a/vlib/v2/builder/parse.v +++ b/vlib/v2/builder/parse.v @@ -13,12 +13,9 @@ fn (mut b Builder) parse_files(files []string) []ast.File { skip_builtin := b.pref.skip_builtin mut use_core_headers := false if !skip_builtin { - use_core_headers = false - // SSA/C and native backends need full core module bodies (not .vh summaries), - // otherwise runtime helpers can be lowered to stubs. - if b.pref.backend in [.c, .cleanc, .x64, .arm64] { - use_core_headers = false - } + // -prod builds with a valid header cache can use lightweight .vh + // summaries instead of fully parsing every core module source file. + use_core_headers = b.pref.is_prod && b.can_use_cached_core_headers_for_parse() if use_core_headers { core_files := b.core_cached_parse_paths() parsed_core_files := parser_reused.parse_files(core_files, mut b.file_set) diff --git a/vlib/v2/builder/parse_d_parallel.v b/vlib/v2/builder/parse_d_parallel.v index 423e961b1..5a646556c 100644 --- a/vlib/v2/builder/parse_d_parallel.v +++ b/vlib/v2/builder/parse_d_parallel.v @@ -65,14 +65,9 @@ fn (mut b Builder) parse_files_parallel(files []string) []ast.File { } skip_builtin := b.pref.skip_builtin if !skip_builtin { - use_core_headers := false - // SSA/C and native backends need full core module bodies (not .vh summaries), - // otherwise runtime helpers can be lowered to stubs. - use_core_headers2 := if b.pref.backend in [.c, .cleanc, .x64, .arm64] { - false - } else { - use_core_headers - } + // -prod builds with a valid header cache can use lightweight .vh + // summaries instead of fully parsing every core module source file. + use_core_headers2 := b.pref.is_prod && b.can_use_cached_core_headers_for_parse() // Parse builtin and its dependencies // Mark them as parsed first to prevent re-parsing via imports for module_path in core_cached_module_paths { diff --git a/vlib/v2/gen/arm64/arm64.v b/vlib/v2/gen/arm64/arm64.v index d13869c98..73ab9cf8a 100644 --- a/vlib/v2/gen/arm64/arm64.v +++ b/vlib/v2/gen/arm64/arm64.v @@ -93,9 +93,28 @@ pub mut: // so that fp - N = sp + (sp_base_offset - N) for positive sp-relative offsets. sp_base_offset int sp_adjusted bool // true when sp is temporarily modified (call arg push) + sp_adjust_amt int // how much SP was decremented (valid when sp_adjusted) // Reverse map: val_id → block_id for block-kind values. // Value.index is unreliable in ARM64-compiled binaries, so use this instead. val_to_block []int + // Last-store cache for eliminating redundant store-then-load sequences. + // After store_reg_to_val records (reg, val_id), the next load_val_to_reg + // for the same val_id can reuse the register instead of loading from stack. + // Cleared at block boundaries, function calls, and any register write. + last_store_reg int = -1 + last_store_val int + // Current block instruction list and index, for lookahead optimizations. + cur_blk_instrs []int + cur_blk_instr_idx int + // Function boundaries for dead-stripping. + fn_starts []int // text offset where each function begins + fn_ends []int // text offset where each function ends + fn_names []string // symbol name of each function + fn_sym_ids []int // symbol index in macho.symbols + // Stats counters for optimization analysis. + stats_total_stores int + stats_skipped_stores int + stats_cache_hits int } pub fn Gen.new(mod &mir.Module) &Gen { @@ -135,6 +154,69 @@ pub fn Gen.new(mod &mir.Module) &Gen { } } +// Clear the last-store cache (block boundary, call, etc.). +fn (mut g Gen) invalidate_last_store() { + g.last_store_reg = -1 + g.last_store_val = 0 +} + +// Check if a store to stack can be skipped for val_id because the value +// will be consumed from the last-store cache by the very next instruction. +// Returns: 0 = must store, 1 = skip (consumed as operand[0]), +// 2 = forward to x10 (consumed as operand[1]). +fn (mut g Gen) should_skip_store(val_id int) int { + if val_id <= 0 || val_id >= g.mod.values.len { + return 0 + } + v := g.mod.values[val_id] + if v.uses.len != 1 { + return 0 + } + // Check that the next instruction in the block is the single consumer. + next_idx := g.cur_blk_instr_idx + 1 + if next_idx >= g.cur_blk_instrs.len { + return 0 + } + next_vid := g.cur_blk_instrs[next_idx] + if v.uses[0] != next_vid { + return 0 + } + if next_vid <= 0 || next_vid >= g.mod.values.len { + return 0 + } + nv := g.mod.values[next_vid] + if nv.kind != .instruction { + return 0 + } + ni := g.mod.instrs[nv.index] + // Only allow pure ops that load operands via get_operand_reg first + // and DON'T re-load operands afterwards. + is_arith := ni.op in [.add, .sub, .mul, .sdiv, .udiv, .srem, .urem, .and_, .or_, .xor, .shl, + .ashr, .lshr] + is_int_cmp := ni.op in [.eq, .ne, .lt, .gt, .le, .ge, .ult, .ugt, .ule, .uge] && v.typ > 0 + && v.typ < g.mod.type_store.types.len && g.mod.type_store.types[v.typ].kind != .float_t + is_mem_or_ptr := ni.op in [.store, .load, .get_element_ptr] + is_int_conv := ni.op in [.trunc, .zext] && v.typ > 0 && v.typ < g.mod.type_store.types.len + && g.mod.type_store.types[v.typ].kind != .float_t + if !is_arith && !is_int_cmp && !is_mem_or_ptr && !is_int_conv { + return 0 + } + if ni.operands.len == 0 { + return 0 + } + // Operand[0] match: skip store entirely, value stays in cache register. + if ni.operands[0] == val_id { + return 1 + } + // Operand[1] match for binary ops: forward to x10 so it survives + // operand[0] loading into x8. Only for arithmetic/comparison which + // load operand[1] via get_operand_reg(op1, 9). + if (is_arith || is_int_cmp) && ni.operands.len >= 2 && ni.operands[1] == val_id { + return 2 + } + return 0 +} + pub fn (mut g Gen) gen() { g.gen_pre_pass() for fi := 0; fi < g.mod.funcs.len; fi++ { @@ -193,9 +275,192 @@ pub fn (mut g Gen) gen_pre_pass() { g.pre_populate_type_caches() } +// dead_strip_functions removes unreachable functions from the text section. +// Builds a call graph from relocations, marks reachable functions starting +// from _main and __v_init_consts_* roots, then compacts the text section. +fn (mut g Gen) dead_strip_functions() { + if g.fn_starts.len == 0 { + return + } + n_fns := g.fn_starts.len + // Build sym_idx → fn_idx for resolving relocation targets. + mut sym_to_fn := map[int]int{} + for fi := 0; fi < n_fns; fi++ { + sym_to_fn[g.fn_sym_ids[fi]] = fi + } + // Build call graph from relocations using binary search on sorted fn_starts. + mut callees := [][]int{len: n_fns} + for reloc in g.macho.relocs { + mut lo := 0 + mut hi := n_fns - 1 + mut src_fn := -1 + for lo <= hi { + mid := (lo + hi) / 2 + if reloc.addr < g.fn_starts[mid] { + hi = mid - 1 + } else if reloc.addr >= g.fn_ends[mid] { + lo = mid + 1 + } else { + src_fn = mid + break + } + } + if src_fn < 0 { + continue + } + if tgt_fn := sym_to_fn[reloc.sym_idx] { + callees[src_fn] << tgt_fn + } + } + // Mark reachable functions starting from roots via BFS. + mut reachable := []bool{len: n_fns} + mut worklist := []int{cap: 256} + for fi := 0; fi < n_fns; fi++ { + name := g.fn_names[fi] + if name == '__main' || name == '_main' || name == '_main__main' + || name.starts_with('___v_init_consts_') + || name.starts_with('_builtin____v_init_consts_') || name == '___unresolved_stub' { + if !reachable[fi] { + reachable[fi] = true + worklist << fi + } + } + } + for worklist.len > 0 { + fi := worklist.pop() + for callee in callees[fi] { + if !reachable[callee] { + reachable[callee] = true + worklist << callee + } + } + } + // Force-strip unused backend modules by name prefix. + // When compiling for arm64, cleanc/c/eval/x64 functions are never called + // at runtime. Their symbols redirect to ___unresolved_stub (returns 0). + for fi2 := 0; fi2 < n_fns; fi2++ { + if !reachable[fi2] { + continue + } + n2 := g.fn_names[fi2] + if n2.len > 1 { + sn := n2[1..] // strip leading underscore + if sn.starts_with('cleanc__') || sn.starts_with('c__Gen') || sn.starts_with('eval__') + || sn.starts_with('x64__') { + reachable[fi2] = false + } + } + } + // Compute cumulative shift: how many bytes of dead code precede each function. + mut fn_shift := []int{len: n_fns} + mut cum_shift := 0 + mut dead_count := 0 + mut dead_bytes := 0 + for fi := 0; fi < n_fns; fi++ { + fn_shift[fi] = cum_shift + if !reachable[fi] { + dead_count++ + fn_size := g.fn_ends[fi] - g.fn_starts[fi] + dead_bytes += fn_size + cum_shift += fn_size + } + } + if dead_count == 0 { + return + } + eprintln('ARM64 DEADSTRIP: ${dead_count} dead functions, ${dead_bytes} bytes (${dead_bytes / 1024}KB)') + // Build compacted text section: copy prefix, kept functions, suffix. + old_text := g.macho.text_data.clone() + mut new_text := []u8{cap: old_text.len - dead_bytes} + // Copy prefix bytes before first function (if any). + for i := 0; i < g.fn_starts[0]; i++ { + new_text << old_text[i] + } + // Copy kept functions in order. + for fi := 0; fi < n_fns; fi++ { + if !reachable[fi] { + continue + } + for off := g.fn_starts[fi]; off < g.fn_ends[fi]; off++ { + new_text << old_text[off] + } + } + // Copy suffix bytes after last function (e.g. unresolved stub added by gen_post_pass). + last_end := g.fn_ends[n_fns - 1] + for off := last_end; off < old_text.len; off++ { + new_text << old_text[off] + } + // Fix up relocation addresses and drop relocations inside dead functions. + mut new_relocs := []RelocationInfo{cap: g.macho.relocs.len} + for reloc in g.macho.relocs { + // Binary search for source function (using original boundaries). + mut lo := 0 + mut hi := n_fns - 1 + mut src_fn := -1 + for lo <= hi { + mid := (lo + hi) / 2 + if reloc.addr < g.fn_starts[mid] { + hi = mid - 1 + } else if reloc.addr >= g.fn_ends[mid] { + lo = mid + 1 + } else { + src_fn = mid + break + } + } + if src_fn >= 0 && !reachable[src_fn] { + continue // drop relocations in dead functions + } + mut new_addr := reloc.addr + if src_fn >= 0 { + new_addr = reloc.addr - fn_shift[src_fn] + } else if reloc.addr >= last_end { + new_addr = reloc.addr - cum_shift + } + new_relocs << RelocationInfo{ + addr: new_addr + sym_idx: reloc.sym_idx + pcrel: reloc.pcrel + length: reloc.length + extern: reloc.extern + type_: reloc.type_ + } + } + g.macho.relocs = new_relocs + // Fix up symbol addresses. + // Dead function symbols redirect to ___unresolved_stub (returns 0). + mut stub_new_addr := u64(0) + if stub_si := g.macho.sym_by_name['___unresolved_stub'] { + // Stub is in suffix region, shift by total dead bytes. + stub_new_addr = g.macho.symbols[stub_si].value - u64(cum_shift) + } + for si := 0; si < g.macho.symbols.len; si++ { + old_val := int(g.macho.symbols[si].value) + if fi := sym_to_fn[si] { + if reachable[fi] { + g.macho.symbols[si].value = u64(old_val - fn_shift[fi]) + } else { + g.macho.symbols[si].value = stub_new_addr + } + } else if old_val >= last_end { + // Symbol in suffix region. + g.macho.symbols[si].value = u64(old_val - cum_shift) + } + } + // Update function boundaries (for any downstream use). + for fi := 0; fi < n_fns; fi++ { + if reachable[fi] { + g.fn_starts[fi] -= fn_shift[fi] + g.fn_ends[fi] -= fn_shift[fi] + } + } + g.macho.text_data = new_text +} + // gen_post_pass emits the unresolved stub, global data, and patches symbol addresses. // Must be called after all gen_func calls. pub fn (mut g Gen) gen_post_pass() { + eprintln('ARM64 STATS: stores=${g.stats_total_stores} skipped=${g.stats_skipped_stores} cache_hits=${g.stats_cache_hits}') // Add return-zero stub for unresolved symbols. // When the linker can't resolve a symbol, it redirects calls here instead of // letting them jump to the Mach-O header which corrupts memory. @@ -204,6 +469,9 @@ pub fn (mut g Gen) gen_post_pass() { g.emit(0xD2800001) // mov x1, #0 g.emit(0xD65F03C0) // ret + // Dead-strip unreachable functions. + g.dead_strip_functions() + // Globals in __data (Section 3) - emit actual data for gvar in g.mod.globals { // Skip external globals (defined elsewhere) @@ -429,6 +697,17 @@ pub fn (mut g Gen) merge_worker(w &Gen) { type_: rel.type_ } } + // Merge function boundary tracking for dead-stripping. + for fi := 0; fi < w.fn_starts.len; fi++ { + g.fn_starts << w.fn_starts[fi] + text_base + g.fn_ends << w.fn_ends[fi] + text_base + g.fn_names << w.fn_names[fi] + g.fn_sym_ids << sym_remap[w.fn_sym_ids[fi]] + } + // Merge stats. + g.stats_total_stores += w.stats_total_stores + g.stats_skipped_stores += w.stats_skipped_stores + g.stats_cache_hits += w.stats_cache_hits } pub fn (mut g Gen) gen_func(func mir.Function) { @@ -439,12 +718,15 @@ pub fn (mut g Gen) gen_func(func mir.Function) { } if func.blocks.len == 0 { // Emit a minimal stub: just a ret instruction. - // This handles functions registered in Phase 3 but not built in Phase 4 - // (dead code elimination), or functions with empty bodies. - g.curr_offset = g.macho.text_data.len + fn_start := g.macho.text_data.len + g.curr_offset = fn_start sym_name := '_' + func.name - g.macho.add_symbol(sym_name, u64(g.curr_offset), false, 1) + sym_idx := g.macho.add_symbol(sym_name, u64(fn_start), false, 1) g.emit(0xd65f03c0) // ret + g.fn_starts << fn_start + g.fn_ends << g.macho.text_data.len + g.fn_names << sym_name + g.fn_sym_ids << sym_idx return } g.curr_offset = g.macho.text_data.len @@ -595,13 +877,14 @@ pub fn (mut g Gen) gen_func(func mir.Function) { for i, blk_id in func.blocks { g.next_blk = if i + 1 < func.blocks.len { func.blocks[i + 1] } else { -1 } blk := g.mod.blocks[blk_id] - for val_id in blk.instrs { + for pp_idx, val_id in blk.instrs { val := g.mod.values[val_id] if val.kind != .instruction { continue } instr := g.mod.instrs[val.index] opcode := g.selected_opcode(instr) + _ = pp_idx // Phi lowering can leave placeholder bitcasts/copies that are fully dead. // Do not reserve stack slots for these values; in large recursive // functions this can cause pathological frame growth and stack overflow. @@ -746,7 +1029,13 @@ pub fn (mut g Gen) gen_func(func mir.Function) { } } - // Assign slot for result of instruction (or pointer for alloca) + // Skip stack slot for callee-saved register values (always in reg_map). + if val_id in g.reg_map && g.reg_map[val_id] != 0xFF { + continue + } + // Align to 8 bytes before assigning scalar slot so that + // SP-relative scaled-immediate addressing always works. + slot_offset = (slot_offset + 7) & ~0x7 g.stack_map[val_id] = -slot_offset slot_offset += 8 } @@ -856,7 +1145,9 @@ pub fn (mut g Gen) gen_func(func mir.Function) { } eprintln('ARM64 BLOCKS ${func.name} end') } - g.macho.add_symbol('_' + func.name, u64(g.curr_offset), true, 1) + fn_sym_name := '_' + func.name + fn_sym_idx := g.macho.add_symbol(fn_sym_name, u64(g.curr_offset), true, 1) + fn_start_off := g.macho.text_data.len // Prologue g.emit(asm_stp_fp_lr_pre()) @@ -878,6 +1169,7 @@ pub fn (mut g Gen) gen_func(func mir.Function) { // sp = fp - callee_saved_size - stack_size, so fp - N = sp + (sp_base_offset + N). g.sp_base_offset = callee_saved_size + g.stack_size g.sp_adjusted = false + g.sp_adjust_amt = 0 // Save x8 if this function returns a large struct // x8 contains the indirect return pointer from the caller @@ -1010,6 +1302,7 @@ pub fn (mut g Gen) gen_func(func mir.Function) { } for i := 0; i < func.blocks.len; i++ { + g.invalidate_last_store() blk_id := int(func.blocks[i]) g.next_blk = if i + 1 < func.blocks.len { int(func.blocks[i + 1]) } else { -1 } g.cur_blk_id = blk_id @@ -1027,22 +1320,24 @@ pub fn (mut g Gen) gen_func(func mir.Function) { instr := g.read_u32(abs_off) mut new_instr := u32(0) - // Check for CBNZ (0xB5...) vs B (0x14...) vs B.cond (0x54...) - if (instr & 0xFF000000) == 0xB5000000 { - // CBNZ - new_instr = (instr & 0xFF000000) | ((u32(rel) & 0x7FFFF) << 5) | (instr & 0x1F) + // Check for CBZ (0xB4...) / CBNZ (0xB5...) vs B (0x14...) vs B.cond (0x54...) + if (instr & 0xFE000000) == 0xB4000000 { + // CBZ / CBNZ (both use imm19 at bits [23:5]) + new_instr = (instr & 0xFF00001F) | ((u32(rel) & 0x7FFFF) << 5) } else if (instr & 0xFC000000) == 0x14000000 { // B imm26 new_instr = (instr & 0xFC000000) | (u32(rel) & 0x3FFFFFF) } else { // B.cond - new_instr = (instr & 0xFF000000) | ((u32(rel) & 0x7FFFF) << 5) | (instr & 0x1F) + new_instr = (instr & 0xFF00001F) | ((u32(rel) & 0x7FFFF) << 5) } g.write_u32(abs_off, new_instr) g.total_resolved++ } - for val_id in blk.instrs { + g.cur_blk_instrs = blk.instrs + for instr_idx, val_id in blk.instrs { + g.cur_blk_instr_idx = instr_idx g.gen_instr(val_id) } } @@ -1050,6 +1345,10 @@ pub fn (mut g Gen) gen_func(func mir.Function) { if unresolved > 0 { eprintln('BRANCH: fn=${func.name} pending=${g.total_pending} resolved=${g.total_resolved} unresolved=${unresolved} pending_blks_len=${g.pending_label_blks.len}') } + g.fn_starts << fn_start_off + g.fn_ends << g.macho.text_data.len + g.fn_names << fn_sym_name + g.fn_sym_ids << fn_sym_idx } fn (mut g Gen) gen_instr(val_id int) { @@ -1075,6 +1374,18 @@ fn (mut g Gen) gen_instr(val_id int) { } eprintln('ARM64 INSTR fn=${g.cur_func_name} val=${val_id} op=${op} orig=${instr.op} sel=${instr.selected_op} typ=${typ_id} kind=${kind} width=${width} unsigned=${is_unsigned} ops=${instr.operands}') } + // Dead value elimination: skip pure operations whose results are never used. + if g.mod.values[val_id].uses.len == 0 { + match op { + .add, .sub, .mul, .sdiv, .udiv, .srem, .urem, .and_, .or_, .xor, .shl, .ashr, .lshr, + .eq, .ne, .lt, .gt, .le, .ge, .ult, .ugt, .ule, .uge, .fadd, .fsub, .fmul, .fdiv, + .frem, .fptosi, .sitofp, .trunc, .zext, .sext, .bitcast, .extractvalue, + .get_element_ptr, .insertvalue, .struct_init, .load { + return + } + else {} + } + } match op { .fadd, .fsub, .fmul, .fdiv, .frem { // Float operations using scalar SIMD instructions (d0-d7) @@ -1209,7 +1520,8 @@ fn (mut g Gen) gen_instr(val_id int) { mut rhs_reg := 9 // Default scratch for RHS op1 := g.mod.values[instr.operands[1]] - if op1.kind == .constant && op in [.add, .sub] { + if op1.kind == .constant + && op in [.add, .sub, .eq, .ne, .lt, .gt, .le, .ge, .ult, .ugt, .ule, .uge] { v := g.get_const_int(instr.operands[1]) if v >= 0 && v < 4096 { is_imm = true @@ -1334,7 +1646,13 @@ fn (mut g Gen) gen_instr(val_id int) { use_32bit := lhs_typ > 0 && lhs_typ < g.mod.type_store.types.len && g.mod.type_store.types[lhs_typ].kind == .int_t && g.mod.type_store.types[lhs_typ].width == 32 - if use_32bit { + if is_imm { + if use_32bit { + g.emit(asm_cmp_imm_w(Reg(lhs_reg), u32(imm_val))) + } else { + g.emit(asm_cmp_imm(Reg(lhs_reg), u32(imm_val))) + } + } else if use_32bit { g.emit(asm_cmp_reg_w(Reg(lhs_reg), Reg(rhs_reg))) } else { g.emit(asm_cmp_reg(Reg(lhs_reg), Reg(rhs_reg))) @@ -1364,11 +1682,15 @@ fn (mut g Gen) gen_instr(val_id int) { } // Keep narrow integer results (i1/i8/i16/i32) canonical after 64-bit // ALU ops so upper garbage bits do not leak through later uses. - result_typ_id := g.mod.values[val_id].typ - if result_typ_id > 0 && result_typ_id < g.mod.type_store.types.len { - result_typ := g.mod.type_store.types[result_typ_id] - if result_typ.kind == .int_t && !emitted_types_sum_is_check { - g.canonicalize_narrow_int_result(dest_reg, result_typ_id) + // Skip for comparison ops (cset always produces 0 or 1, already canonical). + is_cmp := op in [.eq, .ne, .lt, .gt, .le, .ge, .ult, .ugt, .ule, .uge] + if !is_cmp { + result_typ_id := g.mod.values[val_id].typ + if result_typ_id > 0 && result_typ_id < g.mod.type_store.types.len { + result_typ := g.mod.type_store.types[result_typ_id] + if result_typ.kind == .int_t && !emitted_types_sum_is_check { + g.canonicalize_narrow_int_result(dest_reg, result_typ_id) + } } } // If dest_reg was not the allocated one (e.g. was 8), move it. @@ -2120,6 +2442,7 @@ fn (mut g Gen) gen_instr(val_id int) { } } .call { + g.invalidate_last_store() fn_val := g.mod.values[instr.operands[0]] fn_name := fn_val.name trace_call := g.env_trace_call.len > 0 @@ -2164,6 +2487,7 @@ fn (mut g Gen) gen_instr(val_id int) { stack_space := ((num_variadic * 8) + 15) & ~0xF if stack_space > 0 { g.sp_adjusted = true + g.sp_adjust_amt = stack_space g.emit_sub_sp(stack_space) } @@ -2201,6 +2525,7 @@ fn (mut g Gen) gen_instr(val_id int) { g.emit(asm_add_sp_reg(Reg(10))) } g.sp_adjusted = false + g.sp_adjust_amt = 0 } } else { // Non-variadic call: @@ -2243,6 +2568,7 @@ fn (mut g Gen) gen_instr(val_id int) { stack_space := ((total_stack_slots * 8) + 15) & ~0xF if stack_space > 0 { g.sp_adjusted = true + g.sp_adjust_amt = stack_space g.emit_sub_sp(stack_space) mut stack_idx := 0 for a in 0 .. num_args { @@ -2327,6 +2653,7 @@ fn (mut g Gen) gen_instr(val_id int) { g.emit(asm_add_sp_reg(Reg(10))) } g.sp_adjusted = false + g.sp_adjust_amt = 0 } } @@ -2395,6 +2722,7 @@ fn (mut g Gen) gen_instr(val_id int) { } } .call_indirect { + g.invalidate_last_store() // Indirect call through function pointer // operands[0] is the function pointer, rest are arguments num_args := instr.operands.len - 1 @@ -2417,6 +2745,7 @@ fn (mut g Gen) gen_instr(val_id int) { stack_space := ((num_stack_slots * 8) + 15) & ~0xF if stack_space > 0 { g.sp_adjusted = true + g.sp_adjust_amt = stack_space g.emit_sub_sp(stack_space) mut stack_idx := 0 for a in 0 .. num_args { @@ -2476,6 +2805,7 @@ fn (mut g Gen) gen_instr(val_id int) { g.emit(asm_add_sp_reg(Reg(10))) } g.sp_adjusted = false + g.sp_adjust_amt = 0 } ci_result_typ_id := g.mod.values[val_id].typ @@ -2498,6 +2828,7 @@ fn (mut g Gen) gen_instr(val_id int) { } } .call_sret { + g.invalidate_last_store() // Call with struct return lowered by ABI pass. // operands: [fn, arg1, arg2, ...], destination is val_id's stack slot. num_args := instr.operands.len - 1 @@ -2534,6 +2865,7 @@ fn (mut g Gen) gen_instr(val_id int) { stack_space := ((sr_num_stack_slots * 8) + 15) & ~0xF if stack_space > 0 { g.sp_adjusted = true + g.sp_adjust_amt = stack_space g.emit_sub_sp(stack_space) mut stack_idx := 0 for a in 0 .. num_args { @@ -2598,6 +2930,7 @@ fn (mut g Gen) gen_instr(val_id int) { g.emit(asm_add_sp_reg(Reg(10))) } g.sp_adjusted = false + g.sp_adjust_amt = 0 } } .ret { @@ -2902,8 +3235,18 @@ fn (mut g Gen) gen_instr(val_id int) { && g.mod.type_store.types[cond_val.typ].kind == .int_t && g.mod.type_store.types[cond_val.typ].width == 1 if cond_is_i1 { - g.emit_mov_imm64(9, 1) - g.emit(asm_and(Reg(8), Reg(8), Reg(9))) + // Skip AND if condition is from a comparison (cset produces 0/1). + mut need_and := true + if cond_val.kind == .instruction { + cond_instr := g.mod.instrs[cond_val.index] + cond_op := g.selected_opcode(cond_instr) + if cond_op in [.eq, .ne, .lt, .gt, .le, .ge, .ult, .ugt, .ule, .uge] { + need_and = false + } + } + if need_and { + g.emit(asm_and_imm_1(Reg(8), Reg(8))) + } } true_blk := if instr.operands[1] >= 0 && instr.operands[1] < g.val_to_block.len { @@ -2920,37 +3263,21 @@ fn (mut g Gen) gen_instr(val_id int) { has_phis := g.block_has_phis(true_blk) || g.block_has_phis(false_blk) if has_phis { // When target blocks have phi nodes (e.g. -O0 mode), we must emit - // phi copies on each branch path separately. Structure: - // CBZ x8, false_path - // - // B true_blk - // false_path: - // - // B false_blk (or fall-through) - cbz_off := g.macho.text_data.len - g.curr_offset - g.emit(asm_cbz(Reg(8), 0)) // placeholder, will patch - - // True path: emit phi copies then branch to true block - g.emit_phi_copies(true_blk) - if true_blk >= 0 && true_blk < g.block_offsets.len - && g.block_offsets[true_blk] != -1 { - off := g.block_offsets[true_blk] - rel := (off - (g.macho.text_data.len - g.curr_offset)) / 4 - g.emit(asm_b(rel)) - } else { - g.record_pending_label(true_blk) - g.emit(asm_b(0)) - } - - // Patch CBZ to jump here (false path) - false_path_off := g.macho.text_data.len - g.curr_offset - cbz_rel := (false_path_off - cbz_off) / 4 - cbz_abs := g.curr_offset + cbz_off - g.write_u32(cbz_abs, asm_cbz(Reg(8), cbz_rel)) - - // False path: emit phi copies then branch to false block - g.emit_phi_copies(false_blk) - if false_blk != g.next_blk { + // phi copies on each branch path separately. + if true_blk == g.next_blk { + // Optimize: true_blk is next block (fall-through). + // Structure: + // CBNZ x8, true_path + // + // B false_blk (or fall-through if false_blk == next) + // true_path: + // + // (fall through to true_blk) + cbnz_off := g.macho.text_data.len - g.curr_offset + g.emit(asm_cbnz(Reg(8), 0)) // placeholder, will patch + + // False path: emit phi copies then branch to false block + g.emit_phi_copies(false_blk) if false_blk >= 0 && false_blk < g.block_offsets.len && g.block_offsets[false_blk] != -1 { off := g.block_offsets[false_blk] @@ -2960,38 +3287,119 @@ fn (mut g Gen) gen_instr(val_id int) { g.record_pending_label(false_blk) g.emit(asm_b(0)) } + + // Patch CBNZ to jump here (true path) + true_path_off := g.macho.text_data.len - g.curr_offset + cbnz_rel := (true_path_off - cbnz_off) / 4 + cbnz_abs := g.curr_offset + cbnz_off + g.write_u32(cbnz_abs, asm_cbnz(Reg(8), cbnz_rel)) + + // True path: emit phi copies, then fall through to true_blk (next block) + g.emit_phi_copies(true_blk) + } else { + // Standard structure: + // CBZ x8, false_path + // + // B true_blk + // false_path: + // + // B false_blk (or fall-through) + cbz_off := g.macho.text_data.len - g.curr_offset + g.emit(asm_cbz(Reg(8), 0)) // placeholder, will patch + + // True path: emit phi copies then branch to true block + g.emit_phi_copies(true_blk) + if true_blk >= 0 && true_blk < g.block_offsets.len + && g.block_offsets[true_blk] != -1 { + off := g.block_offsets[true_blk] + rel := (off - (g.macho.text_data.len - g.curr_offset)) / 4 + g.emit(asm_b(rel)) + } else { + g.record_pending_label(true_blk) + g.emit(asm_b(0)) + } + + // Patch CBZ to jump here (false path) + false_path_off := g.macho.text_data.len - g.curr_offset + cbz_rel := (false_path_off - cbz_off) / 4 + cbz_abs := g.curr_offset + cbz_off + g.write_u32(cbz_abs, asm_cbz(Reg(8), cbz_rel)) + + // False path: emit phi copies then branch to false block + g.emit_phi_copies(false_blk) + if false_blk != g.next_blk { + if false_blk >= 0 && false_blk < g.block_offsets.len + && g.block_offsets[false_blk] != -1 { + off := g.block_offsets[false_blk] + rel := (off - (g.macho.text_data.len - g.curr_offset)) / 4 + g.emit(asm_b(rel)) + } else { + g.record_pending_label(false_blk) + g.emit(asm_b(0)) + } + } } } else { - // No phi nodes — use efficient branch pattern - if true_blk >= 0 && true_blk < g.block_offsets.len - && g.block_offsets[true_blk] != -1 { - off := g.block_offsets[true_blk] - rel := (off - (g.macho.text_data.len - g.curr_offset)) / 4 - if rel >= -262144 && rel < 262144 { - g.emit(asm_cbnz(Reg(8), rel)) - } else { - // Branch target too far for CBNZ (19-bit range). - // Use trampoline: CBZ skip; B target; skip: - g.emit(asm_cbz(Reg(8), 2)) // skip over next B instruction - g.emit(asm_b(rel - 1)) // adjust for the extra CBZ instruction + // No phi nodes — use efficient branch pattern. + // Optimize: if true_blk is next (fall-through), just CBZ to false_blk. + if true_blk == g.next_blk { + // Condition true → fall through to next block. + // Condition false → branch to false_blk. + if false_blk != g.next_blk { + if false_blk >= 0 && false_blk < g.block_offsets.len + && g.block_offsets[false_blk] != -1 { + off := g.block_offsets[false_blk] + rel := (off - (g.macho.text_data.len - g.curr_offset)) / 4 + if rel >= -262144 && rel < 262144 { + g.emit(asm_cbz(Reg(8), rel)) + } else { + g.emit(asm_cbnz(Reg(8), 2)) + g.emit(asm_b(rel - 1)) + } + } else { + g.record_pending_label(false_blk) + g.emit(asm_cbz(Reg(8), 0)) + } } + // else: both true and false are next block — no branch needed } else { - // Forward reference: use trampoline pattern to avoid 19-bit overflow. - // CBZ x8, skip; B target; skip: - g.emit(asm_cbz(Reg(8), 2)) // skip over next B instruction - g.record_pending_label(true_blk) - g.emit(asm_b(0)) - } - - if false_blk != g.next_blk { - if false_blk >= 0 && false_blk < g.block_offsets.len - && g.block_offsets[false_blk] != -1 { - off := g.block_offsets[false_blk] + if true_blk >= 0 && true_blk < g.block_offsets.len + && g.block_offsets[true_blk] != -1 { + off := g.block_offsets[true_blk] rel := (off - (g.macho.text_data.len - g.curr_offset)) / 4 - g.emit(asm_b(rel)) + if rel >= -262144 && rel < 262144 { + g.emit(asm_cbnz(Reg(8), rel)) + } else { + // Branch target too far for CBNZ (19-bit range). + // Use trampoline: CBZ skip; B target; skip: + g.emit(asm_cbz(Reg(8), 2)) // skip over next B instruction + g.emit(asm_b(rel - 1)) // adjust for the extra CBZ instruction + } } else { - g.record_pending_label(false_blk) - g.emit(asm_b(0)) + // Forward reference — check if false_blk is next (common pattern). + // If so, use CBNZ directly to true_blk (1 instruction). + if false_blk == g.next_blk { + g.record_pending_label(true_blk) + g.emit(asm_cbnz(Reg(8), 0)) + } else { + // Neither block is next: use trampoline pattern. + // CBZ x8, skip; B true_target; skip: + g.emit(asm_cbz(Reg(8), 2)) // skip over next B instruction + g.record_pending_label(true_blk) + g.emit(asm_b(0)) + } + } + + if false_blk != g.next_blk { + if false_blk >= 0 && false_blk < g.block_offsets.len + && g.block_offsets[false_blk] != -1 { + off := g.block_offsets[false_blk] + rel := (off - (g.macho.text_data.len - g.curr_offset)) / 4 + g.emit(asm_b(rel)) + } else { + g.record_pending_label(false_blk) + g.emit(asm_b(0)) + } } } } @@ -4862,7 +5270,7 @@ fn (mut g Gen) get_dest_reg(val_id int) int { } fn (mut g Gen) get_operand_reg(val_id int, fallback int) int { - // If value is in a register, return it + // If value is in a callee-saved register, return it if val_id in g.reg_map { r := g.reg_map[val_id] if r != 0xFF { @@ -5106,6 +5514,21 @@ fn (mut g Gen) load_call_arg_to_reg(reg int, val_id int, arg_idx int, instr mir. g.load_val_to_reg(reg, val_id) } +// val_is_cmp_result returns true if the value is a comparison instruction result (cset). +// Comparison results are always 0 or 1 and don't need narrow int canonicalization. +fn (g Gen) val_is_cmp_result(val_id int) bool { + if val_id <= 0 || val_id >= g.mod.values.len { + return false + } + val := g.mod.values[val_id] + if val.kind != .instruction { + return false + } + instr := g.mod.instrs[val.index] + op := g.selected_opcode(instr) + return op in [.eq, .ne, .lt, .gt, .le, .ge, .ult, .ugt, .ule, .uge] +} + fn (mut g Gen) canonicalize_narrow_int_result(reg int, typ_id ssa.TypeID) { if typ_id <= 0 || typ_id >= g.mod.type_store.types.len { return @@ -5116,26 +5539,39 @@ fn (mut g Gen) canonicalize_narrow_int_result(reg int, typ_id ssa.TypeID) { } if typ.width == 1 { // i1 arithmetic is modulo 2; force truncation to the low bit. - g.emit_mov_imm64(11, 1) - g.emit(asm_and(Reg(reg), Reg(reg), Reg(11))) + g.emit(asm_and_imm_1(Reg(reg), Reg(reg))) } else if typ.is_unsigned { - // Unsigned narrow integers: zero-extend by masking. - if typ.width <= 32 { + // Unsigned narrow integers: zero-extend using bitmask immediate AND. + if typ.width == 8 { + g.emit(asm_and_imm_0xff(Reg(reg), Reg(reg))) + } else if typ.width == 16 { + g.emit(asm_and_imm_0xffff(Reg(reg), Reg(reg))) + } else if typ.width == 32 { + g.emit(asm_and_imm_0xffffffff(Reg(reg), Reg(reg))) + } else if typ.width < 32 { mask := (u64(1) << typ.width) - 1 g.emit_mov_imm64(11, i64(mask)) g.emit(asm_and(Reg(reg), Reg(reg), Reg(11))) } // Unsigned 33-63 bit: no action needed (upper bits already zero) } else { - // For signed types, sign-extend via LSL + ASR - shift := 64 - typ.width - mut shreg := 11 - if reg == shreg { - shreg = 12 + // For signed types, sign-extend using dedicated instructions. + if typ.width == 8 { + g.emit(asm_sxtb(Reg(reg), Reg(reg))) + } else if typ.width == 16 { + g.emit(asm_sxth(Reg(reg), Reg(reg))) + } else if typ.width == 32 { + g.emit(asm_sxtw(Reg(reg), Reg(reg))) + } else { + shift := 64 - typ.width + mut shreg := 11 + if reg == shreg { + shreg = 12 + } + g.emit_mov_imm64(shreg, shift) + g.emit(asm_lslv(Reg(reg), Reg(reg), Reg(shreg))) + g.emit(asm_asrv(Reg(reg), Reg(reg), Reg(shreg))) } - g.emit_mov_imm64(shreg, shift) - g.emit(asm_lslv(Reg(reg), Reg(reg), Reg(shreg))) - g.emit(asm_asrv(Reg(reg), Reg(reg), Reg(shreg))) } } @@ -5450,6 +5886,25 @@ fn (mut g Gen) load_val_to_reg(reg int, val_id int) { return } } + // Check scratch register cache for this val_id. + if val_id > 0 { + if g.last_store_val == val_id && g.last_store_reg >= 0 { + g.stats_cache_hits++ + src := g.last_store_reg + // Don't invalidate — the value is still in src register until + // something overwrites it (checked at line below: last_store_reg == reg). + if src != reg { + g.emit(asm_mov_reg(Reg(reg), Reg(src))) + } + // Must still canonicalize narrow integers (stored value is raw). + g.canonicalize_narrow_int_result(reg, g.mod.values[val_id].typ) + return + } + } + // Invalidate cache if this load overwrites the cached register. + if g.last_store_reg == reg { + g.invalidate_last_store() + } if val_id <= 0 || val_id >= g.mod.values.len { g.emit_mov_imm64(reg, 0) return @@ -5742,24 +6197,48 @@ fn (mut g Gen) store_reg_to_val(reg int, val_id int) { if val_typ.kind in [.struct_t, .array_t] { val_size := g.type_size(val_typ_id) if val_size > 8 && val_size <= 16 { - // For 9-16 byte structs, the register holds a pointer to the - // stack location containing the struct data. Copy it. if trace_storeval { eprintln('ARM64 STOREVAL ptr-copy fn=${g.cur_func_name} val=${val_id} typ=${val_typ_id}/${val_typ.kind} size=${val_size} reg=${stored_reg} off=${offset}') } g.copy_ptr_to_fp_bytes(stored_reg, offset, val_size) + g.invalidate_last_store() return } } if val_typ.kind == .struct_t && g.type_size(val_typ_id) <= 8 { g.emit_str_reg_offset(stored_reg, 29, offset) + g.last_store_reg = stored_reg + g.last_store_val = val_id return } } } - // Always store to stack even when value is register-allocated, - // to ensure correctness with the block-local interval approximation. - g.emit_str_reg_offset(stored_reg, 29, offset) + g.stats_total_stores++ + skip := g.should_skip_store(val_id) + if skip == 1 { + // Skip the store — value will be consumed as operand[0] from cache. + g.stats_skipped_stores++ + } else if skip == 2 { + // Forward to x9: value will be consumed as operand[1]. + // MOV x9, stored_reg so it survives operand[0] loading into x8. + // x9 is the default register for operand[1] in arithmetic/comparison, + // so the cache hit at (9, val_id) gives src==reg with no MOV needed. + if stored_reg != 9 { + g.emit(asm_mov_reg(Reg(9), Reg(stored_reg))) + } + g.stats_skipped_stores++ + // Set cache to (x9, val_id) so get_operand_reg finds it. + g.last_store_reg = 9 + g.last_store_val = val_id + return + } else { + g.emit_str_reg_offset(stored_reg, 29, offset) + } + } + // Record for store-load elimination: stored_reg now holds val_id. + if val_id > 0 { + g.last_store_reg = stored_reg + g.last_store_val = val_id } } @@ -5888,10 +6367,10 @@ fn (mut g Gen) emit_phi_copies(target_blk_id int) { fn (mut g Gen) emit_add_fp_imm(rd int, imm int) { val := -imm // val is the positive distance below FP - // SP-relative: fp - val = sp + (sp_base_offset - val) + // SP-relative: fp - val = sp + (sp_base_offset + sp_adjust_amt - val) // Only use SP-relative when it produces fewer instructions than FP-relative. - if !g.sp_adjusted && g.sp_base_offset > 0 { - sp_off := g.sp_base_offset - val + if g.sp_base_offset > 0 { + sp_off := g.sp_base_offset + g.sp_adjust_amt - val if sp_off >= 0 { if sp_off <= 0xFFF { // 1 instruction (vs 1-2 for FP) — always better or equal @@ -5936,8 +6415,9 @@ fn (mut g Gen) emit_str_reg_offset(rt int, rn int, offset int) { fn (mut g Gen) emit_str_reg_offset_sized(rt int, rn int, offset int, size int) { // SP-relative addressing: convert fp-relative negative offsets to sp-relative positive. - if rn == 29 && offset < -255 && !g.sp_adjusted && g.sp_base_offset > 0 { - sp_off := g.sp_base_offset + offset + // When sp is adjusted for call args, add the adjustment amount. + if rn == 29 && offset < -255 && g.sp_base_offset > 0 { + sp_off := g.sp_base_offset + offset + g.sp_adjust_amt if sp_off >= 0 { // Try 1-instruction: unsigned scaled immediate scaled := match size { @@ -5966,7 +6446,7 @@ fn (mut g Gen) emit_str_reg_offset_sized(rt int, rn int, offset int, size int) { } return } - // Try 2-instruction: add scratch, sp, #imm; str rt, [scratch] + // Try 2-instruction: add scratch, sp, #imm; str rt, [scratch, #off] // Better than 3-instruction FP-relative for large offsets neg := -offset if neg > 0xFFF && sp_off <= 0xFFFFFF { @@ -5996,6 +6476,36 @@ fn (mut g Gen) emit_str_reg_offset_sized(rt int, rn int, offset int, size int) { } return } + // sp_low > 255: try scaled immediate from the high-aligned base + if sp_high > 0 { + sp_low_scaled := match size { + 8 { + if sp_low % 8 == 0 && sp_low / 8 < 4096 { sp_low / 8 } else { -1 } + } + 4 { + if sp_low % 4 == 0 && sp_low / 4 < 4096 { sp_low / 4 } else { -1 } + } + 2 { + if sp_low % 2 == 0 && sp_low / 2 < 4096 { sp_low / 2 } else { -1 } + } + 1 { + if sp_low < 4096 { sp_low } else { -1 } + } + else { + -1 + } + } + if sp_low_scaled >= 0 { + g.emit(asm_add_imm_lsl12(Reg(scratch), sp, u32(sp_high))) + match size { + 1 { g.emit(asm_str_imm_b(Reg(rt), Reg(scratch), u32(sp_low_scaled))) } + 2 { g.emit(asm_str_imm_h(Reg(rt), Reg(scratch), u32(sp_low_scaled))) } + 4 { g.emit(asm_str_imm_w(Reg(rt), Reg(scratch), u32(sp_low_scaled))) } + else { g.emit(asm_str_imm(Reg(rt), Reg(scratch), u32(sp_low_scaled))) } + } + return + } + } } } } @@ -6115,8 +6625,9 @@ fn (mut g Gen) emit_ldr_reg_offset(rt int, rn int, offset int) { fn (mut g Gen) emit_ldr_reg_offset_sized(rt int, rn int, offset int, size int) { // SP-relative addressing: convert fp-relative negative offsets to sp-relative positive. - if rn == 29 && offset < -255 && !g.sp_adjusted && g.sp_base_offset > 0 { - sp_off := g.sp_base_offset + offset + // When sp is adjusted for call args, add the adjustment amount. + if rn == 29 && offset < -255 && g.sp_base_offset > 0 { + sp_off := g.sp_base_offset + offset + g.sp_adjust_amt if sp_off >= 0 { // Try 1-instruction: unsigned scaled immediate scaled := match size { @@ -6175,6 +6686,36 @@ fn (mut g Gen) emit_ldr_reg_offset_sized(rt int, rn int, offset int, size int) { } return } + // sp_low > 255: try scaled immediate from the high-aligned base + if sp_high > 0 { + sp_low_scaled := match size { + 8 { + if sp_low % 8 == 0 && sp_low / 8 < 4096 { sp_low / 8 } else { -1 } + } + 4 { + if sp_low % 4 == 0 && sp_low / 4 < 4096 { sp_low / 4 } else { -1 } + } + 2 { + if sp_low % 2 == 0 && sp_low / 2 < 4096 { sp_low / 2 } else { -1 } + } + 1 { + if sp_low < 4096 { sp_low } else { -1 } + } + else { + -1 + } + } + if sp_low_scaled >= 0 { + g.emit(asm_add_imm_lsl12(Reg(scratch), sp, u32(sp_high))) + match size { + 1 { g.emit(asm_ldr_imm_b(Reg(rt), Reg(scratch), u32(sp_low_scaled))) } + 2 { g.emit(asm_ldr_imm_h(Reg(rt), Reg(scratch), u32(sp_low_scaled))) } + 4 { g.emit(asm_ldr_imm_w(Reg(rt), Reg(scratch), u32(sp_low_scaled))) } + else { g.emit(asm_ldr_imm(Reg(rt), Reg(scratch), u32(sp_low_scaled))) } + } + return + } + } } } } @@ -6701,7 +7242,9 @@ fn (mut g Gen) emit_mov_imm64(rd int, val i64) { } fn (mut g Gen) emit_mov_reg(rd int, rm int) { - g.emit(asm_mov_reg(Reg(rd), Reg(rm))) + if rd != rm { + g.emit(asm_mov_reg(Reg(rd), Reg(rm))) + } } struct Interval { diff --git a/vlib/v2/gen/arm64/asm.v b/vlib/v2/gen/arm64/asm.v index 43f20d6e5..7b7a30277 100644 --- a/vlib/v2/gen/arm64/asm.v +++ b/vlib/v2/gen/arm64/asm.v @@ -117,6 +117,26 @@ fn asm_and(rd Reg, rn Reg, rm Reg) u32 { return 0x8A000000 | (u32(rm) << 16) | (u32(rn) << 5) | u32(rd) } +// and rd, rn, #1 — truncate to 1 bit (AND Xd, Xn, #1: N=1, immr=0, imms=0x00) +fn asm_and_imm_1(rd Reg, rn Reg) u32 { + return 0x92400000 | (u32(rn) << 5) | u32(rd) +} + +// and rd, rn, #0xFF — zero-extend u8 (AND Xd, Xn, #0xFF: N=1, immr=0, imms=0x07) +fn asm_and_imm_0xff(rd Reg, rn Reg) u32 { + return 0x92401C00 | (u32(rn) << 5) | u32(rd) +} + +// and rd, rn, #0xFFFF — zero-extend u16 (AND Xd, Xn, #0xFFFF: N=1, immr=0, imms=0x0F) +fn asm_and_imm_0xffff(rd Reg, rn Reg) u32 { + return 0x92403C00 | (u32(rn) << 5) | u32(rd) +} + +// and rd, rn, #0xFFFFFFFF — zero-extend u32 (AND Xd, Xn, #0xFFFFFFFF: N=1, immr=0, imms=0x1F) +fn asm_and_imm_0xffffffff(rd Reg, rn Reg) u32 { + return 0x92407C00 | (u32(rn) << 5) | u32(rd) +} + // orr rd, rn, rm fn asm_orr(rd Reg, rn Reg, rm Reg) u32 { return 0xAA000000 | (u32(rm) << 16) | (u32(rn) << 5) | u32(rd) @@ -167,6 +187,16 @@ fn asm_sxtw(rd Reg, rn Reg) u32 { return 0x93407C00 | (u32(rn) << 5) | u32(rd) } +// sxth rd, rn — sign-extend 16-bit value to 64-bit (SBFM Xd, Xn, #0, #15) +fn asm_sxth(rd Reg, rn Reg) u32 { + return 0x93403C00 | (u32(rn) << 5) | u32(rd) +} + +// sxtb rd, rn — sign-extend 8-bit value to 64-bit (SBFM Xd, Xn, #0, #7) +fn asm_sxtb(rd Reg, rn Reg) u32 { + return 0x93401C00 | (u32(rn) << 5) | u32(rd) +} + // === Compare === // cmp rn, rm (subs xzr, rn, rm) — 64-bit @@ -174,6 +204,16 @@ fn asm_cmp_reg(rn Reg, rm Reg) u32 { return 0xEB00001F | (u32(rm) << 16) | (u32(rn) << 5) } +// cmp xn, #imm12 (subs xzr, xn, #imm12) — 64-bit immediate compare +fn asm_cmp_imm(rn Reg, imm12 u32) u32 { + return 0xF100001F | (imm12 << 10) | (u32(rn) << 5) +} + +// cmp wn, #imm12 (subs wzr, wn, #imm12) — 32-bit immediate compare +fn asm_cmp_imm_w(rn Reg, imm12 u32) u32 { + return 0x7100001F | (imm12 << 10) | (u32(rn) << 5) +} + // cmp wn, wm (subs wzr, wn, wm) — 32-bit, sign-aware for i32 fn asm_cmp_reg_w(rn Reg, rm Reg) u32 { return 0x6B00001F | (u32(rm) << 16) | (u32(rn) << 5) diff --git a/vlib/v2/gen/cleanc/expr.v b/vlib/v2/gen/cleanc/expr.v index f91e51b63..b2060e1b4 100644 --- a/vlib/v2/gen/cleanc/expr.v +++ b/vlib/v2/gen/cleanc/expr.v @@ -2319,7 +2319,7 @@ fn (mut g Gen) gen_index_expr(node ast.IndexExpr) { } } if raw_type is types.Map { - g.gen_map_index_fallback(node, raw_type) + g.panic_map_index_expr(node) return } if raw_type is types.String { @@ -2366,7 +2366,7 @@ fn (mut g Gen) gen_index_expr(node ast.IndexExpr) { g.sb.write_string(']') return } else if raw_type.base_type is types.Map { - g.gen_map_index_fallback(node, raw_type.base_type) + g.panic_map_index_expr(node) return } else if raw_type.base_type is types.Pointer || raw_type.base_type is types.String { // Pointer to pointer (e.g. &&char) or pointer to string (e.g. &string used as array): @@ -2409,7 +2409,7 @@ fn (mut g Gen) gen_index_expr(node ast.IndexExpr) { // Try to resolve the full Map type for fallback code generation. if map_raw := g.get_raw_type(node.lhs) { if map_raw is types.Map { - g.gen_map_index_fallback(node, map_raw) + g.panic_map_index_expr(node) return } } @@ -2535,30 +2535,10 @@ fn (mut g Gen) gen_index_expr(node ast.IndexExpr) { g.sb.write_string(']') } -// gen_map_index_fallback generates C code for a map read that the transformer -// failed to lower. Produces: -// ({ val_type _mget_key_N = key; val_type _mget_zero_N = {0}; -// *(val_type*)map__get(&map, (void*)&_mget_key_N, (void*)&_mget_zero_N); }) -fn (mut g Gen) gen_map_index_fallback(node ast.IndexExpr, map_type types.Map) { - val_c := g.types_type_to_c(map_type.value_type) - key_c := g.types_type_to_c(map_type.key_type) - tmp := g.tmp_counter - g.tmp_counter++ - g.sb.write_string('({ ${key_c} _mget_key_${tmp} = ') - g.expr(node.expr) - g.sb.write_string('; ${val_c} _mget_zero_${tmp} = {0}; *(${val_c}*)map__get(') - // map__get expects a pointer to the map. - mut lhs_is_ptr := false - if lhs_raw := g.get_raw_type(node.lhs) { - lhs_is_ptr = lhs_raw is types.Pointer - } - if lhs_is_ptr { - g.expr(node.lhs) - } else { - g.sb.write_string('&') - g.expr(node.lhs) - } - g.sb.write_string(', (void*)&_mget_key_${tmp}, (void*)&_mget_zero_${tmp}); })') +fn (mut g Gen) panic_map_index_expr(node ast.IndexExpr) { + lhs_type := g.get_expr_type(node.lhs) + idx_src := '${node.lhs.name()}[${node.expr.name()}]' + panic('bug in v2 compiler: map IndexExpr should have been lowered in v2.transformer (file=${g.cur_file_name} fn=${g.cur_fn_name} pos=${node.pos} idx=${idx_src} lhs=${node.lhs.name()} lhs_type=${lhs_type})') } fn (g &Gen) eval_comptime_flag(name string) bool { diff --git a/vlib/v2/gen/cleanc/fn.v b/vlib/v2/gen/cleanc/fn.v index b4a17a244..48972b558 100644 --- a/vlib/v2/gen/cleanc/fn.v +++ b/vlib/v2/gen/cleanc/fn.v @@ -84,11 +84,16 @@ fn (g &Gen) should_emit_fn_decl(module_name string, decl ast.FnDecl) bool { // Methods on array types ([]T) and other types with unresolvable receivers // may produce 'unknown' receiver in the markused key, causing them to be // incorrectly pruned. Always emit methods whose receiver can't be resolved. + // Also always emit methods on array receivers ([]T), since the markused + // key for these can differ between the walker and the gen lookup. if decl.is_method { key2 := markused.decl_key(module_name, decl, g.env) if key2.contains('|unknown|') { return true } + if decl.receiver.typ is ast.Type && decl.receiver.typ is ast.ArrayType { + return true + } } // Check if this function was force-requested by generated code (e.g. map str functions). if g.force_emit_fn_names.len > 0 && decl.name == 'str' && decl.is_method { -- 2.39.5