v / vlib / v2 / gen / arm64 / arm64.v
9533 lines · 9175 sloc · 313.85 KB
Raw
1// Copyright (c) 2026 Alexander Medvednikov. All rights reserved.
2// Use of this source code is governed by an MIT license
3// that can be found in the LICENSE file.
4
5module arm64
6
7import v2.mir
8import v2.ssa
9import v2.types
10import encoding.binary
11import os
12import time
13
14pub struct Gen {
15pub:
16 mod &mir.Module
17mut:
18 macho &MachOObject
19pub mut:
20 stack_map map[int]int
21 alloca_offsets map[int]int
22 stack_size int
23 curr_offset int
24
25 block_offsets []int // indexed by block_id, -1 = not yet visited
26 pending_label_blks []int
27 pending_label_offs []int
28 // Per-block pending label index (linked list head per block id).
29 // pending_head[blk_id] = first index into pending_label_offs (-1 = none).
30 // pending_next[i] = next pending entry for same block (-1 = last).
31 pending_head []int // indexed by block_id, -1 = no pending labels for this block
32 pending_next []int
33 func_count int
34 total_pending int
35 total_resolved int
36
37 // Register allocation
38 reg_map map[int]int
39 used_regs []int
40 next_blk int
41 cur_blk_id int // current block being generated (for phi copy emission)
42
43 // Track which string literals have been materialized (value_id -> str_data offset)
44 string_literal_offsets map[int]int
45
46 // Cache for parsed constant integer values (value_id -> parsed i64)
47 const_cache map[int]i64
48
49 // Current function's return type (for handling struct returns)
50 cur_func_ret_type int
51 cur_func_name string
52
53 // Stack offset where x8 (indirect return pointer) is saved for large struct returns
54 x8_save_offset int
55 // Cache for deduplicating string data in cstring section (content -> offset)
56 string_data_cache map[string]int
57 // alloca values whose addresses are stored in sumtype `_data` fields and
58 // therefore must outlive the current stack frame.
59 sumtype_data_heap_allocas map[int]bool
60 // Type layout caches/guards to avoid recursive size/alignment loops.
61 type_size_cache []int // indexed by type_id, 0 = not cached (valid sizes are > 0 or == 0 only for void)
62 type_align_cache []int // indexed by type_id, 0 = not cached
63 type_size_stack []bool // indexed by type_id (recursion guard)
64 type_align_stack []bool // indexed by type_id (recursion guard)
65 // Cache for struct field offset calculations (key: typ_id << 16 | field_idx)
66 struct_field_offset_cache map[int]int
67 // Lookup caches for O(1) name resolution
68 func_by_name map[string]int // function name → index in g.mod.funcs
69 global_by_name map[string]int // global name → index in g.mod.globals
70 // Per-function cache for alloca pointer analysis (cleared per function)
71 alloca_ptr_cache map[int]u8 // alloca_id → 1=has_ptrs, 2=no_ptrs
72 // Cached environment variables for debug tracing (read once at init)
73 env_dump_funcrefs string
74 env_trace_skip_dead string
75 env_dump_stackmap string
76 env_dump_blocks string
77 env_trace_paramspill string
78 env_trace_val string
79 env_trace_instr string
80 env_trace_cmp string
81 env_trace_store string
82 env_trace_load string
83 env_trace_call string
84 env_trace_ret string
85 env_trace_bitcast string
86 env_trace_assign string
87 env_trace_extract string
88 env_trace_struct_init string
89 env_trace_agg_copy string
90 env_trace_insert string
91 env_trace_callcount string
92 env_trace_callarg string
93 env_trace_struct_addr string
94 env_trace_strlit string
95 env_trace_storeval string
96 env_trace_regalloc string
97 env_no_regalloc bool
98 // SP-relative addressing: sp_base_offset = callee_saved_size + stack_size
99 // so that fp - N = sp + (sp_base_offset - N) for positive sp-relative offsets.
100 sp_base_offset int
101 sp_adjusted bool // true when sp is temporarily modified (call arg push)
102 sp_adjust_amt int // how much SP was decremented (valid when sp_adjusted)
103 // Last-store cache for eliminating redundant store-then-load sequences.
104 // After store_reg_to_val records (reg, val_id), the next load_val_to_reg
105 // for the same val_id can reuse the register instead of loading from stack.
106 // Cleared at block boundaries, function calls, and any register write.
107 last_store_reg int = -1
108 last_store_val int
109 last_store_blk int = -1
110 last_store_next_instr_idx int = -1
111 // Current block instruction list and index, for lookahead optimizations.
112 cur_blk_instrs []int
113 cur_blk_instr_idx int
114 // BasicBlock is large and contains array fields. Cache the per-block
115 // instruction slices so self-hosted ARM codegen does not repeatedly copy it.
116 block_instrs [][]int
117 // Function is large and contains array/string fields. Cache the metadata used by
118 // codegen so self-hosted ARM codegen can avoid copying Function values.
119 func_names []string
120 func_blocks [][]int
121 func_params [][]int
122 func_typs []ssa.TypeID
123 func_is_c_extern []bool
124 func_abi_ret_indirect []bool
125 func_abi_param_class [][]mir.AbiArgClass
126 func_ref_to_func_idx []int
127 type_kinds []ssa.TypeKind
128 type_elem_types []ssa.TypeID
129 type_lens []int
130 type_is_unsigned []bool
131 // Function boundaries for dead-stripping.
132 fn_starts []int // text offset where each function begins
133 fn_ends []int // text offset where each function ends
134 fn_names []string // symbol name of each function
135 fn_sym_ids []int // symbol index in macho.symbols
136 // Stats counters for optimization analysis.
137 stats_total_stores int
138 stats_skipped_stores int
139 stats_cache_hits int
140 // Profiling timers for gen_func sub-stages.
141 t_setup_ms f64
142 t_prepass_ms f64
143 t_prologue_ms f64
144 t_main_ms f64
145 t_regalloc_ms f64
146}
147
148pub fn Gen.new(mod &mir.Module) &Gen {
149 n_types := mod.type_store.types.len
150 return &Gen{
151 mod: mod
152 macho: MachOObject.new()
153 type_size_cache: []int{len: n_types}
154 type_align_cache: []int{len: n_types}
155 type_size_stack: []bool{len: n_types}
156 type_align_stack: []bool{len: n_types}
157 env_dump_funcrefs: os.getenv('V2_ARM64_DUMP_FUNCREFS')
158 env_trace_skip_dead: os.getenv('V2_ARM64_TRACE_SKIP_DEAD')
159 env_dump_stackmap: os.getenv('V2_ARM64_DUMP_STACKMAP')
160 env_dump_blocks: os.getenv('V2_ARM64_DUMP_BLOCKS')
161 env_trace_paramspill: os.getenv('V2_ARM64_TRACE_PARAMSPILL')
162 env_trace_val: os.getenv('V2_ARM64_TRACE_VAL')
163 env_trace_instr: os.getenv('V2_ARM64_TRACE_INSTR')
164 env_trace_cmp: os.getenv('V2_ARM64_TRACE_CMP')
165 env_trace_store: os.getenv('V2_ARM64_TRACE_STORE')
166 env_trace_load: os.getenv('V2_ARM64_TRACE_LOAD')
167 env_trace_call: os.getenv('V2_ARM64_TRACE_CALL')
168 env_trace_ret: os.getenv('V2_ARM64_TRACE_RET')
169 env_trace_bitcast: os.getenv('V2_ARM64_TRACE_BITCAST')
170 env_trace_assign: os.getenv('V2_ARM64_TRACE_ASSIGN')
171 env_trace_extract: os.getenv('V2_ARM64_TRACE_EXTRACT')
172 env_trace_struct_init: os.getenv('V2_ARM64_TRACE_STRUCT_INIT')
173 env_trace_agg_copy: os.getenv('V2_ARM64_TRACE_AGG_COPY')
174 env_trace_insert: os.getenv('V2_ARM64_TRACE_INSERT')
175 env_trace_callcount: os.getenv('V2_ARM64_TRACE_CALLCOUNT')
176 env_trace_callarg: os.getenv('V2_ARM64_TRACE_CALLARG')
177 env_trace_struct_addr: os.getenv('V2_ARM64_TRACE_STRUCT_ADDR')
178 env_trace_strlit: os.getenv('V2_ARM64_TRACE_STRLIT')
179 env_trace_storeval: os.getenv('V2_ARM64_TRACE_STOREVAL')
180 env_trace_regalloc: os.getenv('V2_ARM64_TRACE_REGALLOC')
181 env_no_regalloc: os.getenv('V2_ARM64_NO_REGALLOC').len > 0
182 }
183}
184
185// Clear the last-store cache (block boundary, call, etc.).
186fn (mut g Gen) invalidate_last_store() {
187 g.last_store_reg = -1
188 g.last_store_val = 0
189 g.last_store_blk = -1
190 g.last_store_next_instr_idx = -1
191}
192
193fn (g &Gen) last_store_cache_enabled() bool {
194 return true
195}
196
197fn (g &Gen) is_cacheable_last_store_reg(reg int) bool {
198 return reg >= 0 && reg != 8 && reg != 9
199}
200
201// Check if a store to stack can be skipped for val_id because the value
202// will be consumed from the last-store cache by the very next instruction.
203// Returns: 0 = must store, 1 = skip (consumed as operand[0]).
204fn (mut g Gen) should_skip_store(val_id int) int {
205 if !g.last_store_cache_enabled() {
206 return 0
207 }
208 if val_id <= 0 || val_id >= g.mod.values.len {
209 return 0
210 }
211 v := g.mod.values[val_id]
212 if v.uses.len != 1 {
213 return 0
214 }
215 // Check that the next instruction in the block is the single consumer.
216 next_idx := g.cur_blk_instr_idx + 1
217 if next_idx >= g.cur_blk_instrs.len {
218 return 0
219 }
220 next_vid := g.cur_blk_instrs[next_idx]
221 if v.uses[0] != next_vid {
222 return 0
223 }
224 if next_vid <= 0 || next_vid >= g.mod.values.len {
225 return 0
226 }
227 nv := g.mod.values[next_vid]
228 if nv.kind != .instruction {
229 return 0
230 }
231 ni := g.mod.instrs[nv.index]
232 // Only allow pure ops that load operands via get_operand_reg first
233 // and DON'T re-load operands afterwards.
234 is_arith := ni.op in [.add, .sub, .mul, .sdiv, .udiv, .srem, .urem, .and_, .or_, .xor, .shl,
235 .ashr, .lshr]
236 is_int_cmp := ni.op in [.eq, .ne, .lt, .gt, .le, .ge, .ult, .ugt, .ule, .uge] && v.typ > 0
237 && v.typ < g.mod.type_store.types.len && g.mod.type_store.types[v.typ].kind != .float_t
238 is_mem_or_ptr := ni.op in [.store, .load, .get_element_ptr]
239 is_int_conv := ni.op in [.trunc, .zext] && v.typ > 0 && v.typ < g.mod.type_store.types.len
240 && g.mod.type_store.types[v.typ].kind != .float_t
241 if !is_arith && !is_int_cmp && !is_mem_or_ptr && !is_int_conv {
242 return 0
243 }
244 if ni.operands.len == 0 {
245 return 0
246 }
247 // Operand[0] match: skip store entirely, value stays in cache register.
248 if ni.operands[0] == val_id {
249 return 1
250 }
251 return 0
252}
253
254pub fn (mut g Gen) gen() {
255 t0 := time.now()
256 g.gen_pre_pass()
257 pre_ms := f64(time.since(t0)) / f64(time.millisecond)
258 t1 := time.now()
259 for fi := 0; fi < g.mod.funcs.len; fi++ {
260 g.gen_func(fi)
261 }
262 funcs_ms := f64(time.since(t1)) / f64(time.millisecond)
263 t2 := time.now()
264 g.gen_post_pass()
265 post_ms := f64(time.since(t2)) / f64(time.millisecond)
266 eprintln('ARM64 gen sub: pre=${pre_ms:.1}ms funcs=${funcs_ms:.1}ms post=${post_ms:.1}ms')
267 if os.getenv('V2_ARM64_TIME_DETAIL') != '' {
268 eprintln('ARM64 gen_func subs: setup=${g.t_setup_ms:.0}ms prepass=${g.t_prepass_ms:.0}ms prologue=${g.t_prologue_ms:.0}ms main=${g.t_main_ms:.0}ms regalloc=${g.t_regalloc_ms:.0}ms')
269 }
270}
271
272// release_scratch_after_gen drops codegen lookup/cache tables after all machine
273// code and relocations have been emitted into g.macho.
274pub fn (mut g Gen) release_scratch_after_gen() {
275 unsafe {
276 g.stack_map.free()
277 g.alloca_offsets.free()
278 g.block_offsets.free()
279 g.pending_label_blks.free()
280 g.pending_label_offs.free()
281 g.pending_head.free()
282 g.pending_next.free()
283 g.reg_map.free()
284 g.used_regs.free()
285 g.string_literal_offsets.free()
286 g.const_cache.free()
287 g.string_data_cache.free()
288 g.sumtype_data_heap_allocas.free()
289 g.type_size_cache.free()
290 g.type_align_cache.free()
291 g.type_size_stack.free()
292 g.type_align_stack.free()
293 g.struct_field_offset_cache.free()
294 g.func_by_name.free()
295 g.global_by_name.free()
296 g.alloca_ptr_cache.free()
297 g.block_instrs.free()
298 g.func_blocks.free()
299 g.func_params.free()
300 g.func_typs.free()
301 g.func_is_c_extern.free()
302 g.func_abi_ret_indirect.free()
303 g.func_abi_param_class.free()
304 g.func_ref_to_func_idx.free()
305 g.type_kinds.free()
306 g.type_elem_types.free()
307 g.type_lens.free()
308 g.type_is_unsigned.free()
309 g.fn_starts.free()
310 g.fn_ends.free()
311 g.fn_sym_ids.free()
312 }
313 g.stack_map = map[int]int{}
314 g.alloca_offsets = map[int]int{}
315 g.block_offsets = []int{}
316 g.pending_label_blks = []int{}
317 g.pending_label_offs = []int{}
318 g.pending_head = []int{}
319 g.pending_next = []int{}
320 g.reg_map = map[int]int{}
321 g.used_regs = []int{}
322 g.string_literal_offsets = map[int]int{}
323 g.const_cache = map[int]i64{}
324 g.string_data_cache = map[string]int{}
325 g.sumtype_data_heap_allocas = map[int]bool{}
326 g.type_size_cache = []int{}
327 g.type_align_cache = []int{}
328 g.type_size_stack = []bool{}
329 g.type_align_stack = []bool{}
330 g.struct_field_offset_cache = map[int]int{}
331 g.func_by_name = map[string]int{}
332 g.global_by_name = map[string]int{}
333 g.alloca_ptr_cache = map[int]u8{}
334 g.cur_blk_instrs = []int{}
335 g.block_instrs = [][]int{}
336 g.func_blocks = [][]int{}
337 g.func_params = [][]int{}
338 g.func_typs = []ssa.TypeID{}
339 g.func_is_c_extern = []bool{}
340 g.func_abi_ret_indirect = []bool{}
341 g.func_abi_param_class = [][]mir.AbiArgClass{}
342 g.func_ref_to_func_idx = []int{}
343 g.type_kinds = []ssa.TypeKind{}
344 g.type_elem_types = []ssa.TypeID{}
345 g.type_lens = []int{}
346 g.type_is_unsigned = []bool{}
347 g.fn_starts = []int{}
348 g.fn_ends = []int{}
349 g.fn_sym_ids = []int{}
350}
351
352// gen_pre_pass registers global symbols and builds lookup caches.
353// Must be called before any gen_func calls.
354pub fn (mut g Gen) gen_pre_pass() {
355 // Pre-register global symbols BEFORE generating functions
356 // This ensures add_undefined() finds existing symbols instead of creating undefined ones
357 mut data_offset := u64(0)
358 for gi := 0; gi < g.mod.globals.len; gi++ {
359 // Skip external globals (defined elsewhere, e.g. __stdoutp)
360 if g.mod.globals[gi].linkage == .external {
361 continue
362 }
363 // Align to 8 bytes
364 data_offset = (data_offset + 7) & ~7
365 g.macho.add_symbol('_' + g.mod.globals[gi].name, data_offset, true, 3)
366 size := if g.mod.globals[gi].initial_data.len > 0 {
367 g.mod.globals[gi].initial_data.len
368 } else {
369 g.type_size(g.mod.globals[gi].typ)
370 }
371 data_offset += u64(size)
372 }
373
374 // Build lookup caches for O(1) name resolution and cache function metadata.
375 n_funcs := g.mod.funcs.len
376 g.func_names = []string{len: n_funcs}
377 g.func_blocks = [][]int{len: n_funcs}
378 g.func_params = [][]int{len: n_funcs}
379 g.func_typs = []ssa.TypeID{len: n_funcs}
380 g.func_is_c_extern = []bool{len: n_funcs}
381 g.func_abi_ret_indirect = []bool{len: n_funcs}
382 g.func_abi_param_class = [][]mir.AbiArgClass{len: n_funcs}
383 for fi := 0; fi < n_funcs; fi++ {
384 func_name := g.mod.funcs[fi].name
385 g.func_names[fi] = func_name
386 g.func_blocks[fi] = g.mod.funcs[fi].blocks
387 g.func_params[fi] = g.mod.funcs[fi].params
388 g.func_typs[fi] = g.mod.funcs[fi].typ
389 g.func_is_c_extern[fi] = g.mod.funcs[fi].is_c_extern
390 g.func_abi_ret_indirect[fi] = g.mod.funcs[fi].abi_ret_indirect
391 g.func_abi_param_class[fi] = g.mod.funcs[fi].abi_param_class
392 }
393 g.func_ref_to_func_idx = []int{len: g.mod.values.len, init: -1}
394 for vi := 0; vi < g.mod.values.len; vi++ {
395 if g.mod.values[vi].kind == .func_ref {
396 fn_name := g.mod.values[vi].name
397 fi := g.find_func_idx_by_name(fn_name)
398 if fi >= 0 {
399 g.func_ref_to_func_idx[vi] = fi
400 }
401 }
402 }
403
404 n_types := g.mod.type_store.types.len
405 g.type_kinds = []ssa.TypeKind{len: n_types}
406 g.type_elem_types = []ssa.TypeID{len: n_types}
407 g.type_lens = []int{len: n_types}
408 g.type_is_unsigned = []bool{len: n_types}
409 for ti := 0; ti < n_types; ti++ {
410 g.type_kinds[ti] = g.mod.type_store.types[ti].kind
411 g.type_elem_types[ti] = g.mod.type_store.types[ti].elem_type
412 g.type_lens[ti] = g.mod.type_store.types[ti].len
413 g.type_is_unsigned[ti] = g.mod.type_store.types[ti].is_unsigned
414 }
415
416 // Pre-populate type size/align caches so parallel workers can share them read-only
417 g.pre_populate_type_caches()
418 g.block_instrs = [][]int{len: g.mod.blocks.len}
419 for bid := 0; bid < g.mod.blocks.len; bid++ {
420 g.block_instrs[bid] = g.mod.blocks[bid].instrs
421 }
422}
423
424// dead_strip_functions removes unreachable functions from the text section.
425// Builds a call graph from relocations, marks reachable functions starting
426// from _main and __v_init_consts_* roots, then compacts the text section.
427fn (mut g Gen) dead_strip_functions() {
428 if g.fn_starts.len == 0 {
429 return
430 }
431 ds_t0 := time.now()
432 n_fns := g.fn_starts.len
433 // Build sym_idx → fn_idx for resolving relocation targets.
434 mut sym_to_fn := map[int]int{}
435 for fi := 0; fi < n_fns; fi++ {
436 sym_to_fn[g.fn_sym_ids[fi]] = fi
437 }
438 // Build call graph from relocations using binary search on sorted fn_starts.
439 mut callees := [][]int{len: n_fns}
440 for reloc in g.macho.relocs {
441 mut lo := 0
442 mut hi := n_fns - 1
443 mut src_fn := -1
444 for lo <= hi {
445 mid := (lo + hi) / 2
446 if reloc.addr < g.fn_starts[mid] {
447 hi = mid - 1
448 } else if reloc.addr >= g.fn_ends[mid] {
449 lo = mid + 1
450 } else {
451 src_fn = mid
452 break
453 }
454 }
455 if src_fn < 0 {
456 continue
457 }
458 if tgt_fn := sym_to_fn[reloc.sym_idx] {
459 callees[src_fn] << tgt_fn
460 }
461 }
462 // Mark reachable functions starting from roots via BFS.
463 mut reachable := []bool{len: n_fns}
464 mut worklist := []int{}
465 for fi := 0; fi < n_fns; fi++ {
466 name := g.fn_names[fi]
467 if name == '__main' || name == '_main' || name == '_main__main'
468 || name.starts_with('___v_init_consts_')
469 || name.starts_with('_builtin____v_init_consts_') || name == '___unresolved_stub' {
470 if !reachable[fi] {
471 reachable[fi] = true
472 worklist << fi
473 }
474 }
475 }
476 for worklist.len > 0 {
477 fi := worklist.pop()
478 for callee in callees[fi] {
479 if !reachable[callee] {
480 reachable[callee] = true
481 worklist << callee
482 }
483 }
484 }
485 // Force-strip unused backend modules by name prefix.
486 // When compiling for arm64, cleanc/c/eval/x64 functions are never called
487 // at runtime. Their symbols redirect to ___unresolved_stub (returns 0).
488 for fi2 := 0; fi2 < n_fns; fi2++ {
489 if !reachable[fi2] {
490 continue
491 }
492 n2 := g.fn_names[fi2]
493 if n2.len > 1 {
494 sn := n2[1..] // strip leading underscore
495 if sn.starts_with('cleanc__') || sn.starts_with('c__Gen') || sn.starts_with('eval__')
496 || sn.starts_with('x64__') {
497 reachable[fi2] = false
498 }
499 }
500 }
501 // Compute cumulative shift: how many bytes of dead code precede each function.
502 mut fn_shift := []int{len: n_fns}
503 mut cum_shift := 0
504 mut dead_count := 0
505 mut dead_bytes := 0
506 for fi := 0; fi < n_fns; fi++ {
507 fn_shift[fi] = cum_shift
508 if !reachable[fi] {
509 dead_count++
510 fn_size := g.fn_ends[fi] - g.fn_starts[fi]
511 dead_bytes += fn_size
512 cum_shift += fn_size
513 }
514 }
515 if dead_count == 0 {
516 return
517 }
518 eprintln('ARM64 DEADSTRIP: ${dead_count} dead functions, ${dead_bytes} bytes (${dead_bytes / 1024}KB)')
519 // Build compacted text section: copy prefix, kept functions, suffix.
520 old_text := g.macho.text_data
521 mut new_text := []u8{cap: old_text.len - dead_bytes}
522 // Copy prefix bytes before first function (if any).
523 if g.fn_starts[0] > 0 {
524 new_text << old_text[..g.fn_starts[0]]
525 }
526 // Copy kept functions in order.
527 for fi := 0; fi < n_fns; fi++ {
528 if !reachable[fi] {
529 continue
530 }
531 new_text << old_text[g.fn_starts[fi]..g.fn_ends[fi]]
532 }
533 // Copy suffix bytes after last function (e.g. unresolved stub added by gen_post_pass).
534 last_end := g.fn_ends[n_fns - 1]
535 if last_end < old_text.len {
536 new_text << old_text[last_end..]
537 }
538 // Fix up relocation addresses and drop relocations inside dead functions.
539 mut new_relocs := []RelocationInfo{cap: g.macho.relocs.len}
540 for reloc in g.macho.relocs {
541 // Binary search for source function (using original boundaries).
542 mut lo := 0
543 mut hi := n_fns - 1
544 mut src_fn := -1
545 for lo <= hi {
546 mid := (lo + hi) / 2
547 if reloc.addr < g.fn_starts[mid] {
548 hi = mid - 1
549 } else if reloc.addr >= g.fn_ends[mid] {
550 lo = mid + 1
551 } else {
552 src_fn = mid
553 break
554 }
555 }
556 if src_fn >= 0 && !reachable[src_fn] {
557 continue // drop relocations in dead functions
558 }
559 mut new_addr := reloc.addr
560 if src_fn >= 0 {
561 new_addr = reloc.addr - fn_shift[src_fn]
562 } else if reloc.addr >= last_end {
563 new_addr = reloc.addr - cum_shift
564 }
565 new_relocs << RelocationInfo{
566 addr: new_addr
567 sym_idx: reloc.sym_idx
568 pcrel: reloc.pcrel
569 length: reloc.length
570 extern: reloc.extern
571 type_: reloc.type_
572 }
573 }
574 g.macho.relocs = new_relocs
575 // Fix up symbol addresses.
576 // Dead function symbols redirect to ___unresolved_stub (returns 0).
577 mut stub_new_addr := u64(0)
578 if stub_si := g.macho.sym_by_name['___unresolved_stub'] {
579 // Stub is in suffix region, shift by total dead bytes.
580 stub_new_addr = g.macho.symbols[stub_si].value - u64(cum_shift)
581 }
582 for si := 0; si < g.macho.symbols.len; si++ {
583 old_val := int(g.macho.symbols[si].value)
584 if fi := sym_to_fn[si] {
585 if reachable[fi] {
586 g.macho.symbols[si].value = u64(old_val - fn_shift[fi])
587 } else {
588 g.macho.symbols[si].value = stub_new_addr
589 }
590 } else if old_val >= last_end {
591 // Symbol in suffix region.
592 g.macho.symbols[si].value = u64(old_val - cum_shift)
593 }
594 }
595 // Update function boundaries (for any downstream use).
596 for fi := 0; fi < n_fns; fi++ {
597 if reachable[fi] {
598 g.fn_starts[fi] -= fn_shift[fi]
599 g.fn_ends[fi] -= fn_shift[fi]
600 }
601 }
602 g.macho.text_data = new_text
603 ds_ms := f64(time.since(ds_t0)) / f64(time.millisecond)
604 eprintln('ARM64 deadstrip ms=${ds_ms:.1}')
605}
606
607// gen_post_pass emits the unresolved stub, global data, and patches symbol addresses.
608// Must be called after all gen_func calls.
609pub fn (mut g Gen) gen_post_pass() {
610 eprintln('ARM64 STATS: stores=${g.stats_total_stores} skipped=${g.stats_skipped_stores} cache_hits=${g.stats_cache_hits}')
611 // Add return-zero stub for unresolved symbols.
612 // When the linker can't resolve a symbol, it redirects calls here instead of
613 // letting them jump to the Mach-O header which corrupts memory.
614 unresolved_stub_offset := u64(g.macho.text_data.len)
615 g.macho.add_symbol('___unresolved_stub', unresolved_stub_offset, false, 1)
616 g.macho.add_symbol('_tcc_backtrace', unresolved_stub_offset, false, 1)
617 g.emit(0xD2800000) // mov x0, #0
618 g.emit(0xD2800001) // mov x1, #0
619 g.emit(0xD65F03C0) // ret
620 g.macho.add_symbol('_v_os_execute_capture_start', u64(g.macho.text_data.len), false, 1)
621 g.emit(0x12800000) // mov w0, #-1
622 g.emit(0xD65F03C0) // ret
623
624 // Dead-strip unreachable functions.
625 g.dead_strip_functions()
626
627 // Globals in __data (Section 3) - emit actual data
628 for gi := 0; gi < g.mod.globals.len; gi++ {
629 // Skip external globals (defined elsewhere)
630 if g.mod.globals[gi].linkage == .external {
631 continue
632 }
633 // Skip globals that collide with function names (same as pre-registration loop)
634 if g.has_function_named(g.mod.globals[gi].name) {
635 continue
636 }
637 for g.macho.data_data.len % 8 != 0 {
638 g.macho.data_data << 0
639 }
640 // Constant arrays: emit raw element data directly
641 if g.mod.globals[gi].initial_data.len > 0 {
642 g.macho.data_data << g.mod.globals[gi].initial_data
643 continue
644 }
645 // Calculate actual size of the global variable based on its type.
646 size := g.type_size(g.mod.globals[gi].typ)
647 is_constant := g.mod.globals[gi].is_constant
648 initial_value := g.mod.globals[gi].initial_value
649 if is_constant {
650 match size {
651 1 {
652 g.macho.data_data << u8(initial_value)
653 }
654 2 {
655 mut bytes := []u8{len: 2}
656 binary.little_endian_put_u16(mut bytes, u16(initial_value))
657 g.macho.data_data << bytes
658 }
659 4 {
660 mut bytes := []u8{len: 4}
661 binary.little_endian_put_u32(mut bytes, u32(initial_value))
662 g.macho.data_data << bytes
663 }
664 8 {
665 mut bytes := []u8{len: 8}
666 binary.little_endian_put_u64(mut bytes, u64(initial_value))
667 g.macho.data_data << bytes
668 }
669 else {
670 // For struct constants (e.g., sum types), emit initial_value as first 8 bytes
671 // (the tag for sum types), then zeros for the rest.
672 if initial_value != 0 && size >= 8 {
673 mut bytes := []u8{len: 8}
674 binary.little_endian_put_u64(mut bytes, u64(initial_value))
675 g.macho.data_data << bytes
676 for _ in 0 .. size - 8 {
677 g.macho.data_data << 0
678 }
679 } else {
680 for _ in 0 .. size {
681 g.macho.data_data << 0
682 }
683 }
684 }
685 }
686 } else {
687 // For regular (mutable) globals, emit initial value if set, else zeros.
688 if initial_value != 0 {
689 match size {
690 1 {
691 g.macho.data_data << u8(initial_value)
692 }
693 2 {
694 mut bytes := []u8{len: 2}
695 binary.little_endian_put_u16(mut bytes, u16(initial_value))
696 g.macho.data_data << bytes
697 }
698 4 {
699 mut bytes := []u8{len: 4}
700 binary.little_endian_put_u32(mut bytes, u32(initial_value))
701 g.macho.data_data << bytes
702 }
703 else {
704 mut bytes := []u8{len: 8}
705 binary.little_endian_put_u64(mut bytes, u64(initial_value))
706 g.macho.data_data << bytes
707 for _ in 0 .. size - 8 {
708 g.macho.data_data << 0
709 }
710 }
711 }
712 } else {
713 for _ in 0 .. size {
714 g.macho.data_data << 0
715 }
716 }
717 }
718 }
719
720 // Patch symbol addresses
721 cstring_base := u64(g.macho.text_data.len)
722 // Align data section to 8 bytes
723 data_base := (cstring_base + u64(g.macho.str_data.len) + 7) & ~7
724
725 for mut sym in g.macho.symbols {
726 if sym.sect == 2 {
727 sym.value += cstring_base
728 } else if sym.sect == 3 {
729 sym.value += data_base
730 }
731 }
732}
733
734// new_worker_clone creates a new Gen instance for parallel code generation.
735// The worker shares the read-only MIR module and lookup caches, but has its
736// own MachOObject buffers for independent code emission.
737// pre_populate_type_caches computes type_size and type_align for ALL types
738// in the type store, so that workers can share the caches read-only.
739pub fn (mut g Gen) pre_populate_type_caches() {
740 for tid := 0; tid < g.mod.type_store.types.len; tid++ {
741 g.type_size(tid)
742 g.type_align(tid)
743 }
744}
745
746fn (g &Gen) block_id_from_value(val_id int) int {
747 if val_id < 0 || val_id >= g.mod.values.len {
748 return -1
749 }
750 if g.mod.values[val_id].kind != .basic_block {
751 return -1
752 }
753 block_id := g.mod.values[val_id].index
754 if block_id < 0 || block_id >= g.mod.blocks.len {
755 return -1
756 }
757 return block_id
758}
759
760pub fn (g &Gen) new_worker_clone() &Gen {
761 // Clone all maps and arrays to avoid COW data races between threads.
762 // V's map/array assignment shares internal data; concurrent reads can
763 // trigger internal rehashing/COW writes that race with other threads.
764 return &Gen{
765 mod: g.mod
766 macho: MachOObject.new()
767 func_by_name: g.func_by_name.clone()
768 global_by_name: g.global_by_name.clone()
769 type_size_cache: g.type_size_cache.clone()
770 type_align_cache: g.type_align_cache.clone()
771 type_size_stack: g.type_size_stack.clone()
772 type_align_stack: g.type_align_stack.clone()
773 block_instrs: g.block_instrs.clone()
774 func_names: g.func_names.clone()
775 func_blocks: g.func_blocks.clone()
776 func_params: g.func_params.clone()
777 func_typs: g.func_typs.clone()
778 func_is_c_extern: g.func_is_c_extern.clone()
779 func_abi_ret_indirect: g.func_abi_ret_indirect.clone()
780 func_abi_param_class: g.func_abi_param_class.clone()
781 func_ref_to_func_idx: g.func_ref_to_func_idx.clone()
782 type_kinds: g.type_kinds.clone()
783 type_elem_types: g.type_elem_types.clone()
784 type_lens: g.type_lens.clone()
785 type_is_unsigned: g.type_is_unsigned.clone()
786 env_dump_funcrefs: g.env_dump_funcrefs
787 env_trace_skip_dead: g.env_trace_skip_dead
788 env_dump_stackmap: g.env_dump_stackmap
789 env_dump_blocks: g.env_dump_blocks
790 env_trace_paramspill: g.env_trace_paramspill
791 env_trace_val: g.env_trace_val
792 env_trace_instr: g.env_trace_instr
793 env_trace_cmp: g.env_trace_cmp
794 env_trace_store: g.env_trace_store
795 env_trace_load: g.env_trace_load
796 env_trace_call: g.env_trace_call
797 env_trace_ret: g.env_trace_ret
798 env_trace_bitcast: g.env_trace_bitcast
799 env_trace_assign: g.env_trace_assign
800 env_trace_extract: g.env_trace_extract
801 env_trace_struct_init: g.env_trace_struct_init
802 env_trace_agg_copy: g.env_trace_agg_copy
803 env_trace_insert: g.env_trace_insert
804 env_trace_callcount: g.env_trace_callcount
805 env_trace_callarg: g.env_trace_callarg
806 env_trace_struct_addr: g.env_trace_struct_addr
807 env_trace_strlit: g.env_trace_strlit
808 env_trace_storeval: g.env_trace_storeval
809 env_trace_regalloc: g.env_trace_regalloc
810 env_no_regalloc: g.env_no_regalloc
811 }
812}
813
814// merge_worker merges a parallel worker's output buffers into the main Gen.
815// text_data, str_data, symbols, and relocations are concatenated with offset adjustment.
816pub fn (mut g Gen) merge_worker(w &Gen) {
817 text_base := g.macho.text_data.len
818 str_base := g.macho.str_data.len
819
820 // Append machine code
821 g.macho.text_data << w.macho.text_data
822
823 // Append string literal data
824 g.macho.str_data << w.macho.str_data
825
826 // Merge symbols: remap worker symbol indices to main symbol table
827 mut sym_remap := []int{len: w.macho.symbols.len}
828 for wi, sym in w.macho.symbols {
829 mut new_value := sym.value
830 if sym.sect == 1 {
831 new_value += u64(text_base)
832 } else if sym.sect == 2 {
833 new_value += u64(str_base)
834 }
835 // Local symbols (L_str_*, L_cstr_*) are per-worker and must never be
836 // deduplicated — each worker's L_str_0 refers to a different string literal.
837 is_local := sym.name.len > 2 && sym.name[0] == `L` && sym.name[1] == `_`
838 // Check if symbol already exists in main (e.g., pre-registered global or extern)
839 if !is_local {
840 if existing := g.macho.sym_by_name[sym.name] {
841 // Update existing symbol with definition if this one defines it
842 if sym.type_ != 0x01 { // not N_UNDF
843 mut main_sym := &g.macho.symbols[existing]
844 main_sym.type_ = sym.type_
845 main_sym.sect = sym.sect
846 main_sym.value = new_value
847 }
848 sym_remap[wi] = existing
849 continue
850 }
851 }
852 sym_remap[wi] = g.macho.symbols.len
853 name_off := g.macho.str_table.len
854 g.macho.str_table << sym.name.bytes()
855 g.macho.str_table << 0
856 g.macho.symbols << Symbol{
857 name: sym.name
858 type_: sym.type_
859 sect: sym.sect
860 desc: sym.desc
861 value: new_value
862 name_off: name_off
863 }
864 if !is_local {
865 g.macho.sym_by_name[sym.name] = sym_remap[wi]
866 }
867 }
868
869 // Merge relocations with adjusted addresses and remapped symbol indices
870 for rel in w.macho.relocs {
871 g.macho.relocs << RelocationInfo{
872 addr: rel.addr + text_base
873 sym_idx: sym_remap[rel.sym_idx]
874 pcrel: rel.pcrel
875 length: rel.length
876 extern: rel.extern
877 type_: rel.type_
878 }
879 }
880 // Merge function boundary tracking for dead-stripping.
881 for fi := 0; fi < w.fn_starts.len; fi++ {
882 g.fn_starts << w.fn_starts[fi] + text_base
883 g.fn_ends << w.fn_ends[fi] + text_base
884 g.fn_names << w.fn_names[fi]
885 g.fn_sym_ids << sym_remap[w.fn_sym_ids[fi]]
886 }
887 // Merge stats.
888 g.stats_total_stores += w.stats_total_stores
889 g.stats_skipped_stores += w.stats_skipped_stores
890 g.stats_cache_hits += w.stats_cache_hits
891}
892
893pub fn (mut g Gen) gen_func(func_idx int) {
894 if func_idx < 0 || func_idx >= g.func_names.len {
895 return
896 }
897 func_name := g.func_names[func_idx]
898 func_blocks := g.func_blocks[func_idx]
899 func_params := g.func_params[func_idx]
900 func_typ := g.func_typs[func_idx]
901 func_abi_param_class := g.func_abi_param_class[func_idx]
902 func_abi_ret_indirect := g.func_abi_ret_indirect[func_idx]
903 if g.func_is_c_extern[func_idx] {
904 // C extern functions are provided by external libraries (libc, etc.).
905 // Don't emit any local symbol — let the linker resolve them as undefined externals.
906 return
907 }
908 if func_blocks.len == 0 {
909 // Emit a minimal stub: just a ret instruction.
910 fn_start := g.macho.text_data.len
911 g.curr_offset = fn_start
912 sym_name := '_' + func_name
913 sym_idx := g.macho.add_symbol(sym_name, u64(fn_start), false, 1)
914 g.emit(0xd65f03c0) // ret
915 g.fn_starts << fn_start
916 g.fn_ends << g.macho.text_data.len
917 g.fn_names << sym_name
918 g.fn_sym_ids << sym_idx
919 return
920 }
921 tf_setup := time.now()
922 g.curr_offset = g.macho.text_data.len
923 g.stack_map.clear()
924 g.alloca_offsets.clear()
925 g.alloca_ptr_cache.clear()
926 // Reuse block_offsets and pending_head arrays, grow if needed, only reset
927 // entries that can still contain state from the previous function.
928 n_blks := g.mod.blocks.len
929 if g.block_offsets.len < n_blks {
930 g.block_offsets = []int{len: n_blks}
931 g.pending_head = []int{len: n_blks}
932 // Fresh allocation needs full -1 init.
933 for bo_idx := 0; bo_idx < n_blks; bo_idx++ {
934 g.block_offsets[bo_idx] = -1
935 g.pending_head[bo_idx] = -1
936 }
937 } else {
938 // Only reset blocks belonging to this function.
939 for fbi := 0; fbi < func_blocks.len; fbi++ {
940 bid := func_blocks[fbi]
941 if bid >= 0 && bid < g.block_offsets.len {
942 g.block_offsets[bid] = -1
943 }
944 }
945 for pi := 0; pi < g.pending_label_blks.len; pi++ {
946 prev_blk := g.pending_label_blks[pi]
947 if prev_blk >= 0 && prev_blk < g.pending_head.len {
948 g.pending_head[prev_blk] = -1
949 }
950 }
951 }
952 g.pending_label_blks.clear()
953 g.pending_label_offs.clear()
954 g.pending_next.clear()
955 g.func_count++
956 g.total_pending = 0
957 g.total_resolved = 0
958 g.reg_map.clear()
959 g.used_regs.clear()
960 g.string_literal_offsets.clear()
961 g.const_cache.clear()
962 g.sumtype_data_heap_allocas.clear()
963 g.cur_func_ret_type = func_typ
964 g.cur_func_name = func_name
965 g.x8_save_offset = 0
966 g.mark_sumtype_data_heap_allocas(func_idx)
967 tf_regalloc := time.now()
968 g.t_setup_ms += f64(time.since(tf_setup)) / f64(time.millisecond)
969 g.allocate_registers(func_idx)
970 tf_prepass := time.now()
971 g.t_regalloc_ms += f64(time.since(tf_regalloc)) / f64(time.millisecond)
972 if g.env_dump_funcrefs.len > 0
973 && (g.env_dump_funcrefs == '*' || func_name == g.env_dump_funcrefs) {
974 eprintln('ARM64 FUNCREFS fn=${func_name} begin')
975 for i, vv in g.mod.values {
976 if vv.kind != .func_ref {
977 continue
978 }
979 if vv.name.contains('cleanc__Gen__expr') {
980 vv_typ := vv.typ.str()
981 eprintln('ARM64 FUNCREF val=${i} name=${vv.name} typ=${vv_typ}')
982 }
983 }
984 for f in g.mod.funcs {
985 if f.name.contains('cleanc__Gen__expr') {
986 f_typ := f.typ.str()
987 eprintln('ARM64 FUNCDECL id=${f.id} name=${f.name} typ=${f_typ} params_len=${f.params.len}')
988 }
989 }
990 eprintln('ARM64 FUNCREFS fn=${func_name} end')
991 }
992
993 // Check if function requires indirect return pointer preservation in x8.
994 fn_ret_typ := g.mod.type_store.types[func_typ]
995 fn_ret_size := g.type_size(func_typ)
996 needs_x8_save := func_abi_ret_indirect || (fn_ret_typ.kind == .struct_t && fn_ret_size > 16)
997
998 // Callee-saved registers are pushed at [fp - 8], [fp - 16], etc.
999 // We need to account for this when computing stack offsets
1000 callee_saved_size := ((g.used_regs.len + 1) / 2) * 16
1001
1002 // Stack Frame - start after callee-saved register area
1003 mut slot_offset := 8 + callee_saved_size
1004
1005 // If function returns large struct, reserve slot for saving x8
1006 if needs_x8_save {
1007 g.x8_save_offset = -slot_offset
1008 slot_offset += 8
1009 }
1010
1011 for pi, pid in func_params {
1012 // For struct parameters, allocate full struct size on the stack.
1013 // On ARM64, structs > 16 bytes are passed by pointer (indirect),
1014 // and structs 9-16 bytes are passed in 2 consecutive registers.
1015 param_typ := g.mod.values[pid].typ
1016 param_type_info := g.mod.type_store.types[param_typ]
1017 param_size := g.type_size(param_typ)
1018 is_indirect_param := pi < func_abi_param_class.len && func_abi_param_class[pi] == .indirect
1019 if is_indirect_param || (param_type_info.kind == .struct_t && param_size > 16) {
1020 // Align to 16 bytes and allocate full struct size
1021 slot_offset = (slot_offset + 15) & ~0xF
1022 slot_offset += param_size
1023 g.stack_map[pid] = -slot_offset
1024 // Reserve one more scalar slot so following values do not overlap
1025 // with the first field at the base offset.
1026 slot_offset += 8
1027 } else if param_type_info.kind == .struct_t && param_size > 8 {
1028 // Small struct (9-16 bytes) passed in 2 registers - allocate full size
1029 slot_offset = (slot_offset + 7) & ~0x7
1030 slot_offset += param_size
1031 g.stack_map[pid] = -slot_offset
1032 slot_offset += 8
1033 } else {
1034 g.stack_map[pid] = -slot_offset
1035 slot_offset += 8
1036 }
1037 }
1038
1039 // Pre-pass: find string_literal values used in this function and allocate stack for them
1040 mut used_string_literals := map[int]bool{}
1041 for blk_id in func_blocks {
1042 blk := g.mod.blocks[blk_id]
1043 for val_id in blk.instrs {
1044 val := g.mod.values[val_id]
1045 if val.kind != .instruction {
1046 continue
1047 }
1048 instr := g.mod.instrs[val.index]
1049 // Check all operands for string_literal references
1050 for op in instr.operands {
1051 op_val := g.mod.values[op]
1052 if op_val.kind == .string_literal {
1053 used_string_literals[op] = true
1054 }
1055 }
1056 // Also check return values - if function returns a string_literal directly
1057 if instr.op == .ret && instr.operands.len > 0 {
1058 ret_val := g.mod.values[instr.operands[0]]
1059 if ret_val.kind == .string_literal {
1060 used_string_literals[instr.operands[0]] = true
1061 }
1062 }
1063 }
1064 }
1065
1066 // Allocate stack slots for used string_literal values
1067 for str_lit_id, _ in used_string_literals {
1068 mut str_size := g.type_size(g.mod.values[str_lit_id].typ)
1069 if str_size <= 0 {
1070 str_size = 24
1071 }
1072 slot_offset = (slot_offset + 15) & ~0xF
1073 slot_offset += str_size
1074 g.stack_map[str_lit_id] = -slot_offset
1075 // Keep subsequent scalar slots below the aggregate base.
1076 slot_offset += 8
1077 }
1078
1079 trace_skip_dead := g.env_trace_skip_dead.len > 0
1080 && (g.env_trace_skip_dead == '*' || func_name == g.env_trace_skip_dead)
1081
1082 for i, blk_id in func_blocks {
1083 g.next_blk = if i + 1 < func_blocks.len { func_blocks[i + 1] } else { -1 }
1084 blk := g.mod.blocks[blk_id]
1085 for pp_idx, val_id in blk.instrs {
1086 val := g.mod.values[val_id]
1087 if val.kind != .instruction {
1088 continue
1089 }
1090 instr := g.mod.instrs[val.index]
1091 opcode := g.selected_opcode(instr)
1092 _ = pp_idx
1093 // Phi lowering can leave placeholder bitcasts/copies that are fully dead.
1094 // Do not reserve stack slots for these values; in large recursive
1095 // functions this can cause pathological frame growth and stack overflow.
1096 if val.uses.len == 0 {
1097 if opcode == .bitcast && instr.operands.len == 0 {
1098 if trace_skip_dead {
1099 eprintln('ARM64 SKIP_DEAD fn=${func_name} val=${val_id} op=bitcast ops_len=${instr.operands.len} uses_len=${val.uses.len}')
1100 }
1101 continue
1102 }
1103 if opcode == .assign {
1104 if trace_skip_dead {
1105 eprintln('ARM64 SKIP_DEAD fn=${func_name} val=${val_id} op=assign ops_len=${instr.operands.len} uses_len=${val.uses.len}')
1106 }
1107 continue
1108 }
1109 }
1110
1111 if instr.op == .alloca {
1112 if val_id !in g.sumtype_data_heap_allocas {
1113 // Calculate allocation size based on the type
1114 // The alloca result type is ptr(T), so get the element type
1115 ptr_type := g.mod.type_store.types[val.typ]
1116 elem_size := g.type_size(ptr_type.elem_type)
1117 mut alloc_size := if elem_size > 0 { elem_size } else { 8 }
1118 // Check for array alloca: operand[0] is element count
1119 if instr.operands.len > 0 {
1120 count_val := g.mod.values[instr.operands[0]]
1121 count := int(parse_const_int_literal(count_val.name))
1122 if count > 1 {
1123 alloc_size = elem_size * count
1124 }
1125 }
1126
1127 // Align to 16 bytes.
1128 slot_offset = (slot_offset + 15) & ~0xF
1129 slot_offset += alloc_size
1130 g.alloca_offsets[val_id] = -slot_offset
1131
1132 // Ensure the next instruction does not use the slot
1133 // overlapping with the base of the alloca data.
1134 slot_offset += 8
1135 }
1136 }
1137
1138 if instr.op == .inline_string_init {
1139 // Reserve payload bytes plus a separate pointer slot.
1140 // The payload size follows the lowered string type layout.
1141 mut string_size := g.type_size(instr.typ)
1142 if string_size <= 0 {
1143 string_size = 24
1144 }
1145 slot_offset = (slot_offset + 15) & ~0xF
1146 slot_offset += string_size // struct data
1147 slot_offset += 8 // pointer slot (separate from struct)
1148 g.stack_map[val_id] = -slot_offset
1149 continue
1150 }
1151
1152 if instr.op == .insertvalue || instr.op == .struct_init {
1153 // Tuple/struct needs full ABI size, not just fields.len * 8.
1154 tuple_typ := g.mod.type_store.types[instr.typ]
1155 mut tuple_size := g.type_size(instr.typ)
1156 if tuple_size <= 0 {
1157 tuple_size = tuple_typ.fields.len * 8
1158 }
1159 slot_offset = (slot_offset + 15) & ~0xF
1160 slot_offset += tuple_size
1161 g.stack_map[val_id] = -slot_offset
1162 // Keep following scalar slots from overlapping field 0.
1163 slot_offset += 8
1164 continue
1165 }
1166
1167 // Keep full stack storage for struct/array values so aggregate copies have
1168 // stable backing bytes even when values are register-allocated.
1169 val_typ := g.mod.type_store.types[val.typ]
1170 if val_typ.kind == .struct_t || val_typ.kind == .array_t {
1171 mut struct_size := g.type_size(val.typ)
1172 if struct_size <= 0 {
1173 struct_size = if val_typ.fields.len > 0 { val_typ.fields.len * 8 } else { 8 }
1174 }
1175 // Some large aggregate producers are represented as data pointers in stack
1176 // slots (one word), not inline bytes. Reserve pointer-sized storage for
1177 // those values to avoid pathological frame growth in recursive functions.
1178 if val_typ.kind == .struct_t && struct_size > 16
1179 && g.large_struct_stack_value_is_pointer(val_id) {
1180 g.stack_map[val_id] = -slot_offset
1181 slot_offset += 8
1182 continue
1183 }
1184 slot_offset = (slot_offset + 15) & ~0xF
1185 slot_offset += struct_size
1186 g.stack_map[val_id] = -slot_offset
1187 // Keep following scalar slots below the aggregate base.
1188 slot_offset += 8
1189 continue
1190 }
1191
1192 if instr.op == .call {
1193 // Check if call returns a tuple
1194 result_typ := g.mod.type_store.types[val.typ]
1195 mut is_multi_reg_call := result_typ.kind == .struct_t && result_typ.fields.len > 1
1196 mut call_tuple_size := g.type_size(val.typ)
1197 // Also check callee's registered return type
1198 if !is_multi_reg_call && instr.operands.len > 0 {
1199 callee_idx := g.func_idx_from_ref_value(instr.operands[0])
1200 if callee_idx >= 0 && callee_idx < g.func_typs.len {
1201 callee_typ := g.func_typs[callee_idx]
1202 callee_ret_typ := g.mod.type_store.types[callee_typ]
1203 callee_ret_size := g.type_size(callee_typ)
1204 if callee_ret_typ.kind == .struct_t && callee_ret_size > 8
1205 && callee_ret_size <= 16 {
1206 is_multi_reg_call = true
1207 call_tuple_size = callee_ret_size
1208 }
1209 }
1210 }
1211 if is_multi_reg_call {
1212 if call_tuple_size <= 0 {
1213 call_tuple_size = 16
1214 }
1215 slot_offset = (slot_offset + 15) & ~0xF
1216 slot_offset += call_tuple_size
1217 g.stack_map[val_id] = -slot_offset
1218 // Keep following scalar slots below the aggregate base.
1219 slot_offset += 8
1220 continue
1221 }
1222 } else if instr.op == .call_sret {
1223 // call_sret returns an aggregate indirectly into the destination slot.
1224 result_typ := g.mod.type_store.types[val.typ]
1225 if result_typ.kind == .struct_t {
1226 result_size := g.type_size(val.typ)
1227 slot_offset = (slot_offset + 15) & ~0xF
1228 slot_offset += result_size
1229 g.stack_map[val_id] = -slot_offset
1230 // Keep following scalar slots below the aggregate base.
1231 slot_offset += 8
1232 continue
1233 }
1234 }
1235
1236 // Skip stack slot for callee-saved register values (always in reg_map).
1237 if val_id in g.reg_map && g.reg_map[val_id] != 0xFF {
1238 continue
1239 }
1240 // Align to 8 bytes before assigning scalar slot so that
1241 // SP-relative scaled-immediate addressing always works.
1242 slot_offset = (slot_offset + 7) & ~0x7
1243 g.stack_map[val_id] = -slot_offset
1244 slot_offset += 8
1245 }
1246 }
1247
1248 g.stack_size = (slot_offset + 16) & ~0xF
1249
1250 if g.env_dump_stackmap.len > 0
1251 && (g.env_dump_stackmap == '*' || func_name == g.env_dump_stackmap) {
1252 eprintln('ARM64 FRAME ${func_name} stack_size=${g.stack_size} x8_save_offset=${g.x8_save_offset}')
1253 eprintln('ARM64 STACKMAP ${func_name} begin')
1254 for vid, off in g.stack_map {
1255 mut typ_kind := 'na'
1256 mut typ_size := 0
1257 mut typ_id := ssa.TypeID(0)
1258 mut typ_desc := ''
1259 mut kind := 'na'
1260 mut name := ''
1261 mut op := 'na'
1262 mut blk := ssa.BlockID(-1)
1263 mut operands := ''
1264 mut uses := ''
1265 if vid > 0 && vid < g.mod.values.len {
1266 vv := g.mod.values[vid]
1267 kind = '${vv.kind}'
1268 name = vv.name
1269 if vv.typ > 0 && vv.typ < g.mod.type_store.types.len {
1270 typ_id = vv.typ
1271 typ := g.mod.type_store.types[vv.typ]
1272 typ_kind = '${typ.kind}'
1273 typ_size = g.type_size(vv.typ)
1274 if typ.kind == .struct_t {
1275 typ_desc = 'fields_len=${typ.field_names.len} ftypes_len=${typ.fields.len}'
1276 } else if typ.kind == .ptr_t {
1277 typ_desc = 'elem=${typ.elem_type}'
1278 }
1279 }
1280 if vv.kind == .instruction {
1281 instr := g.mod.instrs[vv.index]
1282 op = '${g.selected_opcode(instr)}'
1283 blk = instr.block
1284 operands = 'len=${instr.operands.len}'
1285 }
1286 uses = 'len=${vv.uses.len}'
1287 }
1288 eprintln('ARM64 STACKMAP ${func_name} val=${vid} off=${off} kind=${kind} blk=${blk} op=${op} ops=${operands} uses=${uses} typ=${typ_id}/${typ_kind} size=${typ_size} tdesc=`${typ_desc}` name=`${name}`')
1289 }
1290 for vid, off in g.alloca_offsets {
1291 mut elem_typ_id := ssa.TypeID(0)
1292 mut elem_typ_kind := 'na'
1293 mut elem_typ_size := 0
1294 mut alloca_ops := ''
1295 if vid > 0 && vid < g.mod.values.len {
1296 vv := g.mod.values[vid]
1297 if vv.kind == .instruction {
1298 instr := g.mod.instrs[vv.index]
1299 alloca_ops = 'len=${instr.operands.len}'
1300 if vv.typ > 0 && vv.typ < g.mod.type_store.types.len {
1301 typ := g.mod.type_store.types[vv.typ]
1302 if typ.kind == .ptr_t && typ.elem_type > 0
1303 && typ.elem_type < g.mod.type_store.types.len {
1304 elem_typ_id = typ.elem_type
1305 elem_typ := g.mod.type_store.types[elem_typ_id]
1306 elem_typ_kind = '${elem_typ.kind}'
1307 elem_typ_size = g.type_size(elem_typ_id)
1308 }
1309 }
1310 }
1311 }
1312 eprintln('ARM64 ALLOCA ${func_name} val=${vid} off=${off} ops=${alloca_ops} elem=${elem_typ_id}/${elem_typ_kind} size=${elem_typ_size}')
1313 }
1314 eprintln('ARM64 STACKMAP ${func_name} end')
1315 }
1316 if g.env_dump_blocks.len > 0 && (g.env_dump_blocks == '*' || func_name == g.env_dump_blocks) {
1317 eprintln('ARM64 BLOCKS ${func_name} begin')
1318 for bi, blk_id in func_blocks {
1319 blk := g.mod.blocks[blk_id]
1320 eprintln('ARM64 BLOCK ${func_name} order=${bi} id=${blk_id} val=${blk.val_id} preds_len=${blk.preds.len} succs_len=${blk.succs.len} instrs_len=${blk.instrs.len}')
1321 for val_id in blk.instrs {
1322 if val_id <= 0 || val_id >= g.mod.values.len {
1323 continue
1324 }
1325 val := g.mod.values[val_id]
1326 mut op := 'na'
1327 mut operands := '[]'
1328 if val.kind == .instruction {
1329 instr := g.mod.instrs[val.index]
1330 op = '${g.selected_opcode(instr)}'
1331 operands = 'len=${instr.operands.len}'
1332 }
1333 mut callee_info := ''
1334 if val.kind == .instruction {
1335 instr := g.mod.instrs[val.index]
1336 opcode := g.selected_opcode(instr)
1337 if opcode in [.call, .call_indirect, .call_sret] && instr.operands.len > 0 {
1338 callee_id := instr.operands[0]
1339 if callee_id > 0 && callee_id < g.mod.values.len {
1340 callee_val := g.mod.values[callee_id]
1341 callee_info = ' callee=${callee_id}:${callee_val.kind}:${callee_val.name}:${callee_val.typ}'
1342 } else {
1343 callee_info = ' callee=${callee_id}:invalid'
1344 }
1345 }
1346 }
1347 eprintln('ARM64 BLOCK INSTR ${func_name} blk=${blk_id} val=${val_id} kind=${val.kind} op=${op} ops=${operands} uses_len=${val.uses.len}${callee_info}')
1348 }
1349 }
1350 eprintln('ARM64 BLOCKS ${func_name} end')
1351 }
1352 fn_sym_name := '_' + func_name
1353 fn_sym_idx := g.macho.add_symbol(fn_sym_name, u64(g.curr_offset), true, 1)
1354 fn_start_off := g.macho.text_data.len
1355
1356 tf_prologue := time.now()
1357 g.t_prepass_ms += f64(time.since(tf_prepass)) / f64(time.millisecond)
1358 // Prologue
1359 g.emit(asm_stp_fp_lr_pre())
1360 g.emit(asm_mov_fp_sp())
1361
1362 // Save callee-saved regs (pushed below fp using pre-decrement)
1363 for i := 0; i < g.used_regs.len; i += 2 {
1364 r1 := g.used_regs[i]
1365 mut r2 := 31 // xzr
1366 if i + 1 < g.used_regs.len {
1367 r2 = g.used_regs[i + 1]
1368 }
1369 g.emit(asm_stp_pair_pre(Reg(r1), Reg(r2)))
1370 }
1371
1372 g.emit_sub_sp(g.stack_size)
1373
1374 // Compute sp_base_offset for sp-relative addressing.
1375 // sp = fp - callee_saved_size - stack_size, so fp - N = sp + (sp_base_offset + N).
1376 g.sp_base_offset = callee_saved_size + g.stack_size
1377 g.sp_adjusted = false
1378 g.sp_adjust_amt = 0
1379
1380 // Save x8 if this function returns a large struct
1381 // x8 contains the indirect return pointer from the caller
1382 // Save it at a fixed offset from fp (below callee-saved registers)
1383 if g.x8_save_offset != 0 {
1384 g.emit_str_reg_offset(8, 29, g.x8_save_offset)
1385 }
1386
1387 // The Mach-O LC_MAIN entrypoint invokes `main` with C-style argc/argv in
1388 // x0/x1. Persist them to builtin globals so `os.args` / `arguments()` work.
1389 if func_name == 'main' {
1390 g.store_entry_arg_to_global(0, 'builtin__g_main_argc', 4)
1391 g.store_entry_arg_to_global(1, 'builtin__g_main_argv', 8)
1392 g.store_entry_arg_to_global(0, 'g_main_argc', 4)
1393 g.store_entry_arg_to_global(1, 'g_main_argv', 8)
1394 // Call _vinit to initialize dynamic array constants
1395 g.emit_call_to_named_fn('_vinit')
1396 }
1397
1398 // Spill params
1399 // ARM64 ABI: args in x0..x7, args 8+ on caller stack.
1400 // Struct params ≤ 16 bytes occupy ceil(size/8) consecutive registers.
1401 // Struct params > 16 bytes are passed by pointer (one register).
1402 // ARM64 ABI: integer params in x0-x7, float params in d0-d7 (independent allocation)
1403 mut reg_idx := 0
1404 mut float_reg_idx := 0
1405 trace_paramspill := g.env_trace_paramspill.len > 0
1406 && (g.env_trace_paramspill == '*' || func_name == g.env_trace_paramspill)
1407 for i, pid in func_params {
1408 param_typ := g.mod.values[pid].typ
1409 param_type_info := g.mod.type_store.types[param_typ]
1410 param_size := g.type_size(param_typ)
1411 is_indirect_param := i < func_abi_param_class.len && func_abi_param_class[i] == .indirect
1412 if trace_paramspill {
1413 eprintln('ARM64 PARAMSPILL fn=${func_name} idx=${i} pid=${pid} typ=${param_typ} kind=${int(param_type_info.kind)} size=${param_size} reg_idx=${reg_idx} indirect=${is_indirect_param}')
1414 }
1415
1416 // Float parameters arrive in d-registers; move to x-register for storage
1417 if param_type_info.kind == .float_t && !is_indirect_param {
1418 if float_reg_idx < 8 {
1419 // fmov xN, dN to get float bits into integer register
1420 g.emit(asm_fmov_x_d(Reg(9), float_reg_idx))
1421 offset := g.stack_map[pid]
1422 g.emit_str_reg_offset(9, 29, offset)
1423 if pid in g.reg_map {
1424 g.emit_mov_reg(g.reg_map[pid], 9)
1425 }
1426 }
1427 float_reg_idx++
1428 continue
1429 }
1430
1431 mut src_reg := reg_idx
1432 if reg_idx >= 8 {
1433 stack_arg_off := 16 + ((reg_idx - 8) * 8)
1434 g.emit_ldr_reg_offset(9, 29, stack_arg_off)
1435 src_reg = 9
1436 }
1437
1438 // For large struct parameters (> 16 bytes), the argument value is a pointer.
1439 // Copy pointed struct bytes into the function-local spill slot.
1440 if is_indirect_param || (param_type_info.kind == .struct_t && param_size > 16) {
1441 if src_reg != 9 {
1442 g.emit_mov_reg(9, src_reg)
1443 }
1444 offset := g.stack_map[pid]
1445 num_fields := (param_size + 7) / 8
1446 for field_idx in 0 .. num_fields {
1447 g.emit(asm_ldr_imm(Reg(10), Reg(9), u32(field_idx)))
1448 g.emit_str_reg_offset(10, 29, offset + field_idx * 8)
1449 }
1450 // Large/indirect params are represented as addresses in registers.
1451 // Materialize the local spill address for any register-allocated uses.
1452 if pid in g.reg_map {
1453 g.emit_add_fp_imm(g.reg_map[pid], offset)
1454 }
1455 reg_idx += 1
1456 } else if param_type_info.kind == .struct_t && param_size > 8 {
1457 // Small struct (9-16 bytes) passed in 2 consecutive registers.
1458 offset := g.stack_map[pid]
1459 num_regs := (param_size + 7) / 8
1460 if trace_paramspill {
1461 eprintln('ARM64 PARAMSPILL fn=${func_name} idx=${i} mode=small_struct offset=${offset} num_regs=${num_regs}')
1462 }
1463 for ri in 0 .. num_regs {
1464 mut cur_reg := reg_idx + ri
1465 if cur_reg >= 8 {
1466 stack_arg_off := 16 + ((cur_reg - 8) * 8)
1467 g.emit_ldr_reg_offset(9, 29, stack_arg_off)
1468 g.emit_str_reg_offset(9, 29, offset + ri * 8)
1469 } else {
1470 g.emit_str_reg_offset(cur_reg, 29, offset + ri * 8)
1471 }
1472 }
1473 if pid in g.reg_map {
1474 g.emit_add_fp_imm(g.reg_map[pid], offset)
1475 }
1476 reg_idx += num_regs
1477 } else if pid in g.reg_map {
1478 reg := g.reg_map[pid]
1479 offset := g.stack_map[pid]
1480 g.emit_str_reg_offset(src_reg, 29, offset)
1481 if reg != src_reg {
1482 g.emit_mov_reg(reg, src_reg)
1483 }
1484 reg_idx += 1
1485 } else {
1486 offset := g.stack_map[pid]
1487 g.emit_str_reg_offset(src_reg, 29, offset)
1488 reg_idx += 1
1489 }
1490 }
1491
1492 // Run SSA lowered global initializers before entering user main.
1493 // This mirrors the C backend behavior where __v2_global_init() is invoked from main.
1494 if func_name == 'main' && g.has_function_named('__v2_global_init') {
1495 sym_idx := g.macho.add_undefined('_' + '__v2_global_init')
1496 g.macho.add_reloc(g.macho.text_data.len, sym_idx, arm64_reloc_branch26, true)
1497 g.emit(asm_bl_reloc())
1498 }
1499
1500 // Materialize all string literals unconditionally in the function prologue.
1501 // Relying on first-use codegen order can leave literal slots uninitialized when
1502 // control flow reaches a "reuse" site before the first emitted init site.
1503 mut lit_ids := []int{}
1504 for lit_id, _ in used_string_literals {
1505 lit_ids << lit_id
1506 }
1507 lit_ids.sort(a < b)
1508 for lit_id in lit_ids {
1509 g.load_val_to_reg(8, lit_id)
1510 }
1511
1512 tf_main := time.now()
1513 g.t_prologue_ms += f64(time.since(tf_prologue)) / f64(time.millisecond)
1514 for i := 0; i < func_blocks.len; i++ {
1515 g.invalidate_last_store()
1516 blk_id := int(func_blocks[i])
1517 g.next_blk = if i + 1 < func_blocks.len { int(func_blocks[i + 1]) } else { -1 }
1518 g.cur_blk_id = blk_id
1519 g.block_offsets[blk_id] = g.macho.text_data.len - g.curr_offset
1520
1521 // Resolve pending forward branches that target this block via the
1522 // per-block linked list (head in pending_head, next-pointers in pending_next).
1523 mut pi := if blk_id >= 0 && blk_id < g.pending_head.len {
1524 g.pending_head[blk_id]
1525 } else {
1526 -1
1527 }
1528 mut pending_guard := 0
1529 for pi != -1 {
1530 if pi < 0 || pi >= g.pending_label_offs.len || pi >= g.pending_next.len
1531 || pending_guard > g.pending_label_offs.len {
1532 break
1533 }
1534 off := g.pending_label_offs[pi]
1535 target := g.block_offsets[blk_id]
1536 rel := (target - off) / 4
1537 abs_off := g.curr_offset + off
1538 instr := g.read_u32(abs_off)
1539
1540 mut new_instr := u32(0)
1541 // Check for CBZ (0xB4...) / CBNZ (0xB5...) vs B (0x14...) vs B.cond (0x54...)
1542 if (instr & 0xFE000000) == 0xB4000000 {
1543 // CBZ / CBNZ (both use imm19 at bits [23:5])
1544 new_instr = (instr & 0xFF00001F) | ((u32(rel) & 0x7FFFF) << 5)
1545 } else if (instr & 0xFC000000) == 0x14000000 {
1546 // B imm26
1547 new_instr = (instr & 0xFC000000) | (u32(rel) & 0x3FFFFFF)
1548 } else {
1549 // B.cond
1550 new_instr = (instr & 0xFF00001F) | ((u32(rel) & 0x7FFFF) << 5)
1551 }
1552 g.write_u32(abs_off, new_instr)
1553 g.total_resolved++
1554 pi = g.pending_next[pi]
1555 pending_guard++
1556 }
1557
1558 g.cur_blk_instrs = if blk_id >= 0 && blk_id < g.block_instrs.len {
1559 g.block_instrs[blk_id]
1560 } else {
1561 []int{}
1562 }
1563 for instr_idx, val_id in g.cur_blk_instrs {
1564 g.cur_blk_instr_idx = instr_idx
1565 g.gen_instr(val_id)
1566 }
1567 }
1568 g.t_main_ms += f64(time.since(tf_main)) / f64(time.millisecond)
1569 g.fn_starts << fn_start_off
1570 g.fn_ends << g.macho.text_data.len
1571 g.fn_names << fn_sym_name
1572 g.fn_sym_ids << fn_sym_idx
1573}
1574
1575fn (mut g Gen) gen_instr(val_id int) {
1576 instr_idx := g.mod.values[val_id].index
1577 if instr_idx < 0 || instr_idx >= g.mod.instrs.len {
1578 return
1579 }
1580 instr := g.mod.instrs[instr_idx]
1581 instr_operands := instr.operands
1582 op := g.selected_opcode(instr)
1583 trace_val := g.env_trace_val.len > 0
1584 && (g.env_trace_val == '*' || g.cur_func_name == g.env_trace_val)
1585 if trace_val {
1586 eprintln('ARM64 VAL fn=${g.cur_func_name} val=${val_id} opi=${int(op)} off=${g.macho.text_data.len - g.curr_offset} ops_len=${instr_operands.len}')
1587 }
1588 trace_instr := g.env_trace_instr.len > 0
1589 && (g.env_trace_instr == '*' || g.cur_func_name == g.env_trace_instr)
1590 if trace_instr {
1591 typ_id := g.mod.values[val_id].typ
1592 mut kind := ssa.TypeKind.void_t
1593 mut width := 0
1594 mut is_unsigned := false
1595 if typ_id > 0 && typ_id < g.mod.type_store.types.len {
1596 typ := g.mod.type_store.types[typ_id]
1597 kind = typ.kind
1598 width = typ.width
1599 is_unsigned = typ.is_unsigned
1600 }
1601 eprintln('ARM64 INSTR fn=${g.cur_func_name} val=${val_id} op=${op} orig=${instr.op} typ=${typ_id} kind=${kind} width=${width} unsigned=${is_unsigned} ops_len=${instr_operands.len}')
1602 }
1603 if op == .store && g.try_emit_simple_scalar_store(instr_idx) {
1604 return
1605 }
1606 match op {
1607 .fadd, .fsub, .fmul, .fdiv, .frem {
1608 // Float operations using scalar SIMD instructions (d0-d7)
1609 dest_reg := g.get_dest_reg(val_id)
1610
1611 // For now, load operands as float constants or from memory
1612 // Load LHS to d0
1613 g.load_float_operand(instr.operands[0], 0) // d0
1614 // Load RHS to d1
1615 g.load_float_operand(instr.operands[1], 1) // d1
1616
1617 // Perform float operation: result in d0
1618 match op {
1619 .fadd {
1620 g.emit(asm_fadd_d0_d0_d1())
1621 }
1622 .fsub {
1623 g.emit(asm_fsub_d0_d0_d1())
1624 }
1625 .fmul {
1626 g.emit(asm_fmul_d0_d0_d1())
1627 }
1628 .fdiv {
1629 g.emit(asm_fdiv_d0_d0_d1())
1630 }
1631 .frem {
1632 // No single instruction for frem on ARM64
1633 // Use: d0 = d0 - trunc(d0/d1) * d1
1634 g.emit(asm_fdiv_d2_d0_d1())
1635 g.emit(asm_frintz_d2())
1636 g.emit(asm_fnmsub_d0_d2_d1_d0())
1637 }
1638 else {}
1639 }
1640
1641 // Convert d0 result back to integer register for storage
1642 // Store the float bits in the result (for later int() conversion)
1643 g.emit(asm_fmov_x_d(Reg(dest_reg), 0))
1644
1645 g.store_reg_to_val(dest_reg, val_id)
1646 }
1647 .fptosi {
1648 // Float to signed integer conversion
1649 dest_reg := g.get_dest_reg(val_id)
1650
1651 // Load float operand to d0
1652 g.load_float_operand(instr.operands[0], 0)
1653
1654 // FCVTZS Xd, Dn (convert to signed int, truncate toward zero)
1655 g.emit(asm_fcvtzs_x_d(Reg(dest_reg), 0))
1656
1657 g.store_reg_to_val(dest_reg, val_id)
1658 }
1659 .sitofp {
1660 // Signed integer to float conversion
1661 dest_reg := g.get_dest_reg(val_id)
1662
1663 // Load integer operand to x8
1664 src_reg := g.get_operand_reg(instr.operands[0], 8)
1665
1666 // Check if target is f32
1667 result_is_f32 := g.mod.values[val_id].typ > 0
1668 && g.mod.values[val_id].typ < g.mod.type_store.types.len
1669 && g.mod.type_store.types[g.mod.values[val_id].typ].kind == .float_t
1670 && g.mod.type_store.types[g.mod.values[val_id].typ].width == 32
1671
1672 // SCVTF Dd, Xn (convert signed int to double)
1673 g.emit(asm_scvtf_d_x(0, Reg(src_reg)))
1674
1675 if result_is_f32 {
1676 // Convert f64→f32 and move to integer register as 32-bit pattern
1677 g.emit(asm_fcvt_s_d(0, 0))
1678 g.emit(asm_fmov_w_s(Reg(dest_reg), 0))
1679 } else {
1680 // FMOV Xd, D0 (copy f64 bit pattern to integer reg for storage)
1681 g.emit(asm_fmov_x_d(Reg(dest_reg), 0))
1682 }
1683
1684 g.store_reg_to_val(dest_reg, val_id)
1685 }
1686 .uitofp {
1687 // Unsigned integer to float conversion
1688 dest_reg := g.get_dest_reg(val_id)
1689
1690 // Load integer operand to x8
1691 src_reg := g.get_operand_reg(instr.operands[0], 8)
1692
1693 // Check if target is f32
1694 result_is_f32 := g.mod.values[val_id].typ > 0
1695 && g.mod.values[val_id].typ < g.mod.type_store.types.len
1696 && g.mod.type_store.types[g.mod.values[val_id].typ].kind == .float_t
1697 && g.mod.type_store.types[g.mod.values[val_id].typ].width == 32
1698
1699 // UCVTF Dd, Xn (convert unsigned int to double)
1700 g.emit(asm_ucvtf_d_x(0, Reg(src_reg)))
1701
1702 if result_is_f32 {
1703 // Convert f64→f32 and move to integer register as 32-bit pattern
1704 g.emit(asm_fcvt_s_d(0, 0))
1705 g.emit(asm_fmov_w_s(Reg(dest_reg), 0))
1706 } else {
1707 // FMOV Xd, D0 (copy float bit pattern to integer reg for storage)
1708 g.emit(asm_fmov_x_d(Reg(dest_reg), 0))
1709 }
1710
1711 g.store_reg_to_val(dest_reg, val_id)
1712 }
1713 .fptoui {
1714 // Float to unsigned integer conversion
1715 dest_reg := g.get_dest_reg(val_id)
1716
1717 // Load float operand to d0
1718 g.load_float_operand(instr.operands[0], 0)
1719
1720 // FCVTZU Xd, Dn (convert to unsigned int, truncate toward zero)
1721 g.emit(asm_fcvtzu_x_d(Reg(dest_reg), 0))
1722
1723 g.store_reg_to_val(dest_reg, val_id)
1724 }
1725 .add, .sub, .mul, .sdiv, .udiv, .srem, .urem, .and_, .or_, .xor, .shl, .ashr, .lshr, .eq,
1726 .ne, .lt, .gt, .le, .ge, .ult, .ugt, .ule, .uge {
1727 // Optimization: Use actual registers if allocated, avoid shuffling to x8/x9
1728 // Dest register
1729 dest_reg := g.get_dest_reg(val_id)
1730
1731 // Op0 (LHS)
1732 lhs_reg := g.get_operand_reg(instr.operands[0], 8)
1733
1734 // Op1 (RHS) - Check immediate optimization
1735 mut is_imm := false
1736 mut imm_val := i64(0)
1737 mut rhs_reg := 9 // Default scratch for RHS
1738
1739 op1 := g.mod.values[instr.operands[1]]
1740 if op1.kind == .constant
1741 && op in [.add, .sub, .eq, .ne, .lt, .gt, .le, .ge, .ult, .ugt, .ule, .uge] {
1742 v := g.get_const_int(instr.operands[1])
1743 if v >= 0 && v < 4096 {
1744 is_imm = true
1745 imm_val = v
1746 }
1747 }
1748
1749 if !is_imm {
1750 // Don't use x8 as scratch if LHS is in x8
1751 scratch := if lhs_reg == 8 { 9 } else { 8 }
1752 rhs_reg = g.get_operand_reg(instr.operands[1], scratch)
1753 }
1754 mut emitted_types_sum_is_check := false
1755
1756 match op {
1757 .add {
1758 // Some frontend paths can leave `is` checks over `types.Type` as malformed
1759 // `add` i1 over sumtype wrappers. Lower these to direct `_tag == variant_tag`
1760 // checks in arm64 so branch conditions remain semantically correct.
1761 if g.try_emit_types_type_ischeck_add(dest_reg, lhs_reg, val_id, instr) {
1762 emitted_types_sum_is_check = true
1763 } else if is_imm {
1764 g.emit(asm_add_imm(Reg(dest_reg), Reg(lhs_reg), u32(imm_val)))
1765 } else {
1766 g.emit(asm_add_reg(Reg(dest_reg), Reg(lhs_reg), Reg(rhs_reg)))
1767 }
1768 }
1769 .sub {
1770 if is_imm {
1771 g.emit(asm_sub_imm(Reg(dest_reg), Reg(lhs_reg), u32(imm_val)))
1772 } else {
1773 g.emit(asm_sub_reg(Reg(dest_reg), Reg(lhs_reg), Reg(rhs_reg)))
1774 }
1775 }
1776 .mul {
1777 g.emit(asm_mul(Reg(dest_reg), Reg(lhs_reg), Reg(rhs_reg)))
1778 }
1779 .sdiv {
1780 g.emit(asm_sdiv(Reg(dest_reg), Reg(lhs_reg), Reg(rhs_reg)))
1781 }
1782 .udiv {
1783 g.emit(asm_udiv(Reg(dest_reg), Reg(lhs_reg), Reg(rhs_reg)))
1784 }
1785 .srem {
1786 // Signed modulo: a % b = a - (a / b) * b
1787 // Choose temp register for quotient that doesn't conflict with inputs
1788 mut temp_reg := 10
1789 if lhs_reg == 10 || rhs_reg == 10 {
1790 temp_reg = 11
1791 if lhs_reg == 11 || rhs_reg == 11 {
1792 temp_reg = 12
1793 }
1794 }
1795 g.emit(asm_sdiv(Reg(temp_reg), Reg(lhs_reg), Reg(rhs_reg)))
1796 g.emit(asm_msub(Reg(dest_reg), Reg(temp_reg), Reg(rhs_reg), Reg(lhs_reg)))
1797 }
1798 .urem {
1799 // Unsigned modulo: a % b = a - (a / b) * b
1800 mut temp_reg := 10
1801 if lhs_reg == 10 || rhs_reg == 10 {
1802 temp_reg = 11
1803 if lhs_reg == 11 || rhs_reg == 11 {
1804 temp_reg = 12
1805 }
1806 }
1807 g.emit(asm_udiv(Reg(temp_reg), Reg(lhs_reg), Reg(rhs_reg)))
1808 g.emit(asm_msub(Reg(dest_reg), Reg(temp_reg), Reg(rhs_reg), Reg(lhs_reg)))
1809 }
1810 .and_ {
1811 g.emit(asm_and(Reg(dest_reg), Reg(lhs_reg), Reg(rhs_reg)))
1812 }
1813 .or_ {
1814 g.emit(asm_orr(Reg(dest_reg), Reg(lhs_reg), Reg(rhs_reg)))
1815 }
1816 .xor {
1817 g.emit(asm_eor(Reg(dest_reg), Reg(lhs_reg), Reg(rhs_reg)))
1818 }
1819 .shl {
1820 g.emit(asm_lslv(Reg(dest_reg), Reg(lhs_reg), Reg(rhs_reg)))
1821 }
1822 .ashr {
1823 g.emit(asm_asrv(Reg(dest_reg), Reg(lhs_reg), Reg(rhs_reg)))
1824 }
1825 .lshr {
1826 g.emit(asm_lsrv(Reg(dest_reg), Reg(lhs_reg), Reg(rhs_reg)))
1827 }
1828 .eq, .ne, .lt, .gt, .le, .ge, .ult, .ugt, .ule, .uge {
1829 trace_cmp := g.env_trace_cmp.len > 0
1830 && (g.env_trace_cmp == '*' || g.cur_func_name == g.env_trace_cmp)
1831 lhs_typ := g.mod.values[instr.operands[0]].typ
1832 is_float := lhs_typ > 0 && lhs_typ < g.mod.type_store.types.len
1833 && g.mod.type_store.types[lhs_typ].kind == .float_t
1834 mut handled_large_struct_zero_cmp := false
1835 mut large_struct_cmp_operand := 0
1836 if is_float {
1837 // Float comparison: load to FP regs, use FCMP
1838 g.load_float_operand(instr.operands[0], 0) // d0
1839 g.load_float_operand(instr.operands[1], 1) // d1
1840 g.emit(asm_fcmp_d(Reg(0), Reg(1)))
1841 } else {
1842 // For `eq/ne` against zero on large struct values, compare the
1843 // struct truth word (`[addr + 0]`) instead of the slot address.
1844 // This keeps comparison semantics consistent with `.br` lowering.
1845 if op in [.eq, .ne] {
1846 lhs_id := instr.operands[0]
1847 rhs_id := instr.operands[1]
1848 if g.value_is_large_struct(lhs_id) && g.is_known_zero_value(rhs_id, 0) {
1849 large_struct_cmp_operand = lhs_id
1850 } else if g.value_is_large_struct(rhs_id)
1851 && g.is_known_zero_value(lhs_id, 0) {
1852 large_struct_cmp_operand = rhs_id
1853 }
1854 if large_struct_cmp_operand > 0 {
1855 g.load_large_struct_truth_word_to_reg(9, large_struct_cmp_operand)
1856 g.emit(asm_cmp_reg(Reg(9), Reg(31)))
1857 handled_large_struct_zero_cmp = true
1858 }
1859 }
1860 if !handled_large_struct_zero_cmp {
1861 // Integer comparison
1862 // Use 32-bit CMP for i32 operands to preserve sign semantics.
1863 use_32bit := lhs_typ > 0 && lhs_typ < g.mod.type_store.types.len
1864 && g.mod.type_store.types[lhs_typ].kind == .int_t
1865 && g.mod.type_store.types[lhs_typ].width == 32
1866 if is_imm {
1867 if use_32bit {
1868 g.emit(asm_cmp_imm_w(Reg(lhs_reg), u32(imm_val)))
1869 } else {
1870 g.emit(asm_cmp_imm(Reg(lhs_reg), u32(imm_val)))
1871 }
1872 } else if use_32bit {
1873 g.emit(asm_cmp_reg_w(Reg(lhs_reg), Reg(rhs_reg)))
1874 } else {
1875 g.emit(asm_cmp_reg(Reg(lhs_reg), Reg(rhs_reg)))
1876 }
1877 }
1878 }
1879 if trace_cmp {
1880 eprintln('ARM64 CMP fn=${g.cur_func_name} val=${val_id} op=${op} lhs_id=${instr.operands[0]} rhs_id=${instr.operands[1]} lhs_reg=${lhs_reg} rhs_reg=${rhs_reg} lhs_typ=${lhs_typ} float=${is_float} large_struct_zero_cmp=${handled_large_struct_zero_cmp} large_struct_id=${large_struct_cmp_operand}')
1881 }
1882
1883 // CSET Rd, cond (works for both integer and float NZCV flags)
1884 match op {
1885 .eq { g.emit(asm_cset_eq(Reg(dest_reg))) }
1886 .ne { g.emit(asm_cset_ne(Reg(dest_reg))) }
1887 .lt { g.emit(asm_cset_lt(Reg(dest_reg))) }
1888 .gt { g.emit(asm_cset_gt(Reg(dest_reg))) }
1889 .le { g.emit(asm_cset_le(Reg(dest_reg))) }
1890 .ge { g.emit(asm_cset_ge(Reg(dest_reg))) }
1891 .ugt { g.emit(asm_cset_hi(Reg(dest_reg))) }
1892 .uge { g.emit(asm_cset_hs(Reg(dest_reg))) }
1893 .ult { g.emit(asm_cset_lo(Reg(dest_reg))) }
1894 .ule { g.emit(asm_cset_ls(Reg(dest_reg))) }
1895 else {}
1896 }
1897 }
1898 else {}
1899 }
1900
1901 // Keep narrow integer results (i1/i8/i16/i32) canonical after 64-bit
1902 // ALU ops so upper garbage bits do not leak through later uses.
1903 // Skip for comparison ops (cset always produces 0 or 1, already canonical).
1904 is_cmp := op in [.eq, .ne, .lt, .gt, .le, .ge, .ult, .ugt, .ule, .uge]
1905 if !is_cmp {
1906 result_typ_id := g.mod.values[val_id].typ
1907 if result_typ_id > 0 && result_typ_id < g.mod.type_store.types.len {
1908 result_typ := g.mod.type_store.types[result_typ_id]
1909 if result_typ.kind == .int_t && !emitted_types_sum_is_check {
1910 g.canonicalize_narrow_int_result(dest_reg, result_typ_id)
1911 }
1912 }
1913 }
1914 // If dest_reg was not the allocated one (e.g. was 8), move it.
1915 g.store_reg_to_val(dest_reg, val_id)
1916 }
1917 .store {
1918 if instr_operands.len < 2 {
1919 return
1920 }
1921 src_id := instr_operands[0]
1922 ptr_id := instr_operands[1]
1923 trace_store := g.env_trace_store.len > 0
1924 && (g.env_trace_store == '*' || g.cur_func_name == g.env_trace_store)
1925 // ValueID 0 is the SSA null/invalid sentinel.
1926 if src_id <= 0 || src_id >= g.mod.values.len {
1927 return
1928 }
1929 if ptr_id <= 0 || ptr_id >= g.mod.values.len {
1930 return
1931 }
1932 mut src_addr_override_id := 0
1933 if src_id > 0 && src_id < g.mod.values.len {
1934 src_val2 := g.mod.values[src_id]
1935 if src_val2.kind == .instruction {
1936 src_instr2 := g.mod.instrs[src_val2.index]
1937 if src_instr2.op == .bitcast && src_instr2.operands.len > 0 {
1938 bitcast_src := src_instr2.operands[0]
1939 if bitcast_src > 0 && bitcast_src < g.mod.values.len {
1940 bitcast_src_val := g.mod.values[bitcast_src]
1941 if bitcast_src_val.kind == .instruction {
1942 extract_instr := g.mod.instrs[bitcast_src_val.index]
1943 if extract_instr.op == .extractvalue
1944 && extract_instr.operands.len >= 2 {
1945 idx_val_id := extract_instr.operands[1]
1946 if idx_val_id > 0 && idx_val_id < g.mod.values.len {
1947 idx_val := g.mod.values[idx_val_id]
1948 if idx_val.kind == .constant && idx_val.name == '0' {
1949 base_id := extract_instr.operands[0]
1950 if base_id > 0 && base_id < g.mod.values.len {
1951 base_val := g.mod.values[base_id]
1952 if base_val.kind == .instruction {
1953 load_instr := g.mod.instrs[base_val.index]
1954 if load_instr.op == .load
1955 && load_instr.operands.len > 0 {
1956 load_src := load_instr.operands[0]
1957 if load_src > 0
1958 && load_src < g.mod.values.len
1959 && g.mod.values[load_src].kind == .string_literal {
1960 // Sumtype string payload lowering can arrive as:
1961 // bitcast(extractvalue(load(string_literal), 0)).
1962 // Preserve pointer-to-string-struct, not string.str.
1963 src_addr_override_id = load_src
1964 }
1965 }
1966 }
1967 }
1968 }
1969 }
1970 }
1971 }
1972 }
1973 }
1974 }
1975 }
1976
1977 // Check if we're storing a large struct value (> 16 bytes)
1978 // In this case, the value is a pointer to the struct and we need to copy
1979 val_val := g.mod.values[src_id]
1980 val_typ := g.mod.type_store.types[val_val.typ]
1981 val_size := g.type_size(val_val.typ)
1982 is_undef_aggregate := val_val.kind == .constant && val_val.name == 'undef'
1983 src_has_storage := src_id in g.reg_map || src_id in g.stack_map
1984 || val_val.kind in [.global, .string_literal]
1985 mut dst_struct_size := 0
1986 mut dst_is_large_struct := false
1987 mut dst_is_small_struct := false
1988 mut dst_struct_typ_id := ssa.TypeID(0)
1989 mut dst_elem_is_ptrlike := false
1990 ptr_val := g.mod.values[ptr_id]
1991 if ptr_val.typ > 0 && ptr_val.typ < g.mod.type_store.types.len {
1992 ptr_typ := g.mod.type_store.types[ptr_val.typ]
1993 if ptr_typ.kind == .ptr_t && ptr_typ.elem_type > 0
1994 && ptr_typ.elem_type < g.mod.type_store.types.len {
1995 elem_typ := g.mod.type_store.types[ptr_typ.elem_type]
1996 elem_size := g.type_size(ptr_typ.elem_type)
1997 if elem_typ.kind in [.ptr_t, .func_t] {
1998 dst_elem_is_ptrlike = true
1999 }
2000 if elem_typ.kind == .struct_t || elem_typ.kind == .array_t {
2001 dst_struct_typ_id = ptr_typ.elem_type
2002 if elem_size > 16 {
2003 dst_is_large_struct = true
2004 dst_struct_size = elem_size
2005 } else if elem_size > 0 {
2006 dst_is_small_struct = true
2007 dst_struct_size = elem_size
2008 }
2009 }
2010 }
2011 }
2012 // If the source is a large struct/array (>16 bytes) but the destination
2013 // pointer's elem_type was not classified as a struct (e.g., GEP through
2014 // a ptr(ptr(...)) type from array-of-array construction), override.
2015 if !dst_is_large_struct && val_typ.kind in [.struct_t, .array_t] && val_size > 16 {
2016 dst_is_large_struct = true
2017 dst_struct_size = val_size
2018 }
2019 if trace_store {
2020 mut dst_kind := ssa.TypeKind.void_t
2021 mut dst_size := 0
2022 mut src_off_dbg := 0
2023 mut src_has_off_dbg := false
2024 if src_off := g.stack_map[src_id] {
2025 src_off_dbg = src_off
2026 src_has_off_dbg = true
2027 }
2028 mut src_op_dbg := 'na'
2029 if val_val.kind == .instruction {
2030 src_op_dbg = '${g.selected_opcode(g.mod.instrs[val_val.index])}'
2031 }
2032 if ptr_val.typ > 0 && ptr_val.typ < g.mod.type_store.types.len {
2033 ptr_typ := g.mod.type_store.types[ptr_val.typ]
2034 if ptr_typ.kind == .ptr_t && ptr_typ.elem_type > 0
2035 && ptr_typ.elem_type < g.mod.type_store.types.len {
2036 dst_kind = g.mod.type_store.types[ptr_typ.elem_type].kind
2037 dst_size = g.type_size(ptr_typ.elem_type)
2038 }
2039 }
2040 eprintln('ARM64 STORE fn=${g.cur_func_name} src=${src_id} sop=${src_op_dbg} ptr=${ptr_id} styp=${val_val.typ}/${val_typ.kind} ssz=${val_size} src_has_storage=${src_has_storage} src_has_off=${src_has_off_dbg} src_off=${src_off_dbg} dst_kind=${dst_kind} dst_size=${dst_size} dst_small=${dst_is_small_struct} dst_ptrlike=${dst_elem_is_ptrlike}')
2041 }
2042 mut should_zero_large_store := is_undef_aggregate
2043 if !should_zero_large_store && (dst_is_large_struct
2044 || (val_typ.kind in [.struct_t, .array_t] && val_size > 16 && !dst_elem_is_ptrlike))
2045 && !src_has_storage {
2046 should_zero_large_store = true
2047 }
2048
2049 // Load source first, then preserve it in a register that will not be clobbered
2050 // when loading the destination pointer (which may use x9 plus x11/x12 scratch).
2051 mut val_reg := if src_addr_override_id > 0 {
2052 g.get_operand_reg(src_addr_override_id, 8)
2053 } else {
2054 g.get_operand_reg(src_id, 8)
2055 }
2056 if val_reg == 9 || val_reg == 11 || val_reg == 12 {
2057 if val_reg != 8 {
2058 g.emit_mov_reg(8, val_reg)
2059 }
2060 val_reg = 8
2061 }
2062 ptr_reg := g.get_operand_reg(ptr_id, 9)
2063
2064 if dst_is_large_struct {
2065 // Destination expects a large struct by value.
2066 // Large structs are represented as pointers in registers, so copy pointee bytes.
2067 if should_zero_large_store {
2068 g.zero_ptr_bytes(ptr_reg, dst_struct_size)
2069 } else {
2070 mut can_copy_from_src_ptr := false
2071 mut src_ptr_reg := if ptr_reg == 11 { 12 } else { 11 }
2072 mut src_is_unwrapped_wrapper := false
2073 if val_typ.kind == .struct_t && g.is_sumtype_wrapper_struct_type(val_val.typ)
2074 && g.load_sumtype_data_ptr_to_reg(src_ptr_reg, src_id) {
2075 can_copy_from_src_ptr = true
2076 src_is_unwrapped_wrapper = true
2077 }
2078 if !can_copy_from_src_ptr {
2079 if src_off := g.stack_map[src_id] {
2080 if g.large_struct_stack_value_is_pointer(src_id)
2081 || g.large_aggregate_stack_value_is_pointer(src_id) {
2082 g.emit_ldr_reg_offset(src_ptr_reg, 29, src_off)
2083 } else if src_id in g.reg_map {
2084 if val_reg != src_ptr_reg {
2085 g.emit_mov_reg(src_ptr_reg, val_reg)
2086 }
2087 } else {
2088 g.emit_add_fp_imm(src_ptr_reg, src_off)
2089 }
2090 can_copy_from_src_ptr = true
2091 } else {
2092 src_ptr_reg = val_reg
2093 can_copy_from_src_ptr = true
2094 }
2095 }
2096 if can_copy_from_src_ptr {
2097 mut large_copy_size := dst_struct_size
2098 if !src_is_unwrapped_wrapper && val_typ.kind in [.struct_t, .array_t] {
2099 src_inline_size := g.type_size(val_val.typ)
2100 // Some MIR paths store a small aggregate into a pointer typed as a
2101 // larger aggregate (e.g. wrapper field updates through opaque GEPs).
2102 // Copy only initialized source bytes and clear the destination tail.
2103 if src_inline_size > 0 && src_inline_size < large_copy_size {
2104 large_copy_size = src_inline_size
2105 }
2106 }
2107 if large_copy_size <= 0 {
2108 large_copy_size = dst_struct_size
2109 }
2110 g.copy_ptr_to_ptr_bytes(src_ptr_reg, ptr_reg, large_copy_size)
2111 if large_copy_size < dst_struct_size {
2112 g.zero_ptr_range_bytes(ptr_reg, large_copy_size, dst_struct_size)
2113 }
2114 } else {
2115 g.zero_ptr_bytes(ptr_reg, dst_struct_size)
2116 }
2117 }
2118 } else if dst_is_small_struct {
2119 // Destination expects a small multi-field struct by value.
2120 num_fields := (dst_struct_size + 7) / 8
2121 mut src_points_to_struct := false
2122 if dst_struct_typ_id > 0 && val_typ.kind == .ptr_t && val_typ.elem_type > 0
2123 && val_typ.elem_type < g.mod.type_store.types.len {
2124 // Only treat pointer sources as by-value struct bytes when the
2125 // pointee type exactly matches the destination struct type.
2126 // Size-only matches can copy unrelated payload structs into wrapper
2127 // structs (e.g. ast.Expr), corrupting tag/data words.
2128 if val_typ.elem_type == dst_struct_typ_id {
2129 if g.is_sumtype_wrapper_struct_type(dst_struct_typ_id)
2130 && g.scalar_value_is_pointer_payload(src_id, 0) {
2131 src_points_to_struct = false
2132 } else {
2133 src_points_to_struct = true
2134 }
2135 }
2136 }
2137 mut src_copy_chunks := num_fields
2138 if !src_points_to_struct {
2139 src_size_for_copy := g.aggregate_source_size_bytes(src_id)
2140 if src_size_for_copy > 0 {
2141 src_chunks := (src_size_for_copy + 7) / 8
2142 if src_chunks > 0 && src_chunks < src_copy_chunks {
2143 src_copy_chunks = src_chunks
2144 }
2145 }
2146 }
2147 if src_copy_chunks < 1 {
2148 src_copy_chunks = 1
2149 }
2150 if trace_store {
2151 eprintln('ARM64 STORE_SMALL_COPY fn=${g.cur_func_name} src=${src_id} ptr=${ptr_id} src_chunks=${src_copy_chunks}/${num_fields} src_points_to_struct=${src_points_to_struct}')
2152 }
2153 if !src_has_storage && !src_points_to_struct {
2154 g.emit_mov_reg(10, 31)
2155 for i in 0 .. num_fields {
2156 g.emit(asm_str_imm(Reg(10), Reg(ptr_reg), u32(i)))
2157 }
2158 } else {
2159 mut can_copy_from_src_ptr := false
2160 mut src_ptr_reg := if ptr_reg == 11 { 12 } else { 11 }
2161 if src_points_to_struct {
2162 if val_reg != src_ptr_reg {
2163 g.emit_mov_reg(src_ptr_reg, val_reg)
2164 }
2165 can_copy_from_src_ptr = true
2166 } else if src_off := g.stack_map[src_id] {
2167 g.emit_add_fp_imm(src_ptr_reg, src_off)
2168 can_copy_from_src_ptr = true
2169 }
2170 if can_copy_from_src_ptr {
2171 for i in 0 .. src_copy_chunks {
2172 g.emit(asm_ldr_imm(Reg(10), Reg(src_ptr_reg), u32(i)))
2173 g.emit(asm_str_imm(Reg(10), Reg(ptr_reg), u32(i)))
2174 }
2175 if src_copy_chunks < num_fields {
2176 g.emit_mov_reg(10, 31)
2177 for i in src_copy_chunks .. num_fields {
2178 g.emit(asm_str_imm(Reg(10), Reg(ptr_reg), u32(i)))
2179 }
2180 }
2181 } else if num_fields == 1 {
2182 // Single-slot struct values in registers can be stored directly.
2183 g.emit(asm_str(Reg(val_reg), Reg(ptr_reg)))
2184 } else if val_typ.kind in [.struct_t, .array_t] && src_id > 0
2185 && src_id < g.mod.values.len {
2186 // Source is a struct/array value whose register holds a pointer
2187 // (e.g., from a .load of a struct without a stack slot).
2188 // Use the register as a source pointer for field-by-field copy.
2189 src_val_info := g.mod.values[src_id]
2190 if src_val_info.kind == .instruction
2191 && g.mod.instrs[src_val_info.index].op == .load {
2192 if val_reg != src_ptr_reg {
2193 g.emit_mov_reg(src_ptr_reg, val_reg)
2194 }
2195 for i in 0 .. src_copy_chunks {
2196 g.emit(asm_ldr_imm(Reg(10), Reg(src_ptr_reg), u32(i)))
2197 g.emit(asm_str_imm(Reg(10), Reg(ptr_reg), u32(i)))
2198 }
2199 if src_copy_chunks < num_fields {
2200 g.emit_mov_reg(10, 31)
2201 for i in src_copy_chunks .. num_fields {
2202 g.emit(asm_str_imm(Reg(10), Reg(ptr_reg), u32(i)))
2203 }
2204 }
2205 } else {
2206 g.emit_mov_reg(10, 31)
2207 for i in 0 .. num_fields {
2208 g.emit(asm_str_imm(Reg(10), Reg(ptr_reg), u32(i)))
2209 }
2210 }
2211 } else {
2212 // Keep behavior deterministic when aggregate source bytes are unavailable.
2213 g.emit_mov_reg(10, 31)
2214 for i in 0 .. num_fields {
2215 g.emit(asm_str_imm(Reg(10), Reg(ptr_reg), u32(i)))
2216 }
2217 }
2218 }
2219 } else if dst_struct_typ_id > 0 && val_typ.kind in [.struct_t, .array_t]
2220 && val_size > 16 && !dst_elem_is_ptrlike {
2221 // Large struct source with non-pointer destination slot:
2222 // copy pointee bytes into destination memory.
2223 if should_zero_large_store {
2224 g.zero_ptr_bytes(ptr_reg, val_size)
2225 } else {
2226 mut can_copy_from_src_ptr := false
2227 mut src_ptr_reg := if ptr_reg == 11 { 12 } else { 11 }
2228 if src_off := g.stack_map[src_id] {
2229 if g.large_struct_stack_value_is_pointer(src_id)
2230 || g.large_aggregate_stack_value_is_pointer(src_id) {
2231 g.emit_ldr_reg_offset(src_ptr_reg, 29, src_off)
2232 } else if src_id in g.reg_map {
2233 if val_reg != src_ptr_reg {
2234 g.emit_mov_reg(src_ptr_reg, val_reg)
2235 }
2236 } else {
2237 g.emit_add_fp_imm(src_ptr_reg, src_off)
2238 }
2239 can_copy_from_src_ptr = true
2240 } else {
2241 src_ptr_reg = val_reg
2242 can_copy_from_src_ptr = true
2243 }
2244 if can_copy_from_src_ptr {
2245 g.copy_ptr_to_ptr_bytes(src_ptr_reg, ptr_reg, val_size)
2246 } else {
2247 g.zero_ptr_bytes(ptr_reg, val_size)
2248 }
2249 }
2250 } else if val_typ.kind in [.struct_t, .array_t] && val_size > 0 && val_size <= 16
2251 && !dst_elem_is_ptrlike {
2252 // Small aggregate source stored through an opaque destination pointer.
2253 // Copy value-sized 8-byte chunks (not field count), otherwise packed
2254 // structs like `{u32,u8}` can over-copy and corrupt adjacent memory.
2255 num_chunks := (val_size + 7) / 8
2256 if !src_has_storage {
2257 g.emit_mov_reg(10, 31)
2258 for i in 0 .. num_chunks {
2259 g.emit(asm_str_imm(Reg(10), Reg(ptr_reg), u32(i)))
2260 }
2261 } else if src_off := g.stack_map[src_id] {
2262 for i in 0 .. num_chunks {
2263 g.emit_ldr_reg_offset(10, 29, src_off + i * 8)
2264 g.emit(asm_str_imm(Reg(10), Reg(ptr_reg), u32(i)))
2265 }
2266 } else if num_chunks == 1 {
2267 g.emit(asm_str(Reg(val_reg), Reg(ptr_reg)))
2268 } else if val_typ.kind in [.struct_t, .array_t] && src_id > 0
2269 && src_id < g.mod.values.len {
2270 src_val_info := g.mod.values[src_id]
2271 if src_val_info.kind == .instruction
2272 && g.mod.instrs[src_val_info.index].op == .load {
2273 if val_reg != 11 {
2274 g.emit_mov_reg(11, val_reg)
2275 }
2276 for i in 0 .. num_chunks {
2277 g.emit(asm_ldr_imm(Reg(10), Reg(11), u32(i)))
2278 g.emit(asm_str_imm(Reg(10), Reg(ptr_reg), u32(i)))
2279 }
2280 } else {
2281 g.emit_mov_reg(10, 31)
2282 for i in 0 .. num_chunks {
2283 g.emit(asm_str_imm(Reg(10), Reg(ptr_reg), u32(i)))
2284 }
2285 }
2286 } else {
2287 g.emit_mov_reg(10, 31)
2288 for i in 0 .. num_chunks {
2289 g.emit(asm_str_imm(Reg(10), Reg(ptr_reg), u32(i)))
2290 }
2291 }
2292 } else {
2293 mut store_size := g.mem_access_size_bytes(val_val.typ, ptr_id)
2294 // The destination slot controls memory width for pointer fields. A nil
2295 // constant can be typed as a narrow integer, but storing it into a pointer
2296 // field must clear all 64 bits, otherwise stale upper address bits remain.
2297 if store_size < 8 && dst_elem_is_ptrlike {
2298 store_size = 8
2299 }
2300 // Sumtype payload pointers can flow through i64-typed SSA values.
2301 // Do not narrow these stores based on imprecise pointer element widths.
2302 if store_size < 8 && g.scalar_value_is_pointer_payload(src_id, 0) {
2303 store_size = 8
2304 }
2305 match store_size {
2306 1 { g.emit(asm_str_b(Reg(val_reg), Reg(ptr_reg))) }
2307 2 { g.emit(asm_str_h(Reg(val_reg), Reg(ptr_reg))) }
2308 4 { g.emit(asm_str_w(Reg(val_reg), Reg(ptr_reg))) }
2309 else { g.emit(asm_str(Reg(val_reg), Reg(ptr_reg))) }
2310 }
2311 }
2312 }
2313 .load {
2314 dest_reg := g.get_dest_reg(val_id)
2315 ptr_id := instr.operands[0]
2316 trace_load := g.env_trace_load.len > 0
2317 && (g.env_trace_load == '*' || g.cur_func_name == g.env_trace_load)
2318 mut loaded_into_aggregate_slot := false
2319 mut ptr_is_null_const := false
2320 // ValueID 0 is the SSA null/invalid sentinel.
2321 if ptr_id <= 0 || ptr_id >= g.mod.values.len {
2322 g.emit_mov_imm64(dest_reg, 0)
2323 } else if data_word_id := g.sumtype_data_word_load_source(ptr_id,
2324 g.mod.values[val_id].typ)
2325 {
2326 if trace_load {
2327 eprintln('ARM64 LOAD fn=${g.cur_func_name} val=${val_id} ptr=${ptr_id} sumtype_data_word=${data_word_id}')
2328 }
2329 g.load_val_to_reg(dest_reg, data_word_id)
2330 } else {
2331 ptr_is_null_const = g.is_effective_null_pointer_value(ptr_id)
2332 mut ptr_reg := g.get_operand_reg(ptr_id, 9)
2333 if ptr_id > 0 && ptr_id < g.mod.values.len {
2334 ptr_val := g.mod.values[ptr_id]
2335 if ptr_val.kind == .instruction {
2336 ptr_instr := g.mod.instrs[ptr_val.index]
2337 if g.selected_opcode(ptr_instr) == .alloca {
2338 ptr_reg = if dest_reg == 9 { 10 } else { 9 }
2339 g.load_address_of_val_to_reg(ptr_reg, ptr_id)
2340 }
2341 }
2342 }
2343 if trace_load {
2344 mut rkind := ssa.TypeKind.void_t
2345 mut rsize := 0
2346 mut rtyp := ssa.TypeID(0)
2347 mut pkind := mir.ValueKind.constant
2348 mut ptyp := ssa.TypeID(0)
2349 mut pop := 'na'
2350 mut pname := ''
2351 mut gtyp := ssa.TypeID(0)
2352 mut gkind := ssa.TypeKind.void_t
2353 mut gsize := 0
2354 mut gconst := false
2355 mut ginit_len := 0
2356 if val_id > 0 && val_id < g.mod.values.len {
2357 rtyp = g.mod.values[val_id].typ
2358 if rtyp > 0 && rtyp < g.mod.type_store.types.len {
2359 rkind = g.mod.type_store.types[rtyp].kind
2360 rsize = g.type_size(rtyp)
2361 }
2362 }
2363 if ptr_id > 0 && ptr_id < g.mod.values.len {
2364 ptr_val_dbg := g.mod.values[ptr_id]
2365 pkind = ptr_val_dbg.kind
2366 ptyp = ptr_val_dbg.typ
2367 pname = ptr_val_dbg.name
2368 if ptr_val_dbg.kind == .instruction {
2369 pop = '${g.selected_opcode(g.mod.instrs[ptr_val_dbg.index])}'
2370 } else if ptr_val_dbg.kind == .global {
2371 for gvar in g.mod.globals {
2372 if gvar.name == ptr_val_dbg.name {
2373 gtyp = gvar.typ
2374 gconst = gvar.is_constant
2375 ginit_len = gvar.initial_data.len
2376 if gtyp > 0 && gtyp < g.mod.type_store.types.len {
2377 gkind = g.mod.type_store.types[gtyp].kind
2378 gsize = g.type_size(gtyp)
2379 }
2380 break
2381 }
2382 }
2383 }
2384 }
2385 eprintln('ARM64 LOAD fn=${g.cur_func_name} val=${val_id} rtyp=${rtyp} ptr=${ptr_id} ptr_name=`${pname}` ptr_kind=${pkind} ptr_typ=${ptyp} ptr_op=${pop} ptr_reg=${ptr_reg} ptr_null=${ptr_is_null_const} rkind=${rkind} rsize=${rsize} gtyp=${gtyp}/${gkind} gsize=${gsize} gconst=${gconst} ginit=${ginit_len}')
2386 }
2387 mut load_src_ptr_reg := ptr_reg
2388 if ptr_id > 0 && ptr_id < g.mod.values.len {
2389 ptr_val := g.mod.values[ptr_id]
2390 if ptr_val.kind == .instruction {
2391 ptr_instr := g.mod.instrs[ptr_val.index]
2392 if g.selected_opcode(ptr_instr) == .alloca {
2393 slot_has_ptr := g.alloca_slot_stores_pointer_like_values(ptr_id,
2394 g.mod.values[val_id].typ)
2395 if trace_load {
2396 mut rkind := ssa.TypeKind.void_t
2397 mut rsize := 0
2398 if g.mod.values[val_id].typ > 0
2399 && g.mod.values[val_id].typ < g.mod.type_store.types.len {
2400 rkind = g.mod.type_store.types[g.mod.values[val_id].typ].kind
2401 rsize = g.type_size(g.mod.values[val_id].typ)
2402 }
2403 eprintln('ARM64 LOAD fn=${g.cur_func_name} val=${val_id} ptr=${ptr_id} alloca_slot_ptr=${slot_has_ptr} rkind=${rkind} rsize=${rsize}')
2404 }
2405 if slot_has_ptr {
2406 load_src_ptr_reg = if ptr_reg == 11 { 12 } else { 11 }
2407 g.emit(asm_ldr(Reg(load_src_ptr_reg), Reg(ptr_reg)))
2408 }
2409 }
2410 }
2411 }
2412 result_typ_id := g.mod.values[val_id].typ
2413 if result_typ_id > 0 && result_typ_id < g.mod.type_store.types.len {
2414 result_typ := g.mod.type_store.types[result_typ_id]
2415 result_size := g.type_size(result_typ_id)
2416 if (result_typ.kind == .struct_t || result_typ.kind == .array_t)
2417 && result_size > 8 && result_size <= 16 {
2418 if result_offset := g.stack_map[val_id] {
2419 if trace_load {
2420 eprintln('ARM64 LOAD_SMALL_AGG fn=${g.cur_func_name} val=${val_id} ptr=${ptr_id} result_off=${result_offset} size=${result_size} mode=copy')
2421 }
2422 if ptr_is_null_const {
2423 g.zero_fp_bytes(result_offset, result_size)
2424 } else {
2425 g.copy_ptr_to_fp_bytes(load_src_ptr_reg, result_offset, result_size)
2426 }
2427 loaded_into_aggregate_slot = true
2428 } else if dest_reg != load_src_ptr_reg {
2429 if trace_load {
2430 eprintln('ARM64 LOAD_SMALL_AGG fn=${g.cur_func_name} val=${val_id} ptr=${ptr_id} size=${result_size} mode=address')
2431 }
2432 // Fallback when no aggregate slot is available.
2433 if ptr_is_null_const {
2434 g.emit_mov_reg(dest_reg, 31)
2435 } else {
2436 g.emit_mov_reg(dest_reg, load_src_ptr_reg)
2437 }
2438 }
2439 } else if (result_typ.kind == .struct_t || result_typ.kind == .array_t)
2440 && result_size > 16 {
2441 if result_offset := g.stack_map[val_id] {
2442 if g.large_struct_stack_value_is_pointer(val_id) {
2443 // Pointer-backed large struct value: keep source address in the
2444 // value slot instead of copying full bytes into the frame.
2445 if ptr_is_null_const {
2446 g.emit_mov_reg(dest_reg, 31)
2447 } else if dest_reg != load_src_ptr_reg {
2448 g.emit_mov_reg(dest_reg, load_src_ptr_reg)
2449 }
2450 g.store_reg_to_val(dest_reg, val_id)
2451 } else {
2452 // Materialize large load results by value in their stack slot.
2453 if ptr_is_null_const {
2454 g.zero_fp_bytes(result_offset, result_size)
2455 } else {
2456 g.copy_ptr_to_fp_bytes(load_src_ptr_reg, result_offset,
2457 result_size)
2458 }
2459 if val_id in g.reg_map {
2460 g.emit_add_fp_imm(dest_reg, result_offset)
2461 }
2462 }
2463 loaded_into_aggregate_slot = true
2464 } else if dest_reg != load_src_ptr_reg {
2465 // Fallback when no spill slot is available: keep address form.
2466 if ptr_is_null_const {
2467 g.emit_mov_reg(dest_reg, 31)
2468 } else {
2469 g.emit_mov_reg(dest_reg, load_src_ptr_reg)
2470 }
2471 }
2472 } else {
2473 if ptr_is_null_const {
2474 g.emit_mov_reg(dest_reg, 31)
2475 } else {
2476 mut load_size := g.mem_access_size_bytes(result_typ_id, ptr_id)
2477 // Sumtype payload pointers can be represented as i64 scalar words.
2478 // Preserve pointer-width loads for these values even when the
2479 // intermediate pointer type appears byte-sized.
2480 if load_size < 8 && g.scalar_value_is_pointer_payload(val_id, 0) {
2481 load_size = 8
2482 }
2483 match load_size {
2484 1 { g.emit(asm_ldr_b(Reg(dest_reg), Reg(ptr_reg))) }
2485 2 { g.emit(asm_ldr_h(Reg(dest_reg), Reg(ptr_reg))) }
2486 4 { g.emit(asm_ldr_w(Reg(dest_reg), Reg(ptr_reg))) }
2487 else { g.emit(asm_ldr(Reg(dest_reg), Reg(ptr_reg))) }
2488 }
2489 }
2490 }
2491 } else {
2492 if ptr_is_null_const {
2493 g.emit_mov_reg(dest_reg, 31)
2494 } else {
2495 g.emit(asm_ldr(Reg(dest_reg), Reg(ptr_reg)))
2496 }
2497 }
2498 }
2499
2500 if !loaded_into_aggregate_slot {
2501 g.store_reg_to_val(dest_reg, val_id)
2502 }
2503 }
2504 .alloca {
2505 if val_id in g.sumtype_data_heap_allocas {
2506 ptr_type := g.mod.type_store.types[g.mod.values[val_id].typ]
2507 mut alloc_size := g.type_size(ptr_type.elem_type)
2508 if alloc_size <= 0 {
2509 alloc_size = 8
2510 }
2511 // alloca can request multiple elements via operand[0].
2512 if instr.operands.len > 0 {
2513 count_id := instr.operands[0]
2514 if count_id > 0 && count_id < g.mod.values.len {
2515 count_val := g.mod.values[count_id]
2516 count := int(parse_const_int_literal(count_val.name))
2517 if count > 1 {
2518 alloc_size *= count
2519 }
2520 }
2521 }
2522 g.emit_mov_imm(0, 1)
2523 g.emit_mov_imm(1, u64(alloc_size))
2524 sym_idx := g.macho.add_undefined('_calloc')
2525 g.macho.add_reloc(g.macho.text_data.len, sym_idx, arm64_reloc_branch26, true)
2526 g.emit(asm_bl_reloc())
2527 g.store_reg_to_val(0, val_id)
2528 } else {
2529 data_off := g.alloca_offsets[val_id]
2530 g.emit_add_fp_imm(8, data_off)
2531 // Zero-initialize large fixed array allocas.
2532 // The SSA builder skips element-by-element zero-init for arrays > 16 elements,
2533 // so the codegen must bulk-zero them here.
2534 alloca_val := g.mod.values[val_id]
2535 if alloca_val.typ > 0 && alloca_val.typ < g.mod.type_store.types.len {
2536 alloca_ptr_type := g.mod.type_store.types[alloca_val.typ]
2537 if alloca_ptr_type.kind == .ptr_t && alloca_ptr_type.elem_type > 0
2538 && alloca_ptr_type.elem_type < g.mod.type_store.types.len {
2539 elem_typ := g.mod.type_store.types[alloca_ptr_type.elem_type]
2540 if elem_typ.kind == .array_t && elem_typ.len > 16 {
2541 arr_size := g.type_size(alloca_ptr_type.elem_type)
2542 if arr_size > 0 {
2543 g.zero_ptr_bytes(8, arr_size)
2544 }
2545 }
2546 }
2547 }
2548 g.store_reg_to_val(8, val_id)
2549 }
2550 }
2551 .heap_alloc {
2552 // Heap-allocate memory for a struct type.
2553 // Result type is ptr(T), compute sizeof(T) and call calloc(1, size).
2554 mut alloc_size := 8
2555 ha_val := g.mod.values[val_id]
2556 if ha_val.typ > 0 && ha_val.typ < g.mod.type_store.types.len {
2557 ptr_typ := g.mod.type_store.types[ha_val.typ]
2558 if ptr_typ.kind == .ptr_t && ptr_typ.elem_type > 0 {
2559 alloc_size = g.type_size(ptr_typ.elem_type)
2560 if alloc_size <= 0 {
2561 alloc_size = 8
2562 }
2563 }
2564 }
2565 // calloc(1, size) → x0 = 1, x1 = size
2566 g.emit_mov_imm(0, 1)
2567 g.emit_mov_imm(1, u64(alloc_size))
2568 sym_idx := g.macho.add_undefined('_calloc')
2569 g.macho.add_reloc(g.macho.text_data.len, sym_idx, arm64_reloc_branch26, true)
2570 g.emit(asm_bl_reloc())
2571 // calloc returns heap pointer in x0
2572 g.store_reg_to_val(0, val_id)
2573 }
2574 .get_element_ptr {
2575 // GEP: Base + scaled index (or struct field offset for aggregate pointers)
2576 idx_id := instr.operands[1]
2577 base_typ_id := g.mod.values[instr.operands[0]].typ
2578 mut pointee_typ_id := ssa.TypeID(0)
2579 mut base_elem_typ_id := ssa.TypeID(0)
2580 mut gep_done := false
2581 if base_typ_id > 0 && base_typ_id < g.mod.type_store.types.len {
2582 base_typ := g.mod.type_store.types[base_typ_id]
2583 if base_typ.kind == .ptr_t {
2584 pointee_typ_id = base_typ.elem_type
2585 base_elem_typ_id = base_typ.elem_type
2586 }
2587 }
2588 mut base_reg := g.get_operand_reg(instr.operands[0], 8)
2589
2590 // Struct field GEP with constant index: use real field byte offsets.
2591 // Distinguish from array-style GEP: if the GEP result type equals the
2592 // base pointer type, this is array indexing (ptr(struct)[i] -> ptr(struct)),
2593 // not struct field access (ptr(struct), field_idx -> ptr(field_type)).
2594 mut is_array_gep := false
2595 base_val_typ := g.mod.values[instr.operands[0]].typ
2596 if instr.typ == base_val_typ {
2597 is_array_gep = true
2598 }
2599 // Some lowered flows keep pointer payloads in alloca-backed scalar slots.
2600 // For GEP over such values, first load the payload pointer from the slot.
2601 mut idx_is_zero_const := false
2602 if idx_id > 0 && idx_id < g.mod.values.len {
2603 idx_val_dbg := g.mod.values[idx_id]
2604 if idx_val_dbg.kind == .constant && idx_val_dbg.name == '0' {
2605 idx_is_zero_const = true
2606 }
2607 }
2608 base_val := g.mod.values[instr.operands[0]]
2609 mut gep_base_slot_has_ptr := false
2610 if base_val.kind == .instruction {
2611 base_instr := g.mod.instrs[base_val.index]
2612 if g.selected_opcode(base_instr) == .alloca {
2613 mut target_elem_typ_id := base_elem_typ_id
2614 if instr.typ > 0 && instr.typ < g.mod.type_store.types.len {
2615 res_typ := g.mod.type_store.types[instr.typ]
2616 if res_typ.kind == .ptr_t && res_typ.elem_type > 0
2617 && res_typ.elem_type < g.mod.type_store.types.len {
2618 target_elem_typ_id = res_typ.elem_type
2619 }
2620 }
2621 gep_base_slot_has_ptr = target_elem_typ_id > 0
2622 && g.alloca_slot_stores_pointer_like_values(instr.operands[0], target_elem_typ_id)
2623 if gep_base_slot_has_ptr && !(instr.typ == base_val_typ && idx_is_zero_const) {
2624 g.emit(asm_ldr(Reg(base_reg), Reg(base_reg)))
2625 }
2626 }
2627 }
2628 if !is_array_gep && idx_id > 0 && idx_id < g.mod.values.len && pointee_typ_id > 0
2629 && pointee_typ_id < g.mod.type_store.types.len {
2630 idx_val := g.mod.values[idx_id]
2631 pointee_typ := g.mod.type_store.types[pointee_typ_id]
2632 if idx_val.kind == .constant && pointee_typ.kind == .struct_t {
2633 field_idx := int(parse_const_int_literal(idx_val.name))
2634 field_off := g.struct_field_offset_bytes(pointee_typ_id, field_idx)
2635 if field_off <= 0xFFF {
2636 g.emit(asm_add_imm(Reg(8), Reg(base_reg), u32(field_off)))
2637 } else {
2638 g.emit_mov_imm64(9, i64(field_off))
2639 g.emit(asm_add_reg(Reg(8), Reg(base_reg), Reg(9)))
2640 }
2641 g.store_gep_result_from_addr(8, val_id)
2642 gep_done = true
2643 }
2644 }
2645 if !gep_done {
2646 // Array/pointer-style GEP: scale by element size.
2647 mut scale := 8
2648 mut base_ptr_reg := base_reg
2649 if pointee_typ_id > 0 && pointee_typ_id < g.mod.type_store.types.len {
2650 pointee_typ := g.mod.type_store.types[pointee_typ_id]
2651 elem_size := if pointee_typ.kind == .array_t && !is_array_gep {
2652 g.type_size(pointee_typ.elem_type)
2653 } else {
2654 g.type_size(pointee_typ_id)
2655 }
2656 if elem_size > 0 {
2657 scale = elem_size
2658 }
2659 }
2660 // Ensure index load doesn't clobber base if base is 8
2661 idx_scratch := if base_ptr_reg == 8 { 9 } else { 8 }
2662 idx_reg := g.get_operand_reg(idx_id, idx_scratch)
2663 // Sign-extend index from 32-bit to 64-bit. GEP indices may have
2664 // been stored as 32-bit values (e.g. from map lookups or
2665 // extractvalue of int fields) with undefined upper 32 bits.
2666 // The ARM64 ABI does not guarantee upper bits are zeroed for
2667 // sub-64-bit values, so always extend before scaling.
2668 g.emit(asm_sxtw(Reg(idx_reg), Reg(idx_reg)))
2669 if scale == 8 {
2670 g.emit(asm_add_reg_lsl3(Reg(8), Reg(base_ptr_reg), Reg(idx_reg)))
2671 } else if scale == 1 {
2672 g.emit(asm_add_reg(Reg(8), Reg(base_ptr_reg), Reg(idx_reg)))
2673 } else {
2674 mut scale_reg := 10
2675 if scale_reg == base_ptr_reg || scale_reg == idx_reg {
2676 scale_reg = 11
2677 if scale_reg == base_ptr_reg || scale_reg == idx_reg {
2678 scale_reg = 12
2679 }
2680 }
2681 g.emit_mov_imm64(scale_reg, scale)
2682 g.emit(asm_mul(Reg(scale_reg), Reg(idx_reg), Reg(scale_reg)))
2683 g.emit(asm_add_reg(Reg(8), Reg(base_ptr_reg), Reg(scale_reg)))
2684 }
2685 g.store_gep_result_from_addr(8, val_id)
2686 }
2687 }
2688 .call {
2689 g.invalidate_last_store()
2690 fn_val := g.mod.values[instr.operands[0]]
2691 fn_name := fn_val.name
2692 trace_call := g.env_trace_call.len > 0
2693 && (g.env_trace_call == '*' || g.cur_func_name == g.env_trace_call)
2694 if trace_call {
2695 eprintln('ARM64 CALL fn=${g.cur_func_name} val=${val_id} callee_id=${instr.operands[0]} callee=`${fn_name}` args=${instr.operands.len - 1}')
2696 }
2697 // Skip calls with empty function names (shouldn't happen, but safety check)
2698 if fn_name != '' {
2699 // On ARM64 macOS (Apple Silicon), variadic arguments must be
2700 // passed on the stack, not in registers.
2701 is_variadic := fn_name in ['sprintf', 'printf', 'snprintf', 'fprintf', 'sscanf']
2702 num_fixed_args := if fn_name == 'sprintf' {
2703 2 // buffer, format
2704 } else if fn_name == 'printf' {
2705 1 // format
2706 } else if fn_name in ['snprintf', 'fprintf'] {
2707 3 // buffer/file, size, format
2708 } else if fn_name == 'sscanf' {
2709 2 // string, format
2710 } else {
2711 8 // default: all in registers
2712 }
2713
2714 num_args := instr.operands.len - 1
2715
2716 // Check if return type is a large struct (> 16 bytes) requiring indirect return
2717 result_typ := g.mod.type_store.types[g.mod.values[val_id].typ]
2718 result_size := g.type_size(g.mod.values[val_id].typ)
2719 is_indirect_return := result_typ.kind == .struct_t && result_size > 16
2720 // For indirect struct returns, set x8 to point to result storage BEFORE the call
2721 if is_indirect_return {
2722 result_offset := g.stack_map[val_id]
2723 g.emit_add_fp_imm(8, result_offset)
2724 }
2725
2726 if is_variadic && num_args > num_fixed_args {
2727 // Variadic call: push variadic args to stack, fixed args to registers
2728 num_variadic := num_args - num_fixed_args
2729
2730 // Allocate stack space for variadic args (8 bytes each, 16-byte aligned)
2731 stack_space := ((num_variadic * 8) + 15) & ~0xF
2732 if stack_space > 0 {
2733 g.sp_adjusted = true
2734 g.sp_adjust_amt = stack_space
2735 g.emit_sub_sp(stack_space)
2736 }
2737
2738 // Store variadic arguments to stack (in order)
2739 for i := 0; i < num_variadic; i++ {
2740 arg_idx := num_fixed_args + 1 + i // +1 because operands[0] is the function
2741 // Anonymous variadic args do not have a declared parameter slot.
2742 // Pass their promoted value representation directly instead of
2743 // reusing fixed-arg lowering, which can incorrectly turn a loaded
2744 // scalar into the address of its spill slot.
2745 g.load_val_to_reg(9, instr.operands[arg_idx]) // Use x9 to avoid clobbering x8
2746 // STR x9, [sp, #offset]
2747 offset := i * 8
2748 imm12 := u32(offset / 8)
2749 g.emit(asm_str_imm(Reg(9), sp, imm12))
2750 }
2751
2752 // Load fixed arguments to registers (in reverse order to avoid clobbering)
2753 for i := num_fixed_args; i >= 1; i-- {
2754 g.load_call_arg_to_reg(i - 1, instr.operands[i], i - 1, instr)
2755 }
2756
2757 // Call function
2758 sym_idx := g.macho.add_undefined('_' + fn_name)
2759 g.macho.add_reloc(g.macho.text_data.len, sym_idx, arm64_reloc_branch26, true)
2760 g.emit(asm_bl_reloc())
2761
2762 // Restore stack
2763 if stack_space > 0 {
2764 if stack_space <= 0xFFF {
2765 g.emit(asm_add_imm(sp, sp, u32(stack_space)))
2766 } else {
2767 g.emit_mov_imm(10, u64(stack_space))
2768 g.emit(asm_add_sp_reg(Reg(10)))
2769 }
2770 g.sp_adjusted = false
2771 g.sp_adjust_amt = 0
2772 }
2773 } else {
2774 // Non-variadic call:
2775 // ARM64 ABI: integer args in x0-x7, float args in d0-d7
2776 // Integer and float registers are allocated independently.
2777
2778 // Classify each argument as float or integer
2779 mut is_float_arg := []bool{len: num_args}
2780 mut arg_int_reg := []int{len: num_args, init: -1}
2781 mut arg_float_reg := []int{len: num_args, init: -1}
2782 mut arg_int_cnt := []int{len: num_args}
2783 mut int_reg_idx := 0
2784 mut float_reg_idx := 0
2785
2786 for a in 0 .. num_args {
2787 arg_val := g.mod.values[instr.operands[a + 1]]
2788 mut is_float := false
2789 if arg_val.typ > 0 && int(arg_val.typ) < g.mod.type_store.types.len {
2790 arg_typ := g.mod.type_store.types[arg_val.typ]
2791 if arg_typ.kind == .float_t {
2792 is_float = true
2793 }
2794 }
2795 if is_float {
2796 is_float_arg[a] = true
2797 arg_float_reg[a] = float_reg_idx
2798 float_reg_idx++
2799 } else {
2800 cnt := g.call_arg_reg_count(instr.operands[a + 1], a, instr)
2801 arg_int_reg[a] = int_reg_idx
2802 arg_int_cnt[a] = cnt
2803 int_reg_idx += cnt
2804 }
2805 }
2806
2807 // Handle stack-spilled integer args (>8 int regs)
2808 num_int_stack := if int_reg_idx > 8 { int_reg_idx - 8 } else { 0 }
2809 num_float_stack := if float_reg_idx > 8 { float_reg_idx - 8 } else { 0 }
2810 total_stack_slots := num_int_stack + num_float_stack
2811 stack_space := ((total_stack_slots * 8) + 15) & ~0xF
2812 if stack_space > 0 {
2813 g.sp_adjusted = true
2814 g.sp_adjust_amt = stack_space
2815 g.emit_sub_sp(stack_space)
2816 mut stack_idx := 0
2817 for a in 0 .. num_args {
2818 if is_float_arg[a] {
2819 if arg_float_reg[a] >= 8 {
2820 g.load_val_to_reg(9, instr.operands[a + 1])
2821 g.emit(asm_str_imm(Reg(9), sp, u32(stack_idx)))
2822 stack_idx++
2823 }
2824 } else {
2825 cnt := arg_int_cnt[a]
2826 start_reg := arg_int_reg[a]
2827 expected_struct_typ := g.call_param_type(instr, a) or {
2828 ssa.TypeID(0)
2829 }
2830 for ri in 0 .. cnt {
2831 if start_reg + ri < 8 {
2832 continue
2833 }
2834 if cnt > 1 {
2835 g.load_struct_arg_word_to_reg(9, instr.operands[a + 1], ri,
2836 expected_struct_typ, instr.operands[0])
2837 } else {
2838 g.load_call_arg_to_reg(9, instr.operands[a + 1], a, instr)
2839 }
2840 g.emit(asm_str_imm(Reg(9), sp, u32(stack_idx)))
2841 stack_idx++
2842 }
2843 }
2844 }
2845 }
2846
2847 // Load integer args to x-registers (reverse order)
2848 for a := num_args - 1; a >= 0; a-- {
2849 if is_float_arg[a] {
2850 continue
2851 }
2852 reg := arg_int_reg[a]
2853 if reg >= 8 && arg_int_cnt[a] == 1 {
2854 continue // stack arg
2855 }
2856 if arg_int_cnt[a] > 1 {
2857 expected_struct_typ := g.call_param_type(instr, a) or { ssa.TypeID(0) }
2858 for ri := arg_int_cnt[a] - 1; ri >= 0; ri-- {
2859 target_reg := reg + ri
2860 if target_reg >= 8 {
2861 continue
2862 }
2863 g.load_struct_arg_word_to_reg(target_reg, instr.operands[a + 1],
2864 ri, expected_struct_typ, instr.operands[0])
2865 }
2866 } else {
2867 g.load_call_arg_to_reg(reg, instr.operands[a + 1], a, instr)
2868 }
2869 }
2870
2871 // Load float args to d-registers
2872 for a in 0 .. num_args {
2873 if !is_float_arg[a] {
2874 continue
2875 }
2876 freg := arg_float_reg[a]
2877 if freg >= 8 {
2878 continue // stack float arg
2879 }
2880 // Load value bits to x9, then fmov to dN
2881 g.load_val_to_reg(9, instr.operands[a + 1])
2882 g.emit(asm_fmov_d_x(freg, Reg(9)))
2883 }
2884
2885 sym_idx := g.macho.add_undefined('_' + fn_name)
2886 g.macho.add_reloc(g.macho.text_data.len, sym_idx, arm64_reloc_branch26, true)
2887 g.emit(asm_bl_reloc())
2888
2889 if stack_space > 0 {
2890 if stack_space <= 0xFFF {
2891 g.emit(asm_add_imm(sp, sp, u32(stack_space)))
2892 } else {
2893 g.emit_mov_imm(10, u64(stack_space))
2894 g.emit(asm_add_sp_reg(Reg(10)))
2895 }
2896 g.sp_adjusted = false
2897 g.sp_adjust_amt = 0
2898 }
2899 }
2900
2901 if result_typ.kind != .void_t {
2902 // Check if this is a float return: result comes in d0 instead of x0
2903 mut is_float_return := result_typ.kind == .float_t
2904 if !is_float_return {
2905 // Also check the callee's registered return type
2906 callee_idx := g.func_idx_from_ref_value(instr.operands[0])
2907 if callee_idx >= 0 && callee_idx < g.func_typs.len {
2908 callee_ret := g.mod.type_store.types[g.func_typs[callee_idx]]
2909 if callee_ret.kind == .float_t {
2910 is_float_return = true
2911 }
2912 }
2913 }
2914 if is_float_return {
2915 // Float return: move d0 to x0 for integer storage
2916 g.emit(asm_fmov_x_d(Reg(0), 0))
2917 g.store_reg_to_val(0, val_id)
2918 } else {
2919 // Also check callee's registered return type: when the SSA value type
2920 // isn't struct_t (e.g. i64 fallback), use the callee's return type.
2921 mut call_ret_is_multi_reg := result_typ.kind == .struct_t && result_size > 8
2922 mut actual_call_ret_size := result_size
2923 if !call_ret_is_multi_reg && !is_indirect_return
2924 && result_typ.kind == .int_t {
2925 callee_idx := g.func_idx_from_ref_value(instr.operands[0])
2926 if callee_idx >= 0 && callee_idx < g.func_typs.len {
2927 callee_typ := g.func_typs[callee_idx]
2928 callee_ret_typ := g.mod.type_store.types[callee_typ]
2929 callee_ret_size := g.type_size(callee_typ)
2930 if callee_ret_typ.kind == .struct_t && callee_ret_size > 8
2931 && callee_ret_size <= 16 {
2932 call_ret_is_multi_reg = true
2933 actual_call_ret_size = callee_ret_size
2934 }
2935 }
2936 }
2937 if call_ret_is_multi_reg {
2938 if !is_indirect_return {
2939 if val_id in g.stack_map {
2940 result_offset := g.stack_map[val_id]
2941 num_chunks := (actual_call_ret_size + 7) / 8
2942 // Use ABI return size, not SSA value type size. Some MIR
2943 // paths merge small-struct call results to `int`, but the
2944 // call still returns x0/x1 that must both be materialized.
2945 for i in 0 .. num_chunks {
2946 if i < 8 {
2947 g.emit_str_reg_offset(i, 29, result_offset + i * 8)
2948 }
2949 }
2950 } else {
2951 g.store_reg_to_val(0, val_id)
2952 }
2953 }
2954 } else {
2955 g.canonicalize_narrow_int_result(0, g.mod.values[val_id].typ)
2956 g.store_reg_to_val(0, val_id)
2957 }
2958 }
2959 }
2960 }
2961 }
2962 .call_indirect {
2963 g.invalidate_last_store()
2964 // Indirect call through function pointer
2965 // operands[0] is the function pointer, rest are arguments
2966 num_args := instr.operands.len - 1
2967
2968 // Compute register mapping for multi-register struct args.
2969 mut ci_arg_reg_start := []int{len: num_args}
2970 mut ci_arg_reg_cnt := []int{len: num_args}
2971 mut ci_total_reg_slots := 0
2972 for a in 0 .. num_args {
2973 ci_arg_reg_start[a] = ci_total_reg_slots
2974 cnt := g.call_arg_reg_count(instr.operands[a + 1], a, instr)
2975 ci_arg_reg_cnt[a] = cnt
2976 ci_total_reg_slots += cnt
2977 }
2978 num_stack_slots := if ci_total_reg_slots > 8 {
2979 ci_total_reg_slots - 8
2980 } else {
2981 0
2982 }
2983 stack_space := ((num_stack_slots * 8) + 15) & ~0xF
2984 if stack_space > 0 {
2985 g.sp_adjusted = true
2986 g.sp_adjust_amt = stack_space
2987 g.emit_sub_sp(stack_space)
2988 mut stack_idx := 0
2989 for a in 0 .. num_args {
2990 cnt := ci_arg_reg_cnt[a]
2991 start_reg := ci_arg_reg_start[a]
2992 expected_struct_typ := g.call_param_type(instr, a) or { ssa.TypeID(0) }
2993 for ri in 0 .. cnt {
2994 if start_reg + ri < 8 {
2995 continue
2996 }
2997 if cnt > 1 {
2998 g.load_struct_arg_word_to_reg(9, instr.operands[a + 1], ri,
2999 expected_struct_typ, instr.operands[0])
3000 } else {
3001 g.load_call_arg_to_reg(9, instr.operands[a + 1], a, instr)
3002 }
3003 imm12 := u32(stack_idx)
3004 g.emit(asm_str_imm(Reg(9), sp, imm12))
3005 stack_idx++
3006 }
3007 }
3008 }
3009
3010 for a := num_args - 1; a >= 0; a-- {
3011 reg := ci_arg_reg_start[a]
3012 if reg >= 8 && ci_arg_reg_cnt[a] == 1 {
3013 continue
3014 }
3015 if ci_arg_reg_cnt[a] > 1 {
3016 expected_struct_typ := g.call_param_type(instr, a) or { ssa.TypeID(0) }
3017 for ri := ci_arg_reg_cnt[a] - 1; ri >= 0; ri-- {
3018 target_reg := reg + ri
3019 if target_reg >= 8 {
3020 continue
3021 }
3022 g.load_struct_arg_word_to_reg(target_reg, instr.operands[a + 1], ri,
3023 expected_struct_typ, instr.operands[0])
3024 }
3025 } else {
3026 g.load_call_arg_to_reg(reg, instr.operands[a + 1], a, instr)
3027 }
3028 }
3029
3030 // Load function pointer to x9 (scratch register).
3031 // Do not use generic value loading here: it can materialize an address
3032 // for large struct-like values instead of the actual callable pointer.
3033 g.load_fnptr_to_reg(9, instr.operands[0])
3034
3035 // BLR x9 - branch and link to register
3036 g.emit(asm_blr(Reg(9)))
3037
3038 if stack_space > 0 {
3039 if stack_space <= 0xFFF {
3040 g.emit(asm_add_imm(sp, sp, u32(stack_space)))
3041 } else {
3042 g.emit_mov_imm(10, u64(stack_space))
3043 g.emit(asm_add_sp_reg(Reg(10)))
3044 }
3045 g.sp_adjusted = false
3046 g.sp_adjust_amt = 0
3047 }
3048
3049 ci_result_typ_id := g.mod.values[val_id].typ
3050 ci_result_typ := g.mod.type_store.types[ci_result_typ_id]
3051 if ci_result_typ.kind != .void_t {
3052 ci_result_size := g.type_size(ci_result_typ_id)
3053 if ci_result_typ.kind == .struct_t && ci_result_size > 8 {
3054 // Small struct return: store x0, x1 into stack slot
3055 ci_result_offset := g.stack_map[val_id]
3056 num_chunks := (ci_result_size + 7) / 8
3057 for i in 0 .. num_chunks {
3058 if i < 8 {
3059 g.emit_str_reg_offset(i, 29, ci_result_offset + i * 8)
3060 }
3061 }
3062 } else {
3063 g.canonicalize_narrow_int_result(0, ci_result_typ_id)
3064 g.store_reg_to_val(0, val_id)
3065 }
3066 }
3067 }
3068 .call_sret {
3069 g.invalidate_last_store()
3070 // Call with struct return lowered by ABI pass.
3071 // operands: [fn, arg1, arg2, ...], destination is val_id's stack slot.
3072 num_args := instr.operands.len - 1
3073 trace_call := g.env_trace_call.len > 0
3074 && (g.env_trace_call == '*' || g.cur_func_name == g.env_trace_call)
3075 if trace_call {
3076 callee_id := instr.operands[0]
3077 mut callee_name := ''
3078 if callee_id > 0 && callee_id < g.mod.values.len {
3079 callee_name = g.mod.values[callee_id].name
3080 }
3081 eprintln('ARM64 CALL_SRET fn=${g.cur_func_name} val=${val_id} callee_id=${callee_id} callee=`${callee_name}` args=${num_args}')
3082 }
3083
3084 // Keep the destination offset for the hidden indirect-return pointer.
3085 // x8 is scratch for argument materialization, so set it only after
3086 // all arguments have been loaded.
3087 result_offset := g.stack_map[val_id]
3088
3089 // Compute register mapping for multi-register struct args.
3090 mut sr_arg_reg_start := []int{len: num_args}
3091 mut sr_arg_reg_cnt := []int{len: num_args}
3092 mut sr_total_reg_slots := 0
3093 for a in 0 .. num_args {
3094 sr_arg_reg_start[a] = sr_total_reg_slots
3095 cnt := g.call_arg_reg_count(instr.operands[a + 1], a, instr)
3096 sr_arg_reg_cnt[a] = cnt
3097 sr_total_reg_slots += cnt
3098 }
3099 sr_num_stack_slots := if sr_total_reg_slots > 8 {
3100 sr_total_reg_slots - 8
3101 } else {
3102 0
3103 }
3104 stack_space := ((sr_num_stack_slots * 8) + 15) & ~0xF
3105 if stack_space > 0 {
3106 g.sp_adjusted = true
3107 g.sp_adjust_amt = stack_space
3108 g.emit_sub_sp(stack_space)
3109 mut stack_idx := 0
3110 for a in 0 .. num_args {
3111 cnt := sr_arg_reg_cnt[a]
3112 start_reg := sr_arg_reg_start[a]
3113 expected_struct_typ := g.call_param_type(instr, a) or { ssa.TypeID(0) }
3114 for ri in 0 .. cnt {
3115 if start_reg + ri < 8 {
3116 continue
3117 }
3118 if cnt > 1 {
3119 g.load_struct_arg_word_to_reg(9, instr.operands[a + 1], ri,
3120 expected_struct_typ, instr.operands[0])
3121 } else {
3122 g.load_call_arg_to_reg(9, instr.operands[a + 1], a, instr)
3123 }
3124 imm12 := u32(stack_idx)
3125 g.emit(asm_str_imm(Reg(9), sp, imm12))
3126 stack_idx++
3127 }
3128 }
3129 }
3130
3131 for a := num_args - 1; a >= 0; a-- {
3132 reg := sr_arg_reg_start[a]
3133 if reg >= 8 && sr_arg_reg_cnt[a] == 1 {
3134 continue
3135 }
3136 if sr_arg_reg_cnt[a] > 1 {
3137 expected_struct_typ := g.call_param_type(instr, a) or { ssa.TypeID(0) }
3138 for ri := sr_arg_reg_cnt[a] - 1; ri >= 0; ri-- {
3139 target_reg := reg + ri
3140 if target_reg >= 8 {
3141 continue
3142 }
3143 g.load_struct_arg_word_to_reg(target_reg, instr.operands[a + 1], ri,
3144 expected_struct_typ, instr.operands[0])
3145 }
3146 } else {
3147 g.load_call_arg_to_reg(reg, instr.operands[a + 1], a, instr)
3148 }
3149 }
3150
3151 // Set x8 to destination address for indirect return.
3152 g.emit_add_fp_imm(8, result_offset)
3153
3154 fn_val := g.mod.values[instr.operands[0]]
3155 if fn_val.name != '' && fn_val.kind in [.unknown, .func_ref] {
3156 // Direct call by symbol.
3157 sym_idx := g.macho.add_undefined('_' + fn_val.name)
3158 g.macho.add_reloc(g.macho.text_data.len, sym_idx, arm64_reloc_branch26, true)
3159 g.emit(asm_bl_reloc())
3160 } else {
3161 // Indirect call through function pointer value.
3162 g.load_fnptr_to_reg(9, instr.operands[0])
3163 g.emit(asm_blr(Reg(9)))
3164 }
3165
3166 if stack_space > 0 {
3167 if stack_space <= 0xFFF {
3168 g.emit(asm_add_imm(sp, sp, u32(stack_space)))
3169 } else {
3170 g.emit_mov_imm(10, u64(stack_space))
3171 g.emit(asm_add_sp_reg(Reg(10)))
3172 }
3173 g.sp_adjusted = false
3174 g.sp_adjust_amt = 0
3175 }
3176 }
3177 .ret {
3178 if instr.operands.len > 0 {
3179 mut ret_val_id := instr.operands[0]
3180 mut ret_val_typ := g.mod.values[ret_val_id].typ
3181 mut ret_typ := g.mod.type_store.types[ret_val_typ]
3182 trace_ret := g.env_trace_ret.len > 0
3183 && (g.env_trace_ret == '*' || g.cur_func_name == g.env_trace_ret)
3184
3185 // Get the function's declared return type
3186 fn_ret_type := g.cur_func_ret_type
3187 fn_ret_typ := g.mod.type_store.types[fn_ret_type]
3188 fn_ret_size := g.type_size(fn_ret_type)
3189 mut ret_val_size := g.type_size(ret_val_typ)
3190 if trace_ret {
3191 eprintln('ARM64 RET fn=${g.cur_func_name} ret_val=${ret_val_id} rtyp=${ret_val_typ}/${ret_typ.kind} rsz=${ret_val_size} fn_typ=${fn_ret_type}/${fn_ret_typ.kind} fn_sz=${fn_ret_size} roff=${g.stack_map[ret_val_id]}')
3192 }
3193 // Sumtype wrapper returns must produce `{_tag, _data}`.
3194 // When the lowered return value is a payload/pointer/etc., recover
3195 // the originating wrapper from the unwrapped value chain.
3196 if fn_ret_typ.kind == .struct_t && g.is_sumtype_wrapper_struct_type(fn_ret_type)
3197 && fn_ret_size > 0 && fn_ret_size <= 16 && ret_val_typ != fn_ret_type {
3198 if wrapper_id := g.sumtype_wrapper_source_from_unwrapped_value(ret_val_id,
3199 fn_ret_type, 0)
3200 {
3201 ret_val_id = wrapper_id
3202 ret_val_typ = g.mod.values[ret_val_id].typ
3203 ret_typ = g.mod.type_store.types[ret_val_typ]
3204 ret_val_size = g.type_size(ret_val_typ)
3205 }
3206 }
3207 // Some lowered return paths re-wrap an already-optional `types.Type`
3208 // value as `Type(OptionType{ base_type: <Type> })`. For `return inner()`
3209 // this turns `none` into a fake `some`. Recover the original wrapper.
3210 if fn_ret_typ.kind == .struct_t && g.is_sumtype_wrapper_struct_type(fn_ret_type) {
3211 if forwarded_wrapper_id := g.forwarded_optiontype_wrapper_return_source(ret_val_id,
3212 fn_ret_type)
3213 {
3214 if trace_ret {
3215 eprintln('ARM64 RET fn=${g.cur_func_name} rewrite=forward_option_wrapper from=${ret_val_id} to=${forwarded_wrapper_id}')
3216 }
3217 ret_val_id = forwarded_wrapper_id
3218 ret_val_typ = g.mod.values[ret_val_id].typ
3219 ret_typ = g.mod.type_store.types[ret_val_typ]
3220 ret_val_size = g.type_size(ret_val_typ)
3221 }
3222 }
3223
3224 // Check if we're returning a pointer but the function expects a struct
3225 // This happens when returning local struct variables (expr_init returns pointers)
3226 mut is_indirect_struct_return := false
3227 if ret_typ.kind == .ptr_t && fn_ret_typ.kind == .struct_t {
3228 elem_type := ret_typ.elem_type
3229 if elem_type == fn_ret_type {
3230 is_indirect_struct_return = true
3231 }
3232 }
3233
3234 // For large struct returns (> 16 bytes), use indirect return via x8
3235 // The caller provides the destination address in x8
3236 if fn_ret_typ.kind == .struct_t && fn_ret_size > 16 {
3237 // Restore x8 from the saved location (fp-relative)
3238 if g.x8_save_offset != 0 {
3239 g.emit_ldr_reg_offset(8, 29, g.x8_save_offset)
3240 }
3241
3242 // Check if returning a zero/none value (e.g., `return 0` from `return none`).
3243 // In this case, zero-fill the return area instead of trying to copy
3244 // from address 0 (which would be a null pointer dereference).
3245 is_zero_const := g.mod.values[ret_val_id].kind == .constant
3246 && g.mod.values[ret_val_id].name == '0'
3247 if is_zero_const {
3248 num_fields := (fn_ret_size + 7) / 8
3249 for i in 0 .. num_fields {
3250 // STR xzr, [x8, #i*8]
3251 g.emit(asm_str_imm(Reg(31), Reg(8), u32(i)))
3252 }
3253 } else {
3254 // string_literal values need to be materialized on the stack
3255 // before we can copy them to the return pointer.
3256 if g.mod.values[ret_val_id].kind == .string_literal {
3257 g.load_val_to_reg(9, ret_val_id)
3258 }
3259
3260 // Get the source address of the struct
3261 if is_indirect_struct_return {
3262 // Return value is a pointer to struct - use it as source
3263 g.load_val_to_reg(9, ret_val_id)
3264 } else if ret_offset := g.stack_map[ret_val_id] {
3265 if g.large_struct_stack_value_is_pointer(ret_val_id) {
3266 // Some large-struct temporaries are represented as pointers in stack slots.
3267 g.emit_ldr_reg_offset(9, 29, ret_offset)
3268 } else {
3269 // Struct is materialized by value on stack.
3270 g.emit_add_fp_imm(9, ret_offset)
3271 }
3272 } else {
3273 // Fallback
3274 g.load_val_to_reg(9, ret_val_id)
3275 }
3276 // Copy struct from [x9] to [x8] (x8 was restored from saved location)
3277 num_fields := (fn_ret_size + 7) / 8
3278 for i in 0 .. num_fields {
3279 // LDR x10, [x9, #i*8]
3280 g.emit(asm_ldr_imm(Reg(10), Reg(9), u32(i)))
3281 // STR x10, [x8, #i*8]
3282 g.emit(asm_str_imm(Reg(10), Reg(8), u32(i)))
3283 }
3284 }
3285 } else if (ret_typ.kind == .struct_t && g.type_size(ret_val_typ) > 8)
3286 || is_indirect_struct_return
3287 || (fn_ret_typ.kind == .struct_t && fn_ret_size > 8 && fn_ret_size <= 16) {
3288 // Small struct (≤ 16 bytes) - return in registers x0