v / vlib / v2 / ssa / optimize / phi.v
661 lines · 603 sloc · 18.38 KB · 9e3e02d5ea5b1cb300b5f5304cd821f47337b008
Raw
1// Copyright (c) 2026 Alexander Medvednikov. All rights reserved.
2// Use of this source code is governed by an MIT license
3// that can be found in the LICENSE file.
4
5module optimize
6
7import v2.ssa
8
9// --- Simplify Phi Nodes ---
10// Remove trivial phi nodes where all operands are the same or self-referential.
11// A phi is trivial if all its non-self operands resolve to the same value.
12// This reduces unnecessary instructions before phi elimination.
13fn simplify_phi_nodes(mut m ssa.Module) bool {
14 mut any_changed := false
15 mut changed := true
16 for changed {
17 changed = false
18 for fi in 0 .. m.funcs.len {
19 func := m.funcs[fi]
20 n_func_blocks := func.blocks.len
21 for fbi in 0 .. n_func_blocks {
22 blk_id := func.blocks[fbi]
23 blk := m.blocks[blk_id]
24 n_blk_instrs := blk.instrs.len
25 for ii in 0 .. n_blk_instrs {
26 val_id := blk.instrs[ii]
27 val := m.values[val_id]
28 if val.kind != .instruction {
29 continue
30 }
31 instr := m.instrs[val.index]
32 if instr.op != .phi {
33 continue
34 }
35
36 // Dead phi removal: phi with no uses can be removed
37 if val.uses.len == 0 {
38 // Remove uses from this phi's operands
39 for i := 0; i < instr.operands.len; i += 2 {
40 op_val := instr.operands[i]
41 if op_val != val_id { // Don't try to remove self-reference
42 remove_phi_use(mut m, op_val, val_id)
43 }
44 }
45 // Mark phi as dead
46 // Avoid m.instrs[X].field = ... -- chained field assign broken in ARM64 self-hosted
47 mut dead_phi := m.instrs[val.index]
48 dead_phi.op = .bitcast
49 dead_phi.operands = []
50 m.instrs[val.index] = dead_phi
51 changed = true
52 any_changed = true
53 continue
54 }
55
56 // Check if phi is trivial (all non-self operands are the same)
57 mut unique_val := -1
58 mut is_trivial := true
59
60 for i := 0; i < instr.operands.len; i += 2 {
61 op_val := instr.operands[i]
62 // Skip self-references (phi refers to itself)
63 if op_val == val_id {
64 continue
65 }
66 if unique_val == -1 {
67 unique_val = op_val
68 } else if unique_val != op_val {
69 is_trivial = false
70 break
71 }
72 }
73
74 // If trivial and we found a unique value, replace all uses
75 if is_trivial && unique_val != -1 {
76 // Replace all uses of this phi with the unique value
77 m.replace_uses(val_id, unique_val)
78 // Mark phi as dead (will be cleaned up by DCE or ignored)
79 // Avoid m.instrs[X].field = ... -- chained field assign broken in ARM64 self-hosted
80 mut triv_phi := m.instrs[val.index]
81 triv_phi.op = .bitcast
82 triv_phi.operands = []
83 m.instrs[val.index] = triv_phi
84 changed = true
85 any_changed = true
86 }
87 }
88 }
89 }
90 }
91 return any_changed
92}
93
94// Helper to remove a use from a value's uses list (for dead phi removal)
95fn remove_phi_use(mut m ssa.Module, val_id int, user_id int) {
96 if val_id >= m.values.len {
97 return
98 }
99 mut val := &m.values[val_id]
100 for i := val.uses.len - 1; i >= 0; i-- {
101 if val.uses[i] == user_id {
102 val.uses.delete(i)
103 }
104 }
105}
106
107// get_block_val_id returns the value ID for a block, using a pre-built lookup table.
108// This avoids m.blocks[bid].val_id which can return wrong values for the large
109// BasicBlock struct (168 bytes) in ARM64-compiled binaries.
110fn get_block_val_id(_m &ssa.Module, bid int, block_val_ids []int) int {
111 if bid >= 0 && bid < block_val_ids.len {
112 return block_val_ids[bid]
113 }
114 return 0
115}
116
117// build_block_val_ids scans values for basic_block kind and builds block_id → val_id mapping.
118fn build_block_val_ids(m &ssa.Module) []int {
119 n_blks := m.blocks.len
120 mut block_val_ids := []int{len: n_blks}
121 n_vals := m.values.len
122 for vi in 0 .. n_vals {
123 if m.values[vi].kind == .basic_block {
124 bid := m.values[vi].index
125 if bid >= 0 && bid < n_blks {
126 block_val_ids[bid] = vi
127 }
128 }
129 }
130 return block_val_ids
131}
132
133// --- Critical Edge Splitting ---
134fn split_critical_edges(mut m ssa.Module) {
135 mut cfg := build_cfg(mut m)
136
137 for fi in 0 .. m.funcs.len {
138 mut new_blocks := []ssa.BlockID{}
139
140 // Collect edges to split (can't modify while iterating)
141 mut edges_to_split := [][]ssa.BlockID{} // [pred_id, succ_id]
142
143 // Find all critical edges
144 func := m.funcs[fi]
145 func_blocks_len := func.blocks.len
146 for bi in 0 .. func_blocks_len {
147 blk_id := func.blocks[bi]
148 if blk_id < 0 || blk_id >= m.blocks.len {
149 continue
150 }
151 n_blk_succs := cfg.succs[blk_id].len
152 if n_blk_succs > 1 {
153 for si in 0 .. n_blk_succs {
154 succ_id := cfg.succs[blk_id][si]
155 if succ_id < 0 || succ_id >= m.blocks.len {
156 continue
157 }
158 if cfg.preds[succ_id].len > 1 {
159 edges_to_split << [blk_id, succ_id]
160 }
161 }
162 }
163 }
164
165 // Split each critical edge
166 n_edges := edges_to_split.len
167 for ei in 0 .. n_edges {
168 pred_id := edges_to_split[ei][0]
169 succ_id := edges_to_split[ei][1]
170
171 if pred_id < 0 || pred_id >= m.blocks.len || succ_id < 0 || succ_id >= m.blocks.len {
172 continue
173 }
174
175 // Create new intermediate block
176 func2 := m.funcs[fi]
177 split_blk := m.add_block(func2.id, 'split_${pred_id}_${succ_id}')
178 new_blocks << split_blk
179
180 // Re-build block_val_ids after adding the split block
181 // (new blocks get new val_ids that need to be in the table)
182 block_val_ids := build_block_val_ids(m)
183
184 succ_val := get_block_val_id(m, succ_id, block_val_ids)
185 // Add unconditional jump from split block to original successor
186 m.add_instr(.jmp, split_blk, 0, [succ_val])
187
188 // Update predecessor's terminator to jump to split block instead of successor
189 pred_blk := m.blocks[pred_id]
190 if pred_blk.instrs.len > 0 {
191 term_val_id := pred_blk.instrs[pred_blk.instrs.len - 1]
192 if term_val_id >= 0 && term_val_id < m.values.len {
193 term_val := m.values[term_val_id]
194 mut term := &m.instrs[term_val.index]
195
196 old_succ_val := succ_val
197 new_succ_val := get_block_val_id(m, split_blk, block_val_ids)
198
199 // Replace ALL occurrences (handles switch with duplicate targets)
200 for i in 0 .. term.operands.len {
201 if term.operands[i] == old_succ_val {
202 term.operands[i] = new_succ_val
203 }
204 }
205 }
206 }
207
208 // Update phi nodes in successor to reference split block instead of pred
209 succ_blk2 := m.blocks[succ_id]
210 n_succ_instrs := succ_blk2.instrs.len
211 for vi in 0 .. n_succ_instrs {
212 val_id := succ_blk2.instrs[vi]
213 if val_id < 0 || val_id >= m.values.len {
214 continue
215 }
216 val := m.values[val_id]
217 if val.kind != .instruction {
218 continue
219 }
220 idx := val.index
221 if idx < 0 || idx >= m.instrs.len {
222 continue
223 }
224 mut instr := &m.instrs[idx]
225 if instr.op == .phi {
226 old_pred_val := get_block_val_id(m, pred_id, block_val_ids)
227 new_pred_val := get_block_val_id(m, split_blk, block_val_ids)
228 // Replace all occurrences (defensive - handles edge cases)
229 for i := 1; i < instr.operands.len; i += 2 {
230 if instr.operands[i] == old_pred_val {
231 instr.operands[i] = new_pred_val
232 }
233 }
234 }
235 }
236 }
237 }
238
239 // Rebuild CFG after splitting
240 cfg = build_cfg(mut m)
241 _ = cfg
242}
243
244fn eliminate_phi_nodes(mut m ssa.Module) {
245 // First split critical edges to ensure correct copy placement
246 split_critical_edges(mut m)
247
248 n_blocks := m.blocks.len
249 n_funcs := m.funcs.len
250
251 // Build reverse map: val_id → block_id for block-kind values.
252 // Scan values array for basic_block kind instead of using m.blocks[bid].val_id
253 // which returns wrong results in ARM64-compiled binaries (large struct field access bug).
254 mut val_to_block := []int{len: m.values.len, init: -1}
255 n_vals := m.values.len
256 for vi in 0 .. n_vals {
257 if m.values[vi].kind == .basic_block {
258 bid := m.values[vi].index
259 if bid >= 0 && bid < n_blocks {
260 val_to_block[vi] = bid
261 }
262 }
263 }
264
265 // Use flat parallel arrays indexed by block_id instead of [][]ParallelCopy.
266 // pred_copy_dests[blk_id] and pred_copy_srcs[blk_id] store the dest/src pairs.
267 mut pred_copy_dests := [][]int{len: n_blocks}
268 mut pred_copy_srcs := [][]int{len: n_blocks}
269 mut pred_copy_blocks := []int{}
270
271 // Pre-allocate shared src_ref_count array for resolve_parallel_copies_flat.
272 // Sized to m.values.len + headroom for temps. Reused across all calls.
273 mut src_ref_count := []int{len: m.values.len + 1024}
274 mut touched_ids := []int{cap: 256}
275
276 for fi in 0 .. n_funcs {
277 func := m.funcs[fi]
278 func_blocks_len2 := func.blocks.len
279 for bi2 in 0 .. func_blocks_len2 {
280 blk_id := func.blocks[bi2]
281 if blk_id < 0 || blk_id >= m.blocks.len {
282 continue
283 }
284 blk := m.blocks[blk_id]
285 n_instrs := blk.instrs.len
286 for ii in 0 .. n_instrs {
287 val_id := blk.instrs[ii]
288 if val_id < 0 || val_id >= m.values.len {
289 continue
290 }
291 val := m.values[val_id]
292 if val.kind != .instruction {
293 continue
294 }
295 idx := val.index
296 if idx < 0 || idx >= m.instrs.len {
297 continue
298 }
299 instr := m.instrs[idx]
300 if instr.op == .phi {
301 is_do_loop := func.name == 'do_loop'
302 if is_do_loop {
303 eprintln(' do_loop phi: val=${val_id} blk=${blk_id} nops=${instr.operands.len}')
304 }
305 // Phi operands: [val0, blk0, val1, blk1, ...]
306 for i := 0; i < instr.operands.len; i += 2 {
307 if i + 1 >= instr.operands.len {
308 break
309 }
310 val_in := instr.operands[i]
311 blk_val := instr.operands[i + 1]
312 if blk_val < 0 || blk_val >= m.values.len {
313 if is_do_loop {
314 eprintln(' op[${i}]: val_in=${val_in} blk_val=${blk_val} SKIP(oob)')
315 }
316 continue
317 }
318 // Use val_to_block reverse map instead of blk_v.index
319 pred_blk_idx := val_to_block[blk_val]
320 if is_do_loop {
321 blk_kind := m.values[blk_val].kind
322 blk_index := m.values[blk_val].index
323 eprintln(' op[${i}]: val_in=${val_in} blk_val=${blk_val} kind=${blk_kind} index=${blk_index} pred_blk_idx=${pred_blk_idx}')
324 }
325
326 if pred_blk_idx >= 0 && pred_blk_idx < n_blocks {
327 if pred_copy_dests[pred_blk_idx].len == 0 {
328 pred_copy_blocks << pred_blk_idx
329 }
330 mut pcd := pred_copy_dests[pred_blk_idx]
331 pcd << val_id
332 pred_copy_dests[pred_blk_idx] = pcd
333 mut pcs := pred_copy_srcs[pred_blk_idx]
334 pcs << val_in
335 pred_copy_srcs[pred_blk_idx] = pcs
336 }
337 }
338 }
339 }
340 }
341
342 // For each predecessor that has copies, resolve with Briggs algorithm
343 for pki in 0 .. pred_copy_blocks.len {
344 pred_blk := pred_copy_blocks[pki]
345 resolve_parallel_copies_flat(mut m, pred_blk, pred_copy_dests[pred_blk],
346 pred_copy_srcs[pred_blk], mut src_ref_count, mut touched_ids)
347 }
348
349 // Cleanup: clear pred_copies for blocks we touched
350 for pki2 in 0 .. pred_copy_blocks.len {
351 pred_copy_dests[pred_copy_blocks[pki2]] = []
352 pred_copy_srcs[pred_copy_blocks[pki2]] = []
353 }
354 pred_copy_blocks.clear()
355
356 // Remove phi instructions (mark as nop/bitcast with no operands)
357 func2 := m.funcs[fi]
358 func_blocks_len3 := func2.blocks.len
359 for bi3 in 0 .. func_blocks_len3 {
360 blk_id := func2.blocks[bi3]
361 if blk_id < 0 || blk_id >= m.blocks.len {
362 continue
363 }
364 blk := m.blocks[blk_id]
365 n_instrs3 := blk.instrs.len
366 for ii3 in 0 .. n_instrs3 {
367 val_id := blk.instrs[ii3]
368 if val_id < 0 || val_id >= m.values.len {
369 continue
370 }
371 val := m.values[val_id]
372 if val.kind != .instruction {
373 continue
374 }
375 idx := val.index
376 if idx < 0 || idx >= m.instrs.len {
377 continue
378 }
379 if m.instrs[idx].op == .phi {
380 // Avoid m.instrs[X].field = ... -- chained field assign broken in ARM64 self-hosted
381 mut cleanup_instr := m.instrs[idx]
382 cleanup_instr.op = .bitcast
383 cleanup_instr.operands = []
384 m.instrs[idx] = cleanup_instr
385 }
386 }
387 }
388 }
389}
390
391// resolve_parallel_copies_flat sequences a set of parallel copies (dest[i] = src[i])
392// using the Briggs algorithm with worklist-based scheduling.
393// A copy dest←src is "ready" when no other pending copy reads from dest.
394// Uses caller-provided flat array src_ref_count[] indexed by value ID (no maps).
395// touched_ids tracks which entries were modified so we can zero only those on cleanup.
396fn resolve_parallel_copies_flat(mut m ssa.Module, blk_id int, dests []int, srcs []int, mut src_ref_count []int, mut touched_ids []int) {
397 if dests.len == 0 {
398 return
399 }
400
401 n := dests.len
402
403 // Build pending copies, filtering self-copies
404 mut p_dest := []int{cap: n}
405 mut p_src := []int{cap: n}
406 for ci in 0 .. n {
407 if dests[ci] != srcs[ci] {
408 p_dest << dests[ci]
409 p_src << srcs[ci]
410 }
411 }
412
413 np := p_dest.len
414 if np == 0 {
415 return
416 }
417
418 // Ensure src_ref_count is large enough for all value IDs we'll encounter.
419 // Check current max needed against array length.
420 mut need_len := m.values.len + np + 4
421 for i in 0 .. np {
422 if p_dest[i] >= need_len {
423 need_len = p_dest[i] + np + 4
424 }
425 if p_src[i] >= need_len {
426 need_len = p_src[i] + np + 4
427 }
428 }
429 if need_len > src_ref_count.len {
430 // Grow array
431 for _ in 0 .. need_len - src_ref_count.len {
432 src_ref_count << 0
433 }
434 }
435
436 // src_ref_count[val] = how many pending copies use val as source
437 // (array is pre-zeroed or cleaned from previous call via touched_ids)
438 touched_ids.clear()
439 for i in 0 .. np {
440 s := p_src[i]
441 if src_ref_count[s] == 0 {
442 touched_ids << s
443 }
444 src_ref_count[s] = src_ref_count[s] + 1
445 }
446 // Also track dests we'll read from src_ref_count
447 for i in 0 .. np {
448 d := p_dest[i]
449 if src_ref_count[d] == 0 {
450 // dest not in touched_ids yet but we read it; track for safety
451 // (it's already 0, but if cycle-breaking modifies it we need to clean it)
452 }
453 }
454
455 // Use int instead of bool for alive - ARM64 codegen may mishandle []bool{init: true}
456 // alive[i] = 1 if copy i is still pending, 0 if done
457 mut alive := []int{len: np}
458 for ai in 0 .. np {
459 alive[ai] = 1
460 }
461
462 // Worklist: use manual stack pointer instead of .delete() which may be buggy in ARM64
463 mut worklist := []int{len: np + np + 4}
464 mut wl_top := 0 // stack pointer: worklist[0..wl_top] are valid entries
465 for i in 0 .. np {
466 d := p_dest[i]
467 if src_ref_count[d] == 0 {
468 worklist[wl_top] = i
469 wl_top += 1
470 }
471 }
472
473 // Sequenced output copies
474 mut s_dest := []int{cap: np + 4}
475 mut s_src := []int{cap: np + 4}
476
477 mut remaining := np
478 for remaining > 0 {
479 if wl_top > 0 {
480 // Pop a ready copy from the worklist
481 wl_top -= 1
482 idx := worklist[wl_top]
483 if idx < 0 || idx >= np || alive[idx] == 0 {
484 continue
485 }
486 alive[idx] = 0
487 remaining -= 1
488
489 d := p_dest[idx]
490 s := p_src[idx]
491 s_dest << d
492 s_src << s
493
494 // Decrement ref count for the source
495 src_ref_count[s] = src_ref_count[s] - 1
496 if src_ref_count[s] == 0 {
497 // s is no longer used as source; any pending copy with dest==s is now ready
498 for j in 0 .. np {
499 if alive[j] == 1 && p_dest[j] == s {
500 // Grow worklist if needed
501 if wl_top >= worklist.len {
502 worklist << 0
503 }
504 worklist[wl_top] = j
505 wl_top += 1
506 }
507 }
508 }
509 continue
510 }
511
512 // Worklist empty but copies remain → cycle. Break it with a temp.
513 mut ci := -1
514 for k in 0 .. np {
515 if alive[k] == 1 {
516 ci = k
517 break
518 }
519 }
520 if ci < 0 {
521 break
522 }
523
524 cycle_src := p_src[ci]
525 mut typ := 0
526 if cycle_src >= 0 && cycle_src < m.values.len {
527 typ = m.values[cycle_src].typ
528 }
529 if typ == 0 && p_dest[ci] >= 0 && p_dest[ci] < m.values.len {
530 typ = m.values[p_dest[ci]].typ
531 }
532 temp := insert_temp_in_block(mut m, blk_id, cycle_src, typ)
533
534 // Grow src_ref_count if temp exceeds array bounds
535 if temp >= src_ref_count.len {
536 for _ in 0 .. temp + 1 - src_ref_count.len {
537 src_ref_count << 0
538 }
539 }
540
541 // Replace all references to cycle_src with temp in pending copies
542 src_ref_count[temp] = 0
543 touched_ids << temp
544 for i in 0 .. np {
545 if alive[i] == 1 && p_src[i] == cycle_src {
546 p_src[i] = temp
547 src_ref_count[temp] = src_ref_count[temp] + 1
548 }
549 }
550 src_ref_count[cycle_src] = 0
551
552 // Check if any copy whose dest was cycle_src is now ready
553 for j in 0 .. np {
554 if alive[j] == 1 && p_dest[j] == cycle_src {
555 if wl_top >= worklist.len {
556 worklist << 0
557 }
558 worklist[wl_top] = j
559 wl_top += 1
560 }
561 }
562 }
563
564 // Clean up src_ref_count: zero only the entries we touched
565 for ti in 0 .. touched_ids.len {
566 tid := touched_ids[ti]
567 if tid < src_ref_count.len {
568 src_ref_count[tid] = 0
569 }
570 }
571
572 // Emit the sequenced copies
573 for si in 0 .. s_dest.len {
574 insert_copy_in_block(mut m, blk_id, s_dest[si], s_src[si])
575 }
576}
577
578fn insert_temp_in_block(mut m ssa.Module, blk_id int, src int, typ int) int {
579 m.instrs << ssa.Instruction{
580 op: .bitcast
581 block: blk_id
582 typ: typ
583 operands: [ssa.ValueID(src)]
584 }
585 temp_id := m.add_value_node(.instruction, typ, 'phi_tmp_${m.values.len}', m.instrs.len - 1)
586
587 // Read whole struct, modify, write back (chained field assign broken in ARM64)
588 if src < m.values.len && temp_id !in m.values[src].uses {
589 mut sv := m.values[src]
590 sv.uses << temp_id
591 m.values[src] = sv
592 }
593
594 // Insert before terminator — build new instrs array
595 mut blk := m.blocks[blk_id]
596 mut insert_idx := blk.instrs.len
597 if insert_idx > 0 {
598 last_val_id := blk.instrs[blk.instrs.len - 1]
599 last_val := m.values[last_val_id]
600 last_instr := m.instrs[last_val.index]
601 if last_instr.op in [.ret, .br, .jmp, .switch_, .unreachable] {
602 insert_idx = blk.instrs.len - 1
603 }
604 }
605 mut new_instrs := []int{cap: blk.instrs.len + 1}
606 for ii in 0 .. insert_idx {
607 new_instrs << blk.instrs[ii]
608 }
609 new_instrs << temp_id
610 for ii in insert_idx .. blk.instrs.len {
611 new_instrs << blk.instrs[ii]
612 }
613 blk.instrs = new_instrs
614 m.blocks[blk_id] = blk
615 return temp_id
616}
617
618fn insert_copy_in_block(mut m ssa.Module, blk_id int, dest int, src int) {
619 dest_val := m.values[dest]
620 typ := dest_val.typ
621 m.instrs << ssa.Instruction{
622 op: .assign
623 block: blk_id
624 typ: typ
625 operands: [ssa.ValueID(dest), src]
626 }
627 val_id := m.add_value_node(.instruction, typ, 'copy', m.instrs.len - 1)
628 // Read whole struct, modify, write back (chained field assign broken in ARM64)
629 if dest < m.values.len && val_id !in m.values[dest].uses {
630 mut dv := m.values[dest]
631 dv.uses << val_id
632 m.values[dest] = dv
633 }
634 if src < m.values.len && val_id !in m.values[src].uses {
635 mut sv := m.values[src]
636 sv.uses << val_id
637 m.values[src] = sv
638 }
639
640 // Insert before terminator — build new instrs array
641 mut blk := m.blocks[blk_id]
642 mut insert_idx := blk.instrs.len
643 if insert_idx > 0 {
644 last_val_id := blk.instrs[blk.instrs.len - 1]
645 last_val := m.values[last_val_id]
646 last_instr := m.instrs[last_val.index]
647 if last_instr.op in [.ret, .br, .jmp, .switch_, .unreachable] {
648 insert_idx = blk.instrs.len - 1
649 }
650 }
651 mut new_instrs := []int{cap: blk.instrs.len + 1}
652 for ii in 0 .. insert_idx {
653 new_instrs << blk.instrs[ii]
654 }
655 new_instrs << val_id
656 for ii in insert_idx .. blk.instrs.len {
657 new_instrs << blk.instrs[ii]
658 }
659 blk.instrs = new_instrs
660 m.blocks[blk_id] = blk
661}
662