From 7cf08b805f5b8e33f5c465f1ea3e01f8e345ac8c Mon Sep 17 00:00:00 2001
From: JalonSolov <JalonSolov@gmail.com>
Date: Thu, 7 May 2026 23:02:36 -0400
Subject: [PATCH] compress: add pure V bzip2 module (#27107)

---
 vlib/compress/bzip2/README.md                 |   51 +
 vlib/compress/bzip2/bzip2.v                   | 1175 +++++++++++++++++
 vlib/compress/bzip2/bzip2_test.v              |  136 ++
 vlib/compress/bzip2/interop/README.md         |   26 +
 .../bzip2/interop/bzip2_interop_check.vsh     |  166 +++
 5 files changed, 1554 insertions(+)
 create mode 100644 vlib/compress/bzip2/README.md
 create mode 100644 vlib/compress/bzip2/bzip2.v
 create mode 100644 vlib/compress/bzip2/bzip2_test.v
 create mode 100644 vlib/compress/bzip2/interop/README.md
 create mode 100755 vlib/compress/bzip2/interop/bzip2_interop_check.vsh

diff --git a/vlib/compress/bzip2/README.md b/vlib/compress/bzip2/README.md
new file mode 100644
index 000000000..ae1001522
--- /dev/null
+++ b/vlib/compress/bzip2/README.md
@@ -0,0 +1,51 @@
+# compress.bzip2
+
+Pure V bzip2 encoder/decoder.
+
+## Features
+
+- Pure V implementation (no C wrappers)
+- bzip2 stream compression (`BZh1`..`BZh9`)
+- bzip2 stream decompression
+- Block and stream CRC validation
+- Deterministic output for identical input/params
+
+## API
+
+```v
+import compress.bzip2
+
+compressed := bzip2.compress('hello'.bytes())!
+plain := bzip2.decompress(compressed)!
+assert plain.bytestr() == 'hello'
+```
+
+With params:
+
+```v
+import compress.bzip2
+
+data := 'hello'.bytes()
+compressed := bzip2.compress(data, block_size: 1)!
+plain := bzip2.decompress(compressed, verify_crc: true)!
+assert plain.bytestr() == 'hello'
+```
+
+## Notes
+
+- Randomized legacy bzip2 blocks are intentionally rejected.
+- API currently works on full byte slices (no streaming interface yet).
+
+## Test
+
+```bash
+v test vlib/compress/bzip2/
+```
+
+## Proof
+
+- tested against the official bzip2 C library using a variety of inputs and parameters.
+- deterministic and produces identical output for identical input and parameters.
+- includes CRC validation to ensure data integrity during compression and decompression.
+- rejects randomized legacy bzip2 blocks, ensuring only valid bzip2 streams are processed.
+- tested with a variety of inputs, including edge cases, to ensure robustness and reliability.
diff --git a/vlib/compress/bzip2/bzip2.v b/vlib/compress/bzip2/bzip2.v
new file mode 100644
index 000000000..b73f28db0
--- /dev/null
+++ b/vlib/compress/bzip2/bzip2.v
@@ -0,0 +1,1175 @@
+module bzip2
+
+const bzip2_block_magic = u64(0x314159265359)
+const bzip2_eos_magic = u64(0x177245385090)
+const bzip2_runa = 0
+const bzip2_runb = 1
+const bzip2_max_groups = 6
+const bzip2_max_alpha = 258
+const bzip2_group_size = 50
+const bzip2_max_code_len = 20
+const bzip2_max_selectors = 18002
+
+// vfmt off
+const bzip2_crc32_table = [..]u32[
+	0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005,
+	0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61, 0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd,
+	0x4c11db70, 0x48d0c6c7, 0x4593e01e, 0x4152fda9, 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75,
+	0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011, 0x791d4014, 0x7ddc5da3, 0x709f7b7a, 0x745e66cd,
+	0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039, 0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5,
+	0xbe2b5b58, 0xbaea46ef, 0xb7a96036, 0xb3687d81, 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d,
+	0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49, 0xc7361b4c, 0xc3f706fb, 0xceb42022, 0xca753d95,
+	0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1, 0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d,
+	0x34867077, 0x30476dc0, 0x3d044b19, 0x39c556ae, 0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072,
+	0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16, 0x018aeb13, 0x054bf6a4, 0x0808d07d, 0x0cc9cdca,
+	0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde, 0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02,
+	0x5e9f46bf, 0x5a5e5b08, 0x571d7dd1, 0x53dc6066, 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba,
+	0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e, 0xbfa1b04b, 0xbb60adfc, 0xb6238b25, 0xb2e29692,
+	0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6, 0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a,
+	0xe0b41de7, 0xe4750050, 0xe9362689, 0xedf73b3e, 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2,
+	0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686, 0xd5b88683, 0xd1799b34, 0xdc3abded, 0xd8fba05a,
+	0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637, 0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb,
+	0x4f040d56, 0x4bc510e1, 0x46863638, 0x42472b8f, 0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53,
+	0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47, 0x36194d42, 0x32d850f5, 0x3f9b762c, 0x3b5a6b9b,
+	0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff, 0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623,
+	0xf12f560e, 0xf5ee4bb9, 0xf8ad6d60, 0xfc6c70d7, 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b,
+	0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f, 0xc423cd6a, 0xc0e2d0dd, 0xcda1f604, 0xc960ebb3,
+	0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7, 0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b,
+	0x9b3660c6, 0x9ff77d71, 0x92b45ba8, 0x9675461f, 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3,
+	0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640, 0x4e8ee645, 0x4a4ffbf2, 0x470cdd2b, 0x43cdc09c,
+	0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8, 0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24,
+	0x119b4be9, 0x155a565e, 0x18197087, 0x1cd86d30, 0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec,
+	0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088, 0x2497d08d, 0x2056cd3a, 0x2d15ebe3, 0x29d4f654,
+	0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0, 0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c,
+	0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18, 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4,
+	0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0, 0x9abc8bd5, 0x9e7d9662, 0x933eb0bb, 0x97ffad0c,
+	0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668, 0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4
+]
+// vfmt on
+
+@[params]
+pub struct CompressParams {
+pub:
+	block_size int = 9 // Valid range is 1..9 (100k to 900k block size).
+}
+
+@[params]
+pub struct DecompressParams {
+pub:
+	verify_crc bool = true
+}
+
+// compress compresses `src` into a bzip2 byte stream.
+pub fn compress(src []u8, params CompressParams) ![]u8 {
+	if params.block_size < 1 || params.block_size > 9 {
+		return error('bzip2: block_size must be in 1..9')
+	}
+	mut w := BitWriter{
+		out: []u8{}
+	}
+
+	w.write_byte(`B`)
+	w.write_byte(`Z`)
+	w.write_byte(`h`)
+	w.write_byte(u8(`0`) + u8(params.block_size))
+
+	max_block := params.block_size * 100000
+	mut stream_crc := u32(0)
+	if src.len > 0 {
+		mut pos := 0
+		for pos < src.len {
+			end := find_rle1_block_end(src, pos, max_block)
+			chunk := src[pos..end]
+			block := encode_block(chunk)!
+			w.write_bits(48, bzip2_block_magic)
+			w.write_bits(32, u64(block.block_crc))
+			w.write_bit(0) // randomized flag; unsupported legacy mode
+			w.write_bits(24, u64(block.orig_ptr))
+			write_in_use_map(mut w, block.in_use)
+			w.write_bits(3, u64(block.n_groups))
+			w.write_bits(15, u64(block.selectors.len))
+			for sel in block.selector_mtf {
+				for _ in 0 .. sel {
+					w.write_bit(1)
+				}
+				w.write_bit(0)
+			}
+			for g in 0 .. block.n_groups {
+				write_code_lengths(mut w, block.code_lengths[g], block.alpha_size)
+			}
+			mut grp_idx := 0
+			mut grp_pos := 0
+			for sym in block.symbols {
+				if grp_pos == 0 {
+					grp_idx = block.selectors[grp_idx]
+				}
+				code := block.codes[grp_idx][sym]
+				len := block.code_lengths[grp_idx][sym]
+				w.write_bits(len, u64(code))
+				grp_pos++
+				if grp_pos == bzip2_group_size {
+					grp_pos = 0
+					grp_idx++
+				}
+			}
+			stream_crc = rotate_left_1(stream_crc) ^ block.block_crc
+			pos = end
+		}
+	}
+	w.write_bits(48, bzip2_eos_magic)
+	w.write_bits(32, u64(stream_crc))
+	return w.finish()
+}
+
+// decompress decompresses a bzip2 byte stream.
+pub fn decompress(src []u8, params DecompressParams) ![]u8 {
+	mut r := BitReader{
+		data: src
+	}
+	if r.read_byte()! != `B` || r.read_byte()! != `Z` || r.read_byte()! != `h` {
+		return error('bzip2: invalid header')
+	}
+	lvl := r.read_byte()!
+	if lvl < `1` || lvl > `9` {
+		return error('bzip2: invalid block size marker')
+	}
+	block_limit := int(lvl - `0`) * 100000
+	mut out := []u8{}
+	mut stream_crc := u32(0)
+	for {
+		magic := r.read_bits(48)!
+		if magic == bzip2_eos_magic {
+			stored_stream_crc := u32(r.read_bits(32)!)
+			if params.verify_crc && stored_stream_crc != stream_crc {
+				return error('bzip2: stream crc mismatch')
+			}
+			if !r.is_aligned_to_byte_zero_padding() {
+				return error('bzip2: trailing non-zero bits')
+			}
+			return out
+		}
+		if magic != bzip2_block_magic {
+			return error('bzip2: invalid block magic')
+		}
+		block_crc := u32(r.read_bits(32)!)
+		randomized := r.read_bit()!
+		if randomized != 0 {
+			return error('bzip2: randomized blocks are not supported')
+		}
+		orig_ptr := int(r.read_bits(24)!)
+		in_use := read_in_use_map(mut r)!
+		mut seq_to_unseq := []u8{}
+		for i, used in in_use {
+			if used {
+				seq_to_unseq << u8(i)
+			}
+		}
+		n_in_use := seq_to_unseq.len
+		alpha_size := n_in_use + 2
+		if alpha_size < 2 || alpha_size > bzip2_max_alpha {
+			return error('bzip2: invalid alphabet size')
+		}
+		n_groups := int(r.read_bits(3)!)
+		if n_groups < 2 || n_groups > bzip2_max_groups {
+			return error('bzip2: invalid huffman group count')
+		}
+		n_selectors := int(r.read_bits(15)!)
+		if n_selectors < 1 || n_selectors > bzip2_max_selectors {
+			return error('bzip2: invalid selector count')
+		}
+		mut selectors_mtf := []int{len: n_selectors}
+		for i in 0 .. n_selectors {
+			mut c := 0
+			for {
+				b := r.read_bit()!
+				if b == 0 {
+					break
+				}
+				c++
+				if c >= n_groups {
+					return error('bzip2: invalid selector mtf value')
+				}
+			}
+			selectors_mtf[i] = c
+		}
+		selectors := decode_selector_mtf(selectors_mtf, n_groups)!
+
+		mut code_lengths := [][]int{len: n_groups, init: []int{len: alpha_size}}
+		for g in 0 .. n_groups {
+			mut curr := int(r.read_bits(5)!)
+			if curr < 1 || curr > bzip2_max_code_len {
+				return error('bzip2: invalid initial huffman code length')
+			}
+			for i in 0 .. alpha_size {
+				for {
+					flag := r.read_bit()!
+					if flag == 0 {
+						break
+					}
+					up_down := r.read_bit()!
+					if up_down == 0 {
+						curr++
+					} else {
+						curr--
+					}
+					if curr < 1 || curr > bzip2_max_code_len {
+						return error('bzip2: invalid huffman code length delta')
+					}
+				}
+				code_lengths[g][i] = curr
+			}
+		}
+
+		mut tables := []DecodeTable{len: n_groups}
+		for g in 0 .. n_groups {
+			tables[g] = build_decode_table(code_lengths[g], alpha_size)!
+		}
+
+		eob := n_in_use + 1
+		mut selector_index := 0
+		mut group_pos := 0
+		mut mtf := []int{len: n_in_use}
+		for i in 0 .. n_in_use {
+			mtf[i] = i
+		}
+		mut decoded_syms := []int{}
+		mut next_sym := 0
+		for {
+			if group_pos == 0 {
+				if selector_index >= selectors.len {
+					return error('bzip2: selectors exhausted')
+				}
+			}
+			grp := selectors[selector_index]
+			next_sym = tables[grp].decode(mut r)!
+			group_pos++
+			if group_pos == bzip2_group_size {
+				group_pos = 0
+				selector_index++
+			}
+			if next_sym == eob {
+				break
+			}
+			if next_sym == bzip2_runa || next_sym == bzip2_runb {
+				mut s := i64(-1)
+				mut n := i64(1)
+				remaining := block_limit - decoded_syms.len
+				if remaining < 1 {
+					return error('bzip2: block output exceeds declared block size')
+				}
+				for {
+					if next_sym == bzip2_runa {
+						s += n
+					} else {
+						s += n * 2
+					}
+					if s + 1 > i64(remaining) {
+						return error('bzip2: block output exceeds declared block size')
+					}
+					n *= 2
+					if group_pos == 0 {
+						if selector_index >= selectors.len {
+							return error('bzip2: selectors exhausted in run')
+						}
+					}
+					grp2 := selectors[selector_index]
+					next_sym = tables[grp2].decode(mut r)!
+					group_pos++
+					if group_pos == bzip2_group_size {
+						group_pos = 0
+						selector_index++
+					}
+					if next_sym != bzip2_runa && next_sym != bzip2_runb {
+						break
+					}
+				}
+				run_len := int(s + 1)
+				if run_len < 1 {
+					return error('bzip2: invalid run length')
+				}
+				if mtf.len == 0 {
+					return error('bzip2: invalid run with empty mtf table')
+				}
+				ensure_block_output_limit(decoded_syms.len, run_len, block_limit)!
+				for _ in 0 .. run_len {
+					decoded_syms << mtf[0]
+				}
+				if next_sym == eob {
+					break
+				}
+			}
+			if next_sym == eob {
+				break
+			}
+			if next_sym < 2 || next_sym > eob {
+				return error('bzip2: invalid symbol value')
+			}
+			pos := next_sym - 1
+			if pos >= mtf.len {
+				return error('bzip2: mtf index out of range')
+			}
+			sym := mtf[pos]
+			move_to_front_int(mut mtf, pos)
+			ensure_block_output_limit(decoded_syms.len, 1, block_limit)!
+			decoded_syms << sym
+		}
+
+		mut bwt_bytes := []u8{cap: decoded_syms.len}
+		for s in decoded_syms {
+			if s < 0 || s >= seq_to_unseq.len {
+				return error('bzip2: decoded symbol is out of sequence map range')
+			}
+			bwt_bytes << seq_to_unseq[s]
+		}
+
+		rle1 := inverse_bwt(bwt_bytes, orig_ptr)!
+		plain := rle1_decode(rle1)!
+		if params.verify_crc {
+			calc := bzip2_crc32(plain)
+			if calc != block_crc {
+				return error('bzip2: block crc mismatch')
+			}
+		}
+		stream_crc = rotate_left_1(stream_crc) ^ block_crc
+		out << plain
+	}
+	return error('bzip2: unexpected end of stream')
+}
+
+struct EncodedBlock {
+	block_crc    u32
+	orig_ptr     int
+	in_use       []bool
+	alpha_size   int
+	n_groups     int
+	selectors    []int
+	selector_mtf []int
+	symbols      []int
+	code_lengths [][]int
+	codes        [][]u32
+}
+
+fn encode_block(src []u8) !EncodedBlock {
+	block_crc := bzip2_crc32(src)
+	rle1 := rle1_encode(src)
+	if rle1.len == 0 {
+		return error('bzip2: internal error, empty block after rle1')
+	}
+	bwt, orig_ptr := bwt_transform(rle1)
+	mut in_use := []bool{len: 256}
+	for b in bwt {
+		in_use[int(b)] = true
+	}
+	mut seq_to_unseq := []u8{}
+	for i, used in in_use {
+		if used {
+			seq_to_unseq << u8(i)
+		}
+	}
+	mut unseq_to_seq := []int{len: 256, init: -1}
+	for i, b in seq_to_unseq {
+		unseq_to_seq[int(b)] = i
+	}
+	mtf_symbols, alpha_size := mtf_rle2_encode(bwt, unseq_to_seq, seq_to_unseq.len)
+	if mtf_symbols.len == 0 {
+		return error('bzip2: internal error, empty symbol stream')
+	}
+	n_groups := select_group_count(mtf_symbols.len)
+	n_selectors := selector_count_from_symbol_count(mtf_symbols.len)!
+	mut selectors := []int{len: n_selectors, init: 0}
+	selector_mtf := encode_selector_mtf(selectors, n_groups)
+	freq := symbol_freq(mtf_symbols, alpha_size)
+	lens := make_huffman_lengths(freq, bzip2_max_code_len)
+	codes := build_huffman_codes(lens)
+	mut code_lengths := [][]int{len: n_groups}
+	mut code_words := [][]u32{len: n_groups}
+	for i in 0 .. n_groups {
+		code_lengths[i] = lens.clone()
+		code_words[i] = codes.clone()
+	}
+	return EncodedBlock{
+		block_crc:    block_crc
+		orig_ptr:     orig_ptr
+		in_use:       in_use
+		alpha_size:   alpha_size
+		n_groups:     n_groups
+		selectors:    selectors
+		selector_mtf: selector_mtf
+		symbols:      mtf_symbols
+		code_lengths: code_lengths
+		codes:        code_words
+	}
+}
+
+fn select_group_count(n_syms int) int {
+	if n_syms < 200 {
+		return 2
+	}
+	if n_syms < 600 {
+		return 3
+	}
+	if n_syms < 1200 {
+		return 4
+	}
+	if n_syms < 2400 {
+		return 5
+	}
+	return 6
+}
+
+fn selector_count_from_symbol_count(n_symbols int) !int {
+	if n_symbols < 1 {
+		return error('bzip2: invalid selector count')
+	}
+	n_selectors := (n_symbols + bzip2_group_size - 1) / bzip2_group_size
+	if n_selectors < 1 || n_selectors > bzip2_max_selectors {
+		return error('bzip2: invalid selector count')
+	}
+	return n_selectors
+}
+
+fn ensure_block_output_limit(decoded_len int, add_len int, block_limit int) ! {
+	if decoded_len < 0 || add_len < 0 || block_limit < 0 {
+		return error('bzip2: invalid block output state')
+	}
+	if decoded_len > block_limit || add_len > block_limit - decoded_len {
+		return error('bzip2: block output exceeds declared block size')
+	}
+}
+
+fn symbol_freq(symbols []int, alpha_size int) []int {
+	mut freq := []int{len: alpha_size}
+	for s in symbols {
+		if s >= 0 && s < alpha_size {
+			freq[s]++
+		}
+	}
+	for i in 0 .. alpha_size {
+		if freq[i] == 0 {
+			freq[i] = 1
+		}
+	}
+	return freq
+}
+
+fn write_in_use_map(mut w BitWriter, in_use []bool) {
+	mut group_used := []bool{len: 16}
+	for i in 0 .. 16 {
+		for j in 0 .. 16 {
+			if in_use[i * 16 + j] {
+				group_used[i] = true
+				break
+			}
+		}
+	}
+	for g in group_used {
+		w.write_bit(if g { 1 } else { 0 })
+	}
+	for i in 0 .. 16 {
+		if !group_used[i] {
+			continue
+		}
+		for j in 0 .. 16 {
+			w.write_bit(if in_use[i * 16 + j] { 1 } else { 0 })
+		}
+	}
+}
+
+fn read_in_use_map(mut r BitReader) ![]bool {
+	mut group_used := []bool{len: 16}
+	for i in 0 .. 16 {
+		group_used[i] = r.read_bit()! == 1
+	}
+	mut in_use := []bool{len: 256}
+	for i in 0 .. 16 {
+		if !group_used[i] {
+			continue
+		}
+		for j in 0 .. 16 {
+			in_use[i * 16 + j] = r.read_bit()! == 1
+		}
+	}
+	return in_use
+}
+
+fn write_code_lengths(mut w BitWriter, lengths []int, alpha_size int) {
+	mut curr := lengths[0]
+	w.write_bits(5, u64(curr))
+	for i in 0 .. alpha_size {
+		target := lengths[i]
+		for curr < target {
+			w.write_bit(1)
+			w.write_bit(0)
+			curr++
+		}
+		for curr > target {
+			w.write_bit(1)
+			w.write_bit(1)
+			curr--
+		}
+		w.write_bit(0)
+	}
+}
+
+fn decode_selector_mtf(vals []int, n_groups int) ![]int {
+	mut pos := []int{len: n_groups}
+	for i in 0 .. n_groups {
+		pos[i] = i
+	}
+	mut out := []int{len: vals.len}
+	for i, v in vals {
+		if v < 0 || v >= n_groups {
+			return error('bzip2: selector mtf value out of range')
+		}
+		sel := pos[v]
+		for j := v; j > 0; j-- {
+			pos[j] = pos[j - 1]
+		}
+		pos[0] = sel
+		out[i] = sel
+	}
+	return out
+}
+
+fn encode_selector_mtf(selectors []int, n_groups int) []int {
+	mut pos := []int{len: n_groups}
+	for i in 0 .. n_groups {
+		pos[i] = i
+	}
+	mut out := []int{len: selectors.len}
+	for i, sel in selectors {
+		mut idx := 0
+		for idx < pos.len && pos[idx] != sel {
+			idx++
+		}
+		if idx >= pos.len {
+			out[i] = 0
+			continue
+		}
+		out[i] = idx
+		for j := idx; j > 0; j-- {
+			pos[j] = pos[j - 1]
+		}
+		pos[0] = sel
+	}
+	return out
+}
+
+fn make_huffman_lengths(freq []int, max_len int) []int {
+	mut scaled := freq.clone()
+	for {
+		lens := build_huffman_lengths_unbounded(scaled)
+		mut max_seen := 0
+		for l in lens {
+			if l > max_seen {
+				max_seen = l
+			}
+		}
+		if max_seen <= max_len {
+			return lens
+		}
+		for i in 0 .. scaled.len {
+			scaled[i] = (scaled[i] >> 1) + 1
+		}
+	}
+	return []int{}
+}
+
+fn build_huffman_lengths_unbounded(freq []int) []int {
+	n := freq.len
+	if n == 0 {
+		return []int{}
+	}
+	mut weight := []int{cap: 2 * n}
+	mut parent := []int{cap: 2 * n}
+	for i in 0 .. n {
+		w := if freq[i] > 0 { freq[i] } else { 1 }
+		weight << w
+		parent << -1
+	}
+	mut active := []int{len: n}
+	for i in 0 .. n {
+		active[i] = i
+	}
+	for active.len > 1 {
+		mut m1 := 0
+		mut m2 := 1
+		if weight[active[m2]] < weight[active[m1]] {
+			t := m1
+			m1 = m2
+			m2 = t
+		}
+		for i in 2 .. active.len {
+			idx := active[i]
+			if weight[idx] < weight[active[m1]] {
+				m2 = m1
+				m1 = i
+			} else if weight[idx] < weight[active[m2]] {
+				m2 = i
+			}
+		}
+		a := active[m1]
+		b := active[m2]
+		new_idx := weight.len
+		weight << (weight[a] + weight[b])
+		parent << -1
+		parent[a] = new_idx
+		parent[b] = new_idx
+		if m1 > m2 {
+			active.delete(m1)
+			active.delete(m2)
+		} else {
+			active.delete(m2)
+			active.delete(m1)
+		}
+		active << new_idx
+	}
+	mut lengths := []int{len: n}
+	for i in 0 .. n {
+		mut l := 0
+		mut p := parent[i]
+		for p != -1 {
+			l++
+			p = parent[p]
+		}
+		if l == 0 {
+			l = 1
+		}
+		lengths[i] = l
+	}
+	return lengths
+}
+
+fn build_huffman_codes(lengths []int) []u32 {
+	mut max_len := 0
+	for l in lengths {
+		if l > max_len {
+			max_len = l
+		}
+	}
+	mut bl_count := []int{len: max_len + 1}
+	for l in lengths {
+		if l > 0 {
+			bl_count[l]++
+		}
+	}
+	mut next_code := []u32{len: max_len + 1}
+	mut code := u32(0)
+	for bits in 1 .. max_len + 1 {
+		code = (code + u32(bl_count[bits - 1])) << 1
+		next_code[bits] = code
+	}
+	mut out := []u32{len: lengths.len}
+	for i, l in lengths {
+		if l == 0 {
+			continue
+		}
+		out[i] = next_code[l]
+		next_code[l]++
+	}
+	return out
+}
+
+struct DecodeTable {
+	min_len int
+	max_len int
+	base    []int
+	limit   []int
+	perm    []int
+}
+
+fn build_decode_table(lengths []int, alpha_size int) !DecodeTable {
+	mut min_len := 999
+	mut max_len := 0
+	for i in 0 .. alpha_size {
+		l := lengths[i]
+		if l < 1 || l > bzip2_max_code_len {
+			return error('bzip2: invalid huffman code length')
+		}
+		if l < min_len {
+			min_len = l
+		}
+		if l > max_len {
+			max_len = l
+		}
+	}
+	mut perm := []int{}
+	for l in min_len .. max_len + 1 {
+		for i in 0 .. alpha_size {
+			if lengths[i] == l {
+				perm << i
+			}
+		}
+	}
+	mut base := []int{len: max_len + 2}
+	mut limit := []int{len: max_len + 1}
+	for i in 0 .. alpha_size {
+		base[lengths[i] + 1]++
+	}
+	for i in 1 .. base.len {
+		base[i] += base[i - 1]
+	}
+	mut vec := 0
+	for l in min_len .. max_len + 1 {
+		vec += base[l + 1] - base[l]
+		limit[l] = vec - 1
+		vec <<= 1
+	}
+	for l in min_len + 1 .. max_len + 1 {
+		base[l] = ((limit[l - 1] + 1) * 2) - base[l]
+	}
+	return DecodeTable{
+		min_len: min_len
+		max_len: max_len
+		base:    base
+		limit:   limit
+		perm:    perm
+	}
+}
+
+fn (t DecodeTable) decode(mut r BitReader) !int {
+	mut zn := t.min_len
+	mut zvec := int(r.read_bits(zn)!)
+	for zn <= t.max_len && zvec > t.limit[zn] {
+		zn++
+		bit := int(r.read_bits(1)!)
+		zvec = (zvec * 2) | bit
+	}
+	if zn > t.max_len {
+		return error('bzip2: invalid huffman code')
+	}
+	idx := zvec - t.base[zn]
+	if idx < 0 || idx >= t.perm.len {
+		return error('bzip2: invalid huffman decode index')
+	}
+	return t.perm[idx]
+}
+
+fn mtf_rle2_encode(data []u8, unseq_to_seq []int, n_in_use int) ([]int, int) {
+	mut mtf := []int{len: n_in_use}
+	for i in 0 .. n_in_use {
+		mtf[i] = i
+	}
+	eob := n_in_use + 1
+	mut out := []int{}
+	mut zpend := 0
+	for b in data {
+		seq := unseq_to_seq[int(b)]
+		mut pos := 0
+		for pos < mtf.len && mtf[pos] != seq {
+			pos++
+		}
+		if pos == 0 {
+			zpend++
+			continue
+		}
+		if zpend > 0 {
+			emit_run_a_b(mut out, zpend)
+			zpend = 0
+		}
+		out << (pos + 1)
+		move_to_front_int(mut mtf, pos)
+	}
+	if zpend > 0 {
+		emit_run_a_b(mut out, zpend)
+	}
+	out << eob
+	return out, n_in_use + 2
+}
+
+fn emit_run_a_b(mut out []int, run_len int) {
+	mut z := run_len - 1
+	for {
+		if (z & 1) == 0 {
+			out << bzip2_runa
+		} else {
+			out << bzip2_runb
+		}
+		if z < 2 {
+			break
+		}
+		z = (z - 2) >> 1
+	}
+}
+
+fn move_to_front_int(mut arr []int, pos int) {
+	if pos <= 0 {
+		return
+	}
+	tmp := arr[pos]
+	for i := pos; i > 0; i-- {
+		arr[i] = arr[i - 1]
+	}
+	arr[0] = tmp
+}
+
+fn rle1_encode(src []u8) []u8 {
+	if src.len == 0 {
+		return []u8{}
+	}
+	mut out := []u8{cap: src.len + (src.len / 4) + 8}
+	mut i := 0
+	for i < src.len {
+		b := src[i]
+		mut run := 1
+		for i + run < src.len && src[i + run] == b && run < max_i32 {
+			run++
+		}
+		mut rem := run
+		for rem > 0 {
+			if rem <= 3 {
+				for _ in 0 .. rem {
+					out << b
+				}
+				rem = 0
+			} else {
+				chunk := if rem > 259 { 259 } else { rem }
+				for _ in 0 .. 4 {
+					out << b
+				}
+				out << u8(chunk - 4)
+				rem -= chunk
+			}
+		}
+		i += run
+	}
+	return out
+}
+
+fn find_rle1_block_end(src []u8, start int, block_limit int) int {
+	if start >= src.len {
+		return start
+	}
+	mut end := start
+	mut encoded_len := 0
+	for end < src.len {
+		b := src[end]
+		mut run_len := 1
+		for end + run_len < src.len && src[end + run_len] == b {
+			run_len++
+		}
+		run_encoded_len := rle1_encoded_run_len(run_len)
+		if encoded_len + run_encoded_len <= block_limit {
+			encoded_len += run_encoded_len
+			end += run_len
+			continue
+		}
+		available := block_limit - encoded_len
+		if available <= 0 {
+			break
+		}
+		take := max_run_prefix_for_rle1(run_len, available)
+		if take <= 0 {
+			break
+		}
+		end += take
+		break
+	}
+	if end == start {
+		return start + 1
+	}
+	return end
+}
+
+fn max_run_prefix_for_rle1(run_len int, encoded_limit int) int {
+	if run_len <= 0 || encoded_limit <= 0 {
+		return 0
+	}
+	mut lo := 1
+	mut hi := run_len
+	mut best := 0
+	for lo <= hi {
+		mid := lo + (hi - lo) / 2
+		if rle1_encoded_run_len(mid) <= encoded_limit {
+			best = mid
+			lo = mid + 1
+		} else {
+			hi = mid - 1
+		}
+	}
+	return best
+}
+
+fn rle1_encoded_run_len(run_len int) int {
+	if run_len <= 0 {
+		return 0
+	}
+	if run_len <= 3 {
+		return run_len
+	}
+	full_chunks := run_len / 259
+	rem := run_len % 259
+	mut out := full_chunks * 5
+	if rem == 0 {
+		return out
+	}
+	if rem <= 3 {
+		out += rem
+	} else {
+		out += 5
+	}
+	return out
+}
+
+fn rle1_decode(src []u8) ![]u8 {
+	mut out := []u8{cap: src.len}
+	mut i := 0
+	for i < src.len {
+		if i + 4 < src.len && src[i] == src[i + 1] && src[i] == src[i + 2] && src[i] == src[i + 3] {
+			run := int(src[i + 4]) + 4
+			for _ in 0 .. run {
+				out << src[i]
+			}
+			i += 5
+		} else {
+			out << src[i]
+			i++
+		}
+	}
+	return out
+}
+
+fn bwt_transform(data []u8) ([]u8, int) {
+	n := data.len
+	if n == 0 {
+		return []u8{}, 0
+	}
+	mut sa := cyclic_suffix_array(data)
+	mut out := []u8{len: n}
+	mut orig_ptr := 0
+	for i, s in sa {
+		if s == 0 {
+			orig_ptr = i
+			out[i] = data[n - 1]
+		} else {
+			out[i] = data[s - 1]
+		}
+	}
+	return out, orig_ptr
+}
+
+fn inverse_bwt(last_col []u8, orig_ptr int) ![]u8 {
+	n := last_col.len
+	if n == 0 {
+		return []u8{}
+	}
+	if orig_ptr < 0 || orig_ptr >= n {
+		return error('bzip2: invalid bwt origin pointer')
+	}
+	mut count := []int{len: 256}
+	for b in last_col {
+		count[int(b)]++
+	}
+	mut tots := []int{len: 256}
+	mut sum := 0
+	for i in 0 .. 256 {
+		tots[i] = sum
+		sum += count[i]
+	}
+	mut tt := []int{len: n}
+	for i, b in last_col {
+		idx := int(b)
+		tt[tots[idx]] = i
+		tots[idx]++
+	}
+	mut out := []u8{len: n}
+	mut tpos := orig_ptr
+	for i in 0 .. n {
+		tpos = tt[tpos]
+		out[i] = last_col[tpos]
+	}
+	return out
+}
+
+fn cyclic_suffix_array(data []u8) []int {
+	n := data.len
+	if n <= 1 {
+		if n == 1 {
+			return [0]
+		}
+		return []int{}
+	}
+	mut sa := []int{len: n}
+	mut rank := []int{len: n}
+	for i in 0 .. n {
+		sa[i] = i
+		rank[i] = int(data[i])
+	}
+	mut tmp_sa := []int{len: n}
+	mut new_rank := []int{len: n}
+	mut k := 1
+	for k < n {
+		radix_sort_cyclic(mut sa, mut tmp_sa, rank, k)
+		new_rank[sa[0]] = 0
+		mut classes := 1
+		for i in 1 .. n {
+			a := sa[i - 1]
+			b := sa[i]
+			if rank[a] != rank[b] || rank[(a + k) % n] != rank[(b + k) % n] {
+				classes++
+			}
+			new_rank[b] = classes - 1
+		}
+		rank = new_rank.clone()
+		if classes == n {
+			break
+		}
+		k <<= 1
+	}
+	return sa
+}
+
+fn radix_sort_cyclic(mut sa []int, mut tmp []int, rank []int, k int) {
+	n := sa.len
+	m := if n > 256 { n } else { 256 }
+	mut count := []int{len: m}
+	for i in 0 .. n {
+		key := rank[(sa[i] + k) % n]
+		count[key]++
+	}
+	mut pos := []int{len: m}
+	mut sum := 0
+	for i in 0 .. m {
+		pos[i] = sum
+		sum += count[i]
+	}
+	for i in 0 .. n {
+		v := sa[i]
+		key := rank[(v + k) % n]
+		tmp[pos[key]] = v
+		pos[key]++
+	}
+	count = []int{len: m}
+	for i in 0 .. n {
+		key := rank[tmp[i]]
+		count[key]++
+	}
+	sum = 0
+	for i in 0 .. m {
+		pos[i] = sum
+		sum += count[i]
+	}
+	for i in 0 .. n {
+		v := tmp[i]
+		key := rank[v]
+		sa[pos[key]] = v
+		pos[key]++
+	}
+}
+
+struct BitWriter {
+mut:
+	out       []u8
+	bitbuf    u64
+	bit_count int
+}
+
+fn (mut w BitWriter) write_byte(b u8) {
+	w.out << b
+}
+
+fn (mut w BitWriter) write_bit(bit int) {
+	w.write_bits(1, u64(bit & 1))
+}
+
+fn (mut w BitWriter) write_bits(n int, value u64) {
+	if n <= 0 {
+		return
+	}
+	for i := n - 1; i >= 0; i-- {
+		b := (value >> u32(i)) & 1
+		w.bitbuf = (w.bitbuf << 1) | b
+		w.bit_count++
+		if w.bit_count == 8 {
+			w.out << u8(w.bitbuf & 0xff)
+			w.bitbuf = 0
+			w.bit_count = 0
+		}
+	}
+}
+
+fn (mut w BitWriter) finish() ![]u8 {
+	if w.bit_count > 0 {
+		w.bitbuf <<= u32(8 - w.bit_count)
+		w.out << u8(w.bitbuf & 0xff)
+		w.bitbuf = 0
+		w.bit_count = 0
+	}
+	return w.out
+}
+
+struct BitReader {
+	data []u8
+mut:
+	byte_pos int
+	bit_pos  int
+}
+
+fn (mut r BitReader) read_byte() !u8 {
+	if r.bit_pos != 0 {
+		return error('bzip2: attempted byte read on non-byte boundary')
+	}
+	if r.byte_pos >= r.data.len {
+		return error('bzip2: unexpected end of input')
+	}
+	b := r.data[r.byte_pos]
+	r.byte_pos++
+	return b
+}
+
+fn (mut r BitReader) read_bit() !int {
+	return int(r.read_bits(1)!)
+}
+
+fn (mut r BitReader) read_bits(n int) !u64 {
+	if n < 0 || n > 56 {
+		return error('bzip2: invalid bit width')
+	}
+	mut out := u64(0)
+	for _ in 0 .. n {
+		if r.byte_pos >= r.data.len {
+			return error('bzip2: unexpected end of input')
+		}
+		b := r.data[r.byte_pos]
+		bit := (b >> u8(7 - r.bit_pos)) & u8(1)
+		out = (out << 1) | u64(bit)
+		r.bit_pos++
+		if r.bit_pos == 8 {
+			r.bit_pos = 0
+			r.byte_pos++
+		}
+	}
+	return out
+}
+
+fn (r &BitReader) is_aligned_to_byte_zero_padding() bool {
+	if r.bit_pos == 0 {
+		return r.byte_pos == r.data.len
+	}
+	if r.byte_pos >= r.data.len {
+		return false
+	}
+	mask := u8((1 << u8(8 - r.bit_pos)) - 1)
+	if (r.data[r.byte_pos] & mask) != 0 {
+		return false
+	}
+	for i in r.byte_pos + 1 .. r.data.len {
+		if r.data[i] != 0 {
+			return false
+		}
+	}
+	return true
+}
+
+fn rotate_left_1(v u32) u32 {
+	return (v << 1) | (v >> 31)
+}
+
+fn bzip2_crc32(data []u8) u32 {
+	mut crc := u32(0xffffffff)
+	for b in data {
+		table_idx := int(((crc >> 24) ^ u32(b)) & u32(0xff))
+		crc = (crc << 8) ^ bzip2_crc32_table[table_idx]
+	}
+	return ~crc
+}
diff --git a/vlib/compress/bzip2/bzip2_test.v b/vlib/compress/bzip2/bzip2_test.v
new file mode 100644
index 000000000..81bbfd99e
--- /dev/null
+++ b/vlib/compress/bzip2/bzip2_test.v
@@ -0,0 +1,136 @@
+module bzip2
+
+import encoding.hex
+
+fn must_decode_hex(s string) []u8 {
+	return hex.decode(s) or { panic(err) }
+}
+
+fn test_roundtrip_empty_input() {
+	src := []u8{}
+	compressed := compress(src) or { panic(err) }
+	decompressed := decompress(compressed) or { panic(err) }
+	assert decompressed == src
+}
+
+fn test_roundtrip_small_text() {
+	src := 'hello world\n'.bytes()
+	compressed := compress(src) or { panic(err) }
+	decompressed := decompress(compressed) or { panic(err) }
+	assert decompressed == src
+}
+
+fn test_roundtrip_binary_data() {
+	mut src := []u8{len: 4096}
+	for i in 0 .. src.len {
+		src[i] = u8((i * 17 + 13) & 0xff)
+	}
+	compressed := compress(src, block_size: 1) or { panic(err) }
+	decompressed := decompress(compressed) or { panic(err) }
+	assert decompressed == src
+}
+
+fn test_roundtrip_long_runs() {
+	src := 'a'.repeat(2000).bytes()
+	compressed := compress(src) or { panic(err) }
+	decompressed := decompress(compressed) or { panic(err) }
+	assert decompressed == src
+}
+
+fn test_decompress_known_python_vector_empty() {
+	bz2 := must_decode_hex('425a683917724538509000000000')
+	plain := decompress(bz2) or { panic(err) }
+	assert plain == []u8{}
+}
+
+fn test_decompress_known_python_vector_hello() {
+	bz2 :=
+		must_decode_hex('425a68393141592653594eece83600000251800010400006449080200031064c4101a7a9a580bb9431f8bb9229c28482776741b0')
+	plain := decompress(bz2) or { panic(err) }
+	assert plain.bytestr() == 'hello world\n'
+}
+
+fn test_decompress_known_python_vector_text() {
+	bz2 :=
+		must_decode_hex('425a6839314159265359dc01b0d8000002d9800010410120080a00cc20200021a4d3688cd0806800e28a3de49f0b16b10d177245385090dc01b0d8')
+	plain := decompress(bz2) or { panic(err) }
+	assert plain.bytestr() == '1.test\ncopy &copy;\n'
+}
+
+fn test_decompress_known_python_vector_repeated_a() {
+	bz2 :=
+		must_decode_hex('425a6839314159265359ca3d8dfb000000010420000400200021008283177245385090ca3d8dfb')
+	plain := decompress(bz2) or { panic(err) }
+	assert plain == 'a'.repeat(200).bytes()
+}
+
+fn test_decompress_rejects_invalid_header() {
+	_ := decompress('not-bzip2'.bytes()) or {
+		assert err.msg().contains('invalid header')
+		return
+	}
+	assert false
+}
+
+fn test_decompress_rejects_crc_mismatch() {
+	mut bz2 :=
+		must_decode_hex('425a68393141592653594eece83600000251800010400006449080200031064c4101a7a9a580bb9431f8bb9229c28482776741b0')
+	// Corrupt the stored block CRC field (bytes 10..13 in a single-block stream).
+	bz2[10] ^= 0x01
+	_ := decompress(bz2) or {
+		assert err.msg().contains('crc mismatch')
+		return
+	}
+	assert false
+}
+
+fn test_selector_count_limit_boundaries() {
+	below_limit := selector_count_from_symbol_count(900050) or { panic(err) }
+	at_limit := selector_count_from_symbol_count(900100) or { panic(err) }
+	assert below_limit == 18001
+	assert at_limit == 18002
+
+	_ := selector_count_from_symbol_count(900101) or {
+		assert err.msg().contains('invalid selector count')
+		return
+	}
+	assert false
+}
+
+fn test_block_output_limit_guard() {
+	ensure_block_output_limit(0, 100000, 100000) or { panic(err) }
+	ensure_block_output_limit(99999, 1, 100000) or { panic(err) }
+
+	ensure_block_output_limit(100000, 1, 100000) or {
+		assert err.msg().contains('block output exceeds declared block size')
+		return
+	}
+	assert false
+}
+
+fn test_find_rle1_block_end_for_four_byte_runs() {
+	mut src := []u8{cap: 100000}
+	for i in 0 .. 25000 {
+		b := u8(i & 0xff)
+		for _ in 0 .. 4 {
+			src << b
+		}
+	}
+	end := find_rle1_block_end(src, 0, 100000)
+	assert end == 80000
+	assert rle1_encode(src[0..end]).len == 100000
+	assert rle1_encode(src[0..end + 4]).len > 100000
+}
+
+fn test_roundtrip_block_size_1_four_byte_runs() {
+	mut src := []u8{cap: 100000}
+	for i in 0 .. 25000 {
+		b := u8(i & 0xff)
+		for _ in 0 .. 4 {
+			src << b
+		}
+	}
+	compressed := compress(src, block_size: 1) or { panic(err) }
+	decompressed := decompress(compressed) or { panic(err) }
+	assert decompressed == src
+}
diff --git a/vlib/compress/bzip2/interop/README.md b/vlib/compress/bzip2/interop/README.md
new file mode 100644
index 000000000..e2d281bd8
--- /dev/null
+++ b/vlib/compress/bzip2/interop/README.md
@@ -0,0 +1,26 @@
+# bzip2 interop checker
+
+This helper verifies interoperability between:
+
+- V module: `compress.bzip2`
+- System CLI: `bzip2`
+- Python stdlib: `bz2`
+
+It runs deterministic test vectors and compression levels, compresses each vector with all
+three producers, then cross-decompresses every produced stream with all three decoders.
+
+A case passes only if every decompressed output is byte-identical to the original input.
+
+Current levels are `1`, `6`, and `9` (see `compression_levels` in
+`vlib/compress/bzip2/interop/bzip2_interop_check.vsh`).
+
+## Run
+
+```bash
+v run vlib/compress/bzip2/interop/bzip2_interop_check.vsh
+```
+
+## Requirements
+
+- `bzip2` available in `PATH`
+- `python3` with `bz2`
diff --git a/vlib/compress/bzip2/interop/bzip2_interop_check.vsh b/vlib/compress/bzip2/interop/bzip2_interop_check.vsh
new file mode 100755
index 000000000..401998ab9
--- /dev/null
+++ b/vlib/compress/bzip2/interop/bzip2_interop_check.vsh
@@ -0,0 +1,166 @@
+#!/usr/bin/env -S v
+
+import compress.bzip2
+
+const compression_levels = [1, 6, 9]!
+
+struct TestVector {
+	name string
+	data []u8
+}
+
+fn main() {
+	ensure_tools() or {
+		eprintln('SKIP: ${err.msg()}')
+		exit(2)
+	}
+	vectors := make_test_vectors()
+	mut total_checks := 0
+	mut total_runs := 0
+	tmp_root := join_path(temp_dir(), 'v_bzip2_interop_${getpid()}')
+	mkdir_all(tmp_root) or { panic(err) }
+	defer {
+		rmdir_all(tmp_root) or {}
+	}
+	for level in compression_levels {
+		for i, vec in vectors {
+			total_checks += run_case(tmp_root, level, i, vec) or {
+				eprintln('FAIL: ${vec.name} (level=${level}): ${err}')
+				exit(1)
+				0
+			}
+			total_runs++
+			println('ok ${total_runs}/${vectors.len * compression_levels.len}: ${vec.name} (level=${level}, ${vec.data.len} bytes)')
+		}
+	}
+	println('PASS: ${vectors.len} vectors x ${compression_levels.len} levels, ${total_checks} cross-checks')
+}
+
+fn ensure_tools() ! {
+	must_succeed('bzip2 --help >/dev/null 2>&1', 'system bzip2 command is not available')!
+	must_succeed("python3 -c 'import bz2' >/dev/null 2>&1",
+		'python3 with bz2 module is not available')!
+}
+
+fn make_test_vectors() []TestVector {
+	mut vectors := []TestVector{}
+	vectors << TestVector{'empty', []u8{}}
+	vectors << TestVector{'ascii_text', 'The quick brown fox jumps over the lazy dog.\n'.repeat(64).bytes()}
+	vectors << TestVector{'repeated_byte', []u8{len: 10000, init: `A`}}
+	vectors << TestVector{'all_bytes_x4', all_bytes_repeated(4)}
+	vectors << TestVector{'lcg_64k', lcg_bytes(65536)}
+	return vectors
+}
+
+fn run_case(tmp_root string, level int, case_idx int, vec TestVector) !int {
+	case_dir := join_path(tmp_root, 'case_l${level}_${case_idx:02}_${vec.name}')
+	mkdir_all(case_dir)!
+
+	v_bz2 := bzip2.compress(vec.data, block_size: level)!
+	cli_bz2 := cli_compress(case_dir, vec.data, level)!
+	py_bz2 := py_compress(case_dir, vec.data, level)!
+
+	mut checks := 0
+
+	producers := {
+		'v':   v_bz2
+		'cli': cli_bz2
+		'py':  py_bz2
+	}
+	for producer, compressed in producers {
+		v_plain := bzip2.decompress(compressed)!
+		assert_equal_bytes('v.decompress(${producer}.compress, level=${level})', vec.data, v_plain)!
+		checks++
+
+		cli_plain := cli_decompress(case_dir, producer, compressed)!
+		assert_equal_bytes('cli.decompress(${producer}.compress, level=${level})', vec.data,
+			cli_plain)!
+		checks++
+
+		py_plain := py_decompress(case_dir, producer, compressed)!
+		assert_equal_bytes('py.decompress(${producer}.compress, level=${level})', vec.data,
+			py_plain)!
+		checks++
+	}
+	return checks
+}
+
+fn cli_compress(case_dir string, plain []u8, level int) ![]u8 {
+	in_path := join_path(case_dir, 'plain.in')
+	out_path := join_path(case_dir, 'cli_l${level}.bz2')
+	write_file_array(in_path, plain)!
+	must_succeed('bzip2 -${level} -c -- ${shell_quote(in_path)} > ${shell_quote(out_path)}',
+		'bzip2 compression failed')!
+	return read_bytes(out_path)!
+}
+
+fn cli_decompress(case_dir string, producer string, compressed []u8) ![]u8 {
+	in_path := join_path(case_dir, '${producer}.for_cli.bz2')
+	out_path := join_path(case_dir, '${producer}.from_cli.out')
+	write_file_array(in_path, compressed)!
+	must_succeed('bzip2 -d -c -- ${shell_quote(in_path)} > ${shell_quote(out_path)}',
+		'bzip2 decompression failed')!
+	return read_bytes(out_path)!
+}
+
+fn py_compress(case_dir string, plain []u8, level int) ![]u8 {
+	in_path := join_path(case_dir, 'plain_py.in')
+	out_path := join_path(case_dir, 'py_l${level}.bz2')
+	write_file_array(in_path, plain)!
+	py_code := 'import bz2, pathlib, sys; p=pathlib.Path(sys.argv[1]); o=pathlib.Path(sys.argv[2]); l=int(sys.argv[3]); o.write_bytes(bz2.compress(p.read_bytes(), compresslevel=l))'
+	must_succeed('python3 -c ${shell_quote(py_code)} ${shell_quote(in_path)} ${shell_quote(out_path)} ${level}',
+		'python bz2 compression failed')!
+	return read_bytes(out_path)!
+}
+
+fn py_decompress(case_dir string, producer string, compressed []u8) ![]u8 {
+	in_path := join_path(case_dir, '${producer}.for_py.bz2')
+	out_path := join_path(case_dir, '${producer}.from_py.out')
+	write_file_array(in_path, compressed)!
+	py_code := 'import bz2, pathlib, sys; p=pathlib.Path(sys.argv[1]); o=pathlib.Path(sys.argv[2]); o.write_bytes(bz2.decompress(p.read_bytes()))'
+	must_succeed('python3 -c ${shell_quote(py_code)} ${shell_quote(in_path)} ${shell_quote(out_path)}',
+		'python bz2 decompression failed')!
+	return read_bytes(out_path)!
+}
+
+fn all_bytes_repeated(times int) []u8 {
+	mut out := []u8{cap: 256 * times}
+	for _ in 0 .. times {
+		for i in 0 .. 256 {
+			out << u8(i)
+		}
+	}
+	return out
+}
+
+fn lcg_bytes(n int) []u8 {
+	mut out := []u8{len: n}
+	mut x := u32(0x12345678)
+	for i in 0 .. n {
+		x = x * u32(1664525) + u32(1013904223)
+		out[i] = u8((x >> 16) & u32(0xff))
+	}
+	return out
+}
+
+fn assert_equal_bytes(label string, expected []u8, got []u8) ! {
+	if expected.len != got.len {
+		return error('${label}: length mismatch expected=${expected.len} got=${got.len}')
+	}
+	for i in 0 .. expected.len {
+		if expected[i] != got[i] {
+			return error('${label}: byte mismatch at offset ${i}')
+		}
+	}
+}
+
+fn must_succeed(command string, context string) ! {
+	res := execute(command)
+	if res.exit_code != 0 {
+		return error('${context}\ncommand: ${command}\nexit_code: ${res.exit_code}\n${res.output}')
+	}
+}
+
+fn shell_quote(s string) string {
+	return "'${s.replace("'", "'\\''")}'"
+}
-- 
2.39.5