From 7cf08b805f5b8e33f5c465f1ea3e01f8e345ac8c Mon Sep 17 00:00:00 2001 From: JalonSolov Date: Thu, 7 May 2026 23:02:36 -0400 Subject: [PATCH] compress: add pure V bzip2 module (#27107) --- vlib/compress/bzip2/README.md | 51 + vlib/compress/bzip2/bzip2.v | 1175 +++++++++++++++++ vlib/compress/bzip2/bzip2_test.v | 136 ++ vlib/compress/bzip2/interop/README.md | 26 + .../bzip2/interop/bzip2_interop_check.vsh | 166 +++ 5 files changed, 1554 insertions(+) create mode 100644 vlib/compress/bzip2/README.md create mode 100644 vlib/compress/bzip2/bzip2.v create mode 100644 vlib/compress/bzip2/bzip2_test.v create mode 100644 vlib/compress/bzip2/interop/README.md create mode 100755 vlib/compress/bzip2/interop/bzip2_interop_check.vsh diff --git a/vlib/compress/bzip2/README.md b/vlib/compress/bzip2/README.md new file mode 100644 index 000000000..ae1001522 --- /dev/null +++ b/vlib/compress/bzip2/README.md @@ -0,0 +1,51 @@ +# compress.bzip2 + +Pure V bzip2 encoder/decoder. + +## Features + +- Pure V implementation (no C wrappers) +- bzip2 stream compression (`BZh1`..`BZh9`) +- bzip2 stream decompression +- Block and stream CRC validation +- Deterministic output for identical input/params + +## API + +```v +import compress.bzip2 + +compressed := bzip2.compress('hello'.bytes())! +plain := bzip2.decompress(compressed)! +assert plain.bytestr() == 'hello' +``` + +With params: + +```v +import compress.bzip2 + +data := 'hello'.bytes() +compressed := bzip2.compress(data, block_size: 1)! +plain := bzip2.decompress(compressed, verify_crc: true)! +assert plain.bytestr() == 'hello' +``` + +## Notes + +- Randomized legacy bzip2 blocks are intentionally rejected. +- API currently works on full byte slices (no streaming interface yet). + +## Test + +```bash +v test vlib/compress/bzip2/ +``` + +## Proof + +- tested against the official bzip2 C library using a variety of inputs and parameters. +- deterministic and produces identical output for identical input and parameters. +- includes CRC validation to ensure data integrity during compression and decompression. +- rejects randomized legacy bzip2 blocks, ensuring only valid bzip2 streams are processed. +- tested with a variety of inputs, including edge cases, to ensure robustness and reliability. diff --git a/vlib/compress/bzip2/bzip2.v b/vlib/compress/bzip2/bzip2.v new file mode 100644 index 000000000..b73f28db0 --- /dev/null +++ b/vlib/compress/bzip2/bzip2.v @@ -0,0 +1,1175 @@ +module bzip2 + +const bzip2_block_magic = u64(0x314159265359) +const bzip2_eos_magic = u64(0x177245385090) +const bzip2_runa = 0 +const bzip2_runb = 1 +const bzip2_max_groups = 6 +const bzip2_max_alpha = 258 +const bzip2_group_size = 50 +const bzip2_max_code_len = 20 +const bzip2_max_selectors = 18002 + +// vfmt off +const bzip2_crc32_table = [..]u32[ + 0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005, + 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61, 0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd, + 0x4c11db70, 0x48d0c6c7, 0x4593e01e, 0x4152fda9, 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75, + 0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011, 0x791d4014, 0x7ddc5da3, 0x709f7b7a, 0x745e66cd, + 0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039, 0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5, + 0xbe2b5b58, 0xbaea46ef, 0xb7a96036, 0xb3687d81, 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d, + 0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49, 0xc7361b4c, 0xc3f706fb, 0xceb42022, 0xca753d95, + 0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1, 0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d, + 0x34867077, 0x30476dc0, 0x3d044b19, 0x39c556ae, 0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072, + 0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16, 0x018aeb13, 0x054bf6a4, 0x0808d07d, 0x0cc9cdca, + 0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde, 0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02, + 0x5e9f46bf, 0x5a5e5b08, 0x571d7dd1, 0x53dc6066, 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba, + 0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e, 0xbfa1b04b, 0xbb60adfc, 0xb6238b25, 0xb2e29692, + 0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6, 0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a, + 0xe0b41de7, 0xe4750050, 0xe9362689, 0xedf73b3e, 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2, + 0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686, 0xd5b88683, 0xd1799b34, 0xdc3abded, 0xd8fba05a, + 0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637, 0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb, + 0x4f040d56, 0x4bc510e1, 0x46863638, 0x42472b8f, 0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53, + 0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47, 0x36194d42, 0x32d850f5, 0x3f9b762c, 0x3b5a6b9b, + 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff, 0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623, + 0xf12f560e, 0xf5ee4bb9, 0xf8ad6d60, 0xfc6c70d7, 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b, + 0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f, 0xc423cd6a, 0xc0e2d0dd, 0xcda1f604, 0xc960ebb3, + 0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7, 0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b, + 0x9b3660c6, 0x9ff77d71, 0x92b45ba8, 0x9675461f, 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3, + 0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640, 0x4e8ee645, 0x4a4ffbf2, 0x470cdd2b, 0x43cdc09c, + 0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8, 0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24, + 0x119b4be9, 0x155a565e, 0x18197087, 0x1cd86d30, 0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec, + 0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088, 0x2497d08d, 0x2056cd3a, 0x2d15ebe3, 0x29d4f654, + 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0, 0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c, + 0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18, 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4, + 0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0, 0x9abc8bd5, 0x9e7d9662, 0x933eb0bb, 0x97ffad0c, + 0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668, 0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4 +] +// vfmt on + +@[params] +pub struct CompressParams { +pub: + block_size int = 9 // Valid range is 1..9 (100k to 900k block size). +} + +@[params] +pub struct DecompressParams { +pub: + verify_crc bool = true +} + +// compress compresses `src` into a bzip2 byte stream. +pub fn compress(src []u8, params CompressParams) ![]u8 { + if params.block_size < 1 || params.block_size > 9 { + return error('bzip2: block_size must be in 1..9') + } + mut w := BitWriter{ + out: []u8{} + } + + w.write_byte(`B`) + w.write_byte(`Z`) + w.write_byte(`h`) + w.write_byte(u8(`0`) + u8(params.block_size)) + + max_block := params.block_size * 100000 + mut stream_crc := u32(0) + if src.len > 0 { + mut pos := 0 + for pos < src.len { + end := find_rle1_block_end(src, pos, max_block) + chunk := src[pos..end] + block := encode_block(chunk)! + w.write_bits(48, bzip2_block_magic) + w.write_bits(32, u64(block.block_crc)) + w.write_bit(0) // randomized flag; unsupported legacy mode + w.write_bits(24, u64(block.orig_ptr)) + write_in_use_map(mut w, block.in_use) + w.write_bits(3, u64(block.n_groups)) + w.write_bits(15, u64(block.selectors.len)) + for sel in block.selector_mtf { + for _ in 0 .. sel { + w.write_bit(1) + } + w.write_bit(0) + } + for g in 0 .. block.n_groups { + write_code_lengths(mut w, block.code_lengths[g], block.alpha_size) + } + mut grp_idx := 0 + mut grp_pos := 0 + for sym in block.symbols { + if grp_pos == 0 { + grp_idx = block.selectors[grp_idx] + } + code := block.codes[grp_idx][sym] + len := block.code_lengths[grp_idx][sym] + w.write_bits(len, u64(code)) + grp_pos++ + if grp_pos == bzip2_group_size { + grp_pos = 0 + grp_idx++ + } + } + stream_crc = rotate_left_1(stream_crc) ^ block.block_crc + pos = end + } + } + w.write_bits(48, bzip2_eos_magic) + w.write_bits(32, u64(stream_crc)) + return w.finish() +} + +// decompress decompresses a bzip2 byte stream. +pub fn decompress(src []u8, params DecompressParams) ![]u8 { + mut r := BitReader{ + data: src + } + if r.read_byte()! != `B` || r.read_byte()! != `Z` || r.read_byte()! != `h` { + return error('bzip2: invalid header') + } + lvl := r.read_byte()! + if lvl < `1` || lvl > `9` { + return error('bzip2: invalid block size marker') + } + block_limit := int(lvl - `0`) * 100000 + mut out := []u8{} + mut stream_crc := u32(0) + for { + magic := r.read_bits(48)! + if magic == bzip2_eos_magic { + stored_stream_crc := u32(r.read_bits(32)!) + if params.verify_crc && stored_stream_crc != stream_crc { + return error('bzip2: stream crc mismatch') + } + if !r.is_aligned_to_byte_zero_padding() { + return error('bzip2: trailing non-zero bits') + } + return out + } + if magic != bzip2_block_magic { + return error('bzip2: invalid block magic') + } + block_crc := u32(r.read_bits(32)!) + randomized := r.read_bit()! + if randomized != 0 { + return error('bzip2: randomized blocks are not supported') + } + orig_ptr := int(r.read_bits(24)!) + in_use := read_in_use_map(mut r)! + mut seq_to_unseq := []u8{} + for i, used in in_use { + if used { + seq_to_unseq << u8(i) + } + } + n_in_use := seq_to_unseq.len + alpha_size := n_in_use + 2 + if alpha_size < 2 || alpha_size > bzip2_max_alpha { + return error('bzip2: invalid alphabet size') + } + n_groups := int(r.read_bits(3)!) + if n_groups < 2 || n_groups > bzip2_max_groups { + return error('bzip2: invalid huffman group count') + } + n_selectors := int(r.read_bits(15)!) + if n_selectors < 1 || n_selectors > bzip2_max_selectors { + return error('bzip2: invalid selector count') + } + mut selectors_mtf := []int{len: n_selectors} + for i in 0 .. n_selectors { + mut c := 0 + for { + b := r.read_bit()! + if b == 0 { + break + } + c++ + if c >= n_groups { + return error('bzip2: invalid selector mtf value') + } + } + selectors_mtf[i] = c + } + selectors := decode_selector_mtf(selectors_mtf, n_groups)! + + mut code_lengths := [][]int{len: n_groups, init: []int{len: alpha_size}} + for g in 0 .. n_groups { + mut curr := int(r.read_bits(5)!) + if curr < 1 || curr > bzip2_max_code_len { + return error('bzip2: invalid initial huffman code length') + } + for i in 0 .. alpha_size { + for { + flag := r.read_bit()! + if flag == 0 { + break + } + up_down := r.read_bit()! + if up_down == 0 { + curr++ + } else { + curr-- + } + if curr < 1 || curr > bzip2_max_code_len { + return error('bzip2: invalid huffman code length delta') + } + } + code_lengths[g][i] = curr + } + } + + mut tables := []DecodeTable{len: n_groups} + for g in 0 .. n_groups { + tables[g] = build_decode_table(code_lengths[g], alpha_size)! + } + + eob := n_in_use + 1 + mut selector_index := 0 + mut group_pos := 0 + mut mtf := []int{len: n_in_use} + for i in 0 .. n_in_use { + mtf[i] = i + } + mut decoded_syms := []int{} + mut next_sym := 0 + for { + if group_pos == 0 { + if selector_index >= selectors.len { + return error('bzip2: selectors exhausted') + } + } + grp := selectors[selector_index] + next_sym = tables[grp].decode(mut r)! + group_pos++ + if group_pos == bzip2_group_size { + group_pos = 0 + selector_index++ + } + if next_sym == eob { + break + } + if next_sym == bzip2_runa || next_sym == bzip2_runb { + mut s := i64(-1) + mut n := i64(1) + remaining := block_limit - decoded_syms.len + if remaining < 1 { + return error('bzip2: block output exceeds declared block size') + } + for { + if next_sym == bzip2_runa { + s += n + } else { + s += n * 2 + } + if s + 1 > i64(remaining) { + return error('bzip2: block output exceeds declared block size') + } + n *= 2 + if group_pos == 0 { + if selector_index >= selectors.len { + return error('bzip2: selectors exhausted in run') + } + } + grp2 := selectors[selector_index] + next_sym = tables[grp2].decode(mut r)! + group_pos++ + if group_pos == bzip2_group_size { + group_pos = 0 + selector_index++ + } + if next_sym != bzip2_runa && next_sym != bzip2_runb { + break + } + } + run_len := int(s + 1) + if run_len < 1 { + return error('bzip2: invalid run length') + } + if mtf.len == 0 { + return error('bzip2: invalid run with empty mtf table') + } + ensure_block_output_limit(decoded_syms.len, run_len, block_limit)! + for _ in 0 .. run_len { + decoded_syms << mtf[0] + } + if next_sym == eob { + break + } + } + if next_sym == eob { + break + } + if next_sym < 2 || next_sym > eob { + return error('bzip2: invalid symbol value') + } + pos := next_sym - 1 + if pos >= mtf.len { + return error('bzip2: mtf index out of range') + } + sym := mtf[pos] + move_to_front_int(mut mtf, pos) + ensure_block_output_limit(decoded_syms.len, 1, block_limit)! + decoded_syms << sym + } + + mut bwt_bytes := []u8{cap: decoded_syms.len} + for s in decoded_syms { + if s < 0 || s >= seq_to_unseq.len { + return error('bzip2: decoded symbol is out of sequence map range') + } + bwt_bytes << seq_to_unseq[s] + } + + rle1 := inverse_bwt(bwt_bytes, orig_ptr)! + plain := rle1_decode(rle1)! + if params.verify_crc { + calc := bzip2_crc32(plain) + if calc != block_crc { + return error('bzip2: block crc mismatch') + } + } + stream_crc = rotate_left_1(stream_crc) ^ block_crc + out << plain + } + return error('bzip2: unexpected end of stream') +} + +struct EncodedBlock { + block_crc u32 + orig_ptr int + in_use []bool + alpha_size int + n_groups int + selectors []int + selector_mtf []int + symbols []int + code_lengths [][]int + codes [][]u32 +} + +fn encode_block(src []u8) !EncodedBlock { + block_crc := bzip2_crc32(src) + rle1 := rle1_encode(src) + if rle1.len == 0 { + return error('bzip2: internal error, empty block after rle1') + } + bwt, orig_ptr := bwt_transform(rle1) + mut in_use := []bool{len: 256} + for b in bwt { + in_use[int(b)] = true + } + mut seq_to_unseq := []u8{} + for i, used in in_use { + if used { + seq_to_unseq << u8(i) + } + } + mut unseq_to_seq := []int{len: 256, init: -1} + for i, b in seq_to_unseq { + unseq_to_seq[int(b)] = i + } + mtf_symbols, alpha_size := mtf_rle2_encode(bwt, unseq_to_seq, seq_to_unseq.len) + if mtf_symbols.len == 0 { + return error('bzip2: internal error, empty symbol stream') + } + n_groups := select_group_count(mtf_symbols.len) + n_selectors := selector_count_from_symbol_count(mtf_symbols.len)! + mut selectors := []int{len: n_selectors, init: 0} + selector_mtf := encode_selector_mtf(selectors, n_groups) + freq := symbol_freq(mtf_symbols, alpha_size) + lens := make_huffman_lengths(freq, bzip2_max_code_len) + codes := build_huffman_codes(lens) + mut code_lengths := [][]int{len: n_groups} + mut code_words := [][]u32{len: n_groups} + for i in 0 .. n_groups { + code_lengths[i] = lens.clone() + code_words[i] = codes.clone() + } + return EncodedBlock{ + block_crc: block_crc + orig_ptr: orig_ptr + in_use: in_use + alpha_size: alpha_size + n_groups: n_groups + selectors: selectors + selector_mtf: selector_mtf + symbols: mtf_symbols + code_lengths: code_lengths + codes: code_words + } +} + +fn select_group_count(n_syms int) int { + if n_syms < 200 { + return 2 + } + if n_syms < 600 { + return 3 + } + if n_syms < 1200 { + return 4 + } + if n_syms < 2400 { + return 5 + } + return 6 +} + +fn selector_count_from_symbol_count(n_symbols int) !int { + if n_symbols < 1 { + return error('bzip2: invalid selector count') + } + n_selectors := (n_symbols + bzip2_group_size - 1) / bzip2_group_size + if n_selectors < 1 || n_selectors > bzip2_max_selectors { + return error('bzip2: invalid selector count') + } + return n_selectors +} + +fn ensure_block_output_limit(decoded_len int, add_len int, block_limit int) ! { + if decoded_len < 0 || add_len < 0 || block_limit < 0 { + return error('bzip2: invalid block output state') + } + if decoded_len > block_limit || add_len > block_limit - decoded_len { + return error('bzip2: block output exceeds declared block size') + } +} + +fn symbol_freq(symbols []int, alpha_size int) []int { + mut freq := []int{len: alpha_size} + for s in symbols { + if s >= 0 && s < alpha_size { + freq[s]++ + } + } + for i in 0 .. alpha_size { + if freq[i] == 0 { + freq[i] = 1 + } + } + return freq +} + +fn write_in_use_map(mut w BitWriter, in_use []bool) { + mut group_used := []bool{len: 16} + for i in 0 .. 16 { + for j in 0 .. 16 { + if in_use[i * 16 + j] { + group_used[i] = true + break + } + } + } + for g in group_used { + w.write_bit(if g { 1 } else { 0 }) + } + for i in 0 .. 16 { + if !group_used[i] { + continue + } + for j in 0 .. 16 { + w.write_bit(if in_use[i * 16 + j] { 1 } else { 0 }) + } + } +} + +fn read_in_use_map(mut r BitReader) ![]bool { + mut group_used := []bool{len: 16} + for i in 0 .. 16 { + group_used[i] = r.read_bit()! == 1 + } + mut in_use := []bool{len: 256} + for i in 0 .. 16 { + if !group_used[i] { + continue + } + for j in 0 .. 16 { + in_use[i * 16 + j] = r.read_bit()! == 1 + } + } + return in_use +} + +fn write_code_lengths(mut w BitWriter, lengths []int, alpha_size int) { + mut curr := lengths[0] + w.write_bits(5, u64(curr)) + for i in 0 .. alpha_size { + target := lengths[i] + for curr < target { + w.write_bit(1) + w.write_bit(0) + curr++ + } + for curr > target { + w.write_bit(1) + w.write_bit(1) + curr-- + } + w.write_bit(0) + } +} + +fn decode_selector_mtf(vals []int, n_groups int) ![]int { + mut pos := []int{len: n_groups} + for i in 0 .. n_groups { + pos[i] = i + } + mut out := []int{len: vals.len} + for i, v in vals { + if v < 0 || v >= n_groups { + return error('bzip2: selector mtf value out of range') + } + sel := pos[v] + for j := v; j > 0; j-- { + pos[j] = pos[j - 1] + } + pos[0] = sel + out[i] = sel + } + return out +} + +fn encode_selector_mtf(selectors []int, n_groups int) []int { + mut pos := []int{len: n_groups} + for i in 0 .. n_groups { + pos[i] = i + } + mut out := []int{len: selectors.len} + for i, sel in selectors { + mut idx := 0 + for idx < pos.len && pos[idx] != sel { + idx++ + } + if idx >= pos.len { + out[i] = 0 + continue + } + out[i] = idx + for j := idx; j > 0; j-- { + pos[j] = pos[j - 1] + } + pos[0] = sel + } + return out +} + +fn make_huffman_lengths(freq []int, max_len int) []int { + mut scaled := freq.clone() + for { + lens := build_huffman_lengths_unbounded(scaled) + mut max_seen := 0 + for l in lens { + if l > max_seen { + max_seen = l + } + } + if max_seen <= max_len { + return lens + } + for i in 0 .. scaled.len { + scaled[i] = (scaled[i] >> 1) + 1 + } + } + return []int{} +} + +fn build_huffman_lengths_unbounded(freq []int) []int { + n := freq.len + if n == 0 { + return []int{} + } + mut weight := []int{cap: 2 * n} + mut parent := []int{cap: 2 * n} + for i in 0 .. n { + w := if freq[i] > 0 { freq[i] } else { 1 } + weight << w + parent << -1 + } + mut active := []int{len: n} + for i in 0 .. n { + active[i] = i + } + for active.len > 1 { + mut m1 := 0 + mut m2 := 1 + if weight[active[m2]] < weight[active[m1]] { + t := m1 + m1 = m2 + m2 = t + } + for i in 2 .. active.len { + idx := active[i] + if weight[idx] < weight[active[m1]] { + m2 = m1 + m1 = i + } else if weight[idx] < weight[active[m2]] { + m2 = i + } + } + a := active[m1] + b := active[m2] + new_idx := weight.len + weight << (weight[a] + weight[b]) + parent << -1 + parent[a] = new_idx + parent[b] = new_idx + if m1 > m2 { + active.delete(m1) + active.delete(m2) + } else { + active.delete(m2) + active.delete(m1) + } + active << new_idx + } + mut lengths := []int{len: n} + for i in 0 .. n { + mut l := 0 + mut p := parent[i] + for p != -1 { + l++ + p = parent[p] + } + if l == 0 { + l = 1 + } + lengths[i] = l + } + return lengths +} + +fn build_huffman_codes(lengths []int) []u32 { + mut max_len := 0 + for l in lengths { + if l > max_len { + max_len = l + } + } + mut bl_count := []int{len: max_len + 1} + for l in lengths { + if l > 0 { + bl_count[l]++ + } + } + mut next_code := []u32{len: max_len + 1} + mut code := u32(0) + for bits in 1 .. max_len + 1 { + code = (code + u32(bl_count[bits - 1])) << 1 + next_code[bits] = code + } + mut out := []u32{len: lengths.len} + for i, l in lengths { + if l == 0 { + continue + } + out[i] = next_code[l] + next_code[l]++ + } + return out +} + +struct DecodeTable { + min_len int + max_len int + base []int + limit []int + perm []int +} + +fn build_decode_table(lengths []int, alpha_size int) !DecodeTable { + mut min_len := 999 + mut max_len := 0 + for i in 0 .. alpha_size { + l := lengths[i] + if l < 1 || l > bzip2_max_code_len { + return error('bzip2: invalid huffman code length') + } + if l < min_len { + min_len = l + } + if l > max_len { + max_len = l + } + } + mut perm := []int{} + for l in min_len .. max_len + 1 { + for i in 0 .. alpha_size { + if lengths[i] == l { + perm << i + } + } + } + mut base := []int{len: max_len + 2} + mut limit := []int{len: max_len + 1} + for i in 0 .. alpha_size { + base[lengths[i] + 1]++ + } + for i in 1 .. base.len { + base[i] += base[i - 1] + } + mut vec := 0 + for l in min_len .. max_len + 1 { + vec += base[l + 1] - base[l] + limit[l] = vec - 1 + vec <<= 1 + } + for l in min_len + 1 .. max_len + 1 { + base[l] = ((limit[l - 1] + 1) * 2) - base[l] + } + return DecodeTable{ + min_len: min_len + max_len: max_len + base: base + limit: limit + perm: perm + } +} + +fn (t DecodeTable) decode(mut r BitReader) !int { + mut zn := t.min_len + mut zvec := int(r.read_bits(zn)!) + for zn <= t.max_len && zvec > t.limit[zn] { + zn++ + bit := int(r.read_bits(1)!) + zvec = (zvec * 2) | bit + } + if zn > t.max_len { + return error('bzip2: invalid huffman code') + } + idx := zvec - t.base[zn] + if idx < 0 || idx >= t.perm.len { + return error('bzip2: invalid huffman decode index') + } + return t.perm[idx] +} + +fn mtf_rle2_encode(data []u8, unseq_to_seq []int, n_in_use int) ([]int, int) { + mut mtf := []int{len: n_in_use} + for i in 0 .. n_in_use { + mtf[i] = i + } + eob := n_in_use + 1 + mut out := []int{} + mut zpend := 0 + for b in data { + seq := unseq_to_seq[int(b)] + mut pos := 0 + for pos < mtf.len && mtf[pos] != seq { + pos++ + } + if pos == 0 { + zpend++ + continue + } + if zpend > 0 { + emit_run_a_b(mut out, zpend) + zpend = 0 + } + out << (pos + 1) + move_to_front_int(mut mtf, pos) + } + if zpend > 0 { + emit_run_a_b(mut out, zpend) + } + out << eob + return out, n_in_use + 2 +} + +fn emit_run_a_b(mut out []int, run_len int) { + mut z := run_len - 1 + for { + if (z & 1) == 0 { + out << bzip2_runa + } else { + out << bzip2_runb + } + if z < 2 { + break + } + z = (z - 2) >> 1 + } +} + +fn move_to_front_int(mut arr []int, pos int) { + if pos <= 0 { + return + } + tmp := arr[pos] + for i := pos; i > 0; i-- { + arr[i] = arr[i - 1] + } + arr[0] = tmp +} + +fn rle1_encode(src []u8) []u8 { + if src.len == 0 { + return []u8{} + } + mut out := []u8{cap: src.len + (src.len / 4) + 8} + mut i := 0 + for i < src.len { + b := src[i] + mut run := 1 + for i + run < src.len && src[i + run] == b && run < max_i32 { + run++ + } + mut rem := run + for rem > 0 { + if rem <= 3 { + for _ in 0 .. rem { + out << b + } + rem = 0 + } else { + chunk := if rem > 259 { 259 } else { rem } + for _ in 0 .. 4 { + out << b + } + out << u8(chunk - 4) + rem -= chunk + } + } + i += run + } + return out +} + +fn find_rle1_block_end(src []u8, start int, block_limit int) int { + if start >= src.len { + return start + } + mut end := start + mut encoded_len := 0 + for end < src.len { + b := src[end] + mut run_len := 1 + for end + run_len < src.len && src[end + run_len] == b { + run_len++ + } + run_encoded_len := rle1_encoded_run_len(run_len) + if encoded_len + run_encoded_len <= block_limit { + encoded_len += run_encoded_len + end += run_len + continue + } + available := block_limit - encoded_len + if available <= 0 { + break + } + take := max_run_prefix_for_rle1(run_len, available) + if take <= 0 { + break + } + end += take + break + } + if end == start { + return start + 1 + } + return end +} + +fn max_run_prefix_for_rle1(run_len int, encoded_limit int) int { + if run_len <= 0 || encoded_limit <= 0 { + return 0 + } + mut lo := 1 + mut hi := run_len + mut best := 0 + for lo <= hi { + mid := lo + (hi - lo) / 2 + if rle1_encoded_run_len(mid) <= encoded_limit { + best = mid + lo = mid + 1 + } else { + hi = mid - 1 + } + } + return best +} + +fn rle1_encoded_run_len(run_len int) int { + if run_len <= 0 { + return 0 + } + if run_len <= 3 { + return run_len + } + full_chunks := run_len / 259 + rem := run_len % 259 + mut out := full_chunks * 5 + if rem == 0 { + return out + } + if rem <= 3 { + out += rem + } else { + out += 5 + } + return out +} + +fn rle1_decode(src []u8) ![]u8 { + mut out := []u8{cap: src.len} + mut i := 0 + for i < src.len { + if i + 4 < src.len && src[i] == src[i + 1] && src[i] == src[i + 2] && src[i] == src[i + 3] { + run := int(src[i + 4]) + 4 + for _ in 0 .. run { + out << src[i] + } + i += 5 + } else { + out << src[i] + i++ + } + } + return out +} + +fn bwt_transform(data []u8) ([]u8, int) { + n := data.len + if n == 0 { + return []u8{}, 0 + } + mut sa := cyclic_suffix_array(data) + mut out := []u8{len: n} + mut orig_ptr := 0 + for i, s in sa { + if s == 0 { + orig_ptr = i + out[i] = data[n - 1] + } else { + out[i] = data[s - 1] + } + } + return out, orig_ptr +} + +fn inverse_bwt(last_col []u8, orig_ptr int) ![]u8 { + n := last_col.len + if n == 0 { + return []u8{} + } + if orig_ptr < 0 || orig_ptr >= n { + return error('bzip2: invalid bwt origin pointer') + } + mut count := []int{len: 256} + for b in last_col { + count[int(b)]++ + } + mut tots := []int{len: 256} + mut sum := 0 + for i in 0 .. 256 { + tots[i] = sum + sum += count[i] + } + mut tt := []int{len: n} + for i, b in last_col { + idx := int(b) + tt[tots[idx]] = i + tots[idx]++ + } + mut out := []u8{len: n} + mut tpos := orig_ptr + for i in 0 .. n { + tpos = tt[tpos] + out[i] = last_col[tpos] + } + return out +} + +fn cyclic_suffix_array(data []u8) []int { + n := data.len + if n <= 1 { + if n == 1 { + return [0] + } + return []int{} + } + mut sa := []int{len: n} + mut rank := []int{len: n} + for i in 0 .. n { + sa[i] = i + rank[i] = int(data[i]) + } + mut tmp_sa := []int{len: n} + mut new_rank := []int{len: n} + mut k := 1 + for k < n { + radix_sort_cyclic(mut sa, mut tmp_sa, rank, k) + new_rank[sa[0]] = 0 + mut classes := 1 + for i in 1 .. n { + a := sa[i - 1] + b := sa[i] + if rank[a] != rank[b] || rank[(a + k) % n] != rank[(b + k) % n] { + classes++ + } + new_rank[b] = classes - 1 + } + rank = new_rank.clone() + if classes == n { + break + } + k <<= 1 + } + return sa +} + +fn radix_sort_cyclic(mut sa []int, mut tmp []int, rank []int, k int) { + n := sa.len + m := if n > 256 { n } else { 256 } + mut count := []int{len: m} + for i in 0 .. n { + key := rank[(sa[i] + k) % n] + count[key]++ + } + mut pos := []int{len: m} + mut sum := 0 + for i in 0 .. m { + pos[i] = sum + sum += count[i] + } + for i in 0 .. n { + v := sa[i] + key := rank[(v + k) % n] + tmp[pos[key]] = v + pos[key]++ + } + count = []int{len: m} + for i in 0 .. n { + key := rank[tmp[i]] + count[key]++ + } + sum = 0 + for i in 0 .. m { + pos[i] = sum + sum += count[i] + } + for i in 0 .. n { + v := tmp[i] + key := rank[v] + sa[pos[key]] = v + pos[key]++ + } +} + +struct BitWriter { +mut: + out []u8 + bitbuf u64 + bit_count int +} + +fn (mut w BitWriter) write_byte(b u8) { + w.out << b +} + +fn (mut w BitWriter) write_bit(bit int) { + w.write_bits(1, u64(bit & 1)) +} + +fn (mut w BitWriter) write_bits(n int, value u64) { + if n <= 0 { + return + } + for i := n - 1; i >= 0; i-- { + b := (value >> u32(i)) & 1 + w.bitbuf = (w.bitbuf << 1) | b + w.bit_count++ + if w.bit_count == 8 { + w.out << u8(w.bitbuf & 0xff) + w.bitbuf = 0 + w.bit_count = 0 + } + } +} + +fn (mut w BitWriter) finish() ![]u8 { + if w.bit_count > 0 { + w.bitbuf <<= u32(8 - w.bit_count) + w.out << u8(w.bitbuf & 0xff) + w.bitbuf = 0 + w.bit_count = 0 + } + return w.out +} + +struct BitReader { + data []u8 +mut: + byte_pos int + bit_pos int +} + +fn (mut r BitReader) read_byte() !u8 { + if r.bit_pos != 0 { + return error('bzip2: attempted byte read on non-byte boundary') + } + if r.byte_pos >= r.data.len { + return error('bzip2: unexpected end of input') + } + b := r.data[r.byte_pos] + r.byte_pos++ + return b +} + +fn (mut r BitReader) read_bit() !int { + return int(r.read_bits(1)!) +} + +fn (mut r BitReader) read_bits(n int) !u64 { + if n < 0 || n > 56 { + return error('bzip2: invalid bit width') + } + mut out := u64(0) + for _ in 0 .. n { + if r.byte_pos >= r.data.len { + return error('bzip2: unexpected end of input') + } + b := r.data[r.byte_pos] + bit := (b >> u8(7 - r.bit_pos)) & u8(1) + out = (out << 1) | u64(bit) + r.bit_pos++ + if r.bit_pos == 8 { + r.bit_pos = 0 + r.byte_pos++ + } + } + return out +} + +fn (r &BitReader) is_aligned_to_byte_zero_padding() bool { + if r.bit_pos == 0 { + return r.byte_pos == r.data.len + } + if r.byte_pos >= r.data.len { + return false + } + mask := u8((1 << u8(8 - r.bit_pos)) - 1) + if (r.data[r.byte_pos] & mask) != 0 { + return false + } + for i in r.byte_pos + 1 .. r.data.len { + if r.data[i] != 0 { + return false + } + } + return true +} + +fn rotate_left_1(v u32) u32 { + return (v << 1) | (v >> 31) +} + +fn bzip2_crc32(data []u8) u32 { + mut crc := u32(0xffffffff) + for b in data { + table_idx := int(((crc >> 24) ^ u32(b)) & u32(0xff)) + crc = (crc << 8) ^ bzip2_crc32_table[table_idx] + } + return ~crc +} diff --git a/vlib/compress/bzip2/bzip2_test.v b/vlib/compress/bzip2/bzip2_test.v new file mode 100644 index 000000000..81bbfd99e --- /dev/null +++ b/vlib/compress/bzip2/bzip2_test.v @@ -0,0 +1,136 @@ +module bzip2 + +import encoding.hex + +fn must_decode_hex(s string) []u8 { + return hex.decode(s) or { panic(err) } +} + +fn test_roundtrip_empty_input() { + src := []u8{} + compressed := compress(src) or { panic(err) } + decompressed := decompress(compressed) or { panic(err) } + assert decompressed == src +} + +fn test_roundtrip_small_text() { + src := 'hello world\n'.bytes() + compressed := compress(src) or { panic(err) } + decompressed := decompress(compressed) or { panic(err) } + assert decompressed == src +} + +fn test_roundtrip_binary_data() { + mut src := []u8{len: 4096} + for i in 0 .. src.len { + src[i] = u8((i * 17 + 13) & 0xff) + } + compressed := compress(src, block_size: 1) or { panic(err) } + decompressed := decompress(compressed) or { panic(err) } + assert decompressed == src +} + +fn test_roundtrip_long_runs() { + src := 'a'.repeat(2000).bytes() + compressed := compress(src) or { panic(err) } + decompressed := decompress(compressed) or { panic(err) } + assert decompressed == src +} + +fn test_decompress_known_python_vector_empty() { + bz2 := must_decode_hex('425a683917724538509000000000') + plain := decompress(bz2) or { panic(err) } + assert plain == []u8{} +} + +fn test_decompress_known_python_vector_hello() { + bz2 := + must_decode_hex('425a68393141592653594eece83600000251800010400006449080200031064c4101a7a9a580bb9431f8bb9229c28482776741b0') + plain := decompress(bz2) or { panic(err) } + assert plain.bytestr() == 'hello world\n' +} + +fn test_decompress_known_python_vector_text() { + bz2 := + must_decode_hex('425a6839314159265359dc01b0d8000002d9800010410120080a00cc20200021a4d3688cd0806800e28a3de49f0b16b10d177245385090dc01b0d8') + plain := decompress(bz2) or { panic(err) } + assert plain.bytestr() == '1.test\ncopy ©\n' +} + +fn test_decompress_known_python_vector_repeated_a() { + bz2 := + must_decode_hex('425a6839314159265359ca3d8dfb000000010420000400200021008283177245385090ca3d8dfb') + plain := decompress(bz2) or { panic(err) } + assert plain == 'a'.repeat(200).bytes() +} + +fn test_decompress_rejects_invalid_header() { + _ := decompress('not-bzip2'.bytes()) or { + assert err.msg().contains('invalid header') + return + } + assert false +} + +fn test_decompress_rejects_crc_mismatch() { + mut bz2 := + must_decode_hex('425a68393141592653594eece83600000251800010400006449080200031064c4101a7a9a580bb9431f8bb9229c28482776741b0') + // Corrupt the stored block CRC field (bytes 10..13 in a single-block stream). + bz2[10] ^= 0x01 + _ := decompress(bz2) or { + assert err.msg().contains('crc mismatch') + return + } + assert false +} + +fn test_selector_count_limit_boundaries() { + below_limit := selector_count_from_symbol_count(900050) or { panic(err) } + at_limit := selector_count_from_symbol_count(900100) or { panic(err) } + assert below_limit == 18001 + assert at_limit == 18002 + + _ := selector_count_from_symbol_count(900101) or { + assert err.msg().contains('invalid selector count') + return + } + assert false +} + +fn test_block_output_limit_guard() { + ensure_block_output_limit(0, 100000, 100000) or { panic(err) } + ensure_block_output_limit(99999, 1, 100000) or { panic(err) } + + ensure_block_output_limit(100000, 1, 100000) or { + assert err.msg().contains('block output exceeds declared block size') + return + } + assert false +} + +fn test_find_rle1_block_end_for_four_byte_runs() { + mut src := []u8{cap: 100000} + for i in 0 .. 25000 { + b := u8(i & 0xff) + for _ in 0 .. 4 { + src << b + } + } + end := find_rle1_block_end(src, 0, 100000) + assert end == 80000 + assert rle1_encode(src[0..end]).len == 100000 + assert rle1_encode(src[0..end + 4]).len > 100000 +} + +fn test_roundtrip_block_size_1_four_byte_runs() { + mut src := []u8{cap: 100000} + for i in 0 .. 25000 { + b := u8(i & 0xff) + for _ in 0 .. 4 { + src << b + } + } + compressed := compress(src, block_size: 1) or { panic(err) } + decompressed := decompress(compressed) or { panic(err) } + assert decompressed == src +} diff --git a/vlib/compress/bzip2/interop/README.md b/vlib/compress/bzip2/interop/README.md new file mode 100644 index 000000000..e2d281bd8 --- /dev/null +++ b/vlib/compress/bzip2/interop/README.md @@ -0,0 +1,26 @@ +# bzip2 interop checker + +This helper verifies interoperability between: + +- V module: `compress.bzip2` +- System CLI: `bzip2` +- Python stdlib: `bz2` + +It runs deterministic test vectors and compression levels, compresses each vector with all +three producers, then cross-decompresses every produced stream with all three decoders. + +A case passes only if every decompressed output is byte-identical to the original input. + +Current levels are `1`, `6`, and `9` (see `compression_levels` in +`vlib/compress/bzip2/interop/bzip2_interop_check.vsh`). + +## Run + +```bash +v run vlib/compress/bzip2/interop/bzip2_interop_check.vsh +``` + +## Requirements + +- `bzip2` available in `PATH` +- `python3` with `bz2` diff --git a/vlib/compress/bzip2/interop/bzip2_interop_check.vsh b/vlib/compress/bzip2/interop/bzip2_interop_check.vsh new file mode 100755 index 000000000..401998ab9 --- /dev/null +++ b/vlib/compress/bzip2/interop/bzip2_interop_check.vsh @@ -0,0 +1,166 @@ +#!/usr/bin/env -S v + +import compress.bzip2 + +const compression_levels = [1, 6, 9]! + +struct TestVector { + name string + data []u8 +} + +fn main() { + ensure_tools() or { + eprintln('SKIP: ${err.msg()}') + exit(2) + } + vectors := make_test_vectors() + mut total_checks := 0 + mut total_runs := 0 + tmp_root := join_path(temp_dir(), 'v_bzip2_interop_${getpid()}') + mkdir_all(tmp_root) or { panic(err) } + defer { + rmdir_all(tmp_root) or {} + } + for level in compression_levels { + for i, vec in vectors { + total_checks += run_case(tmp_root, level, i, vec) or { + eprintln('FAIL: ${vec.name} (level=${level}): ${err}') + exit(1) + 0 + } + total_runs++ + println('ok ${total_runs}/${vectors.len * compression_levels.len}: ${vec.name} (level=${level}, ${vec.data.len} bytes)') + } + } + println('PASS: ${vectors.len} vectors x ${compression_levels.len} levels, ${total_checks} cross-checks') +} + +fn ensure_tools() ! { + must_succeed('bzip2 --help >/dev/null 2>&1', 'system bzip2 command is not available')! + must_succeed("python3 -c 'import bz2' >/dev/null 2>&1", + 'python3 with bz2 module is not available')! +} + +fn make_test_vectors() []TestVector { + mut vectors := []TestVector{} + vectors << TestVector{'empty', []u8{}} + vectors << TestVector{'ascii_text', 'The quick brown fox jumps over the lazy dog.\n'.repeat(64).bytes()} + vectors << TestVector{'repeated_byte', []u8{len: 10000, init: `A`}} + vectors << TestVector{'all_bytes_x4', all_bytes_repeated(4)} + vectors << TestVector{'lcg_64k', lcg_bytes(65536)} + return vectors +} + +fn run_case(tmp_root string, level int, case_idx int, vec TestVector) !int { + case_dir := join_path(tmp_root, 'case_l${level}_${case_idx:02}_${vec.name}') + mkdir_all(case_dir)! + + v_bz2 := bzip2.compress(vec.data, block_size: level)! + cli_bz2 := cli_compress(case_dir, vec.data, level)! + py_bz2 := py_compress(case_dir, vec.data, level)! + + mut checks := 0 + + producers := { + 'v': v_bz2 + 'cli': cli_bz2 + 'py': py_bz2 + } + for producer, compressed in producers { + v_plain := bzip2.decompress(compressed)! + assert_equal_bytes('v.decompress(${producer}.compress, level=${level})', vec.data, v_plain)! + checks++ + + cli_plain := cli_decompress(case_dir, producer, compressed)! + assert_equal_bytes('cli.decompress(${producer}.compress, level=${level})', vec.data, + cli_plain)! + checks++ + + py_plain := py_decompress(case_dir, producer, compressed)! + assert_equal_bytes('py.decompress(${producer}.compress, level=${level})', vec.data, + py_plain)! + checks++ + } + return checks +} + +fn cli_compress(case_dir string, plain []u8, level int) ![]u8 { + in_path := join_path(case_dir, 'plain.in') + out_path := join_path(case_dir, 'cli_l${level}.bz2') + write_file_array(in_path, plain)! + must_succeed('bzip2 -${level} -c -- ${shell_quote(in_path)} > ${shell_quote(out_path)}', + 'bzip2 compression failed')! + return read_bytes(out_path)! +} + +fn cli_decompress(case_dir string, producer string, compressed []u8) ![]u8 { + in_path := join_path(case_dir, '${producer}.for_cli.bz2') + out_path := join_path(case_dir, '${producer}.from_cli.out') + write_file_array(in_path, compressed)! + must_succeed('bzip2 -d -c -- ${shell_quote(in_path)} > ${shell_quote(out_path)}', + 'bzip2 decompression failed')! + return read_bytes(out_path)! +} + +fn py_compress(case_dir string, plain []u8, level int) ![]u8 { + in_path := join_path(case_dir, 'plain_py.in') + out_path := join_path(case_dir, 'py_l${level}.bz2') + write_file_array(in_path, plain)! + py_code := 'import bz2, pathlib, sys; p=pathlib.Path(sys.argv[1]); o=pathlib.Path(sys.argv[2]); l=int(sys.argv[3]); o.write_bytes(bz2.compress(p.read_bytes(), compresslevel=l))' + must_succeed('python3 -c ${shell_quote(py_code)} ${shell_quote(in_path)} ${shell_quote(out_path)} ${level}', + 'python bz2 compression failed')! + return read_bytes(out_path)! +} + +fn py_decompress(case_dir string, producer string, compressed []u8) ![]u8 { + in_path := join_path(case_dir, '${producer}.for_py.bz2') + out_path := join_path(case_dir, '${producer}.from_py.out') + write_file_array(in_path, compressed)! + py_code := 'import bz2, pathlib, sys; p=pathlib.Path(sys.argv[1]); o=pathlib.Path(sys.argv[2]); o.write_bytes(bz2.decompress(p.read_bytes()))' + must_succeed('python3 -c ${shell_quote(py_code)} ${shell_quote(in_path)} ${shell_quote(out_path)}', + 'python bz2 decompression failed')! + return read_bytes(out_path)! +} + +fn all_bytes_repeated(times int) []u8 { + mut out := []u8{cap: 256 * times} + for _ in 0 .. times { + for i in 0 .. 256 { + out << u8(i) + } + } + return out +} + +fn lcg_bytes(n int) []u8 { + mut out := []u8{len: n} + mut x := u32(0x12345678) + for i in 0 .. n { + x = x * u32(1664525) + u32(1013904223) + out[i] = u8((x >> 16) & u32(0xff)) + } + return out +} + +fn assert_equal_bytes(label string, expected []u8, got []u8) ! { + if expected.len != got.len { + return error('${label}: length mismatch expected=${expected.len} got=${got.len}') + } + for i in 0 .. expected.len { + if expected[i] != got[i] { + return error('${label}: byte mismatch at offset ${i}') + } + } +} + +fn must_succeed(command string, context string) ! { + res := execute(command) + if res.exit_code != 0 { + return error('${context}\ncommand: ${command}\nexit_code: ${res.exit_code}\n${res.output}') + } +} + +fn shell_quote(s string) string { + return "'${s.replace("'", "'\\''")}'" +} -- 2.39.5