v / vlib / compress / bzip2 / bzip2.v
1175 lines · 1121 sloc · 25.92 KB · 7cf08b805f5b8e33f5c465f1ea3e01f8e345ac8c
Raw
1module bzip2
2
3const bzip2_block_magic = u64(0x314159265359)
4const bzip2_eos_magic = u64(0x177245385090)
5const bzip2_runa = 0
6const bzip2_runb = 1
7const bzip2_max_groups = 6
8const bzip2_max_alpha = 258
9const bzip2_group_size = 50
10const bzip2_max_code_len = 20
11const bzip2_max_selectors = 18002
12
13// vfmt off
14const bzip2_crc32_table = [..]u32[
15 0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005,
16 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61, 0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd,
17 0x4c11db70, 0x48d0c6c7, 0x4593e01e, 0x4152fda9, 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75,
18 0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011, 0x791d4014, 0x7ddc5da3, 0x709f7b7a, 0x745e66cd,
19 0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039, 0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5,
20 0xbe2b5b58, 0xbaea46ef, 0xb7a96036, 0xb3687d81, 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d,
21 0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49, 0xc7361b4c, 0xc3f706fb, 0xceb42022, 0xca753d95,
22 0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1, 0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d,
23 0x34867077, 0x30476dc0, 0x3d044b19, 0x39c556ae, 0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072,
24 0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16, 0x018aeb13, 0x054bf6a4, 0x0808d07d, 0x0cc9cdca,
25 0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde, 0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02,
26 0x5e9f46bf, 0x5a5e5b08, 0x571d7dd1, 0x53dc6066, 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba,
27 0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e, 0xbfa1b04b, 0xbb60adfc, 0xb6238b25, 0xb2e29692,
28 0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6, 0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a,
29 0xe0b41de7, 0xe4750050, 0xe9362689, 0xedf73b3e, 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2,
30 0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686, 0xd5b88683, 0xd1799b34, 0xdc3abded, 0xd8fba05a,
31 0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637, 0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb,
32 0x4f040d56, 0x4bc510e1, 0x46863638, 0x42472b8f, 0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53,
33 0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47, 0x36194d42, 0x32d850f5, 0x3f9b762c, 0x3b5a6b9b,
34 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff, 0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623,
35 0xf12f560e, 0xf5ee4bb9, 0xf8ad6d60, 0xfc6c70d7, 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b,
36 0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f, 0xc423cd6a, 0xc0e2d0dd, 0xcda1f604, 0xc960ebb3,
37 0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7, 0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b,
38 0x9b3660c6, 0x9ff77d71, 0x92b45ba8, 0x9675461f, 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3,
39 0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640, 0x4e8ee645, 0x4a4ffbf2, 0x470cdd2b, 0x43cdc09c,
40 0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8, 0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24,
41 0x119b4be9, 0x155a565e, 0x18197087, 0x1cd86d30, 0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec,
42 0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088, 0x2497d08d, 0x2056cd3a, 0x2d15ebe3, 0x29d4f654,
43 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0, 0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c,
44 0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18, 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4,
45 0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0, 0x9abc8bd5, 0x9e7d9662, 0x933eb0bb, 0x97ffad0c,
46 0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668, 0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4
47]
48// vfmt on
49
50@[params]
51pub struct CompressParams {
52pub:
53 block_size int = 9 // Valid range is 1..9 (100k to 900k block size).
54}
55
56@[params]
57pub struct DecompressParams {
58pub:
59 verify_crc bool = true
60}
61
62// compress compresses `src` into a bzip2 byte stream.
63pub fn compress(src []u8, params CompressParams) ![]u8 {
64 if params.block_size < 1 || params.block_size > 9 {
65 return error('bzip2: block_size must be in 1..9')
66 }
67 mut w := BitWriter{
68 out: []u8{}
69 }
70
71 w.write_byte(`B`)
72 w.write_byte(`Z`)
73 w.write_byte(`h`)
74 w.write_byte(u8(`0`) + u8(params.block_size))
75
76 max_block := params.block_size * 100000
77 mut stream_crc := u32(0)
78 if src.len > 0 {
79 mut pos := 0
80 for pos < src.len {
81 end := find_rle1_block_end(src, pos, max_block)
82 chunk := src[pos..end]
83 block := encode_block(chunk)!
84 w.write_bits(48, bzip2_block_magic)
85 w.write_bits(32, u64(block.block_crc))
86 w.write_bit(0) // randomized flag; unsupported legacy mode
87 w.write_bits(24, u64(block.orig_ptr))
88 write_in_use_map(mut w, block.in_use)
89 w.write_bits(3, u64(block.n_groups))
90 w.write_bits(15, u64(block.selectors.len))
91 for sel in block.selector_mtf {
92 for _ in 0 .. sel {
93 w.write_bit(1)
94 }
95 w.write_bit(0)
96 }
97 for g in 0 .. block.n_groups {
98 write_code_lengths(mut w, block.code_lengths[g], block.alpha_size)
99 }
100 mut grp_idx := 0
101 mut grp_pos := 0
102 for sym in block.symbols {
103 if grp_pos == 0 {
104 grp_idx = block.selectors[grp_idx]
105 }
106 code := block.codes[grp_idx][sym]
107 len := block.code_lengths[grp_idx][sym]
108 w.write_bits(len, u64(code))
109 grp_pos++
110 if grp_pos == bzip2_group_size {
111 grp_pos = 0
112 grp_idx++
113 }
114 }
115 stream_crc = rotate_left_1(stream_crc) ^ block.block_crc
116 pos = end
117 }
118 }
119 w.write_bits(48, bzip2_eos_magic)
120 w.write_bits(32, u64(stream_crc))
121 return w.finish()
122}
123
124// decompress decompresses a bzip2 byte stream.
125pub fn decompress(src []u8, params DecompressParams) ![]u8 {
126 mut r := BitReader{
127 data: src
128 }
129 if r.read_byte()! != `B` || r.read_byte()! != `Z` || r.read_byte()! != `h` {
130 return error('bzip2: invalid header')
131 }
132 lvl := r.read_byte()!
133 if lvl < `1` || lvl > `9` {
134 return error('bzip2: invalid block size marker')
135 }
136 block_limit := int(lvl - `0`) * 100000
137 mut out := []u8{}
138 mut stream_crc := u32(0)
139 for {
140 magic := r.read_bits(48)!
141 if magic == bzip2_eos_magic {
142 stored_stream_crc := u32(r.read_bits(32)!)
143 if params.verify_crc && stored_stream_crc != stream_crc {
144 return error('bzip2: stream crc mismatch')
145 }
146 if !r.is_aligned_to_byte_zero_padding() {
147 return error('bzip2: trailing non-zero bits')
148 }
149 return out
150 }
151 if magic != bzip2_block_magic {
152 return error('bzip2: invalid block magic')
153 }
154 block_crc := u32(r.read_bits(32)!)
155 randomized := r.read_bit()!
156 if randomized != 0 {
157 return error('bzip2: randomized blocks are not supported')
158 }
159 orig_ptr := int(r.read_bits(24)!)
160 in_use := read_in_use_map(mut r)!
161 mut seq_to_unseq := []u8{}
162 for i, used in in_use {
163 if used {
164 seq_to_unseq << u8(i)
165 }
166 }
167 n_in_use := seq_to_unseq.len
168 alpha_size := n_in_use + 2
169 if alpha_size < 2 || alpha_size > bzip2_max_alpha {
170 return error('bzip2: invalid alphabet size')
171 }
172 n_groups := int(r.read_bits(3)!)
173 if n_groups < 2 || n_groups > bzip2_max_groups {
174 return error('bzip2: invalid huffman group count')
175 }
176 n_selectors := int(r.read_bits(15)!)
177 if n_selectors < 1 || n_selectors > bzip2_max_selectors {
178 return error('bzip2: invalid selector count')
179 }
180 mut selectors_mtf := []int{len: n_selectors}
181 for i in 0 .. n_selectors {
182 mut c := 0
183 for {
184 b := r.read_bit()!
185 if b == 0 {
186 break
187 }
188 c++
189 if c >= n_groups {
190 return error('bzip2: invalid selector mtf value')
191 }
192 }
193 selectors_mtf[i] = c
194 }
195 selectors := decode_selector_mtf(selectors_mtf, n_groups)!
196
197 mut code_lengths := [][]int{len: n_groups, init: []int{len: alpha_size}}
198 for g in 0 .. n_groups {
199 mut curr := int(r.read_bits(5)!)
200 if curr < 1 || curr > bzip2_max_code_len {
201 return error('bzip2: invalid initial huffman code length')
202 }
203 for i in 0 .. alpha_size {
204 for {
205 flag := r.read_bit()!
206 if flag == 0 {
207 break
208 }
209 up_down := r.read_bit()!
210 if up_down == 0 {
211 curr++
212 } else {
213 curr--
214 }
215 if curr < 1 || curr > bzip2_max_code_len {
216 return error('bzip2: invalid huffman code length delta')
217 }
218 }
219 code_lengths[g][i] = curr
220 }
221 }
222
223 mut tables := []DecodeTable{len: n_groups}
224 for g in 0 .. n_groups {
225 tables[g] = build_decode_table(code_lengths[g], alpha_size)!
226 }
227
228 eob := n_in_use + 1
229 mut selector_index := 0
230 mut group_pos := 0
231 mut mtf := []int{len: n_in_use}
232 for i in 0 .. n_in_use {
233 mtf[i] = i
234 }
235 mut decoded_syms := []int{}
236 mut next_sym := 0
237 for {
238 if group_pos == 0 {
239 if selector_index >= selectors.len {
240 return error('bzip2: selectors exhausted')
241 }
242 }
243 grp := selectors[selector_index]
244 next_sym = tables[grp].decode(mut r)!
245 group_pos++
246 if group_pos == bzip2_group_size {
247 group_pos = 0
248 selector_index++
249 }
250 if next_sym == eob {
251 break
252 }
253 if next_sym == bzip2_runa || next_sym == bzip2_runb {
254 mut s := i64(-1)
255 mut n := i64(1)
256 remaining := block_limit - decoded_syms.len
257 if remaining < 1 {
258 return error('bzip2: block output exceeds declared block size')
259 }
260 for {
261 if next_sym == bzip2_runa {
262 s += n
263 } else {
264 s += n * 2
265 }
266 if s + 1 > i64(remaining) {
267 return error('bzip2: block output exceeds declared block size')
268 }
269 n *= 2
270 if group_pos == 0 {
271 if selector_index >= selectors.len {
272 return error('bzip2: selectors exhausted in run')
273 }
274 }
275 grp2 := selectors[selector_index]
276 next_sym = tables[grp2].decode(mut r)!
277 group_pos++
278 if group_pos == bzip2_group_size {
279 group_pos = 0
280 selector_index++
281 }
282 if next_sym != bzip2_runa && next_sym != bzip2_runb {
283 break
284 }
285 }
286 run_len := int(s + 1)
287 if run_len < 1 {
288 return error('bzip2: invalid run length')
289 }
290 if mtf.len == 0 {
291 return error('bzip2: invalid run with empty mtf table')
292 }
293 ensure_block_output_limit(decoded_syms.len, run_len, block_limit)!
294 for _ in 0 .. run_len {
295 decoded_syms << mtf[0]
296 }
297 if next_sym == eob {
298 break
299 }
300 }
301 if next_sym == eob {
302 break
303 }
304 if next_sym < 2 || next_sym > eob {
305 return error('bzip2: invalid symbol value')
306 }
307 pos := next_sym - 1
308 if pos >= mtf.len {
309 return error('bzip2: mtf index out of range')
310 }
311 sym := mtf[pos]
312 move_to_front_int(mut mtf, pos)
313 ensure_block_output_limit(decoded_syms.len, 1, block_limit)!
314 decoded_syms << sym
315 }
316
317 mut bwt_bytes := []u8{cap: decoded_syms.len}
318 for s in decoded_syms {
319 if s < 0 || s >= seq_to_unseq.len {
320 return error('bzip2: decoded symbol is out of sequence map range')
321 }
322 bwt_bytes << seq_to_unseq[s]
323 }
324
325 rle1 := inverse_bwt(bwt_bytes, orig_ptr)!
326 plain := rle1_decode(rle1)!
327 if params.verify_crc {
328 calc := bzip2_crc32(plain)
329 if calc != block_crc {
330 return error('bzip2: block crc mismatch')
331 }
332 }
333 stream_crc = rotate_left_1(stream_crc) ^ block_crc
334 out << plain
335 }
336 return error('bzip2: unexpected end of stream')
337}
338
339struct EncodedBlock {
340 block_crc u32
341 orig_ptr int
342 in_use []bool
343 alpha_size int
344 n_groups int
345 selectors []int
346 selector_mtf []int
347 symbols []int
348 code_lengths [][]int
349 codes [][]u32
350}
351
352fn encode_block(src []u8) !EncodedBlock {
353 block_crc := bzip2_crc32(src)
354 rle1 := rle1_encode(src)
355 if rle1.len == 0 {
356 return error('bzip2: internal error, empty block after rle1')
357 }
358 bwt, orig_ptr := bwt_transform(rle1)
359 mut in_use := []bool{len: 256}
360 for b in bwt {
361 in_use[int(b)] = true
362 }
363 mut seq_to_unseq := []u8{}
364 for i, used in in_use {
365 if used {
366 seq_to_unseq << u8(i)
367 }
368 }
369 mut unseq_to_seq := []int{len: 256, init: -1}
370 for i, b in seq_to_unseq {
371 unseq_to_seq[int(b)] = i
372 }
373 mtf_symbols, alpha_size := mtf_rle2_encode(bwt, unseq_to_seq, seq_to_unseq.len)
374 if mtf_symbols.len == 0 {
375 return error('bzip2: internal error, empty symbol stream')
376 }
377 n_groups := select_group_count(mtf_symbols.len)
378 n_selectors := selector_count_from_symbol_count(mtf_symbols.len)!
379 mut selectors := []int{len: n_selectors, init: 0}
380 selector_mtf := encode_selector_mtf(selectors, n_groups)
381 freq := symbol_freq(mtf_symbols, alpha_size)
382 lens := make_huffman_lengths(freq, bzip2_max_code_len)
383 codes := build_huffman_codes(lens)
384 mut code_lengths := [][]int{len: n_groups}
385 mut code_words := [][]u32{len: n_groups}
386 for i in 0 .. n_groups {
387 code_lengths[i] = lens.clone()
388 code_words[i] = codes.clone()
389 }
390 return EncodedBlock{
391 block_crc: block_crc
392 orig_ptr: orig_ptr
393 in_use: in_use
394 alpha_size: alpha_size
395 n_groups: n_groups
396 selectors: selectors
397 selector_mtf: selector_mtf
398 symbols: mtf_symbols
399 code_lengths: code_lengths
400 codes: code_words
401 }
402}
403
404fn select_group_count(n_syms int) int {
405 if n_syms < 200 {
406 return 2
407 }
408 if n_syms < 600 {
409 return 3
410 }
411 if n_syms < 1200 {
412 return 4
413 }
414 if n_syms < 2400 {
415 return 5
416 }
417 return 6
418}
419
420fn selector_count_from_symbol_count(n_symbols int) !int {
421 if n_symbols < 1 {
422 return error('bzip2: invalid selector count')
423 }
424 n_selectors := (n_symbols + bzip2_group_size - 1) / bzip2_group_size
425 if n_selectors < 1 || n_selectors > bzip2_max_selectors {
426 return error('bzip2: invalid selector count')
427 }
428 return n_selectors
429}
430
431fn ensure_block_output_limit(decoded_len int, add_len int, block_limit int) ! {
432 if decoded_len < 0 || add_len < 0 || block_limit < 0 {
433 return error('bzip2: invalid block output state')
434 }
435 if decoded_len > block_limit || add_len > block_limit - decoded_len {
436 return error('bzip2: block output exceeds declared block size')
437 }
438}
439
440fn symbol_freq(symbols []int, alpha_size int) []int {
441 mut freq := []int{len: alpha_size}
442 for s in symbols {
443 if s >= 0 && s < alpha_size {
444 freq[s]++
445 }
446 }
447 for i in 0 .. alpha_size {
448 if freq[i] == 0 {
449 freq[i] = 1
450 }
451 }
452 return freq
453}
454
455fn write_in_use_map(mut w BitWriter, in_use []bool) {
456 mut group_used := []bool{len: 16}
457 for i in 0 .. 16 {
458 for j in 0 .. 16 {
459 if in_use[i * 16 + j] {
460 group_used[i] = true
461 break
462 }
463 }
464 }
465 for g in group_used {
466 w.write_bit(if g { 1 } else { 0 })
467 }
468 for i in 0 .. 16 {
469 if !group_used[i] {
470 continue
471 }
472 for j in 0 .. 16 {
473 w.write_bit(if in_use[i * 16 + j] { 1 } else { 0 })
474 }
475 }
476}
477
478fn read_in_use_map(mut r BitReader) ![]bool {
479 mut group_used := []bool{len: 16}
480 for i in 0 .. 16 {
481 group_used[i] = r.read_bit()! == 1
482 }
483 mut in_use := []bool{len: 256}
484 for i in 0 .. 16 {
485 if !group_used[i] {
486 continue
487 }
488 for j in 0 .. 16 {
489 in_use[i * 16 + j] = r.read_bit()! == 1
490 }
491 }
492 return in_use
493}
494
495fn write_code_lengths(mut w BitWriter, lengths []int, alpha_size int) {
496 mut curr := lengths[0]
497 w.write_bits(5, u64(curr))
498 for i in 0 .. alpha_size {
499 target := lengths[i]
500 for curr < target {
501 w.write_bit(1)
502 w.write_bit(0)
503 curr++
504 }
505 for curr > target {
506 w.write_bit(1)
507 w.write_bit(1)
508 curr--
509 }
510 w.write_bit(0)
511 }
512}
513
514fn decode_selector_mtf(vals []int, n_groups int) ![]int {
515 mut pos := []int{len: n_groups}
516 for i in 0 .. n_groups {
517 pos[i] = i
518 }
519 mut out := []int{len: vals.len}
520 for i, v in vals {
521 if v < 0 || v >= n_groups {
522 return error('bzip2: selector mtf value out of range')
523 }
524 sel := pos[v]
525 for j := v; j > 0; j-- {
526 pos[j] = pos[j - 1]
527 }
528 pos[0] = sel
529 out[i] = sel
530 }
531 return out
532}
533
534fn encode_selector_mtf(selectors []int, n_groups int) []int {
535 mut pos := []int{len: n_groups}
536 for i in 0 .. n_groups {
537 pos[i] = i
538 }
539 mut out := []int{len: selectors.len}
540 for i, sel in selectors {
541 mut idx := 0
542 for idx < pos.len && pos[idx] != sel {
543 idx++
544 }
545 if idx >= pos.len {
546 out[i] = 0
547 continue
548 }
549 out[i] = idx
550 for j := idx; j > 0; j-- {
551 pos[j] = pos[j - 1]
552 }
553 pos[0] = sel
554 }
555 return out
556}
557
558fn make_huffman_lengths(freq []int, max_len int) []int {
559 mut scaled := freq.clone()
560 for {
561 lens := build_huffman_lengths_unbounded(scaled)
562 mut max_seen := 0
563 for l in lens {
564 if l > max_seen {
565 max_seen = l
566 }
567 }
568 if max_seen <= max_len {
569 return lens
570 }
571 for i in 0 .. scaled.len {
572 scaled[i] = (scaled[i] >> 1) + 1
573 }
574 }
575 return []int{}
576}
577
578fn build_huffman_lengths_unbounded(freq []int) []int {
579 n := freq.len
580 if n == 0 {
581 return []int{}
582 }
583 mut weight := []int{cap: 2 * n}
584 mut parent := []int{cap: 2 * n}
585 for i in 0 .. n {
586 w := if freq[i] > 0 { freq[i] } else { 1 }
587 weight << w
588 parent << -1
589 }
590 mut active := []int{len: n}
591 for i in 0 .. n {
592 active[i] = i
593 }
594 for active.len > 1 {
595 mut m1 := 0
596 mut m2 := 1
597 if weight[active[m2]] < weight[active[m1]] {
598 t := m1
599 m1 = m2
600 m2 = t
601 }
602 for i in 2 .. active.len {
603 idx := active[i]
604 if weight[idx] < weight[active[m1]] {
605 m2 = m1
606 m1 = i
607 } else if weight[idx] < weight[active[m2]] {
608 m2 = i
609 }
610 }
611 a := active[m1]
612 b := active[m2]
613 new_idx := weight.len
614 weight << (weight[a] + weight[b])
615 parent << -1
616 parent[a] = new_idx
617 parent[b] = new_idx
618 if m1 > m2 {
619 active.delete(m1)
620 active.delete(m2)
621 } else {
622 active.delete(m2)
623 active.delete(m1)
624 }
625 active << new_idx
626 }
627 mut lengths := []int{len: n}
628 for i in 0 .. n {
629 mut l := 0
630 mut p := parent[i]
631 for p != -1 {
632 l++
633 p = parent[p]
634 }
635 if l == 0 {
636 l = 1
637 }
638 lengths[i] = l
639 }
640 return lengths
641}
642
643fn build_huffman_codes(lengths []int) []u32 {
644 mut max_len := 0
645 for l in lengths {
646 if l > max_len {
647 max_len = l
648 }
649 }
650 mut bl_count := []int{len: max_len + 1}
651 for l in lengths {
652 if l > 0 {
653 bl_count[l]++
654 }
655 }
656 mut next_code := []u32{len: max_len + 1}
657 mut code := u32(0)
658 for bits in 1 .. max_len + 1 {
659 code = (code + u32(bl_count[bits - 1])) << 1
660 next_code[bits] = code
661 }
662 mut out := []u32{len: lengths.len}
663 for i, l in lengths {
664 if l == 0 {
665 continue
666 }
667 out[i] = next_code[l]
668 next_code[l]++
669 }
670 return out
671}
672
673struct DecodeTable {
674 min_len int
675 max_len int
676 base []int
677 limit []int
678 perm []int
679}
680
681fn build_decode_table(lengths []int, alpha_size int) !DecodeTable {
682 mut min_len := 999
683 mut max_len := 0
684 for i in 0 .. alpha_size {
685 l := lengths[i]
686 if l < 1 || l > bzip2_max_code_len {
687 return error('bzip2: invalid huffman code length')
688 }
689 if l < min_len {
690 min_len = l
691 }
692 if l > max_len {
693 max_len = l
694 }
695 }
696 mut perm := []int{}
697 for l in min_len .. max_len + 1 {
698 for i in 0 .. alpha_size {
699 if lengths[i] == l {
700 perm << i
701 }
702 }
703 }
704 mut base := []int{len: max_len + 2}
705 mut limit := []int{len: max_len + 1}
706 for i in 0 .. alpha_size {
707 base[lengths[i] + 1]++
708 }
709 for i in 1 .. base.len {
710 base[i] += base[i - 1]
711 }
712 mut vec := 0
713 for l in min_len .. max_len + 1 {
714 vec += base[l + 1] - base[l]
715 limit[l] = vec - 1
716 vec <<= 1
717 }
718 for l in min_len + 1 .. max_len + 1 {
719 base[l] = ((limit[l - 1] + 1) * 2) - base[l]
720 }
721 return DecodeTable{
722 min_len: min_len
723 max_len: max_len
724 base: base
725 limit: limit
726 perm: perm
727 }
728}
729
730fn (t DecodeTable) decode(mut r BitReader) !int {
731 mut zn := t.min_len
732 mut zvec := int(r.read_bits(zn)!)
733 for zn <= t.max_len && zvec > t.limit[zn] {
734 zn++
735 bit := int(r.read_bits(1)!)
736 zvec = (zvec * 2) | bit
737 }
738 if zn > t.max_len {
739 return error('bzip2: invalid huffman code')
740 }
741 idx := zvec - t.base[zn]
742 if idx < 0 || idx >= t.perm.len {
743 return error('bzip2: invalid huffman decode index')
744 }
745 return t.perm[idx]
746}
747
748fn mtf_rle2_encode(data []u8, unseq_to_seq []int, n_in_use int) ([]int, int) {
749 mut mtf := []int{len: n_in_use}
750 for i in 0 .. n_in_use {
751 mtf[i] = i
752 }
753 eob := n_in_use + 1
754 mut out := []int{}
755 mut zpend := 0
756 for b in data {
757 seq := unseq_to_seq[int(b)]
758 mut pos := 0
759 for pos < mtf.len && mtf[pos] != seq {
760 pos++
761 }
762 if pos == 0 {
763 zpend++
764 continue
765 }
766 if zpend > 0 {
767 emit_run_a_b(mut out, zpend)
768 zpend = 0
769 }
770 out << (pos + 1)
771 move_to_front_int(mut mtf, pos)
772 }
773 if zpend > 0 {
774 emit_run_a_b(mut out, zpend)
775 }
776 out << eob
777 return out, n_in_use + 2
778}
779
780fn emit_run_a_b(mut out []int, run_len int) {
781 mut z := run_len - 1
782 for {
783 if (z & 1) == 0 {
784 out << bzip2_runa
785 } else {
786 out << bzip2_runb
787 }
788 if z < 2 {
789 break
790 }
791 z = (z - 2) >> 1
792 }
793}
794
795fn move_to_front_int(mut arr []int, pos int) {
796 if pos <= 0 {
797 return
798 }
799 tmp := arr[pos]
800 for i := pos; i > 0; i-- {
801 arr[i] = arr[i - 1]
802 }
803 arr[0] = tmp
804}
805
806fn rle1_encode(src []u8) []u8 {
807 if src.len == 0 {
808 return []u8{}
809 }
810 mut out := []u8{cap: src.len + (src.len / 4) + 8}
811 mut i := 0
812 for i < src.len {
813 b := src[i]
814 mut run := 1
815 for i + run < src.len && src[i + run] == b && run < max_i32 {
816 run++
817 }
818 mut rem := run
819 for rem > 0 {
820 if rem <= 3 {
821 for _ in 0 .. rem {
822 out << b
823 }
824 rem = 0
825 } else {
826 chunk := if rem > 259 { 259 } else { rem }
827 for _ in 0 .. 4 {
828 out << b
829 }
830 out << u8(chunk - 4)
831 rem -= chunk
832 }
833 }
834 i += run
835 }
836 return out
837}
838
839fn find_rle1_block_end(src []u8, start int, block_limit int) int {
840 if start >= src.len {
841 return start
842 }
843 mut end := start
844 mut encoded_len := 0
845 for end < src.len {
846 b := src[end]
847 mut run_len := 1
848 for end + run_len < src.len && src[end + run_len] == b {
849 run_len++
850 }
851 run_encoded_len := rle1_encoded_run_len(run_len)
852 if encoded_len + run_encoded_len <= block_limit {
853 encoded_len += run_encoded_len
854 end += run_len
855 continue
856 }
857 available := block_limit - encoded_len
858 if available <= 0 {
859 break
860 }
861 take := max_run_prefix_for_rle1(run_len, available)
862 if take <= 0 {
863 break
864 }
865 end += take
866 break
867 }
868 if end == start {
869 return start + 1
870 }
871 return end
872}
873
874fn max_run_prefix_for_rle1(run_len int, encoded_limit int) int {
875 if run_len <= 0 || encoded_limit <= 0 {
876 return 0
877 }
878 mut lo := 1
879 mut hi := run_len
880 mut best := 0
881 for lo <= hi {
882 mid := lo + (hi - lo) / 2
883 if rle1_encoded_run_len(mid) <= encoded_limit {
884 best = mid
885 lo = mid + 1
886 } else {
887 hi = mid - 1
888 }
889 }
890 return best
891}
892
893fn rle1_encoded_run_len(run_len int) int {
894 if run_len <= 0 {
895 return 0
896 }
897 if run_len <= 3 {
898 return run_len
899 }
900 full_chunks := run_len / 259
901 rem := run_len % 259
902 mut out := full_chunks * 5
903 if rem == 0 {
904 return out
905 }
906 if rem <= 3 {
907 out += rem
908 } else {
909 out += 5
910 }
911 return out
912}
913
914fn rle1_decode(src []u8) ![]u8 {
915 mut out := []u8{cap: src.len}
916 mut i := 0
917 for i < src.len {
918 if i + 4 < src.len && src[i] == src[i + 1] && src[i] == src[i + 2] && src[i] == src[i + 3] {
919 run := int(src[i + 4]) + 4
920 for _ in 0 .. run {
921 out << src[i]
922 }
923 i += 5
924 } else {
925 out << src[i]
926 i++
927 }
928 }
929 return out
930}
931
932fn bwt_transform(data []u8) ([]u8, int) {
933 n := data.len
934 if n == 0 {
935 return []u8{}, 0
936 }
937 mut sa := cyclic_suffix_array(data)
938 mut out := []u8{len: n}
939 mut orig_ptr := 0
940 for i, s in sa {
941 if s == 0 {
942 orig_ptr = i
943 out[i] = data[n - 1]
944 } else {
945 out[i] = data[s - 1]
946 }
947 }
948 return out, orig_ptr
949}
950
951fn inverse_bwt(last_col []u8, orig_ptr int) ![]u8 {
952 n := last_col.len
953 if n == 0 {
954 return []u8{}
955 }
956 if orig_ptr < 0 || orig_ptr >= n {
957 return error('bzip2: invalid bwt origin pointer')
958 }
959 mut count := []int{len: 256}
960 for b in last_col {
961 count[int(b)]++
962 }
963 mut tots := []int{len: 256}
964 mut sum := 0
965 for i in 0 .. 256 {
966 tots[i] = sum
967 sum += count[i]
968 }
969 mut tt := []int{len: n}
970 for i, b in last_col {
971 idx := int(b)
972 tt[tots[idx]] = i
973 tots[idx]++
974 }
975 mut out := []u8{len: n}
976 mut tpos := orig_ptr
977 for i in 0 .. n {
978 tpos = tt[tpos]
979 out[i] = last_col[tpos]
980 }
981 return out
982}
983
984fn cyclic_suffix_array(data []u8) []int {
985 n := data.len
986 if n <= 1 {
987 if n == 1 {
988 return [0]
989 }
990 return []int{}
991 }
992 mut sa := []int{len: n}
993 mut rank := []int{len: n}
994 for i in 0 .. n {
995 sa[i] = i
996 rank[i] = int(data[i])
997 }
998 mut tmp_sa := []int{len: n}
999 mut new_rank := []int{len: n}
1000 mut k := 1
1001 for k < n {
1002 radix_sort_cyclic(mut sa, mut tmp_sa, rank, k)
1003 new_rank[sa[0]] = 0
1004 mut classes := 1
1005 for i in 1 .. n {
1006 a := sa[i - 1]
1007 b := sa[i]
1008 if rank[a] != rank[b] || rank[(a + k) % n] != rank[(b + k) % n] {
1009 classes++
1010 }
1011 new_rank[b] = classes - 1
1012 }
1013 rank = new_rank.clone()
1014 if classes == n {
1015 break
1016 }
1017 k <<= 1
1018 }
1019 return sa
1020}
1021
1022fn radix_sort_cyclic(mut sa []int, mut tmp []int, rank []int, k int) {
1023 n := sa.len
1024 m := if n > 256 { n } else { 256 }
1025 mut count := []int{len: m}
1026 for i in 0 .. n {
1027 key := rank[(sa[i] + k) % n]
1028 count[key]++
1029 }
1030 mut pos := []int{len: m}
1031 mut sum := 0
1032 for i in 0 .. m {
1033 pos[i] = sum
1034 sum += count[i]
1035 }
1036 for i in 0 .. n {
1037 v := sa[i]
1038 key := rank[(v + k) % n]
1039 tmp[pos[key]] = v
1040 pos[key]++
1041 }
1042 count = []int{len: m}
1043 for i in 0 .. n {
1044 key := rank[tmp[i]]
1045 count[key]++
1046 }
1047 sum = 0
1048 for i in 0 .. m {
1049 pos[i] = sum
1050 sum += count[i]
1051 }
1052 for i in 0 .. n {
1053 v := tmp[i]
1054 key := rank[v]
1055 sa[pos[key]] = v
1056 pos[key]++
1057 }
1058}
1059
1060struct BitWriter {
1061mut:
1062 out []u8
1063 bitbuf u64
1064 bit_count int
1065}
1066
1067fn (mut w BitWriter) write_byte(b u8) {
1068 w.out << b
1069}
1070
1071fn (mut w BitWriter) write_bit(bit int) {
1072 w.write_bits(1, u64(bit & 1))
1073}
1074
1075fn (mut w BitWriter) write_bits(n int, value u64) {
1076 if n <= 0 {
1077 return
1078 }
1079 for i := n - 1; i >= 0; i-- {
1080 b := (value >> u32(i)) & 1
1081 w.bitbuf = (w.bitbuf << 1) | b
1082 w.bit_count++
1083 if w.bit_count == 8 {
1084 w.out << u8(w.bitbuf & 0xff)
1085 w.bitbuf = 0
1086 w.bit_count = 0
1087 }
1088 }
1089}
1090
1091fn (mut w BitWriter) finish() ![]u8 {
1092 if w.bit_count > 0 {
1093 w.bitbuf <<= u32(8 - w.bit_count)
1094 w.out << u8(w.bitbuf & 0xff)
1095 w.bitbuf = 0
1096 w.bit_count = 0
1097 }
1098 return w.out
1099}
1100
1101struct BitReader {
1102 data []u8
1103mut:
1104 byte_pos int
1105 bit_pos int
1106}
1107
1108fn (mut r BitReader) read_byte() !u8 {
1109 if r.bit_pos != 0 {
1110 return error('bzip2: attempted byte read on non-byte boundary')
1111 }
1112 if r.byte_pos >= r.data.len {
1113 return error('bzip2: unexpected end of input')
1114 }
1115 b := r.data[r.byte_pos]
1116 r.byte_pos++
1117 return b
1118}
1119
1120fn (mut r BitReader) read_bit() !int {
1121 return int(r.read_bits(1)!)
1122}
1123
1124fn (mut r BitReader) read_bits(n int) !u64 {
1125 if n < 0 || n > 56 {
1126 return error('bzip2: invalid bit width')
1127 }
1128 mut out := u64(0)
1129 for _ in 0 .. n {
1130 if r.byte_pos >= r.data.len {
1131 return error('bzip2: unexpected end of input')
1132 }
1133 b := r.data[r.byte_pos]
1134 bit := (b >> u8(7 - r.bit_pos)) & u8(1)
1135 out = (out << 1) | u64(bit)
1136 r.bit_pos++
1137 if r.bit_pos == 8 {
1138 r.bit_pos = 0
1139 r.byte_pos++
1140 }
1141 }
1142 return out
1143}
1144
1145fn (r &BitReader) is_aligned_to_byte_zero_padding() bool {
1146 if r.bit_pos == 0 {
1147 return r.byte_pos == r.data.len
1148 }
1149 if r.byte_pos >= r.data.len {
1150 return false
1151 }
1152 mask := u8((1 << u8(8 - r.bit_pos)) - 1)
1153 if (r.data[r.byte_pos] & mask) != 0 {
1154 return false
1155 }
1156 for i in r.byte_pos + 1 .. r.data.len {
1157 if r.data[i] != 0 {
1158 return false
1159 }
1160 }
1161 return true
1162}
1163
1164fn rotate_left_1(v u32) u32 {
1165 return (v << 1) | (v >> 31)
1166}
1167
1168fn bzip2_crc32(data []u8) u32 {
1169 mut crc := u32(0xffffffff)
1170 for b in data {
1171 table_idx := int(((crc >> 24) ^ u32(b)) & u32(0xff))
1172 crc = (crc << 8) ^ bzip2_crc32_table[table_idx]
1173 }
1174 return ~crc
1175}
1176