From a0632356d23a7c6ee16e85f89a2ef5ca2d360245 Mon Sep 17 00:00:00 2001 From: Richard Wheeler Date: Mon, 8 Jun 2026 20:10:12 -0400 Subject: [PATCH] compress.deflate: build Huffman trees via the shared hash.huffman builder (#27394) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * compress.deflate: build Huffman trees via the shared hash.huffman builder Replace the two in-tree copies of the canonical Huffman code-assignment (RFC 1951 ยง3.2.2 bl_count / next_code) with calls to hash.huffman, the shared builder added for #27358: - Decode: build_huff_tree() now fills its flat LSB-first lookup table via huffman.flat_table(). The table format (entry = (symbol << 5) | length, 0xFFFF_FFFF for invalid) and the decode loop are unchanged. flat_table() assigns each code inline while filling (no intermediate codes array) and skips the invalid pre-fill for complete codes, so it does strictly less work than the old hand-rolled loop on the common path. build_huff_tree() now returns `!HuffTree` so an over-subscribed (malformed) code surfaces as a clean error instead of a silently corrupt table; all call sites propagate with `!`. - Encode: fixed_litlen_encode() takes its reversed codes straight from huffman.build(..., bit_order: .lsb_first).codes. No behavior change for any valid DEFLATE stream (incomplete codes are still accepted). Benchmarked before/after with -prod: decode-dominated throughput (8 MiB fixed-Huffman text, 2 MiB dynamic-Huffman) is unchanged, and the build-dominated paths (thousands of small dynamic-block streams) are ~5-11% faster than the old code, since the fixed trees and most dynamic trees are complete and skip the invalid pre-fill. compress.deflate / gzip / zlib suites stay green. This is the second half of vlang/v#27358 (depends on the hash.huffman module PR). Co-Authored-By: Claude Opus 4.8 * compress.deflate: propagate errors from fixed_litlen_encode instead of panicking Address review on #27394: fixed_litlen_encode() now returns `!([]u32, []int)` and deflate_compress_fixed() returns `![]u8`, propagating the (in practice unreachable) huffman.build error instead of panicking. This matches the decode-side build_huff_tree(), which already returns `!` in this PR, and keeps a clean error path end to end. The public compress* APIs are unchanged (already `![]u8`). Co-Authored-By: Claude Opus 4.8 --------- Co-authored-by: Richard Wheeler Co-authored-by: Claude Opus 4.8 --- vlib/compress/deflate/deflate.v | 8 ++-- vlib/compress/deflate/deflate_compress.v | 40 ++++------------ vlib/compress/deflate/deflate_inflate.v | 60 ++++++++---------------- 3 files changed, 32 insertions(+), 76 deletions(-) diff --git a/vlib/compress/deflate/deflate.v b/vlib/compress/deflate/deflate.v index bbbe5d4db..7bf86f90a 100644 --- a/vlib/compress/deflate/deflate.v +++ b/vlib/compress/deflate/deflate.v @@ -129,12 +129,12 @@ pub fn compress(data []u8, format CompressParams) ![]u8 { return match format.format { .zlib { compress_zlib(data) } .gzip { compress_gzip(data) } - .raw_deflate { deflate_compress_fixed(data) } + .raw_deflate { deflate_compress_fixed(data)! } } } pub fn compress_zlib(data []u8) ![]u8 { - payload := deflate_compress_fixed(data) + payload := deflate_compress_fixed(data)! cksum := adler32.sum(data) mut out := []u8{cap: 2 + payload.len + 4} out << u8(0x78) // CMF: CM=8 deflate, CINFO=7 (32K window) @@ -146,7 +146,7 @@ pub fn compress_zlib(data []u8) ![]u8 { // compress_gzip compresses data into a gzip stream (RFC 1952). pub fn compress_gzip(data []u8) ![]u8 { - payload := deflate_compress_fixed(data) + payload := deflate_compress_fixed(data)! mut out := []u8{cap: 10 + payload.len + 8} // 10-byte gzip header: ID1 ID2 CM FLG MTIME(4) XFL OS out << [u8(0x1f), 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff] @@ -158,7 +158,7 @@ pub fn compress_gzip(data []u8) ![]u8 { // compress_raw compresses data to a raw RFC 1951 DEFLATE stream. pub fn compress_raw(data []u8) ![]u8 { - return deflate_compress_fixed(data) + return deflate_compress_fixed(data)! } // decompress decompresses a zlib (RFC 1950), gzip (RFC 1952), or raw DEFLATE (RFC 1951) stream. diff --git a/vlib/compress/deflate/deflate_compress.v b/vlib/compress/deflate/deflate_compress.v index d49355e0c..6e667071d 100644 --- a/vlib/compress/deflate/deflate_compress.v +++ b/vlib/compress/deflate/deflate_compress.v @@ -1,5 +1,7 @@ module deflate +import hash.huffman + const deflate_hash_bits = 15 const deflate_hash_size = 1 << deflate_hash_bits const deflate_max_chain = 64 @@ -8,36 +10,12 @@ const deflate_max_match = 258 const deflate_window = 32768 // fixed_litlen_encode returns (reversed_codes, code_lengths) for fixed Huffman lit/len. -fn fixed_litlen_encode() ([]u32, []int) { +// The LSB-first (bit-reversed) codes come straight from the shared canonical +// builder, since the encoder writes bits LSB-first. +fn fixed_litlen_encode() !([]u32, []int) { lens := fixed_litlen_lengths() - mut max_bits := 0 - for l in lens { - if l > max_bits { - max_bits = l - } - } - mut bl_count := []int{len: max_bits + 1} - for l in lens { - if l > 0 { - bl_count[l]++ - } - } - mut next_code := []u32{len: max_bits + 1} - mut c := u32(0) - for bits in 1 .. max_bits + 1 { - c = (c + u32(bl_count[bits - 1])) << 1 - next_code[bits] = c - } - mut codes := []u32{len: 288} - for sym in 0 .. 288 { - l := lens[sym] - if l == 0 { - continue - } - codes[sym] = bit_reverse(next_code[l], l) - next_code[l]++ - } - return codes, lens + t := huffman.build(lengths: lens, max_bits: 9, bit_order: .lsb_first)! + return t.codes, lens } // fixed_dist_encode returns (reversed_codes, code_lengths) for fixed Huffman distance. @@ -135,8 +113,8 @@ fn (mut w BitWriter) flush() { // deflate_compress_fixed compresses data to RFC 1951 DEFLATE using fixed Huffman codes. @[direct_array_access] -fn deflate_compress_fixed(data []u8) []u8 { - ll_codes, ll_lens := fixed_litlen_encode() +fn deflate_compress_fixed(data []u8) ![]u8 { + ll_codes, ll_lens := fixed_litlen_encode()! d_codes, d_lens := fixed_dist_encode() mut w := BitWriter{} // BFINAL=1, BTYPE=01 (fixed Huffman) diff --git a/vlib/compress/deflate/deflate_inflate.v b/vlib/compress/deflate/deflate_inflate.v index 1c59e34df..e0dd55d4b 100644 --- a/vlib/compress/deflate/deflate_inflate.v +++ b/vlib/compress/deflate/deflate_inflate.v @@ -1,5 +1,7 @@ module deflate +import hash.huffman + // vfmt off // RFC 1951 length/distance decode tables const length_bases = [3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59, @@ -39,7 +41,7 @@ struct HuffTree { max_bits int } -fn build_huff_tree(lengths []int) HuffTree { +fn build_huff_tree(lengths []int) !HuffTree { mut max_bits := 0 for l in lengths { if l > max_bits { @@ -52,36 +54,12 @@ fn build_huff_tree(lengths []int) HuffTree { max_bits: 0 } } - mut bl_count := []int{len: max_bits + 1} - for l in lengths { - if l > 0 { - bl_count[l]++ - } - } - mut next_code := []u32{len: max_bits + 1} - mut c := u32(0) - for bits in 1 .. max_bits + 1 { - c = (c + u32(bl_count[bits - 1])) << 1 - next_code[bits] = c - } - table_size := 1 << max_bits - mut table := []u32{len: table_size, init: 0xffff_ffff} - for sym in 0 .. lengths.len { - l := lengths[sym] - if l == 0 { - continue - } - code := next_code[l] - next_code[l]++ - // Reverse code for LSB-first bit reader - rev := bit_reverse(code, l) - step := 1 << l - mut idx := int(rev) - for idx < table_size { - table[idx] = (u32(sym) << 5) | u32(l) - idx += step - } - } + // Build the flat LSB-first lookup table the decode loop expects directly + // from the lengths via the shared canonical builder: entry = (symbol << 5) | + // length, 0xFFFF_FFFF for invalid. flat_table() allocates no intermediate + // codes array, so this matches the hand-rolled loop's cost. Over-subscribed + // lengths now surface as an error instead of a silently corrupt table. + table := huffman.flat_table(lengths: lengths, max_bits: max_bits, bit_order: .lsb_first)! return HuffTree{ table: table max_bits: max_bits @@ -192,8 +170,8 @@ fn inflate_with_consumed(data []u8) !InflateResult { buf: data } mut out := []u8{} - fixed_ll := build_huff_tree(fixed_litlen_lengths()) - fixed_d := build_huff_tree([]int{len: 32, init: 5}) + fixed_ll := build_huff_tree(fixed_litlen_lengths())! + fixed_d := build_huff_tree([]int{len: 32, init: 5})! for { bfinal := r.read_bits(1)! btype := r.read_bits(2)! @@ -220,7 +198,7 @@ fn inflate_with_consumed(data []u8) !InflateResult { for i in 0 .. hclen { cl_lens[cl_order[i]] = int(r.read_bits(3)!) } - cl_tree := build_huff_tree(cl_lens) + cl_tree := build_huff_tree(cl_lens)! mut all_lens := []int{} for all_lens.len < hlit + hdist { sym := r.huff_decode(cl_tree)! @@ -249,8 +227,8 @@ fn inflate_with_consumed(data []u8) !InflateResult { return error('inflate: bad code length symbol') } } - ll_tree := build_huff_tree(all_lens[..hlit]) - d_tree := build_huff_tree(all_lens[hlit..]) + ll_tree := build_huff_tree(all_lens[..hlit])! + d_tree := build_huff_tree(all_lens[hlit..])! inflate_block(mut r, mut out, ll_tree, d_tree)! } else { @@ -284,8 +262,8 @@ fn inflate_with_callback(data []u8, cb ChunkCallback, userdata voidptr) !Inflate mut out := []u8{} mut state := InflateStreamState{} mut aborted := false - fixed_ll := build_huff_tree(fixed_litlen_lengths()) - fixed_d := build_huff_tree([]int{len: 32, init: 5}) + fixed_ll := build_huff_tree(fixed_litlen_lengths())! + fixed_d := build_huff_tree([]int{len: 32, init: 5})! for { bfinal := r.read_bits(1)! btype := r.read_bits(2)! @@ -318,7 +296,7 @@ fn inflate_with_callback(data []u8, cb ChunkCallback, userdata voidptr) !Inflate for i in 0 .. hclen { cl_lens[cl_order[i]] = int(r.read_bits(3)!) } - cl_tree := build_huff_tree(cl_lens) + cl_tree := build_huff_tree(cl_lens)! mut all_lens := []int{} for all_lens.len < hlit + hdist { sym := r.huff_decode(cl_tree)! @@ -347,8 +325,8 @@ fn inflate_with_callback(data []u8, cb ChunkCallback, userdata voidptr) !Inflate return error('inflate: bad code length symbol') } } - ll_tree := build_huff_tree(all_lens[..hlit]) - d_tree := build_huff_tree(all_lens[hlit..]) + ll_tree := build_huff_tree(all_lens[..hlit])! + d_tree := build_huff_tree(all_lens[hlit..])! if !inflate_block_stream(mut r, mut out, ll_tree, d_tree, cb, userdata, mut state)! { aborted = true } -- 2.39.5