v / vlib / hash / huffman / huffman.v
225 lines · 213 sloc · 8.89 KB · 95861b8bdeeddc71d79c3f09f56a66bf01106ecd
Raw
1// Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved.
2// Use of this source code is governed by an MIT license
3// that can be found in the LICENSE file.
4
5// Module huffman builds canonical Huffman codes from a per-symbol array of bit
6// lengths. This is the assignment step shared by RFC 1951 (DEFLATE, used by
7// compress.deflate / zlib / gzip) and RFC 7541 Appendix B (HPACK, used by the
8// HTTP/2 code in net.http): given only the lengths, both standards rebuild the
9// exact same codes via the `bl_count` / `next_code` algorithm (RFC 1951 §3.2.2).
10//
11// What the two callers do NOT share — and therefore what this module is
12// parameterized by — is the maximum code length (DEFLATE caps at 15 bits and
13// decodes via a flat 2^max_bits table; HPACK goes up to 30 bits, where a flat
14// table is infeasible) and the bit order (DEFLATE is LSB-first and bit-reverses
15// every code, HPACK is MSB-first). The bit I/O loops and codec-specific
16// semantics (EOS/padding, end-of-block, extra bits, distance alphabets) stay in
17// the callers.
18module huffman
19
20// BitOrder selects how a code's bits are laid out in the returned `u32`.
21pub enum BitOrder {
22 // msb_first keeps the canonical code as-is: the first transmitted bit is
23 // the most-significant bit of the code (RFC 7541 / HPACK).
24 msb_first
25 // lsb_first reverses each code within its length, so the first transmitted
26 // bit is the least-significant bit (RFC 1951 / DEFLATE). This is the form a
27 // flat LSB-first decode table is indexed by.
28 lsb_first
29}
30
31// Config parameterizes build(). All fields are required and have no defaults:
32// the two callers have intentionally different requirements (15 vs 30 bits,
33// LSB vs MSB), so an implicit default would silently fit only one of them.
34@[params]
35pub struct Config {
36pub:
37 lengths []int @[required] // per-symbol code length in bits; 0 marks an unused symbol
38 max_bits int @[required] // maximum allowed code length; must be >= every nonzero length
39 bit_order BitOrder @[required] // .msb_first (HPACK) or .lsb_first (DEFLATE)
40}
41
42// Table is the result of build(): the canonical code for every symbol, plus the
43// metadata a caller needs to drive its own bit I/O and to build a decode
44// structure (flat_table() for small max_bits, decode_map() otherwise).
45@[noinit]
46pub struct Table {
47pub:
48 codes []u32 // per-symbol code, right-aligned in a u32, in `bit_order`
49 lengths []int // per-symbol bit length (a copy of the input)
50 max_bits int
51 bit_order BitOrder
52}
53
54// max_flat_bits is the largest max_bits for which flat_table() will
55// allocate a table (2^max_bits entries). DEFLATE's 15 fits; HPACK's 30 does
56// not and must use decode_map() / a bit-at-a-time decoder instead.
57pub const max_flat_bits = 18
58
59// flat_invalid_entry marks a flat_table() slot that no code maps to.
60pub const flat_invalid_entry = u32(0xffff_ffff)
61
62// flat_length_bits is how many low bits of a flat_table() entry hold the
63// code length; the symbol is stored in the remaining high bits. 5 bits hold
64// lengths up to 31, covering every max_bits <= max_flat_bits.
65pub const flat_length_bits = 5
66
67// next_codes validates `lengths` against `max_bits` and returns the canonical
68// starting code for each length (RFC 1951 §3.2.2): next_code[l] is the code the
69// first symbol of length l receives, and callers post-increment it per symbol.
70// It also reports whether the code is `complete` (uses the whole code space,
71// i.e. the Kraft inequality holds with equality), which lets flat_table() skip
72// pre-filling the table. It is the single source of truth for the code
73// assignment shared by build() and flat_table(). Errors on max_bits < 1 or
74// > 32, a negative length or one exceeding max_bits, or an over-subscribed code
75// (Kraft inequality); an incomplete (under-subscribed) code is allowed, as both
76// DEFLATE and HPACK use.
77fn next_codes(lengths []int, max_bits int) !([]u32, bool) {
78 if max_bits < 1 {
79 return error('huffman: max_bits must be >= 1, got ${max_bits}')
80 }
81 if max_bits > 32 {
82 return error('huffman: max_bits ${max_bits} exceeds 32 (u32 code storage)')
83 }
84 mut bl_count := []int{len: max_bits + 1}
85 for sym, l in lengths {
86 if l < 0 {
87 return error('huffman: negative length ${l} for symbol ${sym}')
88 }
89 if l > max_bits {
90 return error('huffman: length ${l} for symbol ${sym} exceeds max_bits ${max_bits}')
91 }
92 if l > 0 {
93 bl_count[l]++
94 }
95 }
96 // Kraft inequality: sum over used symbols of 2^(max_bits - len) must not
97 // exceed 2^max_bits, i.e. the code must not be over-subscribed. left == 0 at
98 // the end means the code is complete (covers every index of a flat table).
99 mut left := u64(1) << max_bits
100 for bits in 1 .. max_bits + 1 {
101 used := u64(bl_count[bits]) << (max_bits - bits)
102 if used > left {
103 return error('huffman: over-subscribed code (lengths exceed the code space)')
104 }
105 left -= used
106 }
107 mut next_code := []u32{len: max_bits + 1}
108 mut c := u32(0)
109 for bits in 1 .. max_bits + 1 {
110 c = (c + u32(bl_count[bits - 1])) << 1
111 next_code[bits] = c
112 }
113 return next_code, left == 0
114}
115
116// build assigns a canonical Huffman code to every symbol from its bit length.
117// Symbols with length 0 are unused and get code 0. It returns the codes plus
118// the metadata for a decode structure; use it when you need the per-symbol
119// codes (e.g. HPACK encoding) and/or decode_map(). Callers that only want a
120// flat decode table should use flat_table(), which avoids materializing the
121// codes array. See next_codes() for the validation rules.
122pub fn build(cfg Config) !Table {
123 mut next_code, _ := next_codes(cfg.lengths, cfg.max_bits)!
124 mut codes := []u32{len: cfg.lengths.len}
125 for sym, l in cfg.lengths {
126 if l == 0 {
127 continue
128 }
129 code := next_code[l]
130 next_code[l]++
131 codes[sym] = if cfg.bit_order == .lsb_first { bit_reverse(code, l) } else { code }
132 }
133 return Table{
134 codes: codes
135 lengths: cfg.lengths.clone()
136 max_bits: cfg.max_bits
137 bit_order: cfg.bit_order
138 }
139}
140
141// flat_table builds a 2^max_bits lookup table for fast decode of short codes
142// (the DEFLATE strategy), directly from the lengths. Each entry is
143// `(symbol << flat_length_bits) | length`, or flat_invalid_entry for indices no
144// code reaches. The table is indexed by the next max_bits bits read from the
145// stream in `bit_order`. It returns an error if max_bits > max_flat_bits, since
146// the table would be prohibitively large (use build() + decode_map() instead).
147//
148// Unlike build(), this allocates no per-symbol codes array and does not copy
149// the lengths: it assigns each code inline while filling the table, so a hot
150// caller rebuilding a tree per block (compress.deflate) pays no extra
151// allocation over hand-rolling the loop.
152pub fn flat_table(cfg Config) ![]u32 {
153 if cfg.max_bits > max_flat_bits {
154 return error('huffman: max_bits ${cfg.max_bits} > max_flat_bits ${max_flat_bits}; use build() + decode_map()')
155 }
156 mut next_code, complete := next_codes(cfg.lengths, cfg.max_bits)!
157 table_size := 1 << cfg.max_bits
158 // A complete code writes every table slot, so the invalid pre-fill is dead
159 // work; allocate zeroed (vcalloc) and skip it. An incomplete code leaves
160 // gaps that must read back as flat_invalid_entry, so it pays the fill.
161 mut table := if complete {
162 []u32{len: table_size}
163 } else {
164 []u32{len: table_size, init: flat_invalid_entry}
165 }
166 for sym, l in cfg.lengths {
167 if l == 0 {
168 continue
169 }
170 raw := next_code[l]
171 next_code[l]++
172 entry := (u32(sym) << flat_length_bits) | u32(l)
173 // The (max_bits - l) bits beyond the code are don't-cares, so a code of
174 // length l fills 2^(max_bits - l) table slots. Where those slots sit
175 // depends on bit_order: LSB-first codes occupy every index whose low l
176 // bits match the code (stride by 2^l); MSB-first codes occupy a
177 // contiguous block whose high l bits match the code.
178 if cfg.bit_order == .lsb_first {
179 step := 1 << l
180 mut idx := int(bit_reverse(raw, l))
181 for idx < table_size {
182 table[idx] = entry
183 idx += step
184 }
185 } else {
186 block := 1 << (cfg.max_bits - l)
187 base := int(raw) * block
188 for k in 0 .. block {
189 table[base + k] = entry
190 }
191 }
192 }
193 return table
194}
195
196// decode_map builds a map from a packed (length, code) key to its symbol, for
197// codecs whose max_bits is too large for a flat table (HPACK's 30 bits). The
198// key is `(u64(length) << 32) | code`; a decoder accumulates bits MSB-first and
199// looks up after each bit. Only defined for .msb_first tables, where the
200// accumulated value matches the stored code; it returns an error otherwise.
201pub fn (t Table) decode_map() !map[u64]int {
202 if t.bit_order != .msb_first {
203 return error('huffman: decode_map requires .msb_first bit order')
204 }
205 mut m := map[u64]int{}
206 for sym, l in t.lengths {
207 if l == 0 {
208 continue
209 }
210 m[(u64(l) << 32) | u64(t.codes[sym])] = sym
211 }
212 return m
213}
214
215// bit_reverse reverses the low `n` bits of `v` (used to convert a canonical
216// MSB-first code into the LSB-first form a DEFLATE bit reader consumes).
217fn bit_reverse(v u32, n int) u32 {
218 mut r := u32(0)
219 mut val := v
220 for _ in 0 .. n {
221 r = (r << 1) | (val & 1)
222 val >>= 1
223 }
224 return r
225}
226