v2 / vlib / compress / gzip / gzip.v
240 lines · 222 sloc · 8.83 KB · 50e2d0344fb64b27f59f351cfc304a9c3944e05d
Raw
1// [rfc1952](https://datatracker.ietf.org/doc/html/rfc1952) compliant
2// gzip compression/decompression
3
4module gzip
5
6import compress as compr
7import hash.crc32
8
9// CompressFlags
10// TODO: These flags have no use now
11@[flag]
12pub enum CompressFlags {
13 // The low 12 bits will be overwritten by `compression_level`
14 compression_level_overwrite_flag01
15 compression_level_overwrite_flag02
16 compression_level_overwrite_flag03
17 compression_level_overwrite_flag04
18 compression_level_overwrite_flag05
19 compression_level_overwrite_flag06
20 compression_level_overwrite_flag07
21 compression_level_overwrite_flag08
22 compression_level_overwrite_flag09
23 compression_level_overwrite_flag10
24 compression_level_overwrite_flag11
25 compression_level_overwrite_flag12
26
27 // If set, the compressor outputs a zlib header before the deflate data, and the Adler-32 of the source data at the end. Otherwise, you'll get raw deflate data.
28 write_zlib_header //= 0x01000
29 // Always compute the adler-32 of the input data (even when not writing zlib headers).
30 compute_adler32 //= 0x02000
31 // Set to use faster greedy parsing, instead of more efficient lazy parsing.
32 greedy_parsing_flag //= 0x04000
33 // Enable to decrease the compressor's initialization time to the minimum, but the output may vary from run to run given the same input (depending on the contents of memory).
34 nondeterministic_parsing_flag //= 0x08000
35 // Only look for RLE matches (matches with a distance of 1)
36 rle_matches //= 0x10000
37 // Discards matches <= 5 chars if enabled.
38 filter_matches //= 0x20000
39 // Disable usage of optimized Huffman tables.
40 force_all_static_blocks //= 0x40000
41 // Only use raw (uncompressed) deflate blocks.
42 force_all_raw_blocks //= 0x80000
43}
44
45// CompressParams set compression_level for compression:
46// 0: Huffman only;
47// 1: Huffman+LZ (fastest/crap compression);
48// 128: default_max_probes;
49// 4095: Huffman+LZ (slowest/best compression)
50@[params]
51pub struct CompressParams {
52pub:
53 compression_level int = 128 // 0~4095
54 flags CompressFlags
55}
56
57// compresses an array of bytes using gzip and returns the compressed bytes in a new array
58// Example: b := 'abcde'.repeat(1000).bytes(); cmprsd := gzip.compress(b, compression_level:4095)!; assert cmprsd.len == 47
59// Note: compression_level 0~4095
60pub fn compress(data []u8, params CompressParams) ![]u8 {
61 if params.compression_level !in 0..4096 {
62 return error('compression level should in [0,4095]')
63 }
64 // The low 12 bits are reserved to control the max # of hash probes per dictionary lookup.
65 flags := params.compression_level | (int(params.flags) & ~int(4095))
66 compressed := compr.compress(data, flags)!
67 // header
68 mut result := [
69 u8(0x1f), // magic numbers (1F 8B)
70 0x8b,
71 0x08, // deflate
72 0x00, // header flags
73 0x00, // 4-byte timestamp, 0 = no timestamp (00 00 00 00)
74 0x00,
75 0x00,
76 0x00,
77 0x00, // extra flags
78 0xff, // operating system id (0xff = unknown)
79 ] // 10 bytes
80 result << compressed
81 // trailer
82 checksum := crc32.sum(data)
83 length := data.len
84 result << [
85 u8(checksum),
86 u8(checksum >> 8),
87 u8(checksum >> 16),
88 u8(checksum >> 24),
89 u8(length),
90 u8(length >> 8),
91 u8(length >> 16),
92 u8(length >> 24),
93 ] // 8 bytes
94 return result
95}
96
97// DecompressFlags
98// TODO: These flags have no use now
99@[flag]
100pub enum DecompressFlags {
101 // If set, the input has a valid zlib header and ends with an adler32 checksum (it's a valid zlib stream). Otherwise, the input is a raw deflate stream.
102 parse_zlib_header
103 // If set, there are more input bytes available beyond the end of the supplied input buffer. If clear, the input buffer contains all remaining input.
104 has_more_input
105 // If set, the output buffer is large enough to hold the entire decompressed stream. If clear, the output buffer is at least the size of the dictionary (typically 32KB).
106 using_non_wrapping_output_buf
107 // Force adler-32 checksum computation of the decompressed bytes.
108 compute_adler32
109}
110
111// DecompressParams set flags for decompression:
112@[params]
113pub struct DecompressParams {
114pub:
115 verify_header_checksum bool = true
116 verify_length bool = true
117 verify_checksum bool = true
118 flags DecompressFlags
119}
120
121pub const reserved_bits = 0b1110_0000
122pub const ftext = 0b0000_0001
123pub const fextra = 0b0000_0100
124pub const fname = 0b0000_1000
125pub const fcomment = 0b0001_0000
126pub const fhcrc = 0b0000_0010
127
128const min_header_length = 18
129
130@[noinit]
131pub struct GzipHeader {
132pub mut:
133 length int = 10
134 extra []u8
135 filename []u8
136 comment []u8
137 modification_time u32
138 operating_system u8
139}
140
141// validate validates the header and returns its details if valid
142@[direct_array_access]
143pub fn validate(data []u8, params DecompressParams) !GzipHeader {
144 if data.len < min_header_length {
145 return error('data is too short, not gzip compressed?')
146 } else if data[0] != 0x1f || data[1] != 0x8b {
147 return error('wrong magic numbers, not gzip compressed?')
148 } else if data[2] != 0x08 {
149 return error('gzip data is not compressed with DEFLATE')
150 }
151 mut header := GzipHeader{}
152
153 // parse flags, we ignore most of them, but we still need to parse them
154 // correctly, so we dont accidently decompress something that belongs
155 // to the header
156
157 if data[3] & reserved_bits > 0 {
158 // rfc 1952 2.3.1.2 Compliance
159 // A compliant decompressor must give an error indication if any
160 // reserved bit is non-zero, since such a bit could indicate the
161 // presence of a new field that would cause subsequent data to be
162 // interpreted incorrectly.
163 return error('reserved flags are set, unsupported field detected')
164 }
165
166 if data[3] & fextra > 0 {
167 xlen := data[header.length]
168 header.extra = data[header.length + 1..header.length + 1 + xlen]
169 header.length += xlen + 1
170 }
171 if data[3] & fname > 0 {
172 // filename is zero-terminated, so skip until we hit a zero byte
173 for header.length < data.len && data[header.length] != 0x00 {
174 header.filename << data[header.length]
175 header.length++
176 }
177 header.length++
178 }
179 if data[3] & fcomment > 0 {
180 // comment is zero-terminated, so skip until we hit a zero byte
181 for header.length < data.len && data[header.length] != 0x00 {
182 header.comment << data[header.length]
183 header.length++
184 }
185 header.length++
186 }
187 if data[3] & fhcrc > 0 {
188 if header.length + 12 > data.len {
189 return error('data too short')
190 }
191 checksum_header := crc32.sum(data[..header.length])
192 checksum_header_expected := (u32(data[header.length]) << 24) | (u32(data[header.length + 1]) << 16) | (u32(data[
193 header.length + 2]) << 8) | data[header.length + 3]
194 if params.verify_header_checksum && checksum_header != checksum_header_expected {
195 return error('header checksum verification failed')
196 }
197 header.length += 4
198 }
199 if header.length + 8 > data.len {
200 return error('data too short')
201 }
202 header.operating_system = data[9]
203 return header
204}
205
206// decompress an array of bytes using zlib and returns the decompressed bytes in a new array.
207// Example: b := 'abcdef'.repeat(1000).bytes(); cmpr := gzip.compress(b)!; decmpr := gzip.decompress(cmpr)!; assert cmpr.len < b.len; assert b == decmpr
208pub fn decompress(data []u8, params DecompressParams) ![]u8 {
209 gzip_header := validate(data, params)!
210 header_length := gzip_header.length
211
212 decompressed := compr.decompress(data[header_length..data.len - 8], 0)!
213 length_expected := (u32(data[data.len - 1]) << 24) | (u32(data[data.len - 2]) << 16) | (u32(data[data.len - 3]) << 8) | data[data.len - 4]
214 if params.verify_length && decompressed.len != length_expected {
215 return error('length verification failed, got ${decompressed.len}, expected ${length_expected}')
216 }
217 checksum := crc32.sum(decompressed)
218 checksum_expected := (u32(data[data.len - 5]) << 24) | (u32(data[data.len - 6]) << 16) | (u32(data[data.len - 7]) << 8) | data[data.len - 8]
219 if params.verify_checksum && checksum != checksum_expected {
220 return error('checksum verification failed')
221 }
222 return decompressed
223}
224
225// decompress_with_callback decompresses the given `data`, using zlib. It calls `cb` with each chunk of decompressed bytes.
226// A chunk is usually 32 KB or less. Note: the chunk data received by `cb` should be cloned, if you need to store it for later,
227// and not process it right away.
228// The callback function should return the chunk length, if it wants to continue decompressing, or 0, if it wants to abort the decompression early.
229// See also compress.ChunkCallback for more details.
230pub fn decompress_with_callback(data []u8, cb compr.ChunkCallback, userdata voidptr, params DecompressParams) !int {
231 gzip_header := validate(data, params)!
232 header_len := gzip_header.length
233 expected_len := int((u32(data[data.len - 1]) << 24) | (u32(data[data.len - 2]) << 16) | (u32(data[data.len - 3]) << 8) | data[data.len - 4])
234 body := data[header_len..data.len - 8]
235 chunks_len := int(compr.decompress_with_callback(body, cb, userdata, 0)!)
236 if params.verify_length && expected_len != chunks_len {
237 return error('Decompress error: expected length:${expected_len}, got:${chunks_len}')
238 }
239 return chunks_len
240}
241