Gitly


1 // [rfc1952](https://datatracker.ietf.org/doc/html/rfc1952) compliant
2 // gzip compression/decompression
3 
4 module gzip
5 
6 import compress as compr
7 import hash.crc32
8 
9 // CompressFlags
10 // TODO: These flags have no use now
11 @[flag]
12 pub enum CompressFlags {
13     // The low 12 bits will be overwritten by `compression_level`
14     compression_level_overwrite_flag01
15     compression_level_overwrite_flag02
16     compression_level_overwrite_flag03
17     compression_level_overwrite_flag04
18     compression_level_overwrite_flag05
19     compression_level_overwrite_flag06
20     compression_level_overwrite_flag07
21     compression_level_overwrite_flag08
22     compression_level_overwrite_flag09
23     compression_level_overwrite_flag10
24     compression_level_overwrite_flag11
25     compression_level_overwrite_flag12
26 
27     // If set, the compressor outputs a zlib header before the deflate data, and the Adler-32 of the source data at the end. Otherwise, you'll get raw deflate data.
28     write_zlib_header //= 0x01000
29     // Always compute the adler-32 of the input data (even when not writing zlib headers).
30     compute_adler32 //= 0x02000
31     // Set to use faster greedy parsing, instead of more efficient lazy parsing.
32     greedy_parsing_flag //= 0x04000
33     // Enable to decrease the compressor's initialization time to the minimum, but the output may vary from run to run given the same input (depending on the contents of memory).
34     nondeterministic_parsing_flag //= 0x08000
35     // Only look for RLE matches (matches with a distance of 1)
36     rle_matches //= 0x10000
37     // Discards matches <= 5 chars if enabled.
38     filter_matches //= 0x20000
39     // Disable usage of optimized Huffman tables.
40     force_all_static_blocks //= 0x40000
41     // Only use raw (uncompressed) deflate blocks.
42     force_all_raw_blocks //= 0x80000
43 }
44 
45 // CompressParams set compression_level for compression:
46 // 0: Huffman only;
47 // 1: Huffman+LZ (fastest/crap compression);
48 // 128: default_max_probes;
49 // 4095: Huffman+LZ (slowest/best compression)
50 @[params]
51 pub struct CompressParams {
52 pub:
53     compression_level int = 128 // 0~4095
54     flags             CompressFlags
55 }
56 
57 // compresses an array of bytes using gzip and returns the compressed bytes in a new array
58 // Example: b := 'abcde'.repeat(1000).bytes(); cmprsd := gzip.compress(b, compression_level:4095)!; assert cmprsd.len == 47
59 // Note: compression_level 0~4095
60 pub fn compress(data []u8, params CompressParams) ![]u8 {
61     if params.compression_level !in 0..4096 {
62         return error('compression level should in [0,4095]')
63     }
64     // The low 12 bits are reserved to control the max # of hash probes per dictionary lookup.
65     flags := params.compression_level | (int(params.flags) & ~int(4095))
66     compressed := compr.compress(data, flags)!
67     // header
68     mut result := [
69         u8(0x1f), // magic numbers (1F 8B)
70         0x8b,
71         0x08, // deflate
72         0x00, // header flags
73         0x00, // 4-byte timestamp, 0 = no timestamp (00 00 00 00)
74         0x00,
75         0x00,
76         0x00,
77         0x00, // extra flags
78         0xff, // operating system id (0xff = unknown)
79     ] // 10 bytes
80     result << compressed
81     // trailer
82     checksum := crc32.sum(data)
83     length := data.len
84     result << [
85         u8(checksum),
86         u8(checksum >> 8),
87         u8(checksum >> 16),
88         u8(checksum >> 24),
89         u8(length),
90         u8(length >> 8),
91         u8(length >> 16),
92         u8(length >> 24),
93     ] // 8 bytes
94     return result
95 }
96 
97 // DecompressFlags
98 // TODO: These flags have no use now
99 @[flag]
100 pub enum DecompressFlags {
101     // If set, the input has a valid zlib header and ends with an adler32 checksum (it's a valid zlib stream). Otherwise, the input is a raw deflate stream.
102     parse_zlib_header
103     // If set, there are more input bytes available beyond the end of the supplied input buffer. If clear, the input buffer contains all remaining input.
104     has_more_input
105     // If set, the output buffer is large enough to hold the entire decompressed stream. If clear, the output buffer is at least the size of the dictionary (typically 32KB).
106     using_non_wrapping_output_buf
107     // Force adler-32 checksum computation of the decompressed bytes.
108     compute_adler32
109 }
110 
111 // DecompressParams set flags for decompression:
112 @[params]
113 pub struct DecompressParams {
114 pub:
115     verify_header_checksum bool = true
116     verify_length          bool = true
117     verify_checksum        bool = true
118     flags                  DecompressFlags
119 }
120 
121 pub const reserved_bits = 0b1110_0000
122 pub const ftext = 0b0000_0001
123 pub const fextra = 0b0000_0100
124 pub const fname = 0b0000_1000
125 pub const fcomment = 0b0001_0000
126 pub const fhcrc = 0b0000_0010
127 
128 const min_header_length = 18
129 
130 @[noinit]
131 pub struct GzipHeader {
132 pub mut:
133     length            int = 10
134     extra             []u8
135     filename          []u8
136     comment           []u8
137     modification_time u32
138     operating_system  u8
139 }
140 
141 // validate validates the header and returns its details if valid
142 @[direct_array_access]
143 pub fn validate(data []u8, params DecompressParams) !GzipHeader {
144     if data.len < min_header_length {
145         return error('data is too short, not gzip compressed?')
146     } else if data[0] != 0x1f || data[1] != 0x8b {
147         return error('wrong magic numbers, not gzip compressed?')
148     } else if data[2] != 0x08 {
149         return error('gzip data is not compressed with DEFLATE')
150     }
151     mut header := GzipHeader{}
152 
153     // parse flags, we ignore most of them, but we still need to parse them
154     // correctly, so we dont accidently decompress something that belongs
155     // to the header
156 
157     if data[3] & reserved_bits > 0 {
158         // rfc 1952 2.3.1.2 Compliance
159         // A compliant decompressor must give an error indication if any
160         // reserved bit is non-zero, since such a bit could indicate the
161         // presence of a new field that would cause subsequent data to be
162         // interpreted incorrectly.
163         return error('reserved flags are set, unsupported field detected')
164     }
165 
166     if data[3] & fextra > 0 {
167         xlen := data[header.length]
168         header.extra = data[header.length + 1..header.length + 1 + xlen]
169         header.length += xlen + 1
170     }
171     if data[3] & fname > 0 {
172         // filename is zero-terminated, so skip until we hit a zero byte
173         for header.length < data.len && data[header.length] != 0x00 {
174             header.filename << data[header.length]
175             header.length++
176         }
177         header.length++
178     }
179     if data[3] & fcomment > 0 {
180         // comment is zero-terminated, so skip until we hit a zero byte
181         for header.length < data.len && data[header.length] != 0x00 {
182             header.comment << data[header.length]
183             header.length++
184         }
185         header.length++
186     }
187     if data[3] & fhcrc > 0 {
188         if header.length + 12 > data.len {
189             return error('data too short')
190         }
191         checksum_header := crc32.sum(data[..header.length])
192         checksum_header_expected := (u32(data[header.length]) << 24) | (u32(data[header.length + 1]) << 16) | (u32(data[
193             header.length + 2]) << 8) | data[header.length + 3]
194         if params.verify_header_checksum && checksum_header != checksum_header_expected {
195             return error('header checksum verification failed')
196         }
197         header.length += 4
198     }
199     if header.length + 8 > data.len {
200         return error('data too short')
201     }
202     header.operating_system = data[9]
203     return header
204 }
205 
206 // decompress an array of bytes using zlib and returns the decompressed bytes in a new array.
207 // Example: b := 'abcdef'.repeat(1000).bytes(); cmpr := gzip.compress(b)!; decmpr := gzip.decompress(cmpr)!; assert cmpr.len < b.len; assert b == decmpr
208 pub fn decompress(data []u8, params DecompressParams) ![]u8 {
209     gzip_header := validate(data, params)!
210     header_length := gzip_header.length
211 
212     decompressed := compr.decompress(data[header_length..data.len - 8], 0)!
213     length_expected := (u32(data[data.len - 1]) << 24) | (u32(data[data.len - 2]) << 16) | (u32(data[data.len - 3]) << 8) | data[data.len - 4]
214     if params.verify_length && decompressed.len != length_expected {
215         return error('length verification failed, got ${decompressed.len}, expected ${length_expected}')
216     }
217     checksum := crc32.sum(decompressed)
218     checksum_expected := (u32(data[data.len - 5]) << 24) | (u32(data[data.len - 6]) << 16) | (u32(data[data.len - 7]) << 8) | data[data.len - 8]
219     if params.verify_checksum && checksum != checksum_expected {
220         return error('checksum verification failed')
221     }
222     return decompressed
223 }
224 
225 // decompress_with_callback decompresses the given `data`, using zlib. It calls `cb` with each chunk of decompressed bytes.
226 // A chunk is usually 32 KB or less. Note: the chunk data received by `cb` should be cloned, if you need to store it for later,
227 // and not process it right away.
228 // The callback function should return the chunk length, if it wants to continue decompressing, or 0, if it wants to abort the decompression early.
229 // See also compress.ChunkCallback for more details.
230 pub fn decompress_with_callback(data []u8, cb compr.ChunkCallback, userdata voidptr, params DecompressParams) !int {
231     gzip_header := validate(data, params)!
232     header_len := gzip_header.length
233     expected_len := int((u32(data[data.len - 1]) << 24) | (u32(data[data.len - 2]) << 16) | (u32(data[data.len - 3]) << 8) | data[data.len - 4])
234     body := data[header_len..data.len - 8]
235     chunks_len := int(compr.decompress_with_callback(body, cb, userdata, 0)!)
236     if params.verify_length && expected_len != chunks_len {
237         return error('Decompress error: expected length:${expected_len}, got:${chunks_len}')
238     }
239     return chunks_len
240 }
241

1	// [rfc1952](https://datatracker.ietf.org/doc/html/rfc1952) compliant
2	// gzip compression/decompression
3
4	module gzip
5
6	import compress as compr
7	import hash.crc32
8
9	// CompressFlags
10	// TODO: These flags have no use now
11	@[flag]
12	pub enum CompressFlags {
13	// The low 12 bits will be overwritten by `compression_level`
14	compression_level_overwrite_flag01
15	compression_level_overwrite_flag02
16	compression_level_overwrite_flag03
17	compression_level_overwrite_flag04
18	compression_level_overwrite_flag05
19	compression_level_overwrite_flag06
20	compression_level_overwrite_flag07
21	compression_level_overwrite_flag08
22	compression_level_overwrite_flag09
23	compression_level_overwrite_flag10
24	compression_level_overwrite_flag11
25	compression_level_overwrite_flag12
26
27	// If set, the compressor outputs a zlib header before the deflate data, and the Adler-32 of the source data at the end. Otherwise, you'll get raw deflate data.
28	write_zlib_header //= 0x01000
29	// Always compute the adler-32 of the input data (even when not writing zlib headers).
30	compute_adler32 //= 0x02000
31	// Set to use faster greedy parsing, instead of more efficient lazy parsing.
32	greedy_parsing_flag //= 0x04000
33	// Enable to decrease the compressor's initialization time to the minimum, but the output may vary from run to run given the same input (depending on the contents of memory).
34	nondeterministic_parsing_flag //= 0x08000
35	// Only look for RLE matches (matches with a distance of 1)
36	rle_matches //= 0x10000
37	// Discards matches <= 5 chars if enabled.
38	filter_matches //= 0x20000
39	// Disable usage of optimized Huffman tables.
40	force_all_static_blocks //= 0x40000
41	// Only use raw (uncompressed) deflate blocks.
42	force_all_raw_blocks //= 0x80000
43	}
44
45	// CompressParams set compression_level for compression:
46	// 0: Huffman only;
47	// 1: Huffman+LZ (fastest/crap compression);
48	// 128: default_max_probes;
49	// 4095: Huffman+LZ (slowest/best compression)
50	@[params]
51	pub struct CompressParams {
52	pub:
53	compression_level int = 128 // 0~4095
54	flags CompressFlags
55	}
56
57	// compresses an array of bytes using gzip and returns the compressed bytes in a new array
58	// Example: b := 'abcde'.repeat(1000).bytes(); cmprsd := gzip.compress(b, compression_level:4095)!; assert cmprsd.len == 47
59	// Note: compression_level 0~4095
60	pub fn compress(data []u8, params CompressParams) ![]u8 {
61	if params.compression_level !in 0..4096 {
62	return error('compression level should in [0,4095]')
63	}
64	// The low 12 bits are reserved to control the max # of hash probes per dictionary lookup.
65	flags := params.compression_level \| (int(params.flags) & ~int(4095))
66	compressed := compr.compress(data, flags)!
67	// header
68	mut result := [
69	u8(0x1f), // magic numbers (1F 8B)
70	0x8b,
71	0x08, // deflate
72	0x00, // header flags
73	0x00, // 4-byte timestamp, 0 = no timestamp (00 00 00 00)
74	0x00,
75	0x00,
76	0x00,
77	0x00, // extra flags
78	0xff, // operating system id (0xff = unknown)
79	] // 10 bytes
80	result << compressed
81	// trailer
82	checksum := crc32.sum(data)
83	length := data.len
84	result << [
85	u8(checksum),
86	u8(checksum >> 8),
87	u8(checksum >> 16),
88	u8(checksum >> 24),
89	u8(length),
90	u8(length >> 8),
91	u8(length >> 16),
92	u8(length >> 24),
93	] // 8 bytes
94	return result
95	}
96
97	// DecompressFlags
98	// TODO: These flags have no use now
99	@[flag]
100	pub enum DecompressFlags {
101	// If set, the input has a valid zlib header and ends with an adler32 checksum (it's a valid zlib stream). Otherwise, the input is a raw deflate stream.
102	parse_zlib_header
103	// If set, there are more input bytes available beyond the end of the supplied input buffer. If clear, the input buffer contains all remaining input.
104	has_more_input
105	// If set, the output buffer is large enough to hold the entire decompressed stream. If clear, the output buffer is at least the size of the dictionary (typically 32KB).
106	using_non_wrapping_output_buf
107	// Force adler-32 checksum computation of the decompressed bytes.
108	compute_adler32
109	}
110
111	// DecompressParams set flags for decompression:
112	@[params]
113	pub struct DecompressParams {
114	pub:
115	verify_header_checksum bool = true
116	verify_length bool = true
117	verify_checksum bool = true
118	flags DecompressFlags
119	}
120
121	pub const reserved_bits = 0b1110_0000
122	pub const ftext = 0b0000_0001
123	pub const fextra = 0b0000_0100
124	pub const fname = 0b0000_1000
125	pub const fcomment = 0b0001_0000
126	pub const fhcrc = 0b0000_0010
127
128	const min_header_length = 18
129
130	@[noinit]
131	pub struct GzipHeader {
132	pub mut:
133	length int = 10
134	extra []u8
135	filename []u8
136	comment []u8
137	modification_time u32
138	operating_system u8
139	}
140
141	// validate validates the header and returns its details if valid
142	@[direct_array_access]
143	pub fn validate(data []u8, params DecompressParams) !GzipHeader {
144	if data.len < min_header_length {
145	return error('data is too short, not gzip compressed?')
146	} else if data[0] != 0x1f \|\| data[1] != 0x8b {
147	return error('wrong magic numbers, not gzip compressed?')
148	} else if data[2] != 0x08 {
149	return error('gzip data is not compressed with DEFLATE')
150	}
151	mut header := GzipHeader{}
152
153	// parse flags, we ignore most of them, but we still need to parse them
154	// correctly, so we dont accidently decompress something that belongs
155	// to the header
156
157	if data[3] & reserved_bits > 0 {
158	// rfc 1952 2.3.1.2 Compliance
159	// A compliant decompressor must give an error indication if any
160	// reserved bit is non-zero, since such a bit could indicate the
161	// presence of a new field that would cause subsequent data to be
162	// interpreted incorrectly.
163	return error('reserved flags are set, unsupported field detected')
164	}
165
166	if data[3] & fextra > 0 {
167	xlen := data[header.length]
168	header.extra = data[header.length + 1..header.length + 1 + xlen]
169	header.length += xlen + 1
170	}
171	if data[3] & fname > 0 {
172	// filename is zero-terminated, so skip until we hit a zero byte
173	for header.length < data.len && data[header.length] != 0x00 {
174	header.filename << data[header.length]
175	header.length++
176	}
177	header.length++
178	}
179	if data[3] & fcomment > 0 {
180	// comment is zero-terminated, so skip until we hit a zero byte
181	for header.length < data.len && data[header.length] != 0x00 {
182	header.comment << data[header.length]
183	header.length++
184	}
185	header.length++
186	}
187	if data[3] & fhcrc > 0 {
188	if header.length + 12 > data.len {
189	return error('data too short')
190	}
191	checksum_header := crc32.sum(data[..header.length])
192	checksum_header_expected := (u32(data[header.length]) << 24) \| (u32(data[header.length + 1]) << 16) \| (u32(data[
193	header.length + 2]) << 8) \| data[header.length + 3]
194	if params.verify_header_checksum && checksum_header != checksum_header_expected {
195	return error('header checksum verification failed')
196	}
197	header.length += 4
198	}
199	if header.length + 8 > data.len {
200	return error('data too short')
201	}
202	header.operating_system = data[9]
203	return header
204	}
205
206	// decompress an array of bytes using zlib and returns the decompressed bytes in a new array.
207	// Example: b := 'abcdef'.repeat(1000).bytes(); cmpr := gzip.compress(b)!; decmpr := gzip.decompress(cmpr)!; assert cmpr.len < b.len; assert b == decmpr
208	pub fn decompress(data []u8, params DecompressParams) ![]u8 {
209	gzip_header := validate(data, params)!
210	header_length := gzip_header.length
211
212	decompressed := compr.decompress(data[header_length..data.len - 8], 0)!
213	length_expected := (u32(data[data.len - 1]) << 24) \| (u32(data[data.len - 2]) << 16) \| (u32(data[data.len - 3]) << 8) \| data[data.len - 4]
214	if params.verify_length && decompressed.len != length_expected {
215	return error('length verification failed, got ${decompressed.len}, expected ${length_expected}')
216	}
217	checksum := crc32.sum(decompressed)
218	checksum_expected := (u32(data[data.len - 5]) << 24) \| (u32(data[data.len - 6]) << 16) \| (u32(data[data.len - 7]) << 8) \| data[data.len - 8]
219	if params.verify_checksum && checksum != checksum_expected {
220	return error('checksum verification failed')
221	}
222	return decompressed
223	}
224
225	// decompress_with_callback decompresses the given `data`, using zlib. It calls `cb` with each chunk of decompressed bytes.
226	// A chunk is usually 32 KB or less. Note: the chunk data received by `cb` should be cloned, if you need to store it for later,
227	// and not process it right away.
228	// The callback function should return the chunk length, if it wants to continue decompressing, or 0, if it wants to abort the decompression early.
229	// See also compress.ChunkCallback for more details.
230	pub fn decompress_with_callback(data []u8, cb compr.ChunkCallback, userdata voidptr, params DecompressParams) !int {
231	gzip_header := validate(data, params)!
232	header_len := gzip_header.length
233	expected_len := int((u32(data[data.len - 1]) << 24) \| (u32(data[data.len - 2]) << 16) \| (u32(data[data.len - 3]) << 8) \| data[data.len - 4])
234	body := data[header_len..data.len - 8]
235	chunks_len := int(compr.decompress_with_callback(body, cb, userdata, 0)!)
236	if params.verify_length && expected_len != chunks_len {
237	return error('Decompress error: expected length:${expected_len}, got:${chunks_len}')
238	}
239	return chunks_len
240	}
241