| 1 | // [rfc1952](https://datatracker.ietf.org/doc/html/rfc1952) compliant |
| 2 | // gzip compression/decompression |
| 3 | |
| 4 | module gzip |
| 5 | |
| 6 | import compress as compr |
| 7 | import hash.crc32 |
| 8 | |
| 9 | // CompressFlags |
| 10 | // TODO: These flags have no use now |
| 11 | @[flag] |
| 12 | pub enum CompressFlags { |
| 13 | // The low 12 bits will be overwritten by `compression_level` |
| 14 | compression_level_overwrite_flag01 |
| 15 | compression_level_overwrite_flag02 |
| 16 | compression_level_overwrite_flag03 |
| 17 | compression_level_overwrite_flag04 |
| 18 | compression_level_overwrite_flag05 |
| 19 | compression_level_overwrite_flag06 |
| 20 | compression_level_overwrite_flag07 |
| 21 | compression_level_overwrite_flag08 |
| 22 | compression_level_overwrite_flag09 |
| 23 | compression_level_overwrite_flag10 |
| 24 | compression_level_overwrite_flag11 |
| 25 | compression_level_overwrite_flag12 |
| 26 | |
| 27 | // If set, the compressor outputs a zlib header before the deflate data, and the Adler-32 of the source data at the end. Otherwise, you'll get raw deflate data. |
| 28 | write_zlib_header //= 0x01000 |
| 29 | // Always compute the adler-32 of the input data (even when not writing zlib headers). |
| 30 | compute_adler32 //= 0x02000 |
| 31 | // Set to use faster greedy parsing, instead of more efficient lazy parsing. |
| 32 | greedy_parsing_flag //= 0x04000 |
| 33 | // Enable to decrease the compressor's initialization time to the minimum, but the output may vary from run to run given the same input (depending on the contents of memory). |
| 34 | nondeterministic_parsing_flag //= 0x08000 |
| 35 | // Only look for RLE matches (matches with a distance of 1) |
| 36 | rle_matches //= 0x10000 |
| 37 | // Discards matches <= 5 chars if enabled. |
| 38 | filter_matches //= 0x20000 |
| 39 | // Disable usage of optimized Huffman tables. |
| 40 | force_all_static_blocks //= 0x40000 |
| 41 | // Only use raw (uncompressed) deflate blocks. |
| 42 | force_all_raw_blocks //= 0x80000 |
| 43 | } |
| 44 | |
| 45 | // CompressParams set compression_level for compression: |
| 46 | // 0: Huffman only; |
| 47 | // 1: Huffman+LZ (fastest/crap compression); |
| 48 | // 128: default_max_probes; |
| 49 | // 4095: Huffman+LZ (slowest/best compression) |
| 50 | @[params] |
| 51 | pub struct CompressParams { |
| 52 | pub: |
| 53 | compression_level int = 128 // 0~4095 |
| 54 | flags CompressFlags |
| 55 | } |
| 56 | |
| 57 | // compresses an array of bytes using gzip and returns the compressed bytes in a new array |
| 58 | // Example: b := 'abcde'.repeat(1000).bytes(); cmprsd := gzip.compress(b, compression_level:4095)!; assert cmprsd.len == 47 |
| 59 | // Note: compression_level 0~4095 |
| 60 | pub fn compress(data []u8, params CompressParams) ![]u8 { |
| 61 | if params.compression_level !in 0..4096 { |
| 62 | return error('compression level should in [0,4095]') |
| 63 | } |
| 64 | // The low 12 bits are reserved to control the max # of hash probes per dictionary lookup. |
| 65 | flags := params.compression_level | (int(params.flags) & ~int(4095)) |
| 66 | compressed := compr.compress(data, flags)! |
| 67 | // header |
| 68 | mut result := [ |
| 69 | u8(0x1f), // magic numbers (1F 8B) |
| 70 | 0x8b, |
| 71 | 0x08, // deflate |
| 72 | 0x00, // header flags |
| 73 | 0x00, // 4-byte timestamp, 0 = no timestamp (00 00 00 00) |
| 74 | 0x00, |
| 75 | 0x00, |
| 76 | 0x00, |
| 77 | 0x00, // extra flags |
| 78 | 0xff, // operating system id (0xff = unknown) |
| 79 | ] // 10 bytes |
| 80 | result << compressed |
| 81 | // trailer |
| 82 | checksum := crc32.sum(data) |
| 83 | length := data.len |
| 84 | result << [ |
| 85 | u8(checksum), |
| 86 | u8(checksum >> 8), |
| 87 | u8(checksum >> 16), |
| 88 | u8(checksum >> 24), |
| 89 | u8(length), |
| 90 | u8(length >> 8), |
| 91 | u8(length >> 16), |
| 92 | u8(length >> 24), |
| 93 | ] // 8 bytes |
| 94 | return result |
| 95 | } |
| 96 | |
| 97 | // DecompressFlags |
| 98 | // TODO: These flags have no use now |
| 99 | @[flag] |
| 100 | pub enum DecompressFlags { |
| 101 | // If set, the input has a valid zlib header and ends with an adler32 checksum (it's a valid zlib stream). Otherwise, the input is a raw deflate stream. |
| 102 | parse_zlib_header |
| 103 | // If set, there are more input bytes available beyond the end of the supplied input buffer. If clear, the input buffer contains all remaining input. |
| 104 | has_more_input |
| 105 | // If set, the output buffer is large enough to hold the entire decompressed stream. If clear, the output buffer is at least the size of the dictionary (typically 32KB). |
| 106 | using_non_wrapping_output_buf |
| 107 | // Force adler-32 checksum computation of the decompressed bytes. |
| 108 | compute_adler32 |
| 109 | } |
| 110 | |
| 111 | // DecompressParams set flags for decompression: |
| 112 | @[params] |
| 113 | pub struct DecompressParams { |
| 114 | pub: |
| 115 | verify_header_checksum bool = true |
| 116 | verify_length bool = true |
| 117 | verify_checksum bool = true |
| 118 | flags DecompressFlags |
| 119 | } |
| 120 | |
| 121 | pub const reserved_bits = 0b1110_0000 |
| 122 | pub const ftext = 0b0000_0001 |
| 123 | pub const fextra = 0b0000_0100 |
| 124 | pub const fname = 0b0000_1000 |
| 125 | pub const fcomment = 0b0001_0000 |
| 126 | pub const fhcrc = 0b0000_0010 |
| 127 | |
| 128 | const min_header_length = 18 |
| 129 | |
| 130 | @[noinit] |
| 131 | pub struct GzipHeader { |
| 132 | pub mut: |
| 133 | length int = 10 |
| 134 | extra []u8 |
| 135 | filename []u8 |
| 136 | comment []u8 |
| 137 | modification_time u32 |
| 138 | operating_system u8 |
| 139 | } |
| 140 | |
| 141 | // validate validates the header and returns its details if valid |
| 142 | @[direct_array_access] |
| 143 | pub fn validate(data []u8, params DecompressParams) !GzipHeader { |
| 144 | if data.len < min_header_length { |
| 145 | return error('data is too short, not gzip compressed?') |
| 146 | } else if data[0] != 0x1f || data[1] != 0x8b { |
| 147 | return error('wrong magic numbers, not gzip compressed?') |
| 148 | } else if data[2] != 0x08 { |
| 149 | return error('gzip data is not compressed with DEFLATE') |
| 150 | } |
| 151 | mut header := GzipHeader{} |
| 152 | |
| 153 | // parse flags, we ignore most of them, but we still need to parse them |
| 154 | // correctly, so we dont accidently decompress something that belongs |
| 155 | // to the header |
| 156 | |
| 157 | if data[3] & reserved_bits > 0 { |
| 158 | // rfc 1952 2.3.1.2 Compliance |
| 159 | // A compliant decompressor must give an error indication if any |
| 160 | // reserved bit is non-zero, since such a bit could indicate the |
| 161 | // presence of a new field that would cause subsequent data to be |
| 162 | // interpreted incorrectly. |
| 163 | return error('reserved flags are set, unsupported field detected') |
| 164 | } |
| 165 | |
| 166 | if data[3] & fextra > 0 { |
| 167 | xlen := data[header.length] |
| 168 | header.extra = data[header.length + 1..header.length + 1 + xlen] |
| 169 | header.length += xlen + 1 |
| 170 | } |
| 171 | if data[3] & fname > 0 { |
| 172 | // filename is zero-terminated, so skip until we hit a zero byte |
| 173 | for header.length < data.len && data[header.length] != 0x00 { |
| 174 | header.filename << data[header.length] |
| 175 | header.length++ |
| 176 | } |
| 177 | header.length++ |
| 178 | } |
| 179 | if data[3] & fcomment > 0 { |
| 180 | // comment is zero-terminated, so skip until we hit a zero byte |
| 181 | for header.length < data.len && data[header.length] != 0x00 { |
| 182 | header.comment << data[header.length] |
| 183 | header.length++ |
| 184 | } |
| 185 | header.length++ |
| 186 | } |
| 187 | if data[3] & fhcrc > 0 { |
| 188 | if header.length + 12 > data.len { |
| 189 | return error('data too short') |
| 190 | } |
| 191 | checksum_header := crc32.sum(data[..header.length]) |
| 192 | checksum_header_expected := (u32(data[header.length]) << 24) | (u32(data[header.length + 1]) << 16) | (u32(data[ |
| 193 | header.length + 2]) << 8) | data[header.length + 3] |
| 194 | if params.verify_header_checksum && checksum_header != checksum_header_expected { |
| 195 | return error('header checksum verification failed') |
| 196 | } |
| 197 | header.length += 4 |
| 198 | } |
| 199 | if header.length + 8 > data.len { |
| 200 | return error('data too short') |
| 201 | } |
| 202 | header.operating_system = data[9] |
| 203 | return header |
| 204 | } |
| 205 | |
| 206 | // decompress an array of bytes using zlib and returns the decompressed bytes in a new array. |
| 207 | // Example: b := 'abcdef'.repeat(1000).bytes(); cmpr := gzip.compress(b)!; decmpr := gzip.decompress(cmpr)!; assert cmpr.len < b.len; assert b == decmpr |
| 208 | pub fn decompress(data []u8, params DecompressParams) ![]u8 { |
| 209 | gzip_header := validate(data, params)! |
| 210 | header_length := gzip_header.length |
| 211 | |
| 212 | decompressed := compr.decompress(data[header_length..data.len - 8], 0)! |
| 213 | length_expected := (u32(data[data.len - 1]) << 24) | (u32(data[data.len - 2]) << 16) | (u32(data[data.len - 3]) << 8) | data[data.len - 4] |
| 214 | if params.verify_length && decompressed.len != length_expected { |
| 215 | return error('length verification failed, got ${decompressed.len}, expected ${length_expected}') |
| 216 | } |
| 217 | checksum := crc32.sum(decompressed) |
| 218 | checksum_expected := (u32(data[data.len - 5]) << 24) | (u32(data[data.len - 6]) << 16) | (u32(data[data.len - 7]) << 8) | data[data.len - 8] |
| 219 | if params.verify_checksum && checksum != checksum_expected { |
| 220 | return error('checksum verification failed') |
| 221 | } |
| 222 | return decompressed |
| 223 | } |
| 224 | |
| 225 | // decompress_with_callback decompresses the given `data`, using zlib. It calls `cb` with each chunk of decompressed bytes. |
| 226 | // A chunk is usually 32 KB or less. Note: the chunk data received by `cb` should be cloned, if you need to store it for later, |
| 227 | // and not process it right away. |
| 228 | // The callback function should return the chunk length, if it wants to continue decompressing, or 0, if it wants to abort the decompression early. |
| 229 | // See also compress.ChunkCallback for more details. |
| 230 | pub fn decompress_with_callback(data []u8, cb compr.ChunkCallback, userdata voidptr, params DecompressParams) !int { |
| 231 | gzip_header := validate(data, params)! |
| 232 | header_len := gzip_header.length |
| 233 | expected_len := int((u32(data[data.len - 1]) << 24) | (u32(data[data.len - 2]) << 16) | (u32(data[data.len - 3]) << 8) | data[data.len - 4]) |
| 234 | body := data[header_len..data.len - 8] |
| 235 | chunks_len := int(compr.decompress_with_callback(body, cb, userdata, 0)!) |
| 236 | if params.verify_length && expected_len != chunks_len { |
| 237 | return error('Decompress error: expected length:${expected_len}, got:${chunks_len}') |
| 238 | } |
| 239 | return chunks_len |
| 240 | } |
| 241 | |