From 02d9ab594d19c672bbfc9f290201f39697949f7a Mon Sep 17 00:00:00 2001 From: JalonSolov Date: Sun, 7 Jun 2026 19:15:56 -0400 Subject: [PATCH] compress.gzip: use deflate for RFC compliance (#27320) * compress.gzip: use deflate for RFC compliance * adjust deprecation date, more api compatibility with old code * clean up confusing deprecation msgs --- vlib/compress/README.md | 9 + vlib/compress/compress.c.v | 13 + vlib/compress/deflate/deflate.v | 228 ++++++++++++++---- vlib/compress/deflate/deflate_inflate.v | 170 +++++++++++++ vlib/compress/gzip/gzip.v | 226 ++++------------- vlib/compress/gzip/gzip_test.v | 109 +++++++-- vlib/compress/gzip/read_gz_files_test.v | 62 ----- vlib/compress/zlib/zlib.v | 8 + vlib/compress/zlib/zlib_test.v | 15 ++ ..._to_have_the_module_as_a_substring_keep.vv | 148 +----------- 10 files changed, 527 insertions(+), 461 deletions(-) delete mode 100644 vlib/compress/gzip/read_gz_files_test.v diff --git a/vlib/compress/README.md b/vlib/compress/README.md index f067fd845..81b119cac 100644 --- a/vlib/compress/README.md +++ b/vlib/compress/README.md @@ -7,6 +7,15 @@ At the moment, the following compression algorithms are implemented: - `compress.bzip2` - `compress.deflate` - `compress.gzip` +- `compress.lz` supporting the following variations: + - lz4 + - lz77 + - lz78 + - lzjb + - lzma + - lzma2 + - lzss + - lzw - `compress.snappy` - `compress.zlib` - `compress.zstd` diff --git a/vlib/compress/compress.c.v b/vlib/compress/compress.c.v index 61af110c5..09ff3e208 100644 --- a/vlib/compress/compress.c.v +++ b/vlib/compress/compress.c.v @@ -1,9 +1,13 @@ +@[deprecated: 'use compress.deflate instead'] +@[deprecated_after: '2026-07-31'] module compress #flag -I @VEXEROOT/thirdparty/zip #define MINIZ_NO_ZLIB_COMPATIBLE_NAMES #include "miniz.h" +@[deprecated: 'use max_i32 (from builtin) instead'] +@[deprecated_after: '2026-07-31'] pub const max_size = u64(1 << 31) fn C.tdefl_compress_mem_to_heap(source_buf voidptr, source_buf_len usize, out_len &usize, flags i32) voidptr @@ -11,6 +15,8 @@ fn C.tinfl_decompress_mem_to_heap(source_buf voidptr, source_buf_len usize, out_ // compresses an array of bytes based on providing flags and returns the compressed bytes in a new array // NB: this is a low level api, a high level implementation like zlib/gzip should be preferred +@[deprecated: 'use compress.deflate.compress instead'] +@[deprecated_after: '2026-07-31'] @[manualfree] pub fn compress(data []u8, flags int) ![]u8 { if u64(data.len) > max_size { @@ -34,6 +40,8 @@ pub fn compress(data []u8, flags int) ![]u8 { // decompresses an array of bytes based on providing flags and returns the decompressed bytes in a new array // NB: this is a low level api, a high level implementation like zlib/gzip should be preferred +@[deprecated: 'use compress.deflate.decompress instead'] +@[deprecated_after: '2026-07-31'] @[manualfree] pub fn decompress(data []u8, flags int) ![]u8 { mut out_len := usize(0) @@ -58,11 +66,16 @@ pub fn decompress(data []u8, flags int) ![]u8 { // the decompressor to send more chunks, otherwise the decompression stops. // The userdata parameter comes from the call to decompress_with_callback/4, and can be used // to pass arbitrary data, without having to create a closure. + +@[deprecated: 'use compress.deflate.ChunkCallback instead'] +@[deprecated_after: '2026-07-31'] pub type ChunkCallback = fn (chunk []u8, userdata voidptr) int // decompress_with_callback decompresses an array of bytes, based on the provided flags, and a V fn callback to receive decompressed chunks, of at most 32 kilobytes each. // It returns the total decompressed length, or a decompression error. // NB: this is a low level api, a high level implementation like zlib/gzip should be preferred. +@[deprecated: 'use compress.deflate.decompress_with_callback instead'] +@[deprecated_after: '2026-07-31'] pub fn decompress_with_callback(data []u8, cb ChunkCallback, userdata voidptr, flags int) !u64 { cbdata := DecompressionCallBackData{ data: data.data diff --git a/vlib/compress/deflate/deflate.v b/vlib/compress/deflate/deflate.v index 890563a30..bbbe5d4db 100644 --- a/vlib/compress/deflate/deflate.v +++ b/vlib/compress/deflate/deflate.v @@ -4,6 +4,8 @@ import encoding.binary import hash.adler32 import hash.crc32 +pub type ChunkCallback = fn (chunk []u8, userdata voidptr) int + // CompressFormat selects the output container around the RFC 1951 payload. pub enum CompressFormat { zlib @@ -17,13 +19,117 @@ pub: format CompressFormat = .zlib } +pub struct RawInflateResult { +pub: + decoded []u8 + consumed int +} + +pub struct ZlibHeader { +pub: + payload_start int = 2 +} + +pub struct GzipHeader { +pub mut: + flags u8 + payload_start int + extra []u8 + filename []u8 + comment []u8 + modification_time u32 + operating_system u8 +} + +// validate_zlib_header validates a RFC 1950 zlib header. +@[direct_array_access] +pub fn validate_zlib_header(data []u8) !ZlibHeader { + if data.len < 6 { + return error('invalid zlib stream: too short') + } + if data[0] & 0x0f != 8 { + return error('invalid zlib stream: unsupported compression method') + } + if (u32(data[0]) * 256 + u32(data[1])) % 31 != 0 { + return error('invalid zlib stream: bad header checksum') + } + if data[1] & 0x20 != 0 { + return error('invalid zlib stream: preset dictionary not supported') + } + return ZlibHeader{} +} + +// validate_gzip_header validates a RFC 1952 gzip header and returns parsed fields. +@[direct_array_access] +pub fn validate_gzip_header(data []u8) !GzipHeader { + if data.len < 18 { + return error('invalid gzip stream: too short') + } + if data[0] != 0x1f || data[1] != 0x8b { + return error('invalid gzip stream: bad magic') + } + if data[2] != 8 { + return error('invalid gzip stream: unsupported compression method') + } + flg := data[3] + if flg & 0xe0 != 0 { + return error('invalid gzip stream: reserved flags set') + } + mut header := GzipHeader{ + flags: flg + payload_start: 10 + modification_time: binary.little_endian_u32_at(data, 4) + operating_system: data[9] + } + if flg & 0x04 != 0 { + if header.payload_start + 2 > data.len { + return error('invalid gzip stream: truncated extra') + } + xlen := int(u32(data[header.payload_start]) | u32(data[header.payload_start + 1]) << 8) + header.payload_start += 2 + if header.payload_start + xlen > data.len { + return error('invalid gzip stream: truncated extra') + } + header.extra = data[header.payload_start..header.payload_start + xlen] + header.payload_start += xlen + } + if flg & 0x08 != 0 { + for header.payload_start < data.len && data[header.payload_start] != 0 { + header.filename << data[header.payload_start] + header.payload_start++ + } + header.payload_start++ + } + if flg & 0x10 != 0 { + for header.payload_start < data.len && data[header.payload_start] != 0 { + header.comment << data[header.payload_start] + header.payload_start++ + } + header.payload_start++ + } + if flg & 0x02 != 0 { + if header.payload_start + 2 > data.len { + return error('invalid gzip stream: truncated fhcrc') + } + expected_crc16 := u16(data[header.payload_start]) | (u16(data[header.payload_start + 1]) << 8) + actual_crc16 := u16(crc32.sum(data[..header.payload_start]) & 0xffff) + if actual_crc16 != expected_crc16 { + return error('invalid gzip stream: header crc16 mismatch') + } + header.payload_start += 2 + } + if header.payload_start + 8 > data.len { + return error('invalid gzip stream: truncated payload') + } + return header +} + // compress compresses data as zlib, gzip, or raw DEFLATE. pub fn compress(data []u8, format CompressParams) ![]u8 { - payload := deflate_compress_fixed(data) - match format.format { - .zlib { return compress_zlib(data) } - .gzip { return compress_gzip(data) } - .raw_deflate { return payload } + return match format.format { + .zlib { compress_zlib(data) } + .gzip { compress_gzip(data) } + .raw_deflate { deflate_compress_fixed(data) } } } @@ -75,19 +181,8 @@ pub fn decompress(data []u8) ![]u8 { // decompress_zlib decompresses a zlib stream (RFC 1950). // It returns the decompressed bytes in a new array. pub fn decompress_zlib(data []u8) ![]u8 { - if data.len < 6 { - return error('invalid zlib stream: too short') - } - if data[0] & 0x0f != 8 { - return error('invalid zlib stream: unsupported compression method') - } - if (u32(data[0]) * 256 + u32(data[1])) % 31 != 0 { - return error('invalid zlib stream: bad header checksum') - } - if data[1] & 0x20 != 0 { - return error('invalid zlib stream: preset dictionary not supported') - } - payload := data[2..data.len - 4] + header := validate_zlib_header(data)! + payload := data[header.payload_start..data.len - 4] expected := binary.big_endian_u32_at(data, data.len - 4) res := inflate_with_consumed(payload)! if res.consumed != payload.len { @@ -101,57 +196,88 @@ pub fn decompress_zlib(data []u8) ![]u8 { // decompress_gzip decompresses a gzip stream (RFC 1952). // It returns the decompressed bytes in a new array. pub fn decompress_gzip(data []u8) ![]u8 { - if data.len < 18 { - return error('invalid gzip stream: too short') + header := validate_gzip_header(data)! + payload := data[header.payload_start..data.len - 8] + expected_crc := binary.little_endian_u32_at(data, data.len - 8) + expected_size := binary.little_endian_u32_at(data, data.len - 4) + res := inflate_with_consumed(payload)! + if res.consumed != payload.len { + return error('invalid gzip stream: trailing data before trailer') } - if data[0] != 0x1f || data[1] != 0x8b { - return error('invalid gzip stream: bad magic') + decoded := res.decoded + if crc32.sum(decoded) != expected_crc { + return error('invalid gzip stream: crc32 mismatch') } - if data[2] != 8 { - return error('invalid gzip stream: unsupported compression method') + if u32(decoded.len) != expected_size { + return error('invalid gzip stream: size mismatch') } - flg := data[3] - mut pos := 10 // fixed header size - if flg & 0x04 != 0 { // FEXTRA - if pos + 2 > data.len { - return error('invalid gzip stream: truncated extra') - } - xlen := int(u32(data[pos]) | u32(data[pos + 1]) << 8) - pos += 2 + xlen + return decoded +} + +// decompress_raw_with_consumed decompresses raw RFC 1951 DEFLATE data and tracks consumed bytes. +pub fn decompress_raw_with_consumed(data []u8) !RawInflateResult { + res := inflate_with_consumed(data)! + return RawInflateResult{ + decoded: res.decoded + consumed: res.consumed } - if flg & 0x08 != 0 { // FNAME - for pos < data.len && data[pos] != 0 { - pos++ +} + +// decompress_with_callback decompresses a zlib/gzip/raw stream (RFC 1950, RFC 1952) using a callback for chunked delivery. +// The callback receives chunks of decompressed data and should return the chunk length to continue, or 0 to abort. +// Returns the total decompressed length. +pub fn decompress_with_callback(data []u8, cb ChunkCallback, userdata voidptr) !int { + if data.len >= 2 { + // gzip magic: 0x1f 0x8b + if data[0] == 0x1f && data[1] == 0x8b { + return decompress_gzip_with_callback(data, cb, userdata) } - pos++ - } - if flg & 0x10 != 0 { // FCOMMENT - for pos < data.len && data[pos] != 0 { - pos++ + // zlib: CM=8 and header checksum passes + if data[0] & 0x0f == 8 && (u32(data[0]) * 256 + u32(data[1])) % 31 == 0 { + return decompress_zlib_with_callback(data, cb, userdata) } - pos++ } - if flg & 0x02 != 0 { // FHCRC - pos += 2 + // raw DEFLATE + res := inflate_with_callback(data, cb, userdata)! + return res.delivered +} + +fn decompress_zlib_with_callback(data []u8, cb ChunkCallback, userdata voidptr) !int { + header := validate_zlib_header(data)! + payload := data[header.payload_start..data.len - 4] + expected := binary.big_endian_u32_at(data, data.len - 4) + res := inflate_with_callback(payload, cb, userdata)! + if res.aborted { + return res.delivered } - if pos + 8 > data.len { - return error('invalid gzip stream: truncated payload') + if res.consumed != payload.len { + return error('invalid zlib stream: trailing data before adler32') } - payload := data[pos..data.len - 8] + if adler32.sum(res.decoded) != expected { + return error('invalid zlib stream: adler32 mismatch') + } + return res.delivered +} + +fn decompress_gzip_with_callback(data []u8, cb ChunkCallback, userdata voidptr) !int { + header := validate_gzip_header(data)! + payload := data[header.payload_start..data.len - 8] expected_crc := binary.little_endian_u32_at(data, data.len - 8) expected_size := binary.little_endian_u32_at(data, data.len - 4) - res := inflate_with_consumed(payload)! + res := inflate_with_callback(payload, cb, userdata)! + if res.aborted { + return res.delivered + } if res.consumed != payload.len { return error('invalid gzip stream: trailing data before trailer') } - decoded := res.decoded - if crc32.sum(decoded) != expected_crc { + if crc32.sum(res.decoded) != expected_crc { return error('invalid gzip stream: crc32 mismatch') } - if u32(decoded.len) != expected_size { + if u32(res.decoded.len) != expected_size { return error('invalid gzip stream: size mismatch') } - return decoded + return res.delivered } fn bit_reverse(v u32, n int) u32 { diff --git a/vlib/compress/deflate/deflate_inflate.v b/vlib/compress/deflate/deflate_inflate.v index 93458b953..1c59e34df 100644 --- a/vlib/compress/deflate/deflate_inflate.v +++ b/vlib/compress/deflate/deflate_inflate.v @@ -158,6 +158,33 @@ struct InflateResult { consumed int } +struct InflateStreamResult { + decoded []u8 + consumed int + delivered int + aborted bool +} + +struct InflateStreamState { +mut: + delivered int +} + +const inflate_callback_chunk_size = 32768 + +@[direct_array_access; inline] +fn flush_stream_chunks(out []u8, cb ChunkCallback, userdata voidptr, mut state InflateStreamState) bool { + for out.len - state.delivered >= inflate_callback_chunk_size { + end := state.delivered + inflate_callback_chunk_size + chunk := out[state.delivered..end] + if cb(chunk, userdata) != chunk.len { + return false + } + state.delivered = end + } + return true +} + // inflate_with_consumed decompresses raw RFC 1951 DEFLATE data and reports // how many input bytes were consumed by the DEFLATE bitstream. fn inflate_with_consumed(data []u8) !InflateResult { @@ -248,6 +275,110 @@ fn inflate(data []u8) ![]u8 { return res.decoded } +// inflate_with_callback decompresses raw RFC 1951 DEFLATE data and streams output through `cb`. +// It still tracks consumed input bytes for container-level validation. +fn inflate_with_callback(data []u8, cb ChunkCallback, userdata voidptr) !InflateStreamResult { + mut r := BitReader{ + buf: data + } + mut out := []u8{} + mut state := InflateStreamState{} + mut aborted := false + fixed_ll := build_huff_tree(fixed_litlen_lengths()) + fixed_d := build_huff_tree([]int{len: 32, init: 5}) + for { + bfinal := r.read_bits(1)! + btype := r.read_bits(2)! + match btype { + 0 { + r.align_byte() + len_ := int(r.read_byte_raw()!) | (int(u32(r.read_byte_raw()!) << 8)) + nlen := int(r.read_byte_raw()!) | (int(u32(r.read_byte_raw()!) << 8)) + if len_ & 0xffff != (~nlen) & 0xffff { + return error('inflate: bad stored block length') + } + for _ in 0 .. len_ { + out << r.read_byte_raw()! + if !flush_stream_chunks(out, cb, userdata, mut state) { + aborted = true + break + } + } + } + 1 { + if !inflate_block_stream(mut r, mut out, fixed_ll, fixed_d, cb, userdata, mut state)! { + aborted = true + } + } + 2 { + hlit := int(r.read_bits(5)!) + 257 + hdist := int(r.read_bits(5)!) + 1 + hclen := int(r.read_bits(4)!) + 4 + mut cl_lens := []int{len: 19} + for i in 0 .. hclen { + cl_lens[cl_order[i]] = int(r.read_bits(3)!) + } + cl_tree := build_huff_tree(cl_lens) + mut all_lens := []int{} + for all_lens.len < hlit + hdist { + sym := r.huff_decode(cl_tree)! + if sym <= 15 { + all_lens << int(sym) + } else if sym == 16 { + if all_lens.len == 0 { + return error('inflate: repeat with empty history') + } + rep := int(r.read_bits(2)!) + 3 + last := all_lens[all_lens.len - 1] + for _ in 0 .. rep { + all_lens << last + } + } else if sym == 17 { + rep := int(r.read_bits(3)!) + 3 + for _ in 0 .. rep { + all_lens << 0 + } + } else if sym == 18 { + rep := int(r.read_bits(7)!) + 11 + for _ in 0 .. rep { + all_lens << 0 + } + } else { + return error('inflate: bad code length symbol') + } + } + ll_tree := build_huff_tree(all_lens[..hlit]) + d_tree := build_huff_tree(all_lens[hlit..]) + if !inflate_block_stream(mut r, mut out, ll_tree, d_tree, cb, userdata, mut state)! { + aborted = true + } + } + else { + return error('inflate: reserved block type') + } + } + + if aborted || bfinal == 1 { + break + } + } + if !aborted && out.len > state.delivered { + chunk := unsafe { out[state.delivered..] } + if cb(chunk, userdata) != chunk.len { + aborted = true + } else { + state.delivered = out.len + } + } + consumed := r.pos - (r.nbits >> 3) + return InflateStreamResult{ + decoded: out + consumed: consumed + delivered: state.delivered + aborted: aborted + } +} + @[direct_array_access] fn inflate_block(mut r BitReader, mut out []u8, ll HuffTree, dist HuffTree) ! { for { @@ -279,3 +410,42 @@ fn inflate_block(mut r BitReader, mut out []u8, ll HuffTree, dist HuffTree) ! { } } } + +@[direct_array_access] +fn inflate_block_stream(mut r BitReader, mut out []u8, ll HuffTree, dist HuffTree, cb ChunkCallback, userdata voidptr, mut state InflateStreamState) !bool { + for { + sym := r.huff_decode(ll)! + if sym == 256 { + break + } + if sym < 256 { + out << u8(sym) + if !flush_stream_chunks(out, cb, userdata, mut state) { + return false + } + } else { + li := int(sym) - 257 + if li < 0 || li >= length_bases.len { + return error('inflate: invalid length symbol ${sym}') + } + length := length_bases[li] + int(r.read_bits(length_extra_bits[li])!) + dsym := r.huff_decode(dist)! + di := int(dsym) + if di >= dist_bases.len { + return error('inflate: invalid distance symbol ${dsym}') + } + distance := dist_bases[di] + int(r.read_bits(dist_extra_bits[di])!) + if distance > out.len { + return error('inflate: distance past output start') + } + base := out.len - distance + for i in 0 .. length { + out << out[base + i] + if !flush_stream_chunks(out, cb, userdata, mut state) { + return false + } + } + } + } + return true +} diff --git a/vlib/compress/gzip/gzip.v b/vlib/compress/gzip/gzip.v index 55db06b0b..b83822cda 100644 --- a/vlib/compress/gzip/gzip.v +++ b/vlib/compress/gzip/gzip.v @@ -3,11 +3,11 @@ module gzip -import compress as compr -import hash.crc32 +import compress.deflate // CompressFlags -// TODO: These flags have no use now +@[deprecated: 'never used'] +@[deprecated_after: '2026-07-31'] @[flag] pub enum CompressFlags { // The low 12 bits will be overwritten by `compression_level` @@ -23,92 +23,48 @@ pub enum CompressFlags { compression_level_overwrite_flag10 compression_level_overwrite_flag11 compression_level_overwrite_flag12 - - // If set, the compressor outputs a zlib header before the deflate data, and the Adler-32 of the source data at the end. Otherwise, you'll get raw deflate data. - write_zlib_header //= 0x01000 - // Always compute the adler-32 of the input data (even when not writing zlib headers). - compute_adler32 //= 0x02000 - // Set to use faster greedy parsing, instead of more efficient lazy parsing. - greedy_parsing_flag //= 0x04000 - // Enable to decrease the compressor's initialization time to the minimum, but the output may vary from run to run given the same input (depending on the contents of memory). - nondeterministic_parsing_flag //= 0x08000 - // Only look for RLE matches (matches with a distance of 1) - rle_matches //= 0x10000 - // Discards matches <= 5 chars if enabled. - filter_matches //= 0x20000 - // Disable usage of optimized Huffman tables. - force_all_static_blocks //= 0x40000 - // Only use raw (uncompressed) deflate blocks. - force_all_raw_blocks //= 0x80000 + write_zlib_header + compute_adler32 + greedy_parsing_flag + nondeterministic_parsing_flag + rle_matches + filter_matches + force_all_static_blocks + force_all_raw_blocks } // CompressParams set compression_level for compression: -// 0: Huffman only; -// 1: Huffman+LZ (fastest/crap compression); -// 128: default_max_probes; -// 4095: Huffman+LZ (slowest/best compression) +@[deprecated: 'never used'] +@[deprecated_after: '2026-07-31'] @[params] pub struct CompressParams { pub: - compression_level int = 128 // 0~4095 + compression_level int flags CompressFlags } -// compresses an array of bytes using gzip and returns the compressed bytes in a new array -// Example: b := 'abcde'.repeat(1000).bytes(); cmprsd := gzip.compress(b, compression_level:4095)!; assert cmprsd.len == 47 -// Note: compression_level 0~4095 -pub fn compress(data []u8, params CompressParams) ![]u8 { - if params.compression_level !in 0..4096 { - return error('compression level should in [0,4095]') - } - // The low 12 bits are reserved to control the max # of hash probes per dictionary lookup. - flags := params.compression_level | (int(params.flags) & ~int(4095)) - compressed := compr.compress(data, flags)! - // header - mut result := [ - u8(0x1f), // magic numbers (1F 8B) - 0x8b, - 0x08, // deflate - 0x00, // header flags - 0x00, // 4-byte timestamp, 0 = no timestamp (00 00 00 00) - 0x00, - 0x00, - 0x00, - 0x00, // extra flags - 0xff, // operating system id (0xff = unknown) - ] // 10 bytes - result << compressed - // trailer - checksum := crc32.sum(data) - length := data.len - result << [ - u8(checksum), - u8(checksum >> 8), - u8(checksum >> 16), - u8(checksum >> 24), - u8(length), - u8(length >> 8), - u8(length >> 16), - u8(length >> 24), - ] // 8 bytes - return result +// compress compresses an array of bytes using gzip and returns the compressed bytes in a new array. +pub fn compress(data []u8) ![]u8 { + // Delegate to deflate.compress_gzip() which implements RFC 1952 + return deflate.compress_gzip(data) } // DecompressFlags -// TODO: These flags have no use now +// N.B.: only retained for API compatibility. +@[deprecated: 'never used'] +@[deprecated_after: '2026-07-31'] @[flag] pub enum DecompressFlags { - // If set, the input has a valid zlib header and ends with an adler32 checksum (it's a valid zlib stream). Otherwise, the input is a raw deflate stream. parse_zlib_header - // If set, there are more input bytes available beyond the end of the supplied input buffer. If clear, the input buffer contains all remaining input. has_more_input - // If set, the output buffer is large enough to hold the entire decompressed stream. If clear, the output buffer is at least the size of the dictionary (typically 32KB). using_non_wrapping_output_buf - // Force adler-32 checksum computation of the decompressed bytes. compute_adler32 } -// DecompressParams set flags for decompression: +// DecompressParams controls gzip decompression behavior. +// N.B.: only retained for API compatibility. +@[deprecated: 'never used'] +@[deprecated_after: '2026-07-31'] @[params] pub struct DecompressParams { pub: @@ -118,123 +74,25 @@ pub: flags DecompressFlags } -pub const reserved_bits = 0b1110_0000 -pub const ftext = 0b0000_0001 -pub const fextra = 0b0000_0100 -pub const fname = 0b0000_1000 -pub const fcomment = 0b0001_0000 -pub const fhcrc = 0b0000_0010 - -const min_header_length = 18 - -@[noinit] -pub struct GzipHeader { -pub mut: - length int = 10 - extra []u8 - filename []u8 - comment []u8 - modification_time u32 - operating_system u8 -} - -// validate validates the header and returns its details if valid -@[direct_array_access] -pub fn validate(data []u8, params DecompressParams) !GzipHeader { - if data.len < min_header_length { - return error('data is too short, not gzip compressed?') - } else if data[0] != 0x1f || data[1] != 0x8b { - return error('wrong magic numbers, not gzip compressed?') - } else if data[2] != 0x08 { - return error('gzip data is not compressed with DEFLATE') - } - mut header := GzipHeader{} - - // parse flags, we ignore most of them, but we still need to parse them - // correctly, so we dont accidently decompress something that belongs - // to the header - - if data[3] & reserved_bits > 0 { - // rfc 1952 2.3.1.2 Compliance - // A compliant decompressor must give an error indication if any - // reserved bit is non-zero, since such a bit could indicate the - // presence of a new field that would cause subsequent data to be - // interpreted incorrectly. - return error('reserved flags are set, unsupported field detected') - } - - if data[3] & fextra > 0 { - xlen := data[header.length] - header.extra = data[header.length + 1..header.length + 1 + xlen] - header.length += xlen + 1 - } - if data[3] & fname > 0 { - // filename is zero-terminated, so skip until we hit a zero byte - for header.length < data.len && data[header.length] != 0x00 { - header.filename << data[header.length] - header.length++ - } - header.length++ - } - if data[3] & fcomment > 0 { - // comment is zero-terminated, so skip until we hit a zero byte - for header.length < data.len && data[header.length] != 0x00 { - header.comment << data[header.length] - header.length++ - } - header.length++ - } - if data[3] & fhcrc > 0 { - if header.length + 12 > data.len { - return error('data too short') - } - checksum_header := crc32.sum(data[..header.length]) - checksum_header_expected := (u32(data[header.length]) << 24) | (u32(data[header.length + 1]) << 16) | (u32(data[ - header.length + 2]) << 8) | data[header.length + 3] - if params.verify_header_checksum && checksum_header != checksum_header_expected { - return error('header checksum verification failed') - } - header.length += 4 - } - if header.length + 8 > data.len { - return error('data too short') - } - header.operating_system = data[9] - return header +// validate validates the gzip header of data and returns its parsed details if valid. +// N.B.: only retained for API compatibility, all validation is now performed by the deflate backend. +// The returned header details are not used by the decompressor. +// TODO: remove after the deprecation period. +@[deprecated: 'never used'] +@[deprecated_after: '2026-07-31'] +pub fn validate(data []u8, _ DecompressParams) !deflate.GzipHeader { + return deflate.validate_gzip_header(data)! } -// decompress an array of bytes using zlib and returns the decompressed bytes in a new array. -// Example: b := 'abcdef'.repeat(1000).bytes(); cmpr := gzip.compress(b)!; decmpr := gzip.decompress(cmpr)!; assert cmpr.len < b.len; assert b == decmpr -pub fn decompress(data []u8, params DecompressParams) ![]u8 { - gzip_header := validate(data, params)! - header_length := gzip_header.length - - decompressed := compr.decompress(data[header_length..data.len - 8], 0)! - length_expected := (u32(data[data.len - 1]) << 24) | (u32(data[data.len - 2]) << 16) | (u32(data[data.len - 3]) << 8) | data[data.len - 4] - if params.verify_length && decompressed.len != length_expected { - return error('length verification failed, got ${decompressed.len}, expected ${length_expected}') - } - checksum := crc32.sum(decompressed) - checksum_expected := (u32(data[data.len - 5]) << 24) | (u32(data[data.len - 6]) << 16) | (u32(data[data.len - 7]) << 8) | data[data.len - 8] - if params.verify_checksum && checksum != checksum_expected { - return error('checksum verification failed') - } - return decompressed +// decompress decompresses a gzip stream and returns the decompressed bytes in a new array. +pub fn decompress(data []u8) ![]u8 { + return deflate.decompress_gzip(data) } -// decompress_with_callback decompresses the given `data`, using zlib. It calls `cb` with each chunk of decompressed bytes. -// A chunk is usually 32 KB or less. Note: the chunk data received by `cb` should be cloned, if you need to store it for later, -// and not process it right away. -// The callback function should return the chunk length, if it wants to continue decompressing, or 0, if it wants to abort the decompression early. -// See also compress.ChunkCallback for more details. -pub fn decompress_with_callback(data []u8, cb compr.ChunkCallback, userdata voidptr, params DecompressParams) !int { - gzip_header := validate(data, params)! - header_len := gzip_header.length - expected_len := int((u32(data[data.len - 1]) << 24) | (u32(data[data.len - 2]) << 16) | (u32(data[data.len - 3]) << 8) | data[data.len - 4]) - body := data[header_len..data.len - 8] - chunks_len := int(compr.decompress_with_callback(body, cb, userdata, 0)!) - if params.verify_length && expected_len != chunks_len { - return error('Decompress error: expected length:${expected_len}, got:${chunks_len}') - } - return chunks_len +// decompress_with_callback decompresses a gzip stream (RFC 1952) using a callback for chunked delivery. +// The callback receives chunks of decompressed data and should return the chunk length to continue, or 0 to abort. +// Returns the total decompressed length. +pub fn decompress_with_callback(data []u8, cb deflate.ChunkCallback, userdata voidptr) !int { + deflate.validate_gzip_header(data)! + return deflate.decompress_with_callback(data, cb, userdata) } diff --git a/vlib/compress/gzip/gzip_test.v b/vlib/compress/gzip/gzip_test.v index b4aab3cf4..1ac0a471b 100644 --- a/vlib/compress/gzip/gzip_test.v +++ b/vlib/compress/gzip/gzip_test.v @@ -1,6 +1,14 @@ module gzip import hash.crc32 +import os + +const test_ftext = u8(0b0000_0001) +const test_fhcrc = u8(0b0000_0010) +const test_fextra = u8(0b0000_0100) +const test_fname = u8(0b0000_1000) +const test_fcomment = u8(0b0001_0000) +const samples_folder = os.join_path(os.dir(@FILE), 'samples') fn test_gzip() { uncompressed := 'Hello world!' @@ -18,24 +26,24 @@ fn assert_decompress_error(data []u8, reason string) ! { } fn test_gzip_invalid_too_short() { - assert_decompress_error([]u8{}, 'data is too short, not gzip compressed?')! + assert_decompress_error([]u8{}, 'invalid gzip stream: too short')! } fn test_gzip_invalid_magic_numbers() { - assert_decompress_error([]u8{len: 100}, 'wrong magic numbers, not gzip compressed?')! + assert_decompress_error([]u8{len: 100}, 'invalid gzip stream: bad magic')! } fn test_gzip_invalid_compression() { mut data := []u8{len: 100} data[0] = 0x1f data[1] = 0x8b - assert_decompress_error(data, 'gzip data is not compressed with DEFLATE')! + assert_decompress_error(data, 'invalid gzip stream: unsupported compression method')! } fn test_gzip_with_ftext() { uncompressed := 'Hello world!' mut compressed := compress(uncompressed.bytes())! - compressed[3] |= ftext + compressed[3] |= test_ftext decompressed := decompress(compressed)! assert decompressed == uncompressed.bytes() } @@ -43,7 +51,7 @@ fn test_gzip_with_ftext() { fn test_gzip_with_fname() { uncompressed := 'Hello world!' mut compressed := compress(uncompressed.bytes())! - compressed[3] |= fname + compressed[3] |= test_fname compressed.insert(10, `h`) compressed.insert(11, `i`) compressed.insert(12, 0x00) @@ -54,7 +62,7 @@ fn test_gzip_with_fname() { fn test_gzip_with_fcomment() { uncompressed := 'Hello world!' mut compressed := compress(uncompressed.bytes())! - compressed[3] |= fcomment + compressed[3] |= test_fcomment compressed.insert(10, `h`) compressed.insert(11, `i`) compressed.insert(12, 0x00) @@ -65,7 +73,7 @@ fn test_gzip_with_fcomment() { fn test_gzip_with_fname_fcomment() { uncompressed := 'Hello world!' mut compressed := compress(uncompressed.bytes())! - compressed[3] |= (fname | fcomment) + compressed[3] |= (test_fname | test_fcomment) compressed.insert(10, `h`) compressed.insert(11, `i`) compressed.insert(12, 0x00) @@ -79,10 +87,13 @@ fn test_gzip_with_fname_fcomment() { fn test_gzip_with_fextra() { uncompressed := 'Hello world!' mut compressed := compress(uncompressed.bytes())! - compressed[3] |= fextra - compressed.insert(10, 2) - compressed.insert(11, `h`) - compressed.insert(12, `i`) + compressed[3] |= test_fextra + // XLEN is 2-byte little-endian value + xlen := u16(2) + compressed.insert(10, u8(xlen)) + compressed.insert(11, u8(xlen >> 8)) + compressed.insert(12, `h`) + compressed.insert(13, `i`) decompressed := decompress(compressed)! assert decompressed == uncompressed.bytes() } @@ -90,12 +101,12 @@ fn test_gzip_with_fextra() { fn test_gzip_with_hcrc() { uncompressed := 'Hello world!' mut compressed := compress(uncompressed.bytes())! - compressed[3] |= fhcrc + compressed[3] |= test_fhcrc + // FHCRC is 2-byte CRC-16 (low 16 bits of CRC32) in little-endian format checksum := crc32.sum(compressed[..10]) - compressed.insert(10, u8(checksum >> 24)) - compressed.insert(11, u8(checksum >> 16)) - compressed.insert(12, u8(checksum >> 8)) - compressed.insert(13, u8(checksum)) + crc16 := u16(checksum & 0xffff) + compressed.insert(10, u8(crc16)) + compressed.insert(11, u8(crc16 >> 8)) decompressed := decompress(compressed)! assert decompressed == uncompressed.bytes() } @@ -103,34 +114,34 @@ fn test_gzip_with_hcrc() { fn test_gzip_with_invalid_hcrc() { uncompressed := 'Hello world!' mut compressed := compress(uncompressed.bytes())! - compressed[3] |= fhcrc + compressed[3] |= test_fhcrc + // FHCRC is 2-byte CRC-16 (low 16 bits of CRC32) in little-endian format checksum := crc32.sum(compressed[..10]) - compressed.insert(10, u8(checksum >> 24)) - compressed.insert(11, u8(checksum >> 16)) - compressed.insert(12, u8(checksum >> 8)) - compressed.insert(13, u8(checksum + 1)) - assert_decompress_error(compressed, 'header checksum verification failed')! + crc16 := u16(checksum & 0xffff) + compressed.insert(10, u8(crc16)) + compressed.insert(11, u8((crc16 >> 8) + 1)) // corrupt high byte + assert_decompress_error(compressed, 'invalid gzip stream: header crc16 mismatch')! } fn test_gzip_with_invalid_checksum() { uncompressed := 'Hello world!' mut compressed := compress(uncompressed.bytes())! compressed[compressed.len - 5] += 1 - assert_decompress_error(compressed, 'checksum verification failed')! + assert_decompress_error(compressed, 'invalid gzip stream: crc32 mismatch')! } fn test_gzip_with_invalid_length() { uncompressed := 'Hello world!' mut compressed := compress(uncompressed.bytes())! compressed[compressed.len - 1] += 1 - assert_decompress_error(compressed, 'length verification failed, got 12, expected 16777228')! + assert_decompress_error(compressed, 'invalid gzip stream: size mismatch')! } fn test_gzip_with_invalid_flags() { uncompressed := 'Hello world!' mut compressed := compress(uncompressed.bytes())! compressed[3] |= 0b1000_0000 - assert_decompress_error(compressed, 'reserved flags are set, unsupported field detected')! + assert_decompress_error(compressed, 'invalid gzip stream: reserved flags set')! } fn test_gzip_decompress_callback() { @@ -147,3 +158,51 @@ fn test_gzip_decompress_callback() { assert decoded == size assert decoded == uncompressed.len } + +fn test_gzip_decompress_callback_rejects_non_gzip() { + z := [u8(0x78), 0x9c, 0x03, 0x00, 0x00, 0x00, 0x01] + decompress_with_callback(z, fn (chunk []u8, _ voidptr) int { + return chunk.len + }, unsafe { nil }) or { + assert err.msg() == 'invalid gzip stream: too short' + return + } + assert false +} + +fn s(fname string) string { + return os.join_path(samples_folder, fname) +} + +fn read_and_decode_file(fpath string) !([]u8, string) { + compressed := os.read_bytes(fpath)! + decoded := decompress(compressed)! + content := decoded.bytestr() + return compressed, content +} + +fn test_reading_and_decoding_a_known_gziped_file() { + compressed, content := read_and_decode_file(s('known.gz'))! + assert compressed#[0..3] == [u8(31), 139, 8] + assert compressed#[-5..] == [u8(127), 115, 1, 0, 0] + assert content.contains('## Description') + assert content.contains('## Examples:') + assert content.ends_with('```\n') +} + +fn test_decoding_all_samples_files() { + for gz_file in os.walk_ext(samples_folder, '.gz') { + _, content := read_and_decode_file(gz_file)! + assert content.len > 0, 'decoded content should not be empty: `${content}`' + } +} + +fn test_reading_gzip_files_compressed_with_different_options() { + _, content1 := read_and_decode_file(s('readme_level_1.gz'))! + _, content5 := read_and_decode_file(s('readme_level_5.gz'))! + _, content9 := read_and_decode_file(s('readme_level_9.gz'))! + _, content9_rsyncable := read_and_decode_file(s('readme_level_9_rsyncable.gz'))! + assert content9_rsyncable == content9 + assert content9 == content5 + assert content5 == content1 +} diff --git a/vlib/compress/gzip/read_gz_files_test.v b/vlib/compress/gzip/read_gz_files_test.v deleted file mode 100644 index 09840096d..000000000 --- a/vlib/compress/gzip/read_gz_files_test.v +++ /dev/null @@ -1,62 +0,0 @@ -import os -import compress.gzip - -const samples_folder = os.join_path(os.dir(@FILE), 'samples') - -fn s(fname string) string { - return os.join_path(samples_folder, fname) -} - -fn read_and_decode_file(fpath string) !([]u8, string) { - compressed := os.read_bytes(fpath)! - decoded := gzip.decompress(compressed)! - content := decoded.bytestr() - return compressed, content -} - -fn test_reading_and_decoding_a_known_gziped_file() { - compressed, content := read_and_decode_file(s('known.gz'))! - assert compressed#[0..3] == [u8(31), 139, 8] - assert compressed#[-5..] == [u8(127), 115, 1, 0, 0] - assert content.contains('## Description') - assert content.contains('## Examples:') - assert content.ends_with('```\n') -} - -fn test_decoding_all_samples_files() { - for gz_file in os.walk_ext(samples_folder, '.gz') { - _, content := read_and_decode_file(gz_file)! - assert content.len > 0, 'decoded content should not be empty: `${content}`' - } -} - -fn test_reading_gzip_files_compressed_with_different_options() { - _, content1 := read_and_decode_file(s('readme_level_1.gz'))! - _, content5 := read_and_decode_file(s('readme_level_5.gz'))! - _, content9 := read_and_decode_file(s('readme_level_9.gz'))! - _, content9_rsyncable := read_and_decode_file(s('readme_level_9_rsyncable.gz'))! - assert content9_rsyncable == content9 - assert content9 == content5 - assert content5 == content1 -} - -fn test_compress_with_deferent_level() { - compressed := os.read_bytes(s('readme_level_9.gz'))! - content9 := gzip.decompress(compressed)! - - // compression: Huffman only=0 - compress_0 := gzip.compress(content9, compression_level: 0)! - decompress_0 := gzip.decompress(compress_0)! - - // compression: default_max_probes=128 - compress_128 := gzip.compress(content9)! - decompress_128 := gzip.decompress(compress_128)! - - // compression: Huffman+LZ=4095(slowest/best compression) - compress_4095 := gzip.compress(content9, compression_level: 4095)! - decompress_4095 := gzip.decompress(compress_4095)! - - assert content9 == decompress_0 - assert content9 == decompress_128 - assert content9 == decompress_4095 -} diff --git a/vlib/compress/zlib/zlib.v b/vlib/compress/zlib/zlib.v index 68dd901bb..17e819378 100644 --- a/vlib/compress/zlib/zlib.v +++ b/vlib/compress/zlib/zlib.v @@ -11,3 +11,11 @@ pub fn compress(data []u8) ![]u8 { pub fn decompress(data []u8) ![]u8 { return deflate.decompress_zlib(data) } + +// decompress_with_callback decompresses a zlib stream (RFC 1950) using a callback for chunked delivery. +// The callback receives chunks of decompressed data and should return the chunk length to continue, or 0 to abort. +// Returns the total decompressed length. +pub fn decompress_with_callback(data []u8, cb deflate.ChunkCallback, userdata voidptr) !int { + deflate.validate_zlib_header(data)! + return deflate.decompress_with_callback(data, cb, userdata) +} diff --git a/vlib/compress/zlib/zlib_test.v b/vlib/compress/zlib/zlib_test.v index 3c33f6c99..5c91dde9d 100644 --- a/vlib/compress/zlib/zlib_test.v +++ b/vlib/compress/zlib/zlib_test.v @@ -74,3 +74,18 @@ fn test_zlib_invalid_inserted_bytes_before_adler() { bad << enc[enc.len - 4..] assert_decompress_error(bad, 'invalid zlib stream: trailing data before adler32')! } + +fn test_zlib_decompress_callback() { + uncompressed := '321323'.repeat(10_000) + gz := compress(uncompressed.bytes())! + mut size := 0 + mut ref := &size + decoded := decompress_with_callback(gz, fn (chunk []u8, ref &int) int { + unsafe { + *ref += chunk.len + } + return chunk.len + }, ref)! + assert decoded == size + assert decoded == uncompressed.len +} diff --git a/vlib/v/fmt/tests/do_not_change_type_names_that_just_happen_to_have_the_module_as_a_substring_keep.vv b/vlib/v/fmt/tests/do_not_change_type_names_that_just_happen_to_have_the_module_as_a_substring_keep.vv index 0694bee98..f6b379d06 100644 --- a/vlib/v/fmt/tests/do_not_change_type_names_that_just_happen_to_have_the_module_as_a_substring_keep.vv +++ b/vlib/v/fmt/tests/do_not_change_type_names_that_just_happen_to_have_the_module_as_a_substring_keep.vv @@ -1,151 +1,21 @@ -// [rfc1952](https://datatracker.ietf.org/doc/html/rfc1952) compliant -// gzip compression/decompression - module gzip -import compress as z +import compress.gzip import hash.crc32 -// compresses an array of bytes using gzip and returns the compressed bytes in a new array -// Example: compressed := gzip.compress(b)? -pub fn compress(data []u8) ![]u8 { - compressed := compress.compress(data, 0)! - // header - mut result := [ - u8(0x1f), // magic numbers (1F 8B) - 0x8b, - 0x08, // deflate - 0x00, // header flags - 0x00, // 4-byte timestamp, 0 = no timestamp (00 00 00 00) - 0x00, - 0x00, - 0x00, - 0x00, // extra flags - 0xff, // operating system id (0xff = unknown) - ] // 10 bytes - result << compressed - // trailer - checksum := crc32.sum(data) - length := data.len - result << [ - u8(checksum >> 24), - u8(checksum >> 16), - u8(checksum >> 8), - u8(checksum), - u8(length >> 24), - u8(length >> 16), - u8(length >> 8), - u8(length), - ] // 8 bytes - return result -} - -@[params] -pub struct DecompressParams { - verify_header_checksum bool = true - verify_length bool = true - verify_checksum bool = true -} - -pub const reserved_bits = 0b1110_0000 -pub const ftext = 0b0000_0001 -pub const fextra = 0b0000_0100 -pub const fname = 0b0000_1000 -pub const fcomment = 0b0001_0000 -pub const fhcrc = 0b0000_0010 - -const min_header_length = 18 - -@[noinit] pub struct GzipHeader { pub mut: - length int = 10 - extra []u8 - filename []u8 - comment []u8 - modification_time u32 - operating_system u8 + length int = 10 + operating_system u8 } -// validate validates the header and returns its details if valid -pub fn validate(data []u8, params DecompressParams) !GzipHeader { - if data.len < min_header_length { - return error('data is too short, not gzip compressed?') - } else if data[0] != 0x1f || data[1] != 0x8b { - return error('wrong magic numbers, not gzip compressed?') - } else if data[2] != 0x08 { - return error('gzip data is not compressed with DEFLATE') - } - mut header := GzipHeader{} - - // parse flags, we ignore most of them, but we still need to parse them - // correctly, so we dont accidently decompress something that belongs - // to the header - - if data[3] & reserved_bits > 0 { - // rfc 1952 2.3.1.2 Compliance - // A compliant decompressor must give an error indication if any - // reserved bit is non-zero, since such a bit could indicate the - // presence of a new field that would cause subsequent data to be - // interpreted incorrectly. - return error('reserved flags are set, unsupported field detected') - } - - if data[3] & fextra > 0 { - xlen := data[header.length] - header.extra = data[header.length + 1..header.length + 1 + xlen] - header.length += xlen + 1 - } - if data[3] & fname > 0 { - // filename is zero-terminated, so skip until we hit a zero byte - for header.length < data.len && data[header.length] != 0x00 { - header.filename << data[header.length] - header.length++ - } - header.length++ - } - if data[3] & fcomment > 0 { - // comment is zero-terminated, so skip until we hit a zero byte - for header.length < data.len && data[header.length] != 0x00 { - header.comment << data[header.length] - header.length++ - } - header.length++ - } - if data[3] & fhcrc > 0 { - if header.length + 12 > data.len { - return error('data too short') - } - checksum_header := crc32.sum(data[..header.length]) - checksum_header_expected := (u32(data[header.length]) << 24) | (u32(data[header.length + 1]) << 16) | (u32(data[ - header.length + 2]) << 8) | data[header.length + 3] - if params.verify_header_checksum && checksum_header != checksum_header_expected { - return error('header checksum verification failed') - } - header.length += 4 - } - if header.length + 8 > data.len { - return error('data too short') - } - header.operating_system = data[9] - return header +pub fn compress(data []u8) ![]u8 { + compressed := gzip.compress(data)! + return compressed } -// decompresses an array of bytes using zlib and returns the decompressed bytes in a new array -// Example: decompressed := gzip.decompress(b)? -pub fn decompress(data []u8, params DecompressParams) ![]u8 { - gzip_header := validate(data, params)! - header_length := gzip_header.length - - decompressed := compress.decompress(data[header_length..data.len - 8], 0)! - length_expected := (u32(data[data.len - 4]) << 24) | (u32(data[data.len - 3]) << 16) | (u32(data[data.len - 2]) << 8) | data[data.len - 1] - if params.verify_length && decompressed.len != length_expected { - return error('length verification failed, got ${decompressed.len}, expected ${length_expected}') - } - checksum := crc32.sum(decompressed) - checksum_expected := (u32(data[data.len - 8]) << 24) | (u32(data[data.len - 7]) << 16) | (u32(data[data.len - 6]) << 8) | data[data.len - 5] - if params.verify_checksum && checksum != checksum_expected { - return error('checksum verification failed') - } +pub fn decompress(data []u8) ![]u8 { + header := GzipHeader{} + decompressed := gzip.decompress(data)! return decompressed } -- 2.39.5