From 714bc792a1024c3e7eccc72f7312c6dc9821d04f Mon Sep 17 00:00:00 2001 From: JalonSolov Date: Mon, 11 May 2026 21:29:46 -0400 Subject: [PATCH] compress: add pure V lz, supporting multiple variations (#27137) --- vlib/compress/lz/README.md | 34 ++++ vlib/compress/lz/common.v | 201 ++++++++++++++++++++ vlib/compress/lz/interop/README.md | 44 +++++ vlib/compress/lz/interop/lz77_ref.c | 256 ++++++++++++++++++++++++++ vlib/compress/lz/interop/lz77_ref.py | 141 ++++++++++++++ vlib/compress/lz/interop/lz_interop.v | 172 +++++++++++++++++ vlib/compress/lz/lz.v | 53 ++++++ vlib/compress/lz/lz4.v | 18 ++ vlib/compress/lz/lz77.v | 18 ++ vlib/compress/lz/lz78.v | 85 +++++++++ vlib/compress/lz/lz_test.v | 90 +++++++++ vlib/compress/lz/lzjb.v | 18 ++ vlib/compress/lz/lzma.v | 18 ++ vlib/compress/lz/lzss.v | 18 ++ vlib/compress/lz/lzw.v | 92 +++++++++ 15 files changed, 1258 insertions(+) create mode 100644 vlib/compress/lz/README.md create mode 100644 vlib/compress/lz/common.v create mode 100644 vlib/compress/lz/interop/README.md create mode 100644 vlib/compress/lz/interop/lz77_ref.c create mode 100644 vlib/compress/lz/interop/lz77_ref.py create mode 100644 vlib/compress/lz/interop/lz_interop.v create mode 100644 vlib/compress/lz/lz.v create mode 100644 vlib/compress/lz/lz4.v create mode 100644 vlib/compress/lz/lz77.v create mode 100644 vlib/compress/lz/lz78.v create mode 100644 vlib/compress/lz/lz_test.v create mode 100644 vlib/compress/lz/lzjb.v create mode 100644 vlib/compress/lz/lzma.v create mode 100644 vlib/compress/lz/lzss.v create mode 100644 vlib/compress/lz/lzw.v diff --git a/vlib/compress/lz/README.md b/vlib/compress/lz/README.md new file mode 100644 index 000000000..c4b1d0667 --- /dev/null +++ b/vlib/compress/lz/README.md @@ -0,0 +1,34 @@ +## Description + +`compress.lz` provides pure V implementations of several LZ-family codecs. + +Supported formats: + +- `lz77` +- `lz78` +- `lzw` +- `lz4` +- `lzss` +- `lzma` +- `lzjb` + +Use the generic API when selecting a format dynamically: + +```v +import compress.lz + +encoded := lz.compress('hello hello hello'.bytes(), .lz77)! +decoded := lz.decompress(encoded, .lz77)! +assert decoded.bytestr() == 'hello hello hello' +``` + +Use the format-specific APIs for direct calls: + +```v +import compress.lz + +encoded := lz.compress_lzw('banana banana'.bytes())! +decoded := lz.decompress_lzw(encoded)! +assert decoded.bytestr() == 'banana banana' +``` + diff --git a/vlib/compress/lz/common.v b/vlib/compress/lz/common.v new file mode 100644 index 000000000..fb6a02e5b --- /dev/null +++ b/vlib/compress/lz/common.v @@ -0,0 +1,201 @@ +module lz + +const stream_magic = [u8(0x56), 0x4c, 0x5a, 0x31] // VLZ1 + +struct MatchProfile { + window int + min_match int + max_match int + max_literal int +} + +const match_hash_bits = 16 +const match_hash_size = 1 << match_hash_bits +const max_match_candidates = 64 + +fn wrap_payload(format Format, source []u8, payload []u8) []u8 { + mut out := []u8{cap: stream_magic.len + 8 + payload.len} + out << stream_magic + out << u8(format) + encode_uvarint(mut out, u64(source.len)) + out << payload + return out +} + +fn unwrap_payload(data []u8, format Format) !([]u8, i64) { + if data.len < stream_magic.len + 2 { + return error('invalid lz stream: too short') + } + if data[..stream_magic.len] != stream_magic { + return error('invalid lz stream: bad magic') + } + wire_format := data[stream_magic.len] + if wire_format != u8(format) { + return error('invalid lz stream: format mismatch') + } + decoded_len_u64, mut pos, ok := decode_uvarint(data, stream_magic.len + 1) + if !ok { + return error('invalid lz stream: bad length') + } + if decoded_len_u64 > u64(max_int) { + return error('invalid lz stream: decoded length too large') + } + decoded_len := i64(decoded_len_u64) + if pos > data.len { + return error('invalid lz stream: truncated payload') + } + return data[pos..], decoded_len +} + +fn compress_with_profile(data []u8, profile MatchProfile, format Format) []u8 { + if data.len == 0 { + return wrap_payload(format, data, []u8{}) + } + mut payload := []u8{cap: data.len} + mut literals := []u8{cap: profile.max_literal} + mut last_match := []int{len: match_hash_size, init: -1} + mut prev_match := []int{len: data.len, init: -1} + mut pos := 0 + for pos < data.len { + offset, length := find_best_match(data, pos, profile, last_match, prev_match) + if length >= profile.min_match { + flush_literals(mut payload, mut literals) + emit_match(mut payload, offset, length, profile.min_match) + for i := pos; i < pos + length; i++ { + index_match_position(data, i, mut last_match, mut prev_match) + } + pos += length + } else { + literals << data[pos] + if literals.len == profile.max_literal { + flush_literals(mut payload, mut literals) + } + index_match_position(data, pos, mut last_match, mut prev_match) + pos++ + } + } + flush_literals(mut payload, mut literals) + return wrap_payload(format, data, payload) +} + +fn decompress_with_profile(data []u8, profile MatchProfile, format Format) ![]u8 { + payload, expected_len := unwrap_payload(data, format)! + mut out := []u8{cap: int(expected_len)} + mut pos := 0 + for pos < payload.len { + control := payload[pos] + pos++ + if control & 0x80 == 0 { + literal_len := int(control & 0x7f) + 1 + if pos + literal_len > payload.len { + return error('invalid lz stream: truncated literal') + } + out << payload[pos..pos + literal_len] + pos += literal_len + continue + } + match_len := int(control & 0x7f) + profile.min_match + offset, next_pos, ok := decode_uvarint(payload, pos) + if !ok { + return error('invalid lz stream: truncated match offset') + } + pos = next_pos + if offset == 0 || offset > u64(max_i64) || i64(offset) > i64(out.len) { + return error('invalid lz stream: bad match offset') + } + offset_int := int(offset) + base := out.len - offset_int + for i in 0 .. match_len { + out << out[base + i] + } + } + if i64(out.len) != expected_len { + return error('invalid lz stream: length mismatch') + } + return out +} + +fn find_best_match(data []u8, pos int, profile MatchProfile, last_match []int, prev_match []int) (int, int) { + if pos + profile.min_match > data.len { + return 0, 0 + } + max_len := if pos + profile.max_match < data.len { profile.max_match } else { data.len - pos } + mut best_len := 0 + mut best_offset := 0 + hash_idx := match_hash(data, pos) + mut candidates_checked := 0 + mut i := last_match[hash_idx] + for i >= 0 && candidates_checked < max_match_candidates { + offset := pos - i + if offset > profile.window { + break + } + mut current_len := 0 + for current_len < max_len && data[i + current_len] == data[pos + current_len] { + current_len++ + } + if current_len > best_len { + best_len = current_len + best_offset = offset + if best_len == max_len { + break + } + } + i = prev_match[i] + candidates_checked++ + } + return best_offset, best_len +} + +fn index_match_position(data []u8, pos int, mut last_match []int, mut prev_match []int) { + if pos + 2 >= data.len { + return + } + hash_idx := match_hash(data, pos) + prev_match[pos] = last_match[hash_idx] + last_match[hash_idx] = pos +} + +fn match_hash(data []u8, pos int) int { + v := (u32(data[pos]) << 16) | (u32(data[pos + 1]) << 8) | u32(data[pos + 2]) + return int((v * u32(2654435761)) >> (32 - match_hash_bits)) +} + +fn flush_literals(mut payload []u8, mut literals []u8) { + if literals.len == 0 { + return + } + payload << u8(literals.len - 1) + payload << literals + literals.clear() +} + +fn emit_match(mut payload []u8, offset int, length int, min_match int) { + payload << u8(0x80 | u8(length - min_match)) + encode_uvarint(mut payload, u64(offset)) +} + +fn encode_uvarint(mut out []u8, value u64) { + mut v := value + for v >= 0x80 { + out << u8(v & 0x7f | 0x80) + v >>= 7 + } + out << u8(v) +} + +fn decode_uvarint(data []u8, start int) (u64, int, bool) { + mut value := u64(0) + mut shift := u32(0) + mut pos := start + for pos < data.len && shift <= 63 { + b := data[pos] + pos++ + value |= u64(b & 0x7f) << shift + if b & 0x80 == 0 { + return value, pos, true + } + shift += 7 + } + return 0, start, false +} diff --git a/vlib/compress/lz/interop/README.md b/vlib/compress/lz/interop/README.md new file mode 100644 index 000000000..08a68fc85 --- /dev/null +++ b/vlib/compress/lz/interop/README.md @@ -0,0 +1,44 @@ +## LZ Interop Validation (V, C, Python) + +This tool validates: + +- V compress/decompress roundtrips for all formats: `lz77`, `lz78`, `lzw`, + `lz4`, `lzss`, `lzma`, `lzjb` +- a C `lz77`-like reference implementation +- a Python `lz77`-like reference implementation + +The C/Python references are intentionally simple so the benchmark is easy to run +without external dependencies. + +### Run + +```bash +cd /home/jalon/git/v +./vnew run vlib/compress/lz/interop/lz_interop.v 2>&1 +``` + +Optional args: + +1. validation rounds (default: `40`) +2. input size in bytes (default: `524288`) + +Example: + +```bash +cd /home/jalon/git/v +./vnew run vlib/compress/lz/interop/lz_interop.v 25 262144 2>&1 +``` + +### Output + +The tool prints validation status lines only (no timing output). + +When helpers are available, it also cross-validates compression/expansion +interoperability with V (`lz77`) in both directions: + +- V compress -> C decompress +- C compress -> V decompress +- V compress -> Python decompress +- Python compress -> V decompress + +If `cc`/`gcc` or `python3` are missing, that row is skipped with a message. diff --git a/vlib/compress/lz/interop/lz77_ref.c b/vlib/compress/lz/interop/lz77_ref.c new file mode 100644 index 000000000..6e72555b4 --- /dev/null +++ b/vlib/compress/lz/interop/lz77_ref.c @@ -0,0 +1,256 @@ +#include +#include +#include +#include +#include + +#define MIN_MATCH 3 +#define MAX_LITERAL 128 + +static const uint8_t STREAM_MAGIC[4] = {0x56, 0x4c, 0x5a, 0x31}; +static const uint8_t FORMAT_LZ77 = 0; + +typedef struct { + uint8_t *data; + size_t len; + size_t cap; +} Buffer; + +static void die(const char *msg) { + fprintf(stderr, "%s\n", msg); + exit(1); +} + +static void buf_init(Buffer *b, size_t cap) { + b->data = (uint8_t *)malloc(cap > 0 ? cap : 1); + if (!b->data) { + die("allocation failed"); + } + b->len = 0; + b->cap = cap > 0 ? cap : 1; +} + +static void buf_push(Buffer *b, uint8_t v) { + if (b->len >= b->cap) { + size_t new_cap = b->cap * 2; + uint8_t *n = (uint8_t *)realloc(b->data, new_cap); + if (!n) { + die("reallocation failed"); + } + b->data = n; + b->cap = new_cap; + } + b->data[b->len++] = v; +} + +static void buf_append(Buffer *b, const uint8_t *src, size_t len) { + for (size_t i = 0; i < len; i++) { + buf_push(b, src[i]); + } +} + +static Buffer read_all(const char *path) { + FILE *f = fopen(path, "rb"); + if (!f) { + die("could not open input file"); + } + if (fseek(f, 0, SEEK_END) != 0) { + fclose(f); + die("could not seek input file"); + } + long sz = ftell(f); + if (sz < 0) { + fclose(f); + die("could not read input file size"); + } + if (fseek(f, 0, SEEK_SET) != 0) { + fclose(f); + die("could not rewind input file"); + } + Buffer in; + buf_init(&in, (size_t)sz + 1); + in.len = (size_t)sz; + if (in.len > 0 && fread(in.data, 1, in.len, f) != in.len) { + fclose(f); + free(in.data); + die("could not read input file"); + } + fclose(f); + return in; +} + +static void write_all(const char *path, const uint8_t *data, size_t len) { + FILE *f = fopen(path, "wb"); + if (!f) { + die("could not open output file"); + } + if (len > 0 && fwrite(data, 1, len, f) != len) { + fclose(f); + die("could not write output file"); + } + fclose(f); +} + +static void write_uvarint(Buffer *out, uint64_t value) { + uint64_t v = value; + while (v >= 0x80) { + buf_push(out, (uint8_t)(v & 0x7f) | 0x80); + v >>= 7; + } + buf_push(out, (uint8_t)v); +} + +static int read_uvarint(const uint8_t *data, size_t len, size_t *pos, uint64_t *value) { + uint64_t out = 0; + uint32_t shift = 0; + while (*pos < len && shift <= 63) { + uint8_t b = data[*pos]; + (*pos)++; + out |= ((uint64_t)(b & 0x7f)) << shift; + if ((b & 0x80) == 0) { + *value = out; + return 1; + } + shift += 7; + } + return 0; +} + +static Buffer compress_lz77(const uint8_t *in, size_t in_len) { + Buffer out; + buf_init(&out, in_len + 32); + buf_append(&out, STREAM_MAGIC, 4); + buf_push(&out, FORMAT_LZ77); + write_uvarint(&out, (uint64_t)in_len); + + for (size_t i = 0; i < in_len;) { + size_t lit_len = in_len - i; + if (lit_len > MAX_LITERAL) { + lit_len = MAX_LITERAL; + } + buf_push(&out, (uint8_t)(lit_len - 1)); + buf_append(&out, in + i, lit_len); + i += lit_len; + } + return out; +} + +static Buffer decompress_lz77(const uint8_t *in, size_t in_len) { + if (in_len < 6 || memcmp(in, STREAM_MAGIC, 4) != 0) { + die("bad magic"); + } + if (in[4] != FORMAT_LZ77) { + die("format mismatch"); + } + size_t pos = 5; + uint64_t expected_len_u64 = 0; + if (!read_uvarint(in, in_len, &pos, &expected_len_u64)) { + die("bad length varint"); + } + size_t expected_len = (size_t)expected_len_u64; + + Buffer out; + buf_init(&out, expected_len + 16); + while (pos < in_len) { + uint8_t control = in[pos++]; + if ((control & 0x80) == 0) { + size_t lit_len = (size_t)(control & 0x7f) + 1; + if (pos + lit_len > in_len) { + die("truncated literal"); + } + buf_append(&out, in + pos, lit_len); + pos += lit_len; + } else { + size_t match_len = (size_t)(control & 0x7f) + MIN_MATCH; + uint64_t offset_u64 = 0; + if (!read_uvarint(in, in_len, &pos, &offset_u64)) { + die("bad match offset"); + } + size_t offset = (size_t)offset_u64; + if (offset == 0 || offset > out.len) { + die("bad offset"); + } + size_t base = out.len - offset; + for (size_t k = 0; k < match_len; k++) { + buf_push(&out, out.data[base + k]); + } + } + } + if (out.len != expected_len) { + die("length mismatch"); + } + return out; +} + +static int64_t now_ms(void) { + return (int64_t)((double)clock() * 1000.0 / (double)CLOCKS_PER_SEC); +} + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, + "usage:\n" + " %s bench \n" + " %s compress \n" + " %s decompress \n", + argv[0], argv[0], argv[0]); + return 1; + } + if (strcmp(argv[1], "bench") == 0) { + if (argc < 4) { + fprintf(stderr, "usage: %s bench \n", argv[0]); + return 1; + } + int iterations = atoi(argv[3]); + if (iterations <= 0) { + fprintf(stderr, "iterations must be > 0\n"); + return 1; + } + Buffer input = read_all(argv[2]); + int64_t start = now_ms(); + for (int i = 0; i < iterations; i++) { + Buffer comp = compress_lz77(input.data, input.len); + Buffer decomp = decompress_lz77(comp.data, comp.len); + if (decomp.len != input.len || memcmp(decomp.data, input.data, input.len) != 0) { + fprintf(stderr, "roundtrip mismatch\n"); + return 1; + } + free(comp.data); + free(decomp.data); + } + int64_t elapsed = now_ms() - start; + printf("ms=%lld\n", (long long)elapsed); + free(input.data); + return 0; + } + + if (strcmp(argv[1], "compress") == 0) { + if (argc < 4) { + fprintf(stderr, "usage: %s compress \n", argv[0]); + return 1; + } + Buffer input = read_all(argv[2]); + Buffer comp = compress_lz77(input.data, input.len); + write_all(argv[3], comp.data, comp.len); + free(input.data); + free(comp.data); + return 0; + } + + if (strcmp(argv[1], "decompress") == 0) { + if (argc < 4) { + fprintf(stderr, "usage: %s decompress \n", argv[0]); + return 1; + } + Buffer input = read_all(argv[2]); + Buffer dec = decompress_lz77(input.data, input.len); + write_all(argv[3], dec.data, dec.len); + free(input.data); + free(dec.data); + return 0; + } + + fprintf(stderr, "unknown mode: %s\n", argv[1]); + return 1; +} + diff --git a/vlib/compress/lz/interop/lz77_ref.py b/vlib/compress/lz/interop/lz77_ref.py new file mode 100644 index 000000000..29eed3abd --- /dev/null +++ b/vlib/compress/lz/interop/lz77_ref.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +import sys +import time + +MIN_MATCH = 3 +MAX_LITERAL = 128 +STREAM_MAGIC = b'VLZ1' +FORMAT_LZ77 = 0 + + +def encode_uvarint(value: int) -> bytes: + out = bytearray() + v = value + while v >= 0x80: + out.append((v & 0x7F) | 0x80) + v >>= 7 + out.append(v) + return bytes(out) + + +def decode_uvarint(data: bytes, pos: int) -> tuple[int, int]: + value = 0 + shift = 0 + i = pos + while i < len(data) and shift <= 63: + b = data[i] + i += 1 + value |= (b & 0x7F) << shift + if (b & 0x80) == 0: + return value, i + shift += 7 + raise ValueError('bad length varint') + + +def compress_lz77(data: bytes) -> bytes: + out = bytearray() + out.extend(STREAM_MAGIC) + out.append(FORMAT_LZ77) + out.extend(encode_uvarint(len(data))) + i = 0 + while i < len(data): + lit_len = min(MAX_LITERAL, len(data) - i) + out.append(lit_len - 1) + out.extend(data[i : i + lit_len]) + i += lit_len + return bytes(out) + + +def decompress_lz77(data: bytes) -> bytes: + if len(data) < 6 or data[:4] != STREAM_MAGIC: + raise ValueError('bad magic') + if data[4] != FORMAT_LZ77: + raise ValueError('format mismatch') + + expected_len, pos = decode_uvarint(data, 5) + out = bytearray() + while pos < len(data): + control = data[pos] + pos += 1 + if (control & 0x80) == 0: + literal_len = (control & 0x7F) + 1 + if pos + literal_len > len(data): + raise ValueError('truncated literal') + out.extend(data[pos : pos + literal_len]) + pos += literal_len + else: + length = (control & 0x7F) + MIN_MATCH + off, pos = decode_uvarint(data, pos) + if off == 0 or off > len(out): + raise ValueError('bad offset') + base = len(out) - off + for k in range(length): + out.append(out[base + k]) + if len(out) != expected_len: + raise ValueError('length mismatch') + return bytes(out) + + +def main() -> int: + if len(sys.argv) < 2: + print( + f'usage:\n' + f' {sys.argv[0]} bench \n' + f' {sys.argv[0]} compress \n' + f' {sys.argv[0]} decompress ', + file=sys.stderr, + ) + return 1 + + mode = sys.argv[1] + if mode == 'bench': + if len(sys.argv) < 4: + print(f'usage: {sys.argv[0]} bench ', file=sys.stderr) + return 1 + input_path = sys.argv[2] + iterations = int(sys.argv[3]) + if iterations <= 0: + print('iterations must be > 0', file=sys.stderr) + return 1 + with open(input_path, 'rb') as f: + data = f.read() + start = time.perf_counter() + for _ in range(iterations): + enc = compress_lz77(data) + dec = decompress_lz77(enc) + if dec != data: + print('roundtrip mismatch', file=sys.stderr) + return 1 + elapsed_ms = int((time.perf_counter() - start) * 1000) + print(f'ms={elapsed_ms}') + return 0 + + if mode == 'compress': + if len(sys.argv) < 4: + print(f'usage: {sys.argv[0]} compress ', file=sys.stderr) + return 1 + with open(sys.argv[2], 'rb') as f: + data = f.read() + out = compress_lz77(data) + with open(sys.argv[3], 'wb') as f: + f.write(out) + return 0 + + if mode == 'decompress': + if len(sys.argv) < 4: + print(f'usage: {sys.argv[0]} decompress ', file=sys.stderr) + return 1 + with open(sys.argv[2], 'rb') as f: + data = f.read() + out = decompress_lz77(data) + with open(sys.argv[3], 'wb') as f: + f.write(out) + return 0 + + print(f'unknown mode: {mode}', file=sys.stderr) + return 1 + + +if __name__ == '__main__': + raise SystemExit(main()) + diff --git a/vlib/compress/lz/interop/lz_interop.v b/vlib/compress/lz/interop/lz_interop.v new file mode 100644 index 000000000..5c010d08f --- /dev/null +++ b/vlib/compress/lz/interop/lz_interop.v @@ -0,0 +1,172 @@ +module main + +import compress.lz +import os + +const default_data_size = 512 * 1024 + +fn main() { + mut data_size := default_data_size + if os.args.len > 1 { + parsed := int(os.args[1].i32()) + if parsed > 0 { + data_size = parsed + } + } + data := `z`.repeat(data_size).bytes() + + println('LZ interop input: ${data.len} bytes') + + for format in [lz.Format.lz77, .lz78, .lzw, .lz4, .lzss, .lzma, .lzjb] { + validate_v_roundtrip(data, format) or { + eprintln('V validation failed for ${format}: ${err.msg()}') + exit(1) + } + println('V roundtrip (${format}): OK') + } + + tmp_dir := os.join_path(os.temp_dir(), 'v_lz_interop') + os.mkdir_all(tmp_dir) or { + eprintln('Could not create temp directory ${tmp_dir}: ${err.msg()}') + exit(1) + } + defer { + os.rmdir_all(tmp_dir) or { + eprintln('Could not remove temp directory ${tmp_dir}: ${err.msg()}') + } + } + input_path := os.join_path(tmp_dir, 'input.bin') + os.write_file_array(input_path, data) or { + eprintln('Could not write input file ${input_path}: ${err.msg()}') + exit(1) + } + mut c_bin := '' + if bin := compile_c_runner() { + c_bin = bin + } else { + eprintln('Skipping C benchmark: ${err.msg()}') + } + python_ok := has_python3() + if !python_ok { + eprintln('Skipping Python benchmark: python3 is not available') + } + + if c_bin.len > 0 { + cross_validate_v_c(c_bin, data, input_path, tmp_dir) or { + eprintln('Cross-validation V<->C failed: ${err.msg()}') + exit(1) + } + println('Cross-validation: V<->C compress/decompress OK') + } else { + println('Cross-validation: skipped V<->C (requires C compiler)') + } + + if python_ok { + cross_validate_v_python(data, input_path, tmp_dir) or { + eprintln('Cross-validation V<->Python failed: ${err.msg()}') + exit(1) + } + println('Cross-validation: V<->Python compress/decompress OK') + } else { + println('Cross-validation: skipped V<->Python (requires python3)') + } +} + +fn validate_v_roundtrip(data []u8, format lz.Format) ! { + encoded := lz.compress(data, format)! + decoded := lz.decompress(encoded, format)! + if decoded != data { + return error('roundtrip mismatch for ${format}') + } +} + +fn compile_c_runner() !string { + cc := choose_cc() + if cc.len == 0 { + return error('no C compiler found (tried cc, gcc, and clang)') + } + bin_path := os.join_path(os.temp_dir(), 'lz77_ref_bench') + c_src := os.join_path(@DIR, 'lz77_ref.c') + compile_cmd := '${cc} -O3 -std=c99 ${os.quoted_path(c_src)} -o ${os.quoted_path(bin_path)}' + compile_res := os.execute(compile_cmd) + if compile_res.exit_code != 0 { + return error('C compile failed: ${compile_res.output.trim_space()}') + } + return bin_path +} + +fn choose_cc() string { + for cc in ['cc', 'gcc', 'clang'] { + if os.execute('${cc} --version').exit_code == 0 { + return cc + } + } + return '' +} + +fn has_python3() bool { + return os.execute('python3 --version').exit_code == 0 +} + +fn cross_validate_v_c(c_bin string, original []u8, input_path string, tmp_dir string) ! { + v_encoded := os.join_path(tmp_dir, 'v_encoded.bin') + c_decoded := os.join_path(tmp_dir, 'c_decoded.bin') + c_encoded := os.join_path(tmp_dir, 'c_encoded.bin') + + v_stream := lz.compress_lz77(original)! + os.write_file_array(v_encoded, v_stream)! + + mut res := + os.execute('${os.quoted_path(c_bin)} decompress ${os.quoted_path(v_encoded)} ${os.quoted_path(c_decoded)}') + if res.exit_code != 0 { + return error('C decompress(V output) failed: ${res.output.trim_space()}') + } + validate_equal_files(input_path, c_decoded, 'V->C')! + + res = + os.execute('${os.quoted_path(c_bin)} compress ${os.quoted_path(input_path)} ${os.quoted_path(c_encoded)}') + if res.exit_code != 0 { + return error('C compress failed: ${res.output.trim_space()}') + } + c_encoded_data := os.read_bytes(c_encoded)! + v_decoded := lz.decompress_lz77(c_encoded_data)! + if v_decoded != original { + return error('C->V output mismatch') + } +} + +fn cross_validate_v_python(original []u8, input_path string, tmp_dir string) ! { + v_encoded := os.join_path(tmp_dir, 'v_encoded_for_py.bin') + py_decoded := os.join_path(tmp_dir, 'py_decoded.bin') + py_encoded := os.join_path(tmp_dir, 'py_encoded.bin') + py_script := os.join_path(@DIR, 'lz77_ref.py') + + v_stream := lz.compress_lz77(original)! + os.write_file_array(v_encoded, v_stream)! + + mut res := + os.execute('python3 ${os.quoted_path(py_script)} decompress ${os.quoted_path(v_encoded)} ${os.quoted_path(py_decoded)}') + if res.exit_code != 0 { + return error('Python decompress(V output) failed: ${res.output.trim_space()}') + } + validate_equal_files(input_path, py_decoded, 'V->Python')! + + res = + os.execute('python3 ${os.quoted_path(py_script)} compress ${os.quoted_path(input_path)} ${os.quoted_path(py_encoded)}') + if res.exit_code != 0 { + return error('Python compress failed: ${res.output.trim_space()}') + } + py_encoded_data := os.read_bytes(py_encoded)! + v_decoded := lz.decompress_lz77(py_encoded_data)! + if v_decoded != original { + return error('Python->V output mismatch') + } +} + +fn validate_equal_files(expected_path string, actual_path string, tag string) ! { + expected := os.read_bytes(expected_path)! + actual := os.read_bytes(actual_path)! + if expected != actual { + return error('${tag} output mismatch') + } +} diff --git a/vlib/compress/lz/lz.v b/vlib/compress/lz/lz.v new file mode 100644 index 000000000..cd9c44135 --- /dev/null +++ b/vlib/compress/lz/lz.v @@ -0,0 +1,53 @@ +module lz + +// Format identifies which LZ-family codec variant to use. +pub enum Format { + lz77 + lz78 + lzw + lz4 + lzss + lzma + lzjb +} + +// format_from_string parses a case-insensitive format name. +pub fn format_from_string(name string) !Format { + key := name.to_lower() + return match key { + 'lz77' { .lz77 } + 'lz78' { .lz78 } + 'lzw' { .lzw } + 'lz4' { .lz4 } + 'lzss' { .lzss } + 'lzma' { .lzma } + 'lzjb' { .lzjb } + else { return error('unknown lz format: ${name}') } + } +} + +// compress compresses data with the selected LZ format. +pub fn compress(data []u8, format Format) ![]u8 { + return match format { + .lz77 { compress_lz77(data) } + .lz78 { compress_lz78(data) } + .lzw { compress_lzw(data) } + .lz4 { compress_lz4(data) } + .lzss { compress_lzss(data) } + .lzma { compress_lzma(data) } + .lzjb { compress_lzjb(data) } + } +} + +// decompress decompresses data with the selected LZ format. +pub fn decompress(data []u8, format Format) ![]u8 { + return match format { + .lz77 { decompress_lz77(data) } + .lz78 { decompress_lz78(data) } + .lzw { decompress_lzw(data) } + .lz4 { decompress_lz4(data) } + .lzss { decompress_lzss(data) } + .lzma { decompress_lzma(data) } + .lzjb { decompress_lzjb(data) } + } +} diff --git a/vlib/compress/lz/lz4.v b/vlib/compress/lz/lz4.v new file mode 100644 index 000000000..210c6cb4f --- /dev/null +++ b/vlib/compress/lz/lz4.v @@ -0,0 +1,18 @@ +module lz + +const lz4_profile = MatchProfile{ + window: 65535 + min_match: 4 + max_match: 130 + max_literal: 128 +} + +// compress_lz4 compresses data using a pure-V LZ4-like stream. +pub fn compress_lz4(data []u8) ![]u8 { + return compress_with_profile(data, lz4_profile, .lz4) +} + +// decompress_lz4 decompresses data produced by compress_lz4. +pub fn decompress_lz4(data []u8) ![]u8 { + return decompress_with_profile(data, lz4_profile, .lz4) +} diff --git a/vlib/compress/lz/lz77.v b/vlib/compress/lz/lz77.v new file mode 100644 index 000000000..0d4364dcf --- /dev/null +++ b/vlib/compress/lz/lz77.v @@ -0,0 +1,18 @@ +module lz + +const lz77_profile = MatchProfile{ + window: 4096 + min_match: 3 + max_match: 130 + max_literal: 128 +} + +// compress_lz77 compresses data using a pure-V LZ77 style stream. +pub fn compress_lz77(data []u8) ![]u8 { + return compress_with_profile(data, lz77_profile, .lz77) +} + +// decompress_lz77 decompresses data produced by compress_lz77. +pub fn decompress_lz77(data []u8) ![]u8 { + return decompress_with_profile(data, lz77_profile, .lz77) +} diff --git a/vlib/compress/lz/lz78.v b/vlib/compress/lz/lz78.v new file mode 100644 index 000000000..e3884cb49 --- /dev/null +++ b/vlib/compress/lz/lz78.v @@ -0,0 +1,85 @@ +module lz + +// compress_lz78 compresses data using a pure-V LZ78 dictionary stream. +pub fn compress_lz78(data []u8) ![]u8 { + mut payload := []u8{} + mut dict := map[string]int{} + mut next_index := 1 + mut word := []u8{} + + for b in data { + mut candidate := word.clone() + candidate << b + candidate_key := candidate.bytestr() + if candidate_key in dict { + word = candidate.clone() + continue + } + + prefix_index := if word.len == 0 { 0 } else { dict[word.bytestr()] } + encode_uvarint(mut payload, u64(prefix_index)) + payload << u8(1) + payload << b + dict[candidate_key] = next_index + next_index++ + word.clear() + } + + if word.len > 0 { + final_index := dict[word.bytestr()] + encode_uvarint(mut payload, u64(final_index)) + payload << u8(0) + } + + return wrap_payload(.lz78, data, payload) +} + +// decompress_lz78 decompresses data produced by compress_lz78. +pub fn decompress_lz78(data []u8) ![]u8 { + payload, expected_len := unwrap_payload(data, .lz78)! + mut out := []u8{cap: int(expected_len)} + mut dict := map[int][]u8{} + mut next_index := 1 + mut pos := 0 + + for pos < payload.len { + prefix, next_pos, ok := decode_uvarint(payload, pos) + if !ok { + return error('invalid lz78 stream: bad prefix index') + } + pos = next_pos + if pos >= payload.len { + return error('invalid lz78 stream: missing suffix flag') + } + has_suffix := payload[pos] + pos++ + + mut phrase := if prefix == 0 { + []u8{} + } else { + if int(prefix) !in dict { + return error('invalid lz78 stream: unknown prefix index') + } + dict[int(prefix)].clone() + } + + if has_suffix == 1 { + if pos >= payload.len { + return error('invalid lz78 stream: missing suffix byte') + } + phrase << payload[pos] + pos++ + } else if has_suffix != 0 { + return error('invalid lz78 stream: bad suffix flag') + } + + out << phrase + dict[next_index] = phrase + next_index++ + } + + if i64(out.len) != expected_len { + return error('invalid lz78 stream: length mismatch') + } + return out +} diff --git a/vlib/compress/lz/lz_test.v b/vlib/compress/lz/lz_test.v new file mode 100644 index 000000000..577d9abc3 --- /dev/null +++ b/vlib/compress/lz/lz_test.v @@ -0,0 +1,90 @@ +module lz + +const sample_data = ('The quick brown fox jumps over the lazy dog. '.repeat(12) + + 'aaaaaaaaabbbbbbbbbcccccccccdddddddddeeeeeeeee').bytes() + +fn test_roundtrip_all_formats() { + formats := [Format.lz77, .lz78, .lzw, .lz4, .lzss, .lzma, .lzjb] + for format in formats { + compressed := compress(sample_data, format)! + decompressed := decompress(compressed, format)! + assert decompressed == sample_data + } +} + +fn test_format_specific_api_roundtrip() { + lz77_data := compress_lz77(sample_data)! + assert decompress_lz77(lz77_data)! == sample_data + + lz78_data := compress_lz78(sample_data)! + assert decompress_lz78(lz78_data)! == sample_data + + lzw_data := compress_lzw(sample_data)! + assert decompress_lzw(lzw_data)! == sample_data + + lz4_data := compress_lz4(sample_data)! + assert decompress_lz4(lz4_data)! == sample_data + + lzss_data := compress_lzss(sample_data)! + assert decompress_lzss(lzss_data)! == sample_data + + lzma_data := compress_lzma(sample_data)! + assert decompress_lzma(lzma_data)! == sample_data + + lzjb_data := compress_lzjb(sample_data)! + assert decompress_lzjb(lzjb_data)! == sample_data +} + +fn test_mismatched_format_fails() { + compressed := compress(sample_data, .lz77)! + decompress(compressed, .lz4) or { + assert err.msg().contains('format mismatch') + return + } + assert false +} + +fn test_decoded_length_too_large_fails() { + mut corrupt := []u8{} + corrupt << stream_magic + corrupt << u8(Format.lz77) + encode_uvarint(mut corrupt, u64(1) << 31) + + decompress_lz77(corrupt) or { + assert err.msg().contains('decoded length too large') + return + } + assert false +} + +fn test_match_offset_too_large_fails() { + mut corrupt := []u8{} + corrupt << stream_magic + corrupt << u8(Format.lz77) + encode_uvarint(mut corrupt, u64(4)) + corrupt << u8(0) + corrupt << `A` + corrupt << u8(0x80) + encode_uvarint(mut corrupt, u64(1) << 63) + + decompress_lz77(corrupt) or { + assert err.msg().contains('bad match offset') + return + } + assert false +} + +fn test_high_entropy_roundtrip_large_window_formats() { + mut data := []u8{len: 128 * 1024} + mut state := u32(0x9e3779b9) + for i in 0 .. data.len { + state = state * 1664525 + 1013904223 + data[i] = u8(state >> 24) + } + + for format in [Format.lz4, .lzma] { + compressed := compress(data, format)! + decompressed := decompress(compressed, format)! + assert decompressed == data + } +} diff --git a/vlib/compress/lz/lzjb.v b/vlib/compress/lz/lzjb.v new file mode 100644 index 000000000..d12a59ed1 --- /dev/null +++ b/vlib/compress/lz/lzjb.v @@ -0,0 +1,18 @@ +module lz + +const lzjb_profile = MatchProfile{ + window: 1024 + min_match: 3 + max_match: 66 + max_literal: 128 +} + +// compress_lzjb compresses data using a pure-V LZJB-like stream. +pub fn compress_lzjb(data []u8) ![]u8 { + return compress_with_profile(data, lzjb_profile, .lzjb) +} + +// decompress_lzjb decompresses data produced by compress_lzjb. +pub fn decompress_lzjb(data []u8) ![]u8 { + return decompress_with_profile(data, lzjb_profile, .lzjb) +} diff --git a/vlib/compress/lz/lzma.v b/vlib/compress/lz/lzma.v new file mode 100644 index 000000000..d62a122ff --- /dev/null +++ b/vlib/compress/lz/lzma.v @@ -0,0 +1,18 @@ +module lz + +const lzma_profile = MatchProfile{ + window: 32768 + min_match: 3 + max_match: 130 + max_literal: 128 +} + +// compress_lzma compresses data using a pure-V LZMA-like stream. +pub fn compress_lzma(data []u8) ![]u8 { + return compress_with_profile(data, lzma_profile, .lzma) +} + +// decompress_lzma decompresses data produced by compress_lzma. +pub fn decompress_lzma(data []u8) ![]u8 { + return decompress_with_profile(data, lzma_profile, .lzma) +} diff --git a/vlib/compress/lz/lzss.v b/vlib/compress/lz/lzss.v new file mode 100644 index 000000000..4105ff6ac --- /dev/null +++ b/vlib/compress/lz/lzss.v @@ -0,0 +1,18 @@ +module lz + +const lzss_profile = MatchProfile{ + window: 4096 + min_match: 3 + max_match: 130 + max_literal: 128 +} + +// compress_lzss compresses data using a pure-V LZSS style stream. +pub fn compress_lzss(data []u8) ![]u8 { + return compress_with_profile(data, lzss_profile, .lzss) +} + +// decompress_lzss decompresses data produced by compress_lzss. +pub fn decompress_lzss(data []u8) ![]u8 { + return decompress_with_profile(data, lzss_profile, .lzss) +} diff --git a/vlib/compress/lz/lzw.v b/vlib/compress/lz/lzw.v new file mode 100644 index 000000000..1bfb561d9 --- /dev/null +++ b/vlib/compress/lz/lzw.v @@ -0,0 +1,92 @@ +module lz + +// compress_lzw compresses data using a pure-V LZW dictionary stream. +pub fn compress_lzw(data []u8) ![]u8 { + mut payload := []u8{} + mut dict := map[string]int{} + for i in 0 .. 256 { + dict[[u8(i)].bytestr()] = i + } + mut next_code := 256 + mut word := '' + + for b in data { + symbol := [b].bytestr() + candidate := word + symbol + if candidate in dict { + word = candidate + continue + } + if word != '' { + encode_uvarint(mut payload, u64(dict[word])) + } + dict[candidate] = next_code + next_code++ + word = symbol + } + + if word != '' { + encode_uvarint(mut payload, u64(dict[word])) + } + + return wrap_payload(.lzw, data, payload) +} + +// decompress_lzw decompresses data produced by compress_lzw. +pub fn decompress_lzw(data []u8) ![]u8 { + payload, expected_len := unwrap_payload(data, .lzw)! + if payload.len == 0 { + if expected_len == i64(0) { + return []u8{} + } + return error('invalid lzw stream: missing codes') + } + + mut dict := map[int][]u8{} + for i in 0 .. 256 { + dict[i] = [u8(i)] + } + mut next_code := 256 + mut pos := 0 + + first_code, next_pos, ok := decode_uvarint(payload, pos) + if !ok { + return error('invalid lzw stream: bad initial code') + } + pos = next_pos + if int(first_code) !in dict { + return error('invalid lzw stream: unknown initial code') + } + mut word := dict[int(first_code)].clone() + mut out := word.clone() + + for pos < payload.len { + code_u64, new_pos, ok_code := decode_uvarint(payload, pos) + if !ok_code { + return error('invalid lzw stream: bad code') + } + pos = new_pos + code := int(code_u64) + mut entry := []u8{} + if code in dict { + entry = dict[code].clone() + } else if code == next_code { + entry = word.clone() + entry << word[0] + } else { + return error('invalid lzw stream: unknown code') + } + + out << entry + mut new_entry := word.clone() + new_entry << entry[0] + dict[next_code] = new_entry + next_code++ + word = entry.clone() + } + + if i64(out.len) != expected_len { + return error('invalid lzw stream: length mismatch') + } + return out +} -- 2.39.5