| 1 | module lz |
| 2 | |
| 3 | const stream_magic = [u8(0x56), 0x4c, 0x5a, 0x31] // VLZ1 |
| 4 | |
| 5 | struct MatchProfile { |
| 6 | window int |
| 7 | min_match int |
| 8 | max_match int |
| 9 | max_literal int |
| 10 | } |
| 11 | |
| 12 | const match_hash_bits = 16 |
| 13 | const match_hash_size = 1 << match_hash_bits |
| 14 | const max_match_candidates = 64 |
| 15 | |
| 16 | fn wrap_payload(format Format, source []u8, payload []u8) []u8 { |
| 17 | mut out := []u8{cap: stream_magic.len + 8 + payload.len} |
| 18 | out << stream_magic |
| 19 | out << u8(format) |
| 20 | encode_uvarint(mut out, u64(source.len)) |
| 21 | out << payload |
| 22 | return out |
| 23 | } |
| 24 | |
| 25 | fn unwrap_payload(data []u8, format Format) !([]u8, i64) { |
| 26 | if data.len < stream_magic.len + 2 { |
| 27 | return error('invalid lz stream: too short') |
| 28 | } |
| 29 | if data[..stream_magic.len] != stream_magic { |
| 30 | return error('invalid lz stream: bad magic') |
| 31 | } |
| 32 | wire_format := data[stream_magic.len] |
| 33 | if wire_format != u8(format) { |
| 34 | return error('invalid lz stream: format mismatch') |
| 35 | } |
| 36 | decoded_len_u64, mut pos, ok := decode_uvarint(data, stream_magic.len + 1) |
| 37 | if !ok { |
| 38 | return error('invalid lz stream: bad length') |
| 39 | } |
| 40 | if decoded_len_u64 > u64(max_int) { |
| 41 | return error('invalid lz stream: decoded length too large') |
| 42 | } |
| 43 | decoded_len := i64(decoded_len_u64) |
| 44 | if pos > data.len { |
| 45 | return error('invalid lz stream: truncated payload') |
| 46 | } |
| 47 | return data[pos..], decoded_len |
| 48 | } |
| 49 | |
| 50 | fn compress_with_profile(data []u8, profile MatchProfile, format Format) []u8 { |
| 51 | if data.len == 0 { |
| 52 | return wrap_payload(format, data, []u8{}) |
| 53 | } |
| 54 | mut payload := []u8{cap: data.len} |
| 55 | mut literals := []u8{cap: profile.max_literal} |
| 56 | mut last_match := []int{len: match_hash_size, init: -1} |
| 57 | mut prev_match := []int{len: data.len, init: -1} |
| 58 | mut pos := 0 |
| 59 | for pos < data.len { |
| 60 | offset, length := find_best_match(data, pos, profile, last_match, prev_match) |
| 61 | if length >= profile.min_match { |
| 62 | flush_literals(mut payload, mut literals) |
| 63 | emit_match(mut payload, offset, length, profile.min_match) |
| 64 | for i := pos; i < pos + length; i++ { |
| 65 | index_match_position(data, i, mut last_match, mut prev_match) |
| 66 | } |
| 67 | pos += length |
| 68 | } else { |
| 69 | literals << data[pos] |
| 70 | if literals.len == profile.max_literal { |
| 71 | flush_literals(mut payload, mut literals) |
| 72 | } |
| 73 | index_match_position(data, pos, mut last_match, mut prev_match) |
| 74 | pos++ |
| 75 | } |
| 76 | } |
| 77 | flush_literals(mut payload, mut literals) |
| 78 | return wrap_payload(format, data, payload) |
| 79 | } |
| 80 | |
| 81 | fn decompress_with_profile(data []u8, profile MatchProfile, format Format) ![]u8 { |
| 82 | payload, expected_len := unwrap_payload(data, format)! |
| 83 | mut out := []u8{cap: int(expected_len)} |
| 84 | mut pos := 0 |
| 85 | for pos < payload.len { |
| 86 | control := payload[pos] |
| 87 | pos++ |
| 88 | if control & 0x80 == 0 { |
| 89 | literal_len := int(control & 0x7f) + 1 |
| 90 | if pos + literal_len > payload.len { |
| 91 | return error('invalid lz stream: truncated literal') |
| 92 | } |
| 93 | out << payload[pos..pos + literal_len] |
| 94 | pos += literal_len |
| 95 | continue |
| 96 | } |
| 97 | match_len := int(control & 0x7f) + profile.min_match |
| 98 | offset, next_pos, ok := decode_uvarint(payload, pos) |
| 99 | if !ok { |
| 100 | return error('invalid lz stream: truncated match offset') |
| 101 | } |
| 102 | pos = next_pos |
| 103 | if offset == 0 || offset > u64(max_i64) || i64(offset) > i64(out.len) { |
| 104 | return error('invalid lz stream: bad match offset') |
| 105 | } |
| 106 | offset_int := int(offset) |
| 107 | base := out.len - offset_int |
| 108 | for i in 0 .. match_len { |
| 109 | out << out[base + i] |
| 110 | } |
| 111 | } |
| 112 | if i64(out.len) != expected_len { |
| 113 | return error('invalid lz stream: length mismatch') |
| 114 | } |
| 115 | return out |
| 116 | } |
| 117 | |
| 118 | fn find_best_match(data []u8, pos int, profile MatchProfile, last_match []int, prev_match []int) (int, int) { |
| 119 | if pos + profile.min_match > data.len { |
| 120 | return 0, 0 |
| 121 | } |
| 122 | max_len := if pos + profile.max_match < data.len { profile.max_match } else { data.len - pos } |
| 123 | mut best_len := 0 |
| 124 | mut best_offset := 0 |
| 125 | hash_idx := match_hash(data, pos) |
| 126 | mut candidates_checked := 0 |
| 127 | mut i := last_match[hash_idx] |
| 128 | for i >= 0 && candidates_checked < max_match_candidates { |
| 129 | offset := pos - i |
| 130 | if offset > profile.window { |
| 131 | break |
| 132 | } |
| 133 | mut current_len := 0 |
| 134 | for current_len < max_len && data[i + current_len] == data[pos + current_len] { |
| 135 | current_len++ |
| 136 | } |
| 137 | if current_len > best_len { |
| 138 | best_len = current_len |
| 139 | best_offset = offset |
| 140 | if best_len == max_len { |
| 141 | break |
| 142 | } |
| 143 | } |
| 144 | i = prev_match[i] |
| 145 | candidates_checked++ |
| 146 | } |
| 147 | return best_offset, best_len |
| 148 | } |
| 149 | |
| 150 | fn index_match_position(data []u8, pos int, mut last_match []int, mut prev_match []int) { |
| 151 | if pos + 2 >= data.len { |
| 152 | return |
| 153 | } |
| 154 | hash_idx := match_hash(data, pos) |
| 155 | prev_match[pos] = last_match[hash_idx] |
| 156 | last_match[hash_idx] = pos |
| 157 | } |
| 158 | |
| 159 | fn match_hash(data []u8, pos int) int { |
| 160 | v := (u32(data[pos]) << 16) | (u32(data[pos + 1]) << 8) | u32(data[pos + 2]) |
| 161 | return int((v * u32(2654435761)) >> (32 - match_hash_bits)) |
| 162 | } |
| 163 | |
| 164 | fn flush_literals(mut payload []u8, mut literals []u8) { |
| 165 | if literals.len == 0 { |
| 166 | return |
| 167 | } |
| 168 | payload << u8(literals.len - 1) |
| 169 | payload << literals |
| 170 | literals.clear() |
| 171 | } |
| 172 | |
| 173 | fn emit_match(mut payload []u8, offset int, length int, min_match int) { |
| 174 | payload << u8(0x80 | u8(length - min_match)) |
| 175 | encode_uvarint(mut payload, u64(offset)) |
| 176 | } |
| 177 | |
| 178 | fn encode_uvarint(mut out []u8, value u64) { |
| 179 | mut v := value |
| 180 | for v >= 0x80 { |
| 181 | out << u8(v & 0x7f | 0x80) |
| 182 | v >>= 7 |
| 183 | } |
| 184 | out << u8(v) |
| 185 | } |
| 186 | |
| 187 | fn decode_uvarint(data []u8, start int) (u64, int, bool) { |
| 188 | mut value := u64(0) |
| 189 | mut shift := u32(0) |
| 190 | mut pos := start |
| 191 | for pos < data.len && shift <= 63 { |
| 192 | b := data[pos] |
| 193 | pos++ |
| 194 | value |= u64(b & 0x7f) << shift |
| 195 | if b & 0x80 == 0 { |
| 196 | return value, pos, true |
| 197 | } |
| 198 | shift += 7 |
| 199 | } |
| 200 | return 0, start, false |
| 201 | } |
| 202 | |