v2 / vlib / compress / lz / common.v
201 lines · 187 sloc · 5.28 KB · 714bc792a1024c3e7eccc72f7312c6dc9821d04f
Raw
1module lz
2
3const stream_magic = [u8(0x56), 0x4c, 0x5a, 0x31] // VLZ1
4
5struct MatchProfile {
6 window int
7 min_match int
8 max_match int
9 max_literal int
10}
11
12const match_hash_bits = 16
13const match_hash_size = 1 << match_hash_bits
14const max_match_candidates = 64
15
16fn wrap_payload(format Format, source []u8, payload []u8) []u8 {
17 mut out := []u8{cap: stream_magic.len + 8 + payload.len}
18 out << stream_magic
19 out << u8(format)
20 encode_uvarint(mut out, u64(source.len))
21 out << payload
22 return out
23}
24
25fn unwrap_payload(data []u8, format Format) !([]u8, i64) {
26 if data.len < stream_magic.len + 2 {
27 return error('invalid lz stream: too short')
28 }
29 if data[..stream_magic.len] != stream_magic {
30 return error('invalid lz stream: bad magic')
31 }
32 wire_format := data[stream_magic.len]
33 if wire_format != u8(format) {
34 return error('invalid lz stream: format mismatch')
35 }
36 decoded_len_u64, mut pos, ok := decode_uvarint(data, stream_magic.len + 1)
37 if !ok {
38 return error('invalid lz stream: bad length')
39 }
40 if decoded_len_u64 > u64(max_int) {
41 return error('invalid lz stream: decoded length too large')
42 }
43 decoded_len := i64(decoded_len_u64)
44 if pos > data.len {
45 return error('invalid lz stream: truncated payload')
46 }
47 return data[pos..], decoded_len
48}
49
50fn compress_with_profile(data []u8, profile MatchProfile, format Format) []u8 {
51 if data.len == 0 {
52 return wrap_payload(format, data, []u8{})
53 }
54 mut payload := []u8{cap: data.len}
55 mut literals := []u8{cap: profile.max_literal}
56 mut last_match := []int{len: match_hash_size, init: -1}
57 mut prev_match := []int{len: data.len, init: -1}
58 mut pos := 0
59 for pos < data.len {
60 offset, length := find_best_match(data, pos, profile, last_match, prev_match)
61 if length >= profile.min_match {
62 flush_literals(mut payload, mut literals)
63 emit_match(mut payload, offset, length, profile.min_match)
64 for i := pos; i < pos + length; i++ {
65 index_match_position(data, i, mut last_match, mut prev_match)
66 }
67 pos += length
68 } else {
69 literals << data[pos]
70 if literals.len == profile.max_literal {
71 flush_literals(mut payload, mut literals)
72 }
73 index_match_position(data, pos, mut last_match, mut prev_match)
74 pos++
75 }
76 }
77 flush_literals(mut payload, mut literals)
78 return wrap_payload(format, data, payload)
79}
80
81fn decompress_with_profile(data []u8, profile MatchProfile, format Format) ![]u8 {
82 payload, expected_len := unwrap_payload(data, format)!
83 mut out := []u8{cap: int(expected_len)}
84 mut pos := 0
85 for pos < payload.len {
86 control := payload[pos]
87 pos++
88 if control & 0x80 == 0 {
89 literal_len := int(control & 0x7f) + 1
90 if pos + literal_len > payload.len {
91 return error('invalid lz stream: truncated literal')
92 }
93 out << payload[pos..pos + literal_len]
94 pos += literal_len
95 continue
96 }
97 match_len := int(control & 0x7f) + profile.min_match
98 offset, next_pos, ok := decode_uvarint(payload, pos)
99 if !ok {
100 return error('invalid lz stream: truncated match offset')
101 }
102 pos = next_pos
103 if offset == 0 || offset > u64(max_i64) || i64(offset) > i64(out.len) {
104 return error('invalid lz stream: bad match offset')
105 }
106 offset_int := int(offset)
107 base := out.len - offset_int
108 for i in 0 .. match_len {
109 out << out[base + i]
110 }
111 }
112 if i64(out.len) != expected_len {
113 return error('invalid lz stream: length mismatch')
114 }
115 return out
116}
117
118fn find_best_match(data []u8, pos int, profile MatchProfile, last_match []int, prev_match []int) (int, int) {
119 if pos + profile.min_match > data.len {
120 return 0, 0
121 }
122 max_len := if pos + profile.max_match < data.len { profile.max_match } else { data.len - pos }
123 mut best_len := 0
124 mut best_offset := 0
125 hash_idx := match_hash(data, pos)
126 mut candidates_checked := 0
127 mut i := last_match[hash_idx]
128 for i >= 0 && candidates_checked < max_match_candidates {
129 offset := pos - i
130 if offset > profile.window {
131 break
132 }
133 mut current_len := 0
134 for current_len < max_len && data[i + current_len] == data[pos + current_len] {
135 current_len++
136 }
137 if current_len > best_len {
138 best_len = current_len
139 best_offset = offset
140 if best_len == max_len {
141 break
142 }
143 }
144 i = prev_match[i]
145 candidates_checked++
146 }
147 return best_offset, best_len
148}
149
150fn index_match_position(data []u8, pos int, mut last_match []int, mut prev_match []int) {
151 if pos + 2 >= data.len {
152 return
153 }
154 hash_idx := match_hash(data, pos)
155 prev_match[pos] = last_match[hash_idx]
156 last_match[hash_idx] = pos
157}
158
159fn match_hash(data []u8, pos int) int {
160 v := (u32(data[pos]) << 16) | (u32(data[pos + 1]) << 8) | u32(data[pos + 2])
161 return int((v * u32(2654435761)) >> (32 - match_hash_bits))
162}
163
164fn flush_literals(mut payload []u8, mut literals []u8) {
165 if literals.len == 0 {
166 return
167 }
168 payload << u8(literals.len - 1)
169 payload << literals
170 literals.clear()
171}
172
173fn emit_match(mut payload []u8, offset int, length int, min_match int) {
174 payload << u8(0x80 | u8(length - min_match))
175 encode_uvarint(mut payload, u64(offset))
176}
177
178fn encode_uvarint(mut out []u8, value u64) {
179 mut v := value
180 for v >= 0x80 {
181 out << u8(v & 0x7f | 0x80)
182 v >>= 7
183 }
184 out << u8(v)
185}
186
187fn decode_uvarint(data []u8, start int) (u64, int, bool) {
188 mut value := u64(0)
189 mut shift := u32(0)
190 mut pos := start
191 for pos < data.len && shift <= 63 {
192 b := data[pos]
193 pos++
194 value |= u64(b & 0x7f) << shift
195 if b & 0x80 == 0 {
196 return value, pos, true
197 }
198 shift += 7
199 }
200 return 0, start, false
201}
202