v / vlib / compress / lz / interop / lz77_ref.c
256 lines · 235 sloc · 5.84 KB · 714bc792a1024c3e7eccc72f7312c6dc9821d04f
Raw
1#include <stdint.h>
2#include <stdio.h>
3#include <stdlib.h>
4#include <string.h>
5#include <time.h>
6
7#define MIN_MATCH 3
8#define MAX_LITERAL 128
9
10static const uint8_t STREAM_MAGIC[4] = {0x56, 0x4c, 0x5a, 0x31};
11static const uint8_t FORMAT_LZ77 = 0;
12
13typedef struct {
14 uint8_t *data;
15 size_t len;
16 size_t cap;
17} Buffer;
18
19static void die(const char *msg) {
20 fprintf(stderr, "%s\n", msg);
21 exit(1);
22}
23
24static void buf_init(Buffer *b, size_t cap) {
25 b->data = (uint8_t *)malloc(cap > 0 ? cap : 1);
26 if (!b->data) {
27 die("allocation failed");
28 }
29 b->len = 0;
30 b->cap = cap > 0 ? cap : 1;
31}
32
33static void buf_push(Buffer *b, uint8_t v) {
34 if (b->len >= b->cap) {
35 size_t new_cap = b->cap * 2;
36 uint8_t *n = (uint8_t *)realloc(b->data, new_cap);
37 if (!n) {
38 die("reallocation failed");
39 }
40 b->data = n;
41 b->cap = new_cap;
42 }
43 b->data[b->len++] = v;
44}
45
46static void buf_append(Buffer *b, const uint8_t *src, size_t len) {
47 for (size_t i = 0; i < len; i++) {
48 buf_push(b, src[i]);
49 }
50}
51
52static Buffer read_all(const char *path) {
53 FILE *f = fopen(path, "rb");
54 if (!f) {
55 die("could not open input file");
56 }
57 if (fseek(f, 0, SEEK_END) != 0) {
58 fclose(f);
59 die("could not seek input file");
60 }
61 long sz = ftell(f);
62 if (sz < 0) {
63 fclose(f);
64 die("could not read input file size");
65 }
66 if (fseek(f, 0, SEEK_SET) != 0) {
67 fclose(f);
68 die("could not rewind input file");
69 }
70 Buffer in;
71 buf_init(&in, (size_t)sz + 1);
72 in.len = (size_t)sz;
73 if (in.len > 0 && fread(in.data, 1, in.len, f) != in.len) {
74 fclose(f);
75 free(in.data);
76 die("could not read input file");
77 }
78 fclose(f);
79 return in;
80}
81
82static void write_all(const char *path, const uint8_t *data, size_t len) {
83 FILE *f = fopen(path, "wb");
84 if (!f) {
85 die("could not open output file");
86 }
87 if (len > 0 && fwrite(data, 1, len, f) != len) {
88 fclose(f);
89 die("could not write output file");
90 }
91 fclose(f);
92}
93
94static void write_uvarint(Buffer *out, uint64_t value) {
95 uint64_t v = value;
96 while (v >= 0x80) {
97 buf_push(out, (uint8_t)(v & 0x7f) | 0x80);
98 v >>= 7;
99 }
100 buf_push(out, (uint8_t)v);
101}
102
103static int read_uvarint(const uint8_t *data, size_t len, size_t *pos, uint64_t *value) {
104 uint64_t out = 0;
105 uint32_t shift = 0;
106 while (*pos < len && shift <= 63) {
107 uint8_t b = data[*pos];
108 (*pos)++;
109 out |= ((uint64_t)(b & 0x7f)) << shift;
110 if ((b & 0x80) == 0) {
111 *value = out;
112 return 1;
113 }
114 shift += 7;
115 }
116 return 0;
117}
118
119static Buffer compress_lz77(const uint8_t *in, size_t in_len) {
120 Buffer out;
121 buf_init(&out, in_len + 32);
122 buf_append(&out, STREAM_MAGIC, 4);
123 buf_push(&out, FORMAT_LZ77);
124 write_uvarint(&out, (uint64_t)in_len);
125
126 for (size_t i = 0; i < in_len;) {
127 size_t lit_len = in_len - i;
128 if (lit_len > MAX_LITERAL) {
129 lit_len = MAX_LITERAL;
130 }
131 buf_push(&out, (uint8_t)(lit_len - 1));
132 buf_append(&out, in + i, lit_len);
133 i += lit_len;
134 }
135 return out;
136}
137
138static Buffer decompress_lz77(const uint8_t *in, size_t in_len) {
139 if (in_len < 6 || memcmp(in, STREAM_MAGIC, 4) != 0) {
140 die("bad magic");
141 }
142 if (in[4] != FORMAT_LZ77) {
143 die("format mismatch");
144 }
145 size_t pos = 5;
146 uint64_t expected_len_u64 = 0;
147 if (!read_uvarint(in, in_len, &pos, &expected_len_u64)) {
148 die("bad length varint");
149 }
150 size_t expected_len = (size_t)expected_len_u64;
151
152 Buffer out;
153 buf_init(&out, expected_len + 16);
154 while (pos < in_len) {
155 uint8_t control = in[pos++];
156 if ((control & 0x80) == 0) {
157 size_t lit_len = (size_t)(control & 0x7f) + 1;
158 if (pos + lit_len > in_len) {
159 die("truncated literal");
160 }
161 buf_append(&out, in + pos, lit_len);
162 pos += lit_len;
163 } else {
164 size_t match_len = (size_t)(control & 0x7f) + MIN_MATCH;
165 uint64_t offset_u64 = 0;
166 if (!read_uvarint(in, in_len, &pos, &offset_u64)) {
167 die("bad match offset");
168 }
169 size_t offset = (size_t)offset_u64;
170 if (offset == 0 || offset > out.len) {
171 die("bad offset");
172 }
173 size_t base = out.len - offset;
174 for (size_t k = 0; k < match_len; k++) {
175 buf_push(&out, out.data[base + k]);
176 }
177 }
178 }
179 if (out.len != expected_len) {
180 die("length mismatch");
181 }
182 return out;
183}
184
185static int64_t now_ms(void) {
186 return (int64_t)((double)clock() * 1000.0 / (double)CLOCKS_PER_SEC);
187}
188
189int main(int argc, char **argv) {
190 if (argc < 2) {
191 fprintf(stderr,
192 "usage:\n"
193 " %s bench <input.bin> <iterations>\n"
194 " %s compress <input.bin> <output.bin>\n"
195 " %s decompress <input.bin> <output.bin>\n",
196 argv[0], argv[0], argv[0]);
197 return 1;
198 }
199 if (strcmp(argv[1], "bench") == 0) {
200 if (argc < 4) {
201 fprintf(stderr, "usage: %s bench <input.bin> <iterations>\n", argv[0]);
202 return 1;
203 }
204 int iterations = atoi(argv[3]);
205 if (iterations <= 0) {
206 fprintf(stderr, "iterations must be > 0\n");
207 return 1;
208 }
209 Buffer input = read_all(argv[2]);
210 int64_t start = now_ms();
211 for (int i = 0; i < iterations; i++) {
212 Buffer comp = compress_lz77(input.data, input.len);
213 Buffer decomp = decompress_lz77(comp.data, comp.len);
214 if (decomp.len != input.len || memcmp(decomp.data, input.data, input.len) != 0) {
215 fprintf(stderr, "roundtrip mismatch\n");
216 return 1;
217 }
218 free(comp.data);
219 free(decomp.data);
220 }
221 int64_t elapsed = now_ms() - start;
222 printf("ms=%lld\n", (long long)elapsed);
223 free(input.data);
224 return 0;
225 }
226
227 if (strcmp(argv[1], "compress") == 0) {
228 if (argc < 4) {
229 fprintf(stderr, "usage: %s compress <input.bin> <output.bin>\n", argv[0]);
230 return 1;
231 }
232 Buffer input = read_all(argv[2]);
233 Buffer comp = compress_lz77(input.data, input.len);
234 write_all(argv[3], comp.data, comp.len);
235 free(input.data);
236 free(comp.data);
237 return 0;
238 }
239
240 if (strcmp(argv[1], "decompress") == 0) {
241 if (argc < 4) {
242 fprintf(stderr, "usage: %s decompress <input.bin> <output.bin>\n", argv[0]);
243 return 1;
244 }
245 Buffer input = read_all(argv[2]);
246 Buffer dec = decompress_lz77(input.data, input.len);
247 write_all(argv[3], dec.data, dec.len);
248 free(input.data);
249 free(dec.data);
250 return 0;
251 }
252
253 fprintf(stderr, "unknown mode: %s\n", argv[1]);
254 return 1;
255}
256
257