| 1 | #include <stdint.h> |
| 2 | #include <stdio.h> |
| 3 | #include <stdlib.h> |
| 4 | #include <string.h> |
| 5 | #include <time.h> |
| 6 | |
| 7 | #define MIN_MATCH 3 |
| 8 | #define MAX_LITERAL 128 |
| 9 | |
| 10 | static const uint8_t STREAM_MAGIC[4] = {0x56, 0x4c, 0x5a, 0x31}; |
| 11 | static const uint8_t FORMAT_LZ77 = 0; |
| 12 | |
| 13 | typedef struct { |
| 14 | uint8_t *data; |
| 15 | size_t len; |
| 16 | size_t cap; |
| 17 | } Buffer; |
| 18 | |
| 19 | static void die(const char *msg) { |
| 20 | fprintf(stderr, "%s\n", msg); |
| 21 | exit(1); |
| 22 | } |
| 23 | |
| 24 | static void buf_init(Buffer *b, size_t cap) { |
| 25 | b->data = (uint8_t *)malloc(cap > 0 ? cap : 1); |
| 26 | if (!b->data) { |
| 27 | die("allocation failed"); |
| 28 | } |
| 29 | b->len = 0; |
| 30 | b->cap = cap > 0 ? cap : 1; |
| 31 | } |
| 32 | |
| 33 | static void buf_push(Buffer *b, uint8_t v) { |
| 34 | if (b->len >= b->cap) { |
| 35 | size_t new_cap = b->cap * 2; |
| 36 | uint8_t *n = (uint8_t *)realloc(b->data, new_cap); |
| 37 | if (!n) { |
| 38 | die("reallocation failed"); |
| 39 | } |
| 40 | b->data = n; |
| 41 | b->cap = new_cap; |
| 42 | } |
| 43 | b->data[b->len++] = v; |
| 44 | } |
| 45 | |
| 46 | static void buf_append(Buffer *b, const uint8_t *src, size_t len) { |
| 47 | for (size_t i = 0; i < len; i++) { |
| 48 | buf_push(b, src[i]); |
| 49 | } |
| 50 | } |
| 51 | |
| 52 | static Buffer read_all(const char *path) { |
| 53 | FILE *f = fopen(path, "rb"); |
| 54 | if (!f) { |
| 55 | die("could not open input file"); |
| 56 | } |
| 57 | if (fseek(f, 0, SEEK_END) != 0) { |
| 58 | fclose(f); |
| 59 | die("could not seek input file"); |
| 60 | } |
| 61 | long sz = ftell(f); |
| 62 | if (sz < 0) { |
| 63 | fclose(f); |
| 64 | die("could not read input file size"); |
| 65 | } |
| 66 | if (fseek(f, 0, SEEK_SET) != 0) { |
| 67 | fclose(f); |
| 68 | die("could not rewind input file"); |
| 69 | } |
| 70 | Buffer in; |
| 71 | buf_init(&in, (size_t)sz + 1); |
| 72 | in.len = (size_t)sz; |
| 73 | if (in.len > 0 && fread(in.data, 1, in.len, f) != in.len) { |
| 74 | fclose(f); |
| 75 | free(in.data); |
| 76 | die("could not read input file"); |
| 77 | } |
| 78 | fclose(f); |
| 79 | return in; |
| 80 | } |
| 81 | |
| 82 | static void write_all(const char *path, const uint8_t *data, size_t len) { |
| 83 | FILE *f = fopen(path, "wb"); |
| 84 | if (!f) { |
| 85 | die("could not open output file"); |
| 86 | } |
| 87 | if (len > 0 && fwrite(data, 1, len, f) != len) { |
| 88 | fclose(f); |
| 89 | die("could not write output file"); |
| 90 | } |
| 91 | fclose(f); |
| 92 | } |
| 93 | |
| 94 | static void write_uvarint(Buffer *out, uint64_t value) { |
| 95 | uint64_t v = value; |
| 96 | while (v >= 0x80) { |
| 97 | buf_push(out, (uint8_t)(v & 0x7f) | 0x80); |
| 98 | v >>= 7; |
| 99 | } |
| 100 | buf_push(out, (uint8_t)v); |
| 101 | } |
| 102 | |
| 103 | static int read_uvarint(const uint8_t *data, size_t len, size_t *pos, uint64_t *value) { |
| 104 | uint64_t out = 0; |
| 105 | uint32_t shift = 0; |
| 106 | while (*pos < len && shift <= 63) { |
| 107 | uint8_t b = data[*pos]; |
| 108 | (*pos)++; |
| 109 | out |= ((uint64_t)(b & 0x7f)) << shift; |
| 110 | if ((b & 0x80) == 0) { |
| 111 | *value = out; |
| 112 | return 1; |
| 113 | } |
| 114 | shift += 7; |
| 115 | } |
| 116 | return 0; |
| 117 | } |
| 118 | |
| 119 | static Buffer compress_lz77(const uint8_t *in, size_t in_len) { |
| 120 | Buffer out; |
| 121 | buf_init(&out, in_len + 32); |
| 122 | buf_append(&out, STREAM_MAGIC, 4); |
| 123 | buf_push(&out, FORMAT_LZ77); |
| 124 | write_uvarint(&out, (uint64_t)in_len); |
| 125 | |
| 126 | for (size_t i = 0; i < in_len;) { |
| 127 | size_t lit_len = in_len - i; |
| 128 | if (lit_len > MAX_LITERAL) { |
| 129 | lit_len = MAX_LITERAL; |
| 130 | } |
| 131 | buf_push(&out, (uint8_t)(lit_len - 1)); |
| 132 | buf_append(&out, in + i, lit_len); |
| 133 | i += lit_len; |
| 134 | } |
| 135 | return out; |
| 136 | } |
| 137 | |
| 138 | static Buffer decompress_lz77(const uint8_t *in, size_t in_len) { |
| 139 | if (in_len < 6 || memcmp(in, STREAM_MAGIC, 4) != 0) { |
| 140 | die("bad magic"); |
| 141 | } |
| 142 | if (in[4] != FORMAT_LZ77) { |
| 143 | die("format mismatch"); |
| 144 | } |
| 145 | size_t pos = 5; |
| 146 | uint64_t expected_len_u64 = 0; |
| 147 | if (!read_uvarint(in, in_len, &pos, &expected_len_u64)) { |
| 148 | die("bad length varint"); |
| 149 | } |
| 150 | size_t expected_len = (size_t)expected_len_u64; |
| 151 | |
| 152 | Buffer out; |
| 153 | buf_init(&out, expected_len + 16); |
| 154 | while (pos < in_len) { |
| 155 | uint8_t control = in[pos++]; |
| 156 | if ((control & 0x80) == 0) { |
| 157 | size_t lit_len = (size_t)(control & 0x7f) + 1; |
| 158 | if (pos + lit_len > in_len) { |
| 159 | die("truncated literal"); |
| 160 | } |
| 161 | buf_append(&out, in + pos, lit_len); |
| 162 | pos += lit_len; |
| 163 | } else { |
| 164 | size_t match_len = (size_t)(control & 0x7f) + MIN_MATCH; |
| 165 | uint64_t offset_u64 = 0; |
| 166 | if (!read_uvarint(in, in_len, &pos, &offset_u64)) { |
| 167 | die("bad match offset"); |
| 168 | } |
| 169 | size_t offset = (size_t)offset_u64; |
| 170 | if (offset == 0 || offset > out.len) { |
| 171 | die("bad offset"); |
| 172 | } |
| 173 | size_t base = out.len - offset; |
| 174 | for (size_t k = 0; k < match_len; k++) { |
| 175 | buf_push(&out, out.data[base + k]); |
| 176 | } |
| 177 | } |
| 178 | } |
| 179 | if (out.len != expected_len) { |
| 180 | die("length mismatch"); |
| 181 | } |
| 182 | return out; |
| 183 | } |
| 184 | |
| 185 | static int64_t now_ms(void) { |
| 186 | return (int64_t)((double)clock() * 1000.0 / (double)CLOCKS_PER_SEC); |
| 187 | } |
| 188 | |
| 189 | int main(int argc, char **argv) { |
| 190 | if (argc < 2) { |
| 191 | fprintf(stderr, |
| 192 | "usage:\n" |
| 193 | " %s bench <input.bin> <iterations>\n" |
| 194 | " %s compress <input.bin> <output.bin>\n" |
| 195 | " %s decompress <input.bin> <output.bin>\n", |
| 196 | argv[0], argv[0], argv[0]); |
| 197 | return 1; |
| 198 | } |
| 199 | if (strcmp(argv[1], "bench") == 0) { |
| 200 | if (argc < 4) { |
| 201 | fprintf(stderr, "usage: %s bench <input.bin> <iterations>\n", argv[0]); |
| 202 | return 1; |
| 203 | } |
| 204 | int iterations = atoi(argv[3]); |
| 205 | if (iterations <= 0) { |
| 206 | fprintf(stderr, "iterations must be > 0\n"); |
| 207 | return 1; |
| 208 | } |
| 209 | Buffer input = read_all(argv[2]); |
| 210 | int64_t start = now_ms(); |
| 211 | for (int i = 0; i < iterations; i++) { |
| 212 | Buffer comp = compress_lz77(input.data, input.len); |
| 213 | Buffer decomp = decompress_lz77(comp.data, comp.len); |
| 214 | if (decomp.len != input.len || memcmp(decomp.data, input.data, input.len) != 0) { |
| 215 | fprintf(stderr, "roundtrip mismatch\n"); |
| 216 | return 1; |
| 217 | } |
| 218 | free(comp.data); |
| 219 | free(decomp.data); |
| 220 | } |
| 221 | int64_t elapsed = now_ms() - start; |
| 222 | printf("ms=%lld\n", (long long)elapsed); |
| 223 | free(input.data); |
| 224 | return 0; |
| 225 | } |
| 226 | |
| 227 | if (strcmp(argv[1], "compress") == 0) { |
| 228 | if (argc < 4) { |
| 229 | fprintf(stderr, "usage: %s compress <input.bin> <output.bin>\n", argv[0]); |
| 230 | return 1; |
| 231 | } |
| 232 | Buffer input = read_all(argv[2]); |
| 233 | Buffer comp = compress_lz77(input.data, input.len); |
| 234 | write_all(argv[3], comp.data, comp.len); |
| 235 | free(input.data); |
| 236 | free(comp.data); |
| 237 | return 0; |
| 238 | } |
| 239 | |
| 240 | if (strcmp(argv[1], "decompress") == 0) { |
| 241 | if (argc < 4) { |
| 242 | fprintf(stderr, "usage: %s decompress <input.bin> <output.bin>\n", argv[0]); |
| 243 | return 1; |
| 244 | } |
| 245 | Buffer input = read_all(argv[2]); |
| 246 | Buffer dec = decompress_lz77(input.data, input.len); |
| 247 | write_all(argv[3], dec.data, dec.len); |
| 248 | free(input.data); |
| 249 | free(dec.data); |
| 250 | return 0; |
| 251 | } |
| 252 | |
| 253 | fprintf(stderr, "unknown mode: %s\n", argv[1]); |
| 254 | return 1; |
| 255 | } |
| 256 | |
| 257 | |