v2 / vlib / encoding / cbor / encoder.v
702 lines · 655 sloc · 19.34 KB · 468855eef1db0ff73c62be2d1bf176ffa0e1478e
Raw
1module cbor
2
3import math
4
5// EncodeOpts tunes the encoder. Defaults yield RFC 8949 *preferred*
6// serialisation: floats shrink to the shortest IEEE 754 width that
7// preserves their value, headers use the shortest length encoding.
8//
9// Setting `canonical = true` additionally sorts map keys per RFC 8949
10// §4.2.1 (deterministic encoding) — useful for hashing/signing.
11pub struct EncodeOpts {
12pub:
13 initial_cap int = 64
14 canonical bool // sort map keys, definite-length only
15 self_describe bool // prepend tag 55799 (`d9 d9 f7`)
16 // validate_utf8 makes encode[T] reject V `string` payloads that
17 // contain non-UTF-8 bytes. Off by default to match the conventional
18 // V invariant ("strings are UTF-8") and avoid paying for validation
19 // on hot paths. Turn on at trust boundaries when callers may build
20 // strings from raw bytes (e.g. `bytestr()`), so the wire stays
21 // round-trip-safe against the strict-by-default decoder.
22 validate_utf8 bool
23}
24
25// Packer accumulates CBOR bytes into an internal buffer. Use `bytes()`
26// to retrieve the wire output, or `reset()` to reuse the buffer for the
27// next message — that's the cheapest way to emit many small frames.
28//
29// `indef_string_open` and `indef_other_depth` track open indefinite-length
30// items so the encoder can reject malformed compositions: nested indef
31// strings, indef array/map inside an indef string (RFC 8949 §3.2.3), or
32// a stray break code.
33pub struct Packer {
34pub mut:
35 buf []u8
36 opts EncodeOpts
37mut:
38 indef_string_open bool // top of the indef "stack" is text or bytes
39 indef_other_depth int // count of currently open indef arrays/maps
40}
41
42// new_packer builds a Packer with the given options. `opts.initial_cap`
43// reserves the buffer up-front; oversize is harmless, undersize triggers
44// the usual growth policy.
45pub fn new_packer(opts EncodeOpts) Packer {
46 cap := if opts.initial_cap > 0 { opts.initial_cap } else { 64 }
47 mut p := Packer{
48 buf: []u8{cap: cap}
49 opts: opts
50 }
51 if opts.self_describe {
52 p.buf << self_describe_prefix
53 }
54 return p
55}
56
57// bytes returns the encoded buffer. The returned slice aliases the
58// Packer's storage — clone it if you keep using the Packer. This is a
59// low-level accessor that does NOT verify the buffer holds a complete
60// item; if you opened an indefinite-length container without closing
61// it, the bytes will be malformed. Use `pack_to` (or `encode[T]`) for
62// the validated path, or call `is_complete()` yourself.
63@[inline]
64pub fn (mut p Packer) bytes() []u8 {
65 return p.buf
66}
67
68// is_complete reports whether the buffer holds a sequence of fully
69// closed items. False while an indefinite-length array, map, text, or
70// bytes container is still open (waiting for `pack_break`).
71@[inline]
72pub fn (p &Packer) is_complete() bool {
73 return !p.indef_string_open && p.indef_other_depth == 0
74}
75
76// reset clears the buffer for reuse. The capacity is preserved, so this
77// is the fast path for high-throughput senders.
78@[inline]
79pub fn (mut p Packer) reset() {
80 unsafe {
81 p.buf.len = 0
82 }
83 p.indef_string_open = false
84 p.indef_other_depth = 0
85 if p.opts.self_describe {
86 p.buf << self_describe_prefix
87 }
88}
89
90// reserve grows the buffer's capacity by at least `n` bytes. Useful
91// before a string/binary write of known length to skip per-byte growth.
92@[inline]
93pub fn (mut p Packer) reserve(n int) {
94 if n <= 0 {
95 return
96 }
97 needed := p.buf.len + n
98 if needed > p.buf.cap {
99 mut new_cap := if p.buf.cap == 0 { 64 } else { p.buf.cap * 2 }
100 for new_cap < needed {
101 new_cap *= 2
102 }
103 mut grown := []u8{cap: new_cap}
104 grown << p.buf
105 p.buf = grown
106 }
107}
108
109// extend_unchecked grows the buffer's length by `n`. The caller must
110// have already ensured enough capacity via `reserve`. Returns the
111// position at which the new bytes start.
112@[direct_array_access; inline]
113fn (mut p Packer) extend_unchecked(n int) int {
114 pos := p.buf.len
115 unsafe {
116 p.buf.len = pos + n
117 }
118 return pos
119}
120
121// --------------------------------------------------------------------
122// Low-level head writer
123// --------------------------------------------------------------------
124
125// write_head emits an initial byte (major type | additional info) plus
126// the appropriate big-endian argument. Always uses the shortest encoding
127// (RFC 8949 §4.2.1, "preferred serialization"). Hot path: avoid the
128// `<<` operator (which carries cap-grow checks per byte) by reserving
129// once, then using direct unsafe index writes.
130@[direct_array_access; inline]
131fn (mut p Packer) write_head(major u8, arg u64) {
132 if arg < 24 {
133 p.reserve(1)
134 pos := p.extend_unchecked(1)
135 unsafe {
136 p.buf[pos] = major | u8(arg)
137 }
138 return
139 }
140 if arg <= 0xff {
141 p.reserve(2)
142 pos := p.extend_unchecked(2)
143 unsafe {
144 p.buf[pos] = major | 24
145 p.buf[pos + 1] = u8(arg)
146 }
147 return
148 }
149 if arg <= 0xffff {
150 p.reserve(3)
151 pos := p.extend_unchecked(3)
152 unsafe {
153 p.buf[pos] = major | 25
154 p.buf[pos + 1] = u8(arg >> 8)
155 p.buf[pos + 2] = u8(arg)
156 }
157 return
158 }
159 if arg <= 0xffffffff {
160 p.reserve(5)
161 pos := p.extend_unchecked(5)
162 unsafe {
163 p.buf[pos] = major | 26
164 p.buf[pos + 1] = u8(arg >> 24)
165 p.buf[pos + 2] = u8(arg >> 16)
166 p.buf[pos + 3] = u8(arg >> 8)
167 p.buf[pos + 4] = u8(arg)
168 }
169 return
170 }
171 p.reserve(9)
172 pos := p.extend_unchecked(9)
173 unsafe {
174 p.buf[pos] = major | 27
175 p.buf[pos + 1] = u8(arg >> 56)
176 p.buf[pos + 2] = u8(arg >> 48)
177 p.buf[pos + 3] = u8(arg >> 40)
178 p.buf[pos + 4] = u8(arg >> 32)
179 p.buf[pos + 5] = u8(arg >> 24)
180 p.buf[pos + 6] = u8(arg >> 16)
181 p.buf[pos + 7] = u8(arg >> 8)
182 p.buf[pos + 8] = u8(arg)
183 }
184}
185
186@[direct_array_access; inline]
187fn (mut p Packer) write_be_u16(v u16) {
188 p.reserve(2)
189 pos := p.extend_unchecked(2)
190 unsafe {
191 p.buf[pos] = u8(v >> 8)
192 p.buf[pos + 1] = u8(v)
193 }
194}
195
196@[direct_array_access; inline]
197fn (mut p Packer) write_be_u32(v u32) {
198 p.reserve(4)
199 pos := p.extend_unchecked(4)
200 unsafe {
201 p.buf[pos] = u8(v >> 24)
202 p.buf[pos + 1] = u8(v >> 16)
203 p.buf[pos + 2] = u8(v >> 8)
204 p.buf[pos + 3] = u8(v)
205 }
206}
207
208@[direct_array_access; inline]
209fn (mut p Packer) write_be_u64(v u64) {
210 p.reserve(8)
211 pos := p.extend_unchecked(8)
212 unsafe {
213 p.buf[pos] = u8(v >> 56)
214 p.buf[pos + 1] = u8(v >> 48)
215 p.buf[pos + 2] = u8(v >> 40)
216 p.buf[pos + 3] = u8(v >> 32)
217 p.buf[pos + 4] = u8(v >> 24)
218 p.buf[pos + 5] = u8(v >> 16)
219 p.buf[pos + 6] = u8(v >> 8)
220 p.buf[pos + 7] = u8(v)
221 }
222}
223
224// --------------------------------------------------------------------
225// High-level packers — primitives
226// --------------------------------------------------------------------
227
228// pack_uint emits a CBOR unsigned-integer (major type 0). Covers the
229// full u64 range, including values above i64.max.
230@[inline]
231pub fn (mut p Packer) pack_uint(v u64) {
232 p.write_head(0x00, v)
233}
234
235// pack_int picks the right major type for a signed integer.
236// For values below i64.min that can still fit -1-u64, prefer
237// `pack_negative_arg`.
238@[inline]
239pub fn (mut p Packer) pack_int(v i64) {
240 if v >= 0 {
241 p.write_head(0x00, u64(v))
242 } else {
243 p.write_head(0x20, u64(-1 - v))
244 }
245}
246
247// pack_negative_arg writes a major type 1 value where the encoded
248// argument is `arg` and the represented integer is `-1 - arg`. Lets you
249// emit values down to -2^64 (the lower bound of CBOR negative ints).
250@[inline]
251pub fn (mut p Packer) pack_negative_arg(arg u64) {
252 p.write_head(0x20, arg)
253}
254
255// pack_bool emits the simple value 20 (false) or 21 (true).
256@[direct_array_access; inline]
257pub fn (mut p Packer) pack_bool(v bool) {
258 p.reserve(1)
259 pos := p.extend_unchecked(1)
260 unsafe {
261 p.buf[pos] = if v { u8(0xf5) } else { u8(0xf4) }
262 }
263}
264
265// pack_null emits CBOR null (simple value 22, byte 0xf6).
266@[direct_array_access; inline]
267pub fn (mut p Packer) pack_null() {
268 p.reserve(1)
269 pos := p.extend_unchecked(1)
270 unsafe {
271 p.buf[pos] = 0xf6
272 }
273}
274
275// pack_undefined emits CBOR undefined (simple value 23, byte 0xf7).
276@[direct_array_access; inline]
277pub fn (mut p Packer) pack_undefined() {
278 p.reserve(1)
279 pos := p.extend_unchecked(1)
280 unsafe {
281 p.buf[pos] = 0xf7
282 }
283}
284
285// pack_simple emits a CBOR simple value. Values 0..23 use the inline
286// form, values 32..255 use the 1-byte trailer form. Values 24..31 are
287// not well-formed per RFC 8949 §3.3 and are rejected here.
288@[direct_array_access]
289pub fn (mut p Packer) pack_simple(v u8) ! {
290 // RFC 8949 §3.3 assigns simple values 20..23 to false/true/null/
291 // undefined; encoding them through pack_simple would silently produce
292 // wire-equivalent bytes that decode back as Bool/Null/Undefined, not
293 // as a Simple — surprising and ambiguous. Force the caller through
294 // the dedicated typed packers.
295 if v >= 20 && v < 24 {
296 return error('cbor: simple values 20..23 must be packed via pack_bool / pack_null / pack_undefined (RFC 8949 §3.3)')
297 }
298 if v < 24 {
299 p.reserve(1)
300 pos := p.extend_unchecked(1)
301 unsafe {
302 p.buf[pos] = 0xe0 | v
303 }
304 return
305 }
306 if v < 32 {
307 return error('cbor: simple values 24..31 are not well-formed (RFC 8949 §3.3)')
308 }
309 p.reserve(2)
310 pos := p.extend_unchecked(2)
311 unsafe {
312 p.buf[pos] = 0xf8
313 p.buf[pos + 1] = v
314 }
315}
316
317// --------------------------------------------------------------------
318// High-level packers — strings and bytes
319// --------------------------------------------------------------------
320
321// pack_text writes a UTF-8 text string (major type 3). Single-shot
322// reservation: the head + payload bytes are appended via one capacity
323// check and one memcpy.
324@[direct_array_access; inline]
325pub fn (mut p Packer) pack_text(s string) {
326 if s.len < 24 {
327 // Short string: head + payload fit in s.len + 1 bytes.
328 total := s.len + 1
329 p.reserve(total)
330 pos := p.extend_unchecked(total)
331 unsafe {
332 p.buf[pos] = u8(0x60) | u8(s.len)
333 if s.len > 0 {
334 vmemcpy(&p.buf[pos + 1], s.str, s.len)
335 }
336 }
337 return
338 }
339 p.write_head(0x60, u64(s.len))
340 p.reserve(s.len)
341 unsafe { p.buf.push_many(s.str, s.len) }
342}
343
344// pack_bytes writes a byte string (major type 2).
345@[direct_array_access]
346pub fn (mut p Packer) pack_bytes(b []u8) {
347 if b.len < 24 {
348 total := b.len + 1
349 p.reserve(total)
350 pos := p.extend_unchecked(total)
351 unsafe {
352 p.buf[pos] = u8(0x40) | u8(b.len)
353 if b.len > 0 {
354 vmemcpy(&p.buf[pos + 1], b.data, b.len)
355 }
356 }
357 return
358 }
359 p.write_head(0x40, u64(b.len))
360 p.reserve(b.len)
361 unsafe { p.buf.push_many(b.data, b.len) }
362}
363
364// --------------------------------------------------------------------
365// High-level packers — arrays, maps, tags
366// --------------------------------------------------------------------
367
368// pack_array_header writes the prefix for a definite-length array.
369@[inline]
370pub fn (mut p Packer) pack_array_header(n u64) {
371 p.write_head(0x80, n)
372}
373
374// pack_map_header writes the prefix for a definite-length map. The
375// argument is the number of *pairs*, not items.
376@[inline]
377pub fn (mut p Packer) pack_map_header(n u64) {
378 p.write_head(0xa0, n)
379}
380
381// pack_tag writes a tag header (major type 6). The next packed item is
382// the tag's content.
383@[inline]
384pub fn (mut p Packer) pack_tag(number u64) {
385 p.write_head(0xc0, number)
386}
387
388// open_indef_or_error rejects opening any indef container inside an
389// open indef text/bytes context (RFC 8949 §3.2.3 only allows definite
390// chunks of the matching major type), then writes `head` and updates
391// the tracking state.
392@[direct_array_access; inline]
393fn (mut p Packer) open_indef_or_error(head u8, is_string bool) ! {
394 if p.indef_string_open {
395 return error('cbor: indefinite-length string chunks must be definite-length strings of the same major type')
396 }
397 p.reserve(1)
398 pos := p.extend_unchecked(1)
399 unsafe {
400 p.buf[pos] = head
401 }
402 if is_string {
403 p.indef_string_open = true
404 } else {
405 p.indef_other_depth++
406 }
407}
408
409// pack_array_indef opens an indefinite-length array. Close with `pack_break`.
410@[inline]
411pub fn (mut p Packer) pack_array_indef() ! {
412 p.open_indef_or_error(0x9f, false)!
413}
414
415// pack_map_indef opens an indefinite-length map. Close with `pack_break`.
416@[inline]
417pub fn (mut p Packer) pack_map_indef() ! {
418 p.open_indef_or_error(0xbf, false)!
419}
420
421// pack_text_indef opens an indefinite-length text string. Each chunk
422// must be a definite-length text string; close with `pack_break`.
423@[inline]
424pub fn (mut p Packer) pack_text_indef() ! {
425 p.open_indef_or_error(0x7f, true)!
426}
427
428// pack_bytes_indef opens an indefinite-length byte string. Each chunk
429// must be a definite-length byte string; close with `pack_break`.
430@[inline]
431pub fn (mut p Packer) pack_bytes_indef() ! {
432 p.open_indef_or_error(0x5f, true)!
433}
434
435// pack_break writes the break stop code 0xff that terminates the most
436// recently opened indefinite-length item. Errors when no item is open
437// (the byte 0xff is otherwise reserved and emitting one would corrupt
438// the stream).
439@[direct_array_access; inline]
440pub fn (mut p Packer) pack_break() ! {
441 if p.indef_string_open {
442 p.indef_string_open = false
443 } else if p.indef_other_depth > 0 {
444 p.indef_other_depth--
445 } else {
446 return error('cbor: pack_break called with no open indefinite-length item')
447 }
448 p.reserve(1)
449 pos := p.extend_unchecked(1)
450 unsafe {
451 p.buf[pos] = 0xff
452 }
453}
454
455// --------------------------------------------------------------------
456// High-level packers — floats with preferred serialisation
457// --------------------------------------------------------------------
458
459// pack_float64 always emits an 8-byte IEEE 754 float.
460@[direct_array_access; inline]
461pub fn (mut p Packer) pack_float64(v f64) {
462 p.reserve(9)
463 pos := p.extend_unchecked(9)
464 bits := math.f64_bits(v)
465 unsafe {
466 p.buf[pos] = 0xfb
467 p.buf[pos + 1] = u8(bits >> 56)
468 p.buf[pos + 2] = u8(bits >> 48)
469 p.buf[pos + 3] = u8(bits >> 40)
470 p.buf[pos + 4] = u8(bits >> 32)
471 p.buf[pos + 5] = u8(bits >> 24)
472 p.buf[pos + 6] = u8(bits >> 16)
473 p.buf[pos + 7] = u8(bits >> 8)
474 p.buf[pos + 8] = u8(bits)
475 }
476}
477
478// pack_float32 always emits a 4-byte IEEE 754 float.
479@[direct_array_access; inline]
480pub fn (mut p Packer) pack_float32(v f32) {
481 p.reserve(5)
482 pos := p.extend_unchecked(5)
483 bits := math.f32_bits(v)
484 unsafe {
485 p.buf[pos] = 0xfa
486 p.buf[pos + 1] = u8(bits >> 24)
487 p.buf[pos + 2] = u8(bits >> 16)
488 p.buf[pos + 3] = u8(bits >> 8)
489 p.buf[pos + 4] = u8(bits)
490 }
491}
492
493// pack_float16_bits always emits a 2-byte IEEE 754 float.
494@[direct_array_access; inline]
495pub fn (mut p Packer) pack_float16_bits(bits u16) {
496 p.reserve(3)
497 pos := p.extend_unchecked(3)
498 unsafe {
499 p.buf[pos] = 0xf9
500 p.buf[pos + 1] = u8(bits >> 8)
501 p.buf[pos + 2] = u8(bits)
502 }
503}
504
505// pack_float emits the shortest IEEE 754 width that preserves the value,
506// per RFC 8949 §4.2.2. NaN serialises as the canonical quiet NaN
507// (0xf97e00), not the original payload.
508@[direct_array_access]
509pub fn (mut p Packer) pack_float(v f64) {
510 if math.is_nan(v) {
511 p.pack_float16_bits(half_qnan_bits)
512 return
513 }
514 if math.is_inf(v, 1) {
515 p.pack_float16_bits(half_pos_inf_bits)
516 return
517 }
518 if math.is_inf(v, -1) {
519 p.pack_float16_bits(half_neg_inf_bits)
520 return
521 }
522 // Try f32: lossless conversion?
523 f32_v := f32(v)
524 if f64(f32_v) == v {
525 bits16, ok := f32_to_half(f32_v)
526 if ok {
527 p.pack_float16_bits(bits16)
528 return
529 }
530 p.pack_float32(f32_v)
531 return
532 }
533 p.pack_float64(v)
534}
535
536// --------------------------------------------------------------------
537// Value tree encoder
538// --------------------------------------------------------------------
539
540// pack_value emits an arbitrary `Value` tree, honouring the original
541// float width hint. Map keys are sorted when `opts.canonical` is set.
542// Returns an error if the tree is malformed (e.g. a `Tag` with no
543// content) — silently emitting a placeholder would corrupt round-trips.
544pub fn (mut p Packer) pack_value(v Value) ! {
545 match v {
546 IntNum {
547 if v.negative {
548 p.write_head(0x20, v.magnitude)
549 } else {
550 p.write_head(0x00, v.magnitude)
551 }
552 }
553 Bytes {
554 p.pack_bytes(v.data)
555 }
556 Text {
557 p.pack_text(v.value)
558 }
559 Array {
560 p.pack_array_header(u64(v.elements.len))
561 for el in v.elements {
562 p.pack_value(el)!
563 }
564 }
565 Map {
566 p.pack_map_header(u64(v.pairs.len))
567 if p.opts.canonical {
568 p.pack_map_canonical(v.pairs)!
569 } else {
570 for pair in v.pairs {
571 p.pack_value(pair.key)!
572 p.pack_value(pair.value)!
573 }
574 }
575 }
576 Tag {
577 if v.content_box.len == 0 {
578 return error('cbor: Tag(${v.number}) has no content — use new_tag() or set content_box')
579 }
580 p.pack_tag(v.number)
581 p.pack_value(v.content_box[0])!
582 }
583 Bool {
584 p.pack_bool(v.value)
585 }
586 Null {
587 p.pack_null()
588 }
589 Undefined {
590 p.pack_undefined()
591 }
592 FloatNum {
593 // RFC 8949 §4.2.1 deterministic encoding requires the shortest
594 // IEEE 754 form (§4.2.2) regardless of the original wire width.
595 // Drop the bits hint when canonical so re-encoded `Value`s
596 // match the rule, even if the producer copied a too-wide hint
597 // from a non-canonical source.
598 if p.opts.canonical {
599 p.pack_float(v.value)
600 } else {
601 match v.bits {
602 .half {
603 // NaN/±Inf bypass the lossless check (NaN != NaN
604 // breaks the f32 round-trip equality test).
605 if math.is_nan(v.value) {
606 p.pack_float16_bits(half_qnan_bits)
607 } else if math.is_inf(v.value, 1) {
608 p.pack_float16_bits(half_pos_inf_bits)
609 } else if math.is_inf(v.value, -1) {
610 p.pack_float16_bits(half_neg_inf_bits)
611 } else {
612 bits16, ok := f64_to_half(v.value)
613 if ok {
614 p.pack_float16_bits(bits16)
615 } else {
616 p.pack_float64(v.value)
617 }
618 }
619 }
620 .single {
621 p.pack_float32(f32(v.value))
622 }
623 .double {
624 p.pack_float64(v.value)
625 }
626 .@none {
627 p.pack_float(v.value)
628 }
629 }
630 }
631 }
632 Simple {
633 p.pack_simple(v.value)!
634 }
635 }
636}
637
638// pack_map_canonical sorts pairs by encoded-key bytes per RFC 8949
639// §4.2.1 (length-first lexicographic, "bytewise lexicographic of the
640// deterministic encodings of the keys") before emitting them.
641fn (mut p Packer) pack_map_canonical(pairs []MapPair) ! {
642 if pairs.len == 0 {
643 return
644 }
645 // Encode each key once, sort indices by the encoded key bytes, then emit.
646 // Sub-encoders inherit `validate_utf8` so a strict-encode caller still
647 // gets the guarantee on text-typed keys in canonical mode.
648 sub_opts := EncodeOpts{
649 initial_cap: 16
650 canonical: true
651 validate_utf8: p.opts.validate_utf8
652 }
653 mut encoded_keys := [][]u8{cap: pairs.len}
654 for pair in pairs {
655 mut sub := new_packer(sub_opts)
656 sub.pack_value(pair.key)!
657 encoded_keys << sub.bytes().clone()
658 }
659 for i in sort_canonical_indices(encoded_keys) {
660 p.reserve(encoded_keys[i].len)
661 unsafe { p.buf.push_many(encoded_keys[i].data, encoded_keys[i].len) }
662 p.pack_value(pairs[i].value)!
663 }
664}
665
666// compare_canonical_keys orders byte slices by length first, then
667// bytewise; this matches RFC 8949 §4.2.1 "Core Deterministic Encoding".
668@[direct_array_access]
669fn compare_canonical_keys(a []u8, b []u8) int {
670 if a.len != b.len {
671 return if a.len < b.len { -1 } else { 1 }
672 }
673 for i in 0 .. a.len {
674 if a[i] != b[i] {
675 return if a[i] < b[i] { -1 } else { 1 }
676 }
677 }
678 return 0
679}
680
681// sort_canonical_indices returns indices into `keys` ordered by RFC
682// 8949 §4.2.1 (length-first lexicographic on the encoded key bytes).
683// Shared by the three canonical-emit paths (Value Map, generic $map,
684// generic $struct) so the closure literal lives in one place.
685fn sort_canonical_indices(keys [][]u8) []int {
686 mut idx := []int{len: keys.len, init: index}
687 idx.sort_with_compare(fn [keys] (a &int, b &int) int {
688 return compare_canonical_keys(keys[*a], keys[*b])
689 })
690 return idx
691}
692
693// --------------------------------------------------------------------
694// Module-level convenience wrappers
695// --------------------------------------------------------------------
696
697// encode_value emits a `Value` tree to a fresh byte slice with default opts.
698pub fn encode_value(v Value, opts EncodeOpts) ![]u8 {
699 mut p := new_packer(opts)
700 p.pack_value(v)!
701 return p.bytes().clone()
702}
703