v / vlib / yaml / parser.v
1100 lines · 1056 sloc · 25.83 KB · d96fe54c0a96fd6d651e9cb3c61b137ccc5c7dbe
Raw
1module yaml
2
3import strconv
4import strings
5
6struct Parser {
7mut:
8 lines []string
9 idx int
10 anchors map[string]Any
11 directives_done bool
12}
13
14fn (mut p Parser) parse() !Any {
15 p.skip_ignorable()
16 if p.idx >= p.lines.len {
17 return null
18 }
19 indent := p.line_indent(p.idx)!
20 if indent < 0 {
21 return null
22 }
23 return p.parse_node(indent)
24}
25
26fn (mut p Parser) parse_node(indent int) !Any {
27 p.skip_ignorable()
28 if p.idx >= p.lines.len {
29 return null
30 }
31 current_indent := p.line_indent(p.idx)!
32 if current_indent < indent {
33 return null
34 }
35 content := p.current_content()!
36 if content.starts_with('-') && (content.len == 1 || content[1] == ` `) {
37 return p.parse_sequence(current_indent)
38 }
39 if split_mapping_entry(content).ok {
40 return p.parse_mapping(current_indent)
41 }
42 d := extract_decorators(content)
43 if d.alias != '' && d.rest == '' {
44 p.idx++
45 return p.resolve_alias(d.alias)
46 }
47 if is_block_scalar(d.rest) {
48 p.idx++
49 return p.register_anchor(d.anchor, Any(p.parse_block_scalar(current_indent, d.rest)!))
50 }
51 if d.rest.starts_with('[') || d.rest.starts_with('{') {
52 p.idx++
53 full := p.collect_flow_continuation(d.rest)!
54 return p.register_anchor(d.anchor, parse_flow_value(full)!)
55 }
56 p.idx++
57 if d.rest.len > 0 && (d.rest[0] == `"` || d.rest[0] == `'`) {
58 quoted := p.gather_quoted_continuation(d.rest)!
59 return p.register_anchor(d.anchor, parse_scalar(quoted)!)
60 }
61 folded := p.gather_plain_continuation(d.rest, current_indent)
62 return p.register_anchor(d.anchor, parse_scalar(folded)!)
63}
64
65// gather_plain_continuation extends a plain scalar with subsequent lines that
66// belong to the same node per YAML 1.2 §6.5.1: adjacent non-blank lines fold
67// to a single space, blank lines contribute literal `\n`s. The scan stops at
68// document markers, structural indicators, or a less-indented line.
69fn (mut p Parser) gather_plain_continuation(initial string, base_indent int) string {
70 mut sb := strings.new_builder(initial.len * 2)
71 sb.write_string(initial.trim_space())
72 mut blanks := 0
73 for p.idx < p.lines.len {
74 line := p.lines[p.idx]
75 trimmed_raw := line.trim_space()
76 if trimmed_raw == '' {
77 blanks++
78 p.idx++
79 continue
80 }
81 trimmed := strip_comments(line).trim_space()
82 if trimmed == '' {
83 blanks++
84 p.idx++
85 continue
86 }
87 if trimmed == '---' || trimmed == '...' {
88 break
89 }
90 line_indent := p.line_indent(p.idx) or { break }
91 if line_indent < base_indent {
92 break
93 }
94 if trimmed.starts_with('- ') || trimmed == '-' {
95 break
96 }
97 if split_mapping_entry(trimmed).ok {
98 break
99 }
100 if blanks > 0 {
101 for _ in 0 .. blanks {
102 sb.write_u8(`\n`)
103 }
104 blanks = 0
105 } else {
106 sb.write_u8(` `)
107 }
108 sb.write_string(trimmed)
109 p.idx++
110 }
111 return sb.str()
112}
113
114fn (mut p Parser) parse_mapping(indent int) !Any {
115 mut result := map[string]Any{}
116 for p.idx < p.lines.len {
117 p.skip_ignorable()
118 if p.idx >= p.lines.len {
119 break
120 }
121 current_indent := p.line_indent(p.idx)!
122 if current_indent < indent {
123 break
124 }
125 if current_indent > indent {
126 return error('yaml: unexpected indentation on line ${p.idx + 1}')
127 }
128 content := p.current_content()!
129 entry := split_mapping_entry(content)
130 if !entry.ok {
131 return error('yaml: expected a mapping entry on line ${p.idx + 1}')
132 }
133 p.idx++
134 result[entry.key] = p.parse_mapping_value(entry.rest, indent)!
135 }
136 return Any(result)
137}
138
139fn (mut p Parser) parse_mapping_value(rest_in string, indent int) !Any {
140 d := extract_decorators(rest_in)
141 if d.alias != '' && d.rest == '' {
142 return p.resolve_alias(d.alias)
143 }
144 value := if d.rest == '' {
145 next_indent := p.peek_next_indent()
146 if next_indent > indent {
147 p.parse_node(next_indent)!
148 } else {
149 null
150 }
151 } else if is_block_scalar(d.rest) {
152 Any(p.parse_block_scalar(indent, d.rest)!)
153 } else if d.rest.starts_with('[') || d.rest.starts_with('{') {
154 full := p.collect_flow_continuation(d.rest)!
155 parse_flow_value(full)!
156 } else if d.rest.len > 0 && (d.rest[0] == `"` || d.rest[0] == `'`) {
157 quoted := p.gather_quoted_continuation(d.rest)!
158 parse_scalar(quoted)!
159 } else {
160 parse_scalar(d.rest)!
161 }
162 return p.register_anchor(d.anchor, value)
163}
164
165fn (mut p Parser) parse_sequence(indent int) !Any {
166 mut items := []Any{}
167 for p.idx < p.lines.len {
168 p.skip_ignorable()
169 if p.idx >= p.lines.len {
170 break
171 }
172 current_indent := p.line_indent(p.idx)!
173 if current_indent < indent {
174 break
175 }
176 if current_indent > indent {
177 return error('yaml: unexpected indentation on line ${p.idx + 1}')
178 }
179 content := p.current_content()!
180 if !content.starts_with('-') || (content.len > 1 && content[1] != ` `) {
181 break
182 }
183 rest := if content.len == 1 { '' } else { content[1..].trim_space() }
184 p.idx++
185 items << p.parse_sequence_item(rest, indent)!
186 }
187 return Any(items)
188}
189
190fn (mut p Parser) parse_sequence_item(rest_in string, indent int) !Any {
191 d := extract_decorators(rest_in)
192 if d.alias != '' && d.rest == '' {
193 return p.resolve_alias(d.alias)
194 }
195 value := if d.rest == '' {
196 next_indent := p.peek_next_indent()
197 if next_indent > indent {
198 p.parse_node(next_indent)!
199 } else {
200 null
201 }
202 } else if is_block_scalar(d.rest) {
203 Any(p.parse_block_scalar(indent, d.rest)!)
204 } else if d.rest.starts_with('[') || d.rest.starts_with('{') {
205 full := p.collect_flow_continuation(d.rest)!
206 parse_flow_value(full)!
207 } else {
208 entry := split_mapping_entry(d.rest)
209 if entry.ok {
210 mut result := map[string]Any{}
211 child_indent := indent + 2
212 result[entry.key] = p.parse_mapping_value(entry.rest, child_indent)!
213 for p.idx < p.lines.len {
214 p.skip_ignorable()
215 if p.idx >= p.lines.len {
216 break
217 }
218 current_indent := p.line_indent(p.idx)!
219 if current_indent <= indent {
220 break
221 }
222 if current_indent != child_indent {
223 return error('yaml: unexpected indentation on line ${p.idx + 1}')
224 }
225 content := p.current_content()!
226 next_entry := split_mapping_entry(content)
227 if !next_entry.ok {
228 break
229 }
230 p.idx++
231 result[next_entry.key] = p.parse_mapping_value(next_entry.rest, child_indent)!
232 }
233 Any(result)
234 } else if d.rest.len > 0 && (d.rest[0] == `"` || d.rest[0] == `'`) {
235 quoted := p.gather_quoted_continuation(d.rest)!
236 parse_scalar(quoted)!
237 } else {
238 parse_scalar(d.rest)!
239 }
240 }
241 return p.register_anchor(d.anchor, value)
242}
243
244fn (p &Parser) resolve_alias(name string) Any {
245 return p.anchors[name] or { null }
246}
247
248// register_anchor associates `value` with `anchor` when the latter is set,
249// then returns `value` so call sites can `return p.register_anchor(...)`
250// in a single statement instead of branching.
251fn (mut p Parser) register_anchor(anchor string, value Any) Any {
252 if anchor != '' {
253 p.anchors[anchor] = value
254 }
255 return value
256}
257
258// parse_block_scalar parses a `|` (literal) or `>` (folded) block, honoring
259// the optional chomp indicator from the header (`-` strip, `+` keep, default
260// clip). Indent indicators (`|2`) are tolerated but ignored — the block's
261// indentation is auto-detected from the first non-empty content line.
262fn (mut p Parser) parse_block_scalar(parent_indent int, header string) !string {
263 style, chomp := parse_block_header(header)
264 start := p.idx
265 mut min_indent := -1
266 for i := start; i < p.lines.len; i++ {
267 line := p.lines[i]
268 if line.trim_space() == '' {
269 continue
270 }
271 line_indent := p.line_indent(i)!
272 if line_indent <= parent_indent {
273 break
274 }
275 if min_indent == -1 || line_indent < min_indent {
276 min_indent = line_indent
277 }
278 }
279 if min_indent == -1 {
280 // Body is entirely empty/blank. Keep chomp (`+`) still preserves the
281 // implicit trailing line breaks; strip and clip yield the empty string.
282 if chomp != `+` {
283 return ''
284 }
285 mut blanks := 0
286 for p.idx < p.lines.len {
287 line := p.lines[p.idx]
288 if line.trim_space() != '' {
289 break
290 }
291 blanks++
292 p.idx++
293 }
294 if blanks == 0 {
295 blanks = 1
296 }
297 return '\n'.repeat(blanks)
298 }
299 mut lines := []string{}
300 for p.idx < p.lines.len {
301 line := p.lines[p.idx]
302 if line.trim_space() == '' {
303 lines << ''
304 p.idx++
305 continue
306 }
307 line_indent := p.line_indent(p.idx)!
308 if line_indent <= parent_indent {
309 break
310 }
311 if line.len <= min_indent {
312 lines << ''
313 } else {
314 lines << line[min_indent..]
315 }
316 p.idx++
317 }
318 mut stripped_trailing := 0
319 for lines.len > 0 && lines[lines.len - 1] == '' {
320 stripped_trailing++
321 lines.delete(lines.len - 1)
322 }
323 body := if style == `|` { lines.join('\n') } else { fold_block_scalar(lines) }
324 return apply_chomp(body, chomp, stripped_trailing)
325}
326
327struct FlowBalance {
328mut:
329 bracket int
330 brace int
331 in_single bool
332 in_double bool
333 escape bool
334}
335
336// collect_flow_continuation gathers subsequent lines into `initial` until the
337// `[` / `{` brackets and the active quoted strings are all balanced. YAML 1.2
338// allows flow collections to span lines; without this the parser would reject
339// anything that wraps. The returned text is what `parse_flow_value` receives.
340fn (mut p Parser) collect_flow_continuation(initial string) !string {
341 mut bal := FlowBalance{}
342 bal.scan(initial)
343 if !bal.unbalanced() {
344 return initial
345 }
346 mut sb := strings.new_builder(initial.len * 2)
347 sb.write_string(initial)
348 for p.idx < p.lines.len && bal.unbalanced() {
349 line := p.lines[p.idx]
350 segment := if bal.in_single || bal.in_double { line } else { strip_comments(line) }
351 trimmed := segment.trim_space()
352 if trimmed == '' && !bal.in_single && !bal.in_double {
353 p.idx++
354 continue
355 }
356 sb.write_u8(` `)
357 sb.write_string(trimmed)
358 bal.scan(trimmed)
359 p.idx++
360 }
361 if bal.unbalanced() {
362 return error('yaml: unterminated flow collection')
363 }
364 return sb.str()
365}
366
367fn (b &FlowBalance) unbalanced() bool {
368 return b.bracket > 0 || b.brace > 0 || b.in_single || b.in_double
369}
370
371fn (mut b FlowBalance) scan(s string) {
372 for i := 0; i < s.len; i++ {
373 ch := s[i]
374 if b.in_double {
375 if b.escape {
376 b.escape = false
377 } else if ch == `\\` {
378 b.escape = true
379 } else if ch == `"` {
380 b.in_double = false
381 }
382 continue
383 }
384 if b.in_single {
385 if ch == `'` {
386 if i + 1 < s.len && s[i + 1] == `'` {
387 i++
388 continue
389 }
390 b.in_single = false
391 }
392 continue
393 }
394 match ch {
395 `"` { b.in_double = true }
396 `'` { b.in_single = true }
397 `[` { b.bracket++ }
398 `]` { b.bracket-- }
399 `{` { b.brace++ }
400 `}` { b.brace-- }
401 else {}
402 }
403 }
404}
405
406// parse_block_header reads the `|`/`>` style and the optional `+`/`-` chomp
407// indicator from a block-scalar header like `|`, `|-`, `|+`, `>2-`.
408fn parse_block_header(s string) (u8, u8) {
409 if s == '' {
410 return `|`, 0
411 }
412 style := s[0]
413 mut chomp := u8(0)
414 for i := 1; i < s.len; i++ {
415 c := s[i]
416 if c == `+` || c == `-` {
417 chomp = c
418 }
419 }
420 return style, chomp
421}
422
423// apply_chomp rewrites the block body's trailing whitespace per RFC 9.1.1.2:
424// strip removes all trailing newlines, clip keeps a single trailing newline
425// when the body is non-empty, keep preserves every original trailing newline.
426fn apply_chomp(body string, chomp u8, stripped_trailing int) string {
427 return match chomp {
428 `-` {
429 body
430 }
431 `+` {
432 clipped := if body == '' { '' } else { body + '\n' }
433 clipped + '\n'.repeat(stripped_trailing)
434 }
435 else {
436 if body == '' {
437 ''
438 } else {
439 body + '\n'
440 }
441 }
442 }
443}
444
445// is_ignorable_line returns true when `trimmed` is content the parser must
446// skip over: blank lines, document markers, and directive lines that occur
447// before the first body line. Directives (`%YAML`, `%TAG`, …) become plain
448// text once `directives_done` flips, so the skip is conditional on that flag.
449fn (p &Parser) is_ignorable_line(trimmed string) bool {
450 return trimmed == '' || trimmed == '---' || trimmed == '...'
451 || (trimmed.starts_with('%') && !p.directives_done)
452}
453
454fn (mut p Parser) skip_ignorable() {
455 for p.idx < p.lines.len {
456 line := p.lines[p.idx]
457 trimmed := strip_comments(line).trim_space()
458 if p.is_ignorable_line(trimmed) {
459 p.idx++
460 continue
461 }
462 // `--- <inline>` (or `---\t…`) folds the document marker away and
463 // keeps the inline content as the document body at column 0.
464 if (line.starts_with('--- ') || line.starts_with('---\t')) && line.len > 4 {
465 p.lines[p.idx] = line[4..]
466 }
467 p.directives_done = true
468 break
469 }
470}
471
472fn (p &Parser) peek_next_indent() int {
473 mut i := p.idx
474 for i < p.lines.len {
475 line := p.lines[i]
476 trimmed := strip_comments(line).trim_space()
477 if p.is_ignorable_line(trimmed) {
478 i++
479 continue
480 }
481 return p.line_indent(i) or { -1 }
482 }
483 return -1
484}
485
486fn (p &Parser) current_content() !string {
487 line := p.lines[p.idx]
488 indent := p.line_indent(p.idx)!
489 if line.len <= indent {
490 return ''
491 }
492 return strip_comments(line[indent..]).trim_space()
493}
494
495fn (p &Parser) line_indent(index int) !int {
496 line := p.lines[index]
497 mut indent := 0
498 for indent < line.len && line[indent] == ` ` {
499 indent++
500 }
501 if indent < line.len && line[indent] == `\t` {
502 return error('yaml: tabs are not supported for indentation on line ${index + 1}')
503 }
504 return indent
505}
506
507struct MappingEntry {
508 key string
509 rest string
510 ok bool
511}
512
513fn split_mapping_entry(content string) MappingEntry {
514 mut in_single := false
515 mut in_double := false
516 mut escape := false
517 mut bracket_depth := 0
518 mut brace_depth := 0
519 mut i := 0
520 for i < content.len {
521 ch := content[i]
522 if in_double {
523 if escape {
524 escape = false
525 } else if ch == `\\` {
526 escape = true
527 } else if ch == `"` {
528 in_double = false
529 }
530 i++
531 continue
532 }
533 if in_single {
534 if ch == `'` {
535 if i + 1 < content.len && content[i + 1] == `'` {
536 i += 2
537 continue
538 }
539 in_single = false
540 }
541 i++
542 continue
543 }
544 match ch {
545 `"` {
546 in_double = true
547 }
548 `'` {
549 in_single = true
550 }
551 `[` {
552 bracket_depth++
553 }
554 `]` {
555 if bracket_depth > 0 {
556 bracket_depth--
557 }
558 }
559 `{` {
560 brace_depth++
561 }
562 `}` {
563 if brace_depth > 0 {
564 brace_depth--
565 }
566 }
567 `:` {
568 if bracket_depth == 0 && brace_depth == 0
569 && (i + 1 == content.len || content[i + 1].is_space()) {
570 key_text := content[..i].trim_space()
571 if key_text == '' {
572 return MappingEntry{}
573 }
574 return MappingEntry{
575 key: parse_key(key_text) or { return MappingEntry{} }
576 rest: if i + 1 < content.len {
577 content[i + 1..].trim_space()
578 } else {
579 ''
580 }
581 ok: true
582 }
583 }
584 }
585 else {}
586 }
587
588 i++
589 }
590 return MappingEntry{}
591}
592
593// strip_node_decorators removes a leading anchor (`&id`), a tag (`!Type` or
594// `!!Type`), or a sequence of both, from a YAML node's text. The semantics of
595// anchors and tags are intentionally not implemented: stripping them lets the
596// underlying scalar/collection still parse, which matches the common practical
597// case where the document carries decorators but does not rely on them. A
598// stand-alone alias (`*name`) cannot be resolved without anchor tracking and
599// is therefore left untouched, so the caller still sees that something was
600// referenced.
601fn strip_node_decorators(s string) string {
602 return extract_decorators(s).rest
603}
604
605// Decorators carries the anchor / alias / remaining content split off a node.
606struct Decorators {
607 anchor string
608 alias string
609 rest string
610}
611
612// extract_decorators peels leading anchor / alias / tag decorators off `s` and
613// returns the anchor name, alias name, and remaining content. At most one
614// anchor and one alias are recognized; tags are stripped without being
615// returned.
616fn extract_decorators(s string) Decorators {
617 mut anchor := ''
618 mut alias := ''
619 mut out := s.trim_left(' \t')
620 for {
621 if out.len < 2 {
622 break
623 }
624 c := out[0]
625 if (c == `&` || c == `*`) && out[1] != ` ` && out[1] != `\t` {
626 mut i := 1
627 for i < out.len && out[i] != ` ` && out[i] != `\t` {
628 i++
629 }
630 name := out[1..i]
631 if c == `&` {
632 anchor = name
633 } else {
634 alias = name
635 }
636 out = out[i..].trim_left(' \t')
637 continue
638 }
639 if c == `!` {
640 mut i := 1
641 if i < out.len && out[i] == `!` {
642 i++
643 }
644 for i < out.len && out[i] != ` ` && out[i] != `\t` {
645 i++
646 }
647 out = out[i..].trim_left(' \t')
648 continue
649 }
650 break
651 }
652 return Decorators{
653 anchor: anchor
654 alias: alias
655 rest: out
656 }
657}
658
659// parse_key resolves a mapping-key token: drops anchor / tag decorators,
660// unquotes the result if surrounded by matching `"` or `'` quotes, and
661// returns the cleaned key string otherwise.
662fn parse_key(src string) !string {
663 cleaned := strip_node_decorators(src)
664 if cleaned.len >= 2 && ((cleaned[0] == `"` && cleaned[cleaned.len - 1] == `"`)
665 || (cleaned[0] == `'` && cleaned[cleaned.len - 1] == `'`)) {
666 return parse_quoted_string(cleaned)
667 }
668 return cleaned.trim_space()
669}
670
671// parse_scalar resolves a scalar token to its YAML 1.2 typed value: quoted
672// strings unquote, the case-insensitive keywords `null`/`~`, `true`/`yes`/`on`
673// and `false`/`no`/`off` produce the matching constants, integer and float
674// literals (with `_` digit separators) parse to `i64`/`u64`/`f64`, and any
675// other text falls back to a plain string.
676fn parse_scalar(text string) !Any {
677 value := strip_node_decorators(text).trim_space()
678 if value == '' {
679 return Any('')
680 }
681 if value.len >= 2 && ((value[0] == `"` && value[value.len - 1] == `"`)
682 || (value[0] == `'` && value[value.len - 1] == `'`)) {
683 return Any(parse_quoted_string(value)!)
684 }
685 // Keyword check: only strings of length 1..5 can match `~`, `null`, `true`,
686 // `yes`, `on`, `false`, `no`, `off`. Length-bound first to skip the
687 // allocation of `to_lower()` for every plain scalar (the overwhelmingly
688 // common case in real documents).
689 if value.len <= 5 {
690 if value.len == 1 && value[0] == `~` {
691 return null
692 }
693 if equals_ascii_ci(value, 'null') {
694 return null
695 }
696 if equals_ascii_ci(value, 'true') || equals_ascii_ci(value, 'yes')
697 || equals_ascii_ci(value, 'on') {
698 return Any(true)
699 }
700 if equals_ascii_ci(value, 'false') || equals_ascii_ci(value, 'no')
701 || equals_ascii_ci(value, 'off') {
702 return Any(false)
703 }
704 }
705 numeric := strip_underscores(value)
706 if is_integer(numeric) {
707 if numeric.starts_with('-') {
708 return Any(numeric.parse_int(0, 64)!)
709 }
710 if numeric.starts_with('+') {
711 return Any(numeric[1..].parse_uint(0, 64)!)
712 }
713 return Any(numeric.parse_uint(0, 64)!)
714 }
715 if is_float(numeric) {
716 return Any(strconv.atof64(numeric)!)
717 }
718 return Any(value)
719}
720
721// parse_quoted_string unquotes a YAML scalar wrapped in matching `"` or `'`
722// quotes. Single-quoted strings only undouble `''` to `'`. Double-quoted
723// strings honor the YAML 1.2 §5.7 escape set (`\b \f \n \r \t \" \\ \/
724// \uXXXX`); any other backslash sequence is rejected as malformed.
725fn parse_quoted_string(src string) !string {
726 if src.len < 2 {
727 return error('yaml: invalid quoted string')
728 }
729 quote := src[0]
730 inner := fold_quoted_inner(src[1..src.len - 1])
731 if quote == `'` {
732 // Single-quoted strings only undouble `''`. When the body has no
733 // doubled quote, return the slice as-is — `inner` already shares the
734 // source buffer, so this avoids a `replace` allocation.
735 if !inner.contains("''") {
736 return inner
737 }
738 return inner.replace("''", "'")
739 }
740 // Double-quoted fast path: if there's no `\` in the body, no escape
741 // resolution is needed — `inner` is the final value verbatim.
742 if !inner.contains_u8(`\\`) {
743 return inner
744 }
745 mut out := []u8{cap: inner.len}
746 mut i := 0
747 for i < inner.len {
748 ch := inner[i]
749 if ch != `\\` {
750 out << ch
751 i++
752 continue
753 }
754 i++
755 if i >= inner.len {
756 return error('yaml: invalid escape sequence')
757 }
758 esc := inner[i]
759 match esc {
760 `"`, `\\`, `/` {
761 out << esc
762 }
763 `b` {
764 out << `\b`
765 }
766 `f` {
767 out << `\f`
768 }
769 `n` {
770 out << `\n`
771 }
772 `r` {
773 out << `\r`
774 }
775 `t` {
776 out << `\t`
777 }
778 `u` {
779 if i + 4 >= inner.len {
780 return error('yaml: invalid unicode escape')
781 }
782 code := inner[i + 1..i + 5]
783 r := rune(code.parse_uint(16, 32)!)
784 out << r.str().bytes()
785 i += 4
786 }
787 else {
788 return error('yaml: unknown escape sequence \\${rune(esc).str()}')
789 }
790 }
791
792 i++
793 }
794 return out.bytestr()
795}
796
797// fold_quoted_inner applies YAML 1.2 §7.3 line folding rules to the body of a
798// quoted scalar (the chars between the opening and closing quote): adjacent
799// non-blank content lines fold to a single space, runs of N consecutive line
800// breaks collapse to N-1 literal newlines, and leading whitespace inside
801// continuation lines is stripped. A leading or trailing empty line folds to a
802// single space (§7.3.1).
803fn fold_quoted_inner(inner string) string {
804 if !inner.contains_u8(`\n`) {
805 return inner
806 }
807 lines := inner.split('\n')
808 n := lines.len
809 mut trimmed := []string{cap: n}
810 for i := 0; i < n; i++ {
811 line := lines[i]
812 if i == 0 && i != n - 1 {
813 trimmed << line.trim_right(' \t')
814 } else if i == n - 1 && i != 0 {
815 trimmed << line.trim_left(' \t')
816 } else {
817 trimmed << line.trim(' \t')
818 }
819 }
820 has_pre := trimmed[0] == ''
821 has_post := trimmed[n - 1] == ''
822 mut sb := strings.new_builder(inner.len)
823 if has_pre {
824 sb.write_u8(` `)
825 }
826 start := if has_pre { 1 } else { 0 }
827 end := if has_post { n - 1 } else { n }
828 mut blanks := 0
829 mut wrote := false
830 for i := start; i < end; i++ {
831 f := trimmed[i]
832 if f == '' {
833 blanks++
834 continue
835 }
836 if wrote {
837 if blanks == 0 {
838 sb.write_u8(` `)
839 } else {
840 for _ in 0 .. blanks {
841 sb.write_u8(`\n`)
842 }
843 }
844 }
845 blanks = 0
846 sb.write_string(f)
847 wrote = true
848 }
849 if has_post {
850 sb.write_u8(` `)
851 }
852 return sb.str()
853}
854
855// quoted_terminated reports whether `s` (which starts with a `quote` byte)
856// ends with the matching closing quote, taking single-quote `''` doubling and
857// double-quote `\` escapes into account.
858fn quoted_terminated(s string, quote u8) bool {
859 if s.len < 2 || s[0] != quote {
860 return false
861 }
862 mut i := 1
863 if quote == `'` {
864 for i < s.len {
865 if s[i] == `'` {
866 if i + 1 < s.len && s[i + 1] == `'` {
867 i += 2
868 continue
869 }
870 return i == s.len - 1
871 }
872 i++
873 }
874 return false
875 }
876 mut esc := false
877 for i < s.len {
878 ch := s[i]
879 if esc {
880 esc = false
881 i++
882 continue
883 }
884 if ch == `\\` {
885 esc = true
886 i++
887 continue
888 }
889 if ch == `"` {
890 return i == s.len - 1
891 }
892 i++
893 }
894 return false
895}
896
897// gather_quoted_continuation accumulates subsequent lines into a quoted scalar
898// that doesn't terminate on its first line. The returned string still wraps
899// the original line breaks; `fold_quoted_inner` collapses them later.
900fn (mut p Parser) gather_quoted_continuation(initial string) !string {
901 if initial == '' {
902 return initial
903 }
904 quote := initial[0]
905 if quote != `"` && quote != `'` {
906 return initial
907 }
908 if quoted_terminated(initial, quote) {
909 return initial
910 }
911 mut buf := initial.bytes()
912 for p.idx < p.lines.len {
913 buf << `\n`
914 buf << p.lines[p.idx].bytes()
915 p.idx++
916 snap := buf.bytestr()
917 if quoted_terminated(snap, quote) {
918 return snap
919 }
920 }
921 return error('yaml: unterminated quoted string')
922}
923
924fn strip_comments(line string) string {
925 // Fast path: the overwhelming majority of YAML lines have no `#`.
926 if !line.contains_u8(`#`) {
927 return line.trim_right(' \t')
928 }
929 mut in_single := false
930 mut in_double := false
931 mut escape := false
932 mut bracket_depth := 0
933 mut brace_depth := 0
934 mut i := 0
935 for i < line.len {
936 ch := line[i]
937 if in_double {
938 if escape {
939 escape = false
940 } else if ch == `\\` {
941 escape = true
942 } else if ch == `"` {
943 in_double = false
944 }
945 i++
946 continue
947 }
948 if in_single {
949 if ch == `'` {
950 if i + 1 < line.len && line[i + 1] == `'` {
951 i += 2
952 continue
953 }
954 in_single = false
955 }
956 i++
957 continue
958 }
959 match ch {
960 `"` {
961 in_double = true
962 }
963 `'` {
964 in_single = true
965 }
966 `[` {
967 bracket_depth++
968 }
969 `]` {
970 if bracket_depth > 0 {
971 bracket_depth--
972 }
973 }
974 `{` {
975 brace_depth++
976 }
977 `}` {
978 if brace_depth > 0 {
979 brace_depth--
980 }
981 }
982 `#` {
983 if bracket_depth == 0 && brace_depth == 0 {
984 return line[..i].trim_right(' \t')
985 }
986 }
987 else {}
988 }
989
990 i++
991 }
992 return line.trim_right(' \t')
993}
994
995fn fold_block_scalar(lines []string) string {
996 mut out := ''
997 mut pending_newlines := 0
998 mut started := false
999 for line in lines {
1000 if line == '' {
1001 pending_newlines++
1002 continue
1003 }
1004 if !started {
1005 out = '\n'.repeat(pending_newlines) + line
1006 started = true
1007 } else if pending_newlines > 0 {
1008 out += '\n'.repeat(pending_newlines) + line
1009 } else {
1010 out += ' ' + line
1011 }
1012 pending_newlines = 0
1013 }
1014 if pending_newlines > 0 {
1015 out += '\n'.repeat(pending_newlines)
1016 }
1017 return out
1018}
1019
1020fn is_block_scalar(value string) bool {
1021 if value == '' || (value[0] != `|` && value[0] != `>`) {
1022 return false
1023 }
1024 // Allow `|`, `>`, `|-`, `|+`, `>2`, `|3-`, etc. Any other char rules it out
1025 // (e.g. `> something` is just a plain scalar starting with `>`).
1026 for i := 1; i < value.len; i++ {
1027 c := value[i]
1028 if c != `+` && c != `-` && !(c >= `0` && c <= `9`) {
1029 return false
1030 }
1031 }
1032 return true
1033}
1034
1035fn is_integer(value string) bool {
1036 if value == '' {
1037 return false
1038 }
1039 if value[0] in [`+`, `-`] {
1040 if value.len == 1 {
1041 return false
1042 }
1043 if value[0] == `-` {
1044 value.parse_int(0, 64) or { return false }
1045 return true
1046 }
1047 value[1..].parse_uint(0, 64) or { return false }
1048 return true
1049 }
1050 value.parse_uint(0, 64) or { return false }
1051 return true
1052}
1053
1054fn is_float(value string) bool {
1055 if value == '' {
1056 return false
1057 }
1058 if !value.contains('.') && !value.contains('e') && !value.contains('E') {
1059 return false
1060 }
1061 strconv.atof64(value) or { return false }
1062 return true
1063}
1064
1065// equals_ascii_ci reports whether `s` equals `lower_ref` byte-for-byte once
1066// ASCII letters in `s` are lower-cased. `lower_ref` MUST already be lowercase
1067// ASCII; mixing case in it silently breaks the comparison. Used by
1068// `parse_scalar` to recognize boolean / null keywords without allocating a
1069// lower-cased copy of every plain scalar in the document.
1070fn equals_ascii_ci(s string, lower_ref string) bool {
1071 if s.len != lower_ref.len {
1072 return false
1073 }
1074 for i := 0; i < s.len; i++ {
1075 mut c := s[i]
1076 if c >= `A` && c <= `Z` {
1077 c |= 0x20
1078 }
1079 if c != lower_ref[i] {
1080 return false
1081 }
1082 }
1083 return true
1084}
1085
1086// strip_underscores removes `_` digit separators from a numeric literal in a
1087// single pass. Returns `value` unchanged when no `_` is present, avoiding an
1088// allocation on the common case.
1089fn strip_underscores(value string) string {
1090 if !value.contains_u8(`_`) {
1091 return value
1092 }
1093 mut out := []u8{cap: value.len}
1094 for c in value {
1095 if c != `_` {
1096 out << c
1097 }
1098 }
1099 return out.bytestr()
1100}
1101