From d96fe54c0a96fd6d651e9cb3c61b137ccc5c7dbe Mon Sep 17 00:00:00 2001 From: David Legrand Date: Wed, 29 Apr 2026 19:07:53 +0200 Subject: [PATCH] yaml: split module, raise conformance and performance (#27021) --- vlib/yaml/emit.v | 204 +++++ vlib/yaml/flow.v | 180 +++++ vlib/yaml/parser.v | 1100 +++++++++++++++++++++++++ vlib/yaml/path.v | 139 ++++ vlib/yaml/test_helpers.v | 19 + vlib/yaml/yaml.v | 1110 ++------------------------ vlib/yaml/yaml_conformance_test.v | 145 ++++ vlib/yaml/yaml_edge_cases_test.v | 344 ++++++++ vlib/yaml/yaml_json_roundtrip_test.v | 82 ++ vlib/yaml/yaml_test.v | 4 +- 10 files changed, 2283 insertions(+), 1044 deletions(-) create mode 100644 vlib/yaml/emit.v create mode 100644 vlib/yaml/flow.v create mode 100644 vlib/yaml/parser.v create mode 100644 vlib/yaml/path.v create mode 100644 vlib/yaml/test_helpers.v create mode 100644 vlib/yaml/yaml_conformance_test.v create mode 100644 vlib/yaml/yaml_edge_cases_test.v create mode 100644 vlib/yaml/yaml_json_roundtrip_test.v diff --git a/vlib/yaml/emit.v b/vlib/yaml/emit.v new file mode 100644 index 000000000..3ca835c49 --- /dev/null +++ b/vlib/yaml/emit.v @@ -0,0 +1,204 @@ +module yaml + +import strings + +// write_spaces appends `n` spaces to `sb`. +fn write_spaces(mut sb strings.Builder, n int) { + for _ in 0 .. n { + sb.write_u8(` `) + } +} + +// emit_yaml_any streams `value` into `sb` as block-style YAML. +fn emit_yaml_any(mut sb strings.Builder, value Any, indent int) { + match value { + map[string]Any { emit_yaml_map(mut sb, value, indent) } + []Any { emit_yaml_array(mut sb, value, indent) } + else { emit_yaml_scalar(mut sb, value) } + } +} + +// emit_yaml_map writes `value` as a block-style YAML mapping. An empty map +// is emitted as the inline `{}` form; otherwise each key is JSON-quoted (see +// `write_json_escaped_string`) and nested containers indent one level deeper. +fn emit_yaml_map(mut sb strings.Builder, value map[string]Any, indent int) { + if value.len == 0 { + sb.write_string('{}') + return + } + mut first := true + for key, item in value { + if !first { + sb.write_u8(`\n`) + } + first = false + write_spaces(mut sb, indent) + write_json_escaped_string(mut sb, key) + match item { + map[string]Any, []Any { + sb.write_u8(`:`) + sb.write_u8(`\n`) + emit_yaml_any(mut sb, item, indent + 2) + } + else { + sb.write_string(': ') + emit_yaml_scalar(mut sb, item) + } + } + } +} + +// emit_yaml_array writes `value` as a block-style YAML sequence. An empty +// array is emitted as the inline `[]` form; otherwise each item is prefixed +// with `- ` and nested containers indent one level deeper. +fn emit_yaml_array(mut sb strings.Builder, value []Any, indent int) { + if value.len == 0 { + sb.write_string('[]') + return + } + mut first := true + for item in value { + if !first { + sb.write_u8(`\n`) + } + first = false + write_spaces(mut sb, indent) + match item { + map[string]Any, []Any { + sb.write_u8(`-`) + sb.write_u8(`\n`) + emit_yaml_any(mut sb, item, indent + 2) + } + else { + sb.write_string('- ') + emit_yaml_scalar(mut sb, item) + } + } + } +} + +// emit_yaml_scalar writes `value` as a single YAML scalar token: strings go +// through `write_json_escaped_string`, booleans / numbers / null print their +// literal form. The container branch is type-required by V's exhaustive +// `match` over `Any` but is unreachable: `emit_yaml_any` routes maps and +// arrays to their dedicated emitters before falling back here. +fn emit_yaml_scalar(mut sb strings.Builder, value Any) { + match value { + string { write_json_escaped_string(mut sb, value) } + bool { sb.write_string(if value { 'true' } else { 'false' }) } + f64 { sb.write_string(value.str()) } + i64 { sb.write_string(value.str()) } + int { sb.write_string(value.str()) } + u64 { sb.write_string(value.str()) } + Null { sb.write_string('null') } + []Any, map[string]Any { emit_yaml_any(mut sb, value, 0) } + } +} + +// write_json_escaped_string writes `value` as a JSON string literal directly +// into `sb`. Matches `json2.encode`'s rules: standard short escapes for control +// chars, `\u00XX` for the rest below 0x20, and UTF-8 bytes passed through +// verbatim (no per-byte `\uXXXX` re-escape). Passes safe runs through in bulk +// via `write_string` on the original slice — the overwhelmingly common case for +// human YAML — and only switches to per-byte handling at escape boundaries. +fn write_json_escaped_string(mut sb strings.Builder, value string) { + sb.write_u8(`"`) + mut start := 0 + for i := 0; i < value.len; i++ { + c := value[i] + if c >= 0x20 && c != `"` && c != `\\` { + continue + } + if start < i { + sb.write_string(value[start..i]) + } + match c { + `"` { + sb.write_string('\\"') + } + `\\` { + sb.write_string('\\\\') + } + `\n` { + sb.write_string('\\n') + } + `\r` { + sb.write_string('\\r') + } + `\t` { + sb.write_string('\\t') + } + `\b` { + sb.write_string('\\b') + } + `\f` { + sb.write_string('\\f') + } + else { + sb.write_string('\\u00') + hex := '0123456789abcdef' + sb.write_u8(hex[(c >> 4) & 0xf]) + sb.write_u8(hex[c & 0xf]) + } + } + + start = i + 1 + } + if start < value.len { + sb.write_string(value[start..]) + } + sb.write_u8(`"`) +} + +// emit_any_as_json writes `a` as a compact JSON document. +fn emit_any_as_json(mut sb strings.Builder, a Any) { + match a { + map[string]Any { + sb.write_u8(`{`) + mut first := true + for key, value in a { + if !first { + sb.write_u8(`,`) + } + first = false + write_json_escaped_string(mut sb, key) + sb.write_u8(`:`) + emit_any_as_json(mut sb, value) + } + sb.write_u8(`}`) + } + []Any { + sb.write_u8(`[`) + mut first := true + for value in a { + if !first { + sb.write_u8(`,`) + } + first = false + emit_any_as_json(mut sb, value) + } + sb.write_u8(`]`) + } + string { + write_json_escaped_string(mut sb, a) + } + bool { + sb.write_string(if a { 'true' } else { 'false' }) + } + f64 { + sb.write_string(a.str()) + } + i64 { + sb.write_string(a.str()) + } + int { + sb.write_string(a.str()) + } + u64 { + sb.write_string(a.str()) + } + Null { + sb.write_string('null') + } + } +} diff --git a/vlib/yaml/flow.v b/vlib/yaml/flow.v new file mode 100644 index 000000000..59ac30258 --- /dev/null +++ b/vlib/yaml/flow.v @@ -0,0 +1,180 @@ +module yaml + +struct FlowParser { + src string +mut: + pos int +} + +fn parse_flow_value(src string) !Any { + mut parser := FlowParser{ + src: src + } + value := parser.parse_value()! + parser.skip_space() + if parser.pos != parser.src.len { + return error('yaml: unexpected trailing flow content') + } + return value +} + +fn (mut p FlowParser) parse_value() !Any { + p.skip_space() + if p.pos >= p.src.len { + return error('yaml: unexpected end of flow value') + } + return match p.src[p.pos] { + `[` { p.parse_array() } + `{` { p.parse_object() } + `"`, `'` { Any(parse_quoted_flow_string(mut p)!) } + else { parse_scalar(p.parse_plain_token()) } + } +} + +fn (mut p FlowParser) parse_array() !Any { + p.pos++ + mut items := []Any{} + for { + p.skip_space() + if p.pos >= p.src.len { + return error('yaml: unterminated flow array') + } + if p.src[p.pos] == `]` { + p.pos++ + break + } + items << p.parse_value()! + p.skip_space() + if p.pos >= p.src.len { + return error('yaml: unterminated flow array') + } + if p.src[p.pos] == `,` { + p.pos++ + continue + } + if p.src[p.pos] == `]` { + p.pos++ + break + } + return error('yaml: expected `,` or `]` in flow array') + } + return Any(items) +} + +fn (mut p FlowParser) parse_object() !Any { + p.pos++ + mut result := map[string]Any{} + for { + p.skip_space() + if p.pos >= p.src.len { + return error('yaml: unterminated flow object') + } + if p.src[p.pos] == `}` { + p.pos++ + break + } + key := p.parse_key()! + p.skip_space() + if p.pos >= p.src.len || p.src[p.pos] != `:` { + return error('yaml: expected `:` in flow object') + } + p.pos++ + result[key] = p.parse_value()! + p.skip_space() + if p.pos >= p.src.len { + return error('yaml: unterminated flow object') + } + if p.src[p.pos] == `,` { + p.pos++ + continue + } + if p.src[p.pos] == `}` { + p.pos++ + break + } + return error('yaml: expected `,` or `}` in flow object') + } + return Any(result) +} + +fn (mut p FlowParser) parse_key() !string { + p.skip_space() + if p.pos >= p.src.len { + return error('yaml: unexpected end of flow key') + } + if p.src[p.pos] in [`"`, `'`] { + return parse_quoted_flow_string(mut p) + } + start := p.pos + for p.pos < p.src.len { + ch := p.src[p.pos] + if ch == `:` { + break + } + p.pos++ + } + return p.src[start..p.pos].trim_space() +} + +fn (mut p FlowParser) parse_plain_token() string { + start := p.pos + mut bracket_depth := 0 + mut brace_depth := 0 + for p.pos < p.src.len { + ch := p.src[p.pos] + if ch == `[` { + bracket_depth++ + } else if ch == `]` { + if bracket_depth == 0 { + break + } + bracket_depth-- + } else if ch == `{` { + brace_depth++ + } else if ch == `}` { + if brace_depth == 0 { + break + } + brace_depth-- + } else if ch == `,` && bracket_depth == 0 && brace_depth == 0 { + break + } + p.pos++ + } + return p.src[start..p.pos].trim_space() +} + +fn (mut p FlowParser) skip_space() { + for p.pos < p.src.len && p.src[p.pos].is_space() { + p.pos++ + } +} + +fn parse_quoted_flow_string(mut p FlowParser) !string { + start := p.pos + quote := p.src[p.pos] + p.pos++ + mut escape := false + for p.pos < p.src.len { + ch := p.src[p.pos] + if quote == `"` { + if escape { + escape = false + } else if ch == `\\` { + escape = true + } else if ch == `"` { + p.pos++ + return parse_quoted_string(p.src[start..p.pos]) + } + } else if ch == `'` { + if p.pos + 1 < p.src.len && p.src[p.pos + 1] == `'` { + p.pos += 2 + continue + } + p.pos++ + return parse_quoted_string(p.src[start..p.pos]) + } + p.pos++ + } + return error('yaml: unterminated quoted flow string') +} diff --git a/vlib/yaml/parser.v b/vlib/yaml/parser.v new file mode 100644 index 000000000..3ffa55cf8 --- /dev/null +++ b/vlib/yaml/parser.v @@ -0,0 +1,1100 @@ +module yaml + +import strconv +import strings + +struct Parser { +mut: + lines []string + idx int + anchors map[string]Any + directives_done bool +} + +fn (mut p Parser) parse() !Any { + p.skip_ignorable() + if p.idx >= p.lines.len { + return null + } + indent := p.line_indent(p.idx)! + if indent < 0 { + return null + } + return p.parse_node(indent) +} + +fn (mut p Parser) parse_node(indent int) !Any { + p.skip_ignorable() + if p.idx >= p.lines.len { + return null + } + current_indent := p.line_indent(p.idx)! + if current_indent < indent { + return null + } + content := p.current_content()! + if content.starts_with('-') && (content.len == 1 || content[1] == ` `) { + return p.parse_sequence(current_indent) + } + if split_mapping_entry(content).ok { + return p.parse_mapping(current_indent) + } + d := extract_decorators(content) + if d.alias != '' && d.rest == '' { + p.idx++ + return p.resolve_alias(d.alias) + } + if is_block_scalar(d.rest) { + p.idx++ + return p.register_anchor(d.anchor, Any(p.parse_block_scalar(current_indent, d.rest)!)) + } + if d.rest.starts_with('[') || d.rest.starts_with('{') { + p.idx++ + full := p.collect_flow_continuation(d.rest)! + return p.register_anchor(d.anchor, parse_flow_value(full)!) + } + p.idx++ + if d.rest.len > 0 && (d.rest[0] == `"` || d.rest[0] == `'`) { + quoted := p.gather_quoted_continuation(d.rest)! + return p.register_anchor(d.anchor, parse_scalar(quoted)!) + } + folded := p.gather_plain_continuation(d.rest, current_indent) + return p.register_anchor(d.anchor, parse_scalar(folded)!) +} + +// gather_plain_continuation extends a plain scalar with subsequent lines that +// belong to the same node per YAML 1.2 §6.5.1: adjacent non-blank lines fold +// to a single space, blank lines contribute literal `\n`s. The scan stops at +// document markers, structural indicators, or a less-indented line. +fn (mut p Parser) gather_plain_continuation(initial string, base_indent int) string { + mut sb := strings.new_builder(initial.len * 2) + sb.write_string(initial.trim_space()) + mut blanks := 0 + for p.idx < p.lines.len { + line := p.lines[p.idx] + trimmed_raw := line.trim_space() + if trimmed_raw == '' { + blanks++ + p.idx++ + continue + } + trimmed := strip_comments(line).trim_space() + if trimmed == '' { + blanks++ + p.idx++ + continue + } + if trimmed == '---' || trimmed == '...' { + break + } + line_indent := p.line_indent(p.idx) or { break } + if line_indent < base_indent { + break + } + if trimmed.starts_with('- ') || trimmed == '-' { + break + } + if split_mapping_entry(trimmed).ok { + break + } + if blanks > 0 { + for _ in 0 .. blanks { + sb.write_u8(`\n`) + } + blanks = 0 + } else { + sb.write_u8(` `) + } + sb.write_string(trimmed) + p.idx++ + } + return sb.str() +} + +fn (mut p Parser) parse_mapping(indent int) !Any { + mut result := map[string]Any{} + for p.idx < p.lines.len { + p.skip_ignorable() + if p.idx >= p.lines.len { + break + } + current_indent := p.line_indent(p.idx)! + if current_indent < indent { + break + } + if current_indent > indent { + return error('yaml: unexpected indentation on line ${p.idx + 1}') + } + content := p.current_content()! + entry := split_mapping_entry(content) + if !entry.ok { + return error('yaml: expected a mapping entry on line ${p.idx + 1}') + } + p.idx++ + result[entry.key] = p.parse_mapping_value(entry.rest, indent)! + } + return Any(result) +} + +fn (mut p Parser) parse_mapping_value(rest_in string, indent int) !Any { + d := extract_decorators(rest_in) + if d.alias != '' && d.rest == '' { + return p.resolve_alias(d.alias) + } + value := if d.rest == '' { + next_indent := p.peek_next_indent() + if next_indent > indent { + p.parse_node(next_indent)! + } else { + null + } + } else if is_block_scalar(d.rest) { + Any(p.parse_block_scalar(indent, d.rest)!) + } else if d.rest.starts_with('[') || d.rest.starts_with('{') { + full := p.collect_flow_continuation(d.rest)! + parse_flow_value(full)! + } else if d.rest.len > 0 && (d.rest[0] == `"` || d.rest[0] == `'`) { + quoted := p.gather_quoted_continuation(d.rest)! + parse_scalar(quoted)! + } else { + parse_scalar(d.rest)! + } + return p.register_anchor(d.anchor, value) +} + +fn (mut p Parser) parse_sequence(indent int) !Any { + mut items := []Any{} + for p.idx < p.lines.len { + p.skip_ignorable() + if p.idx >= p.lines.len { + break + } + current_indent := p.line_indent(p.idx)! + if current_indent < indent { + break + } + if current_indent > indent { + return error('yaml: unexpected indentation on line ${p.idx + 1}') + } + content := p.current_content()! + if !content.starts_with('-') || (content.len > 1 && content[1] != ` `) { + break + } + rest := if content.len == 1 { '' } else { content[1..].trim_space() } + p.idx++ + items << p.parse_sequence_item(rest, indent)! + } + return Any(items) +} + +fn (mut p Parser) parse_sequence_item(rest_in string, indent int) !Any { + d := extract_decorators(rest_in) + if d.alias != '' && d.rest == '' { + return p.resolve_alias(d.alias) + } + value := if d.rest == '' { + next_indent := p.peek_next_indent() + if next_indent > indent { + p.parse_node(next_indent)! + } else { + null + } + } else if is_block_scalar(d.rest) { + Any(p.parse_block_scalar(indent, d.rest)!) + } else if d.rest.starts_with('[') || d.rest.starts_with('{') { + full := p.collect_flow_continuation(d.rest)! + parse_flow_value(full)! + } else { + entry := split_mapping_entry(d.rest) + if entry.ok { + mut result := map[string]Any{} + child_indent := indent + 2 + result[entry.key] = p.parse_mapping_value(entry.rest, child_indent)! + for p.idx < p.lines.len { + p.skip_ignorable() + if p.idx >= p.lines.len { + break + } + current_indent := p.line_indent(p.idx)! + if current_indent <= indent { + break + } + if current_indent != child_indent { + return error('yaml: unexpected indentation on line ${p.idx + 1}') + } + content := p.current_content()! + next_entry := split_mapping_entry(content) + if !next_entry.ok { + break + } + p.idx++ + result[next_entry.key] = p.parse_mapping_value(next_entry.rest, child_indent)! + } + Any(result) + } else if d.rest.len > 0 && (d.rest[0] == `"` || d.rest[0] == `'`) { + quoted := p.gather_quoted_continuation(d.rest)! + parse_scalar(quoted)! + } else { + parse_scalar(d.rest)! + } + } + return p.register_anchor(d.anchor, value) +} + +fn (p &Parser) resolve_alias(name string) Any { + return p.anchors[name] or { null } +} + +// register_anchor associates `value` with `anchor` when the latter is set, +// then returns `value` so call sites can `return p.register_anchor(...)` +// in a single statement instead of branching. +fn (mut p Parser) register_anchor(anchor string, value Any) Any { + if anchor != '' { + p.anchors[anchor] = value + } + return value +} + +// parse_block_scalar parses a `|` (literal) or `>` (folded) block, honoring +// the optional chomp indicator from the header (`-` strip, `+` keep, default +// clip). Indent indicators (`|2`) are tolerated but ignored — the block's +// indentation is auto-detected from the first non-empty content line. +fn (mut p Parser) parse_block_scalar(parent_indent int, header string) !string { + style, chomp := parse_block_header(header) + start := p.idx + mut min_indent := -1 + for i := start; i < p.lines.len; i++ { + line := p.lines[i] + if line.trim_space() == '' { + continue + } + line_indent := p.line_indent(i)! + if line_indent <= parent_indent { + break + } + if min_indent == -1 || line_indent < min_indent { + min_indent = line_indent + } + } + if min_indent == -1 { + // Body is entirely empty/blank. Keep chomp (`+`) still preserves the + // implicit trailing line breaks; strip and clip yield the empty string. + if chomp != `+` { + return '' + } + mut blanks := 0 + for p.idx < p.lines.len { + line := p.lines[p.idx] + if line.trim_space() != '' { + break + } + blanks++ + p.idx++ + } + if blanks == 0 { + blanks = 1 + } + return '\n'.repeat(blanks) + } + mut lines := []string{} + for p.idx < p.lines.len { + line := p.lines[p.idx] + if line.trim_space() == '' { + lines << '' + p.idx++ + continue + } + line_indent := p.line_indent(p.idx)! + if line_indent <= parent_indent { + break + } + if line.len <= min_indent { + lines << '' + } else { + lines << line[min_indent..] + } + p.idx++ + } + mut stripped_trailing := 0 + for lines.len > 0 && lines[lines.len - 1] == '' { + stripped_trailing++ + lines.delete(lines.len - 1) + } + body := if style == `|` { lines.join('\n') } else { fold_block_scalar(lines) } + return apply_chomp(body, chomp, stripped_trailing) +} + +struct FlowBalance { +mut: + bracket int + brace int + in_single bool + in_double bool + escape bool +} + +// collect_flow_continuation gathers subsequent lines into `initial` until the +// `[` / `{` brackets and the active quoted strings are all balanced. YAML 1.2 +// allows flow collections to span lines; without this the parser would reject +// anything that wraps. The returned text is what `parse_flow_value` receives. +fn (mut p Parser) collect_flow_continuation(initial string) !string { + mut bal := FlowBalance{} + bal.scan(initial) + if !bal.unbalanced() { + return initial + } + mut sb := strings.new_builder(initial.len * 2) + sb.write_string(initial) + for p.idx < p.lines.len && bal.unbalanced() { + line := p.lines[p.idx] + segment := if bal.in_single || bal.in_double { line } else { strip_comments(line) } + trimmed := segment.trim_space() + if trimmed == '' && !bal.in_single && !bal.in_double { + p.idx++ + continue + } + sb.write_u8(` `) + sb.write_string(trimmed) + bal.scan(trimmed) + p.idx++ + } + if bal.unbalanced() { + return error('yaml: unterminated flow collection') + } + return sb.str() +} + +fn (b &FlowBalance) unbalanced() bool { + return b.bracket > 0 || b.brace > 0 || b.in_single || b.in_double +} + +fn (mut b FlowBalance) scan(s string) { + for i := 0; i < s.len; i++ { + ch := s[i] + if b.in_double { + if b.escape { + b.escape = false + } else if ch == `\\` { + b.escape = true + } else if ch == `"` { + b.in_double = false + } + continue + } + if b.in_single { + if ch == `'` { + if i + 1 < s.len && s[i + 1] == `'` { + i++ + continue + } + b.in_single = false + } + continue + } + match ch { + `"` { b.in_double = true } + `'` { b.in_single = true } + `[` { b.bracket++ } + `]` { b.bracket-- } + `{` { b.brace++ } + `}` { b.brace-- } + else {} + } + } +} + +// parse_block_header reads the `|`/`>` style and the optional `+`/`-` chomp +// indicator from a block-scalar header like `|`, `|-`, `|+`, `>2-`. +fn parse_block_header(s string) (u8, u8) { + if s == '' { + return `|`, 0 + } + style := s[0] + mut chomp := u8(0) + for i := 1; i < s.len; i++ { + c := s[i] + if c == `+` || c == `-` { + chomp = c + } + } + return style, chomp +} + +// apply_chomp rewrites the block body's trailing whitespace per RFC 9.1.1.2: +// strip removes all trailing newlines, clip keeps a single trailing newline +// when the body is non-empty, keep preserves every original trailing newline. +fn apply_chomp(body string, chomp u8, stripped_trailing int) string { + return match chomp { + `-` { + body + } + `+` { + clipped := if body == '' { '' } else { body + '\n' } + clipped + '\n'.repeat(stripped_trailing) + } + else { + if body == '' { + '' + } else { + body + '\n' + } + } + } +} + +// is_ignorable_line returns true when `trimmed` is content the parser must +// skip over: blank lines, document markers, and directive lines that occur +// before the first body line. Directives (`%YAML`, `%TAG`, …) become plain +// text once `directives_done` flips, so the skip is conditional on that flag. +fn (p &Parser) is_ignorable_line(trimmed string) bool { + return trimmed == '' || trimmed == '---' || trimmed == '...' + || (trimmed.starts_with('%') && !p.directives_done) +} + +fn (mut p Parser) skip_ignorable() { + for p.idx < p.lines.len { + line := p.lines[p.idx] + trimmed := strip_comments(line).trim_space() + if p.is_ignorable_line(trimmed) { + p.idx++ + continue + } + // `--- ` (or `---\t…`) folds the document marker away and + // keeps the inline content as the document body at column 0. + if (line.starts_with('--- ') || line.starts_with('---\t')) && line.len > 4 { + p.lines[p.idx] = line[4..] + } + p.directives_done = true + break + } +} + +fn (p &Parser) peek_next_indent() int { + mut i := p.idx + for i < p.lines.len { + line := p.lines[i] + trimmed := strip_comments(line).trim_space() + if p.is_ignorable_line(trimmed) { + i++ + continue + } + return p.line_indent(i) or { -1 } + } + return -1 +} + +fn (p &Parser) current_content() !string { + line := p.lines[p.idx] + indent := p.line_indent(p.idx)! + if line.len <= indent { + return '' + } + return strip_comments(line[indent..]).trim_space() +} + +fn (p &Parser) line_indent(index int) !int { + line := p.lines[index] + mut indent := 0 + for indent < line.len && line[indent] == ` ` { + indent++ + } + if indent < line.len && line[indent] == `\t` { + return error('yaml: tabs are not supported for indentation on line ${index + 1}') + } + return indent +} + +struct MappingEntry { + key string + rest string + ok bool +} + +fn split_mapping_entry(content string) MappingEntry { + mut in_single := false + mut in_double := false + mut escape := false + mut bracket_depth := 0 + mut brace_depth := 0 + mut i := 0 + for i < content.len { + ch := content[i] + if in_double { + if escape { + escape = false + } else if ch == `\\` { + escape = true + } else if ch == `"` { + in_double = false + } + i++ + continue + } + if in_single { + if ch == `'` { + if i + 1 < content.len && content[i + 1] == `'` { + i += 2 + continue + } + in_single = false + } + i++ + continue + } + match ch { + `"` { + in_double = true + } + `'` { + in_single = true + } + `[` { + bracket_depth++ + } + `]` { + if bracket_depth > 0 { + bracket_depth-- + } + } + `{` { + brace_depth++ + } + `}` { + if brace_depth > 0 { + brace_depth-- + } + } + `:` { + if bracket_depth == 0 && brace_depth == 0 + && (i + 1 == content.len || content[i + 1].is_space()) { + key_text := content[..i].trim_space() + if key_text == '' { + return MappingEntry{} + } + return MappingEntry{ + key: parse_key(key_text) or { return MappingEntry{} } + rest: if i + 1 < content.len { + content[i + 1..].trim_space() + } else { + '' + } + ok: true + } + } + } + else {} + } + + i++ + } + return MappingEntry{} +} + +// strip_node_decorators removes a leading anchor (`&id`), a tag (`!Type` or +// `!!Type`), or a sequence of both, from a YAML node's text. The semantics of +// anchors and tags are intentionally not implemented: stripping them lets the +// underlying scalar/collection still parse, which matches the common practical +// case where the document carries decorators but does not rely on them. A +// stand-alone alias (`*name`) cannot be resolved without anchor tracking and +// is therefore left untouched, so the caller still sees that something was +// referenced. +fn strip_node_decorators(s string) string { + return extract_decorators(s).rest +} + +// Decorators carries the anchor / alias / remaining content split off a node. +struct Decorators { + anchor string + alias string + rest string +} + +// extract_decorators peels leading anchor / alias / tag decorators off `s` and +// returns the anchor name, alias name, and remaining content. At most one +// anchor and one alias are recognized; tags are stripped without being +// returned. +fn extract_decorators(s string) Decorators { + mut anchor := '' + mut alias := '' + mut out := s.trim_left(' \t') + for { + if out.len < 2 { + break + } + c := out[0] + if (c == `&` || c == `*`) && out[1] != ` ` && out[1] != `\t` { + mut i := 1 + for i < out.len && out[i] != ` ` && out[i] != `\t` { + i++ + } + name := out[1..i] + if c == `&` { + anchor = name + } else { + alias = name + } + out = out[i..].trim_left(' \t') + continue + } + if c == `!` { + mut i := 1 + if i < out.len && out[i] == `!` { + i++ + } + for i < out.len && out[i] != ` ` && out[i] != `\t` { + i++ + } + out = out[i..].trim_left(' \t') + continue + } + break + } + return Decorators{ + anchor: anchor + alias: alias + rest: out + } +} + +// parse_key resolves a mapping-key token: drops anchor / tag decorators, +// unquotes the result if surrounded by matching `"` or `'` quotes, and +// returns the cleaned key string otherwise. +fn parse_key(src string) !string { + cleaned := strip_node_decorators(src) + if cleaned.len >= 2 && ((cleaned[0] == `"` && cleaned[cleaned.len - 1] == `"`) + || (cleaned[0] == `'` && cleaned[cleaned.len - 1] == `'`)) { + return parse_quoted_string(cleaned) + } + return cleaned.trim_space() +} + +// parse_scalar resolves a scalar token to its YAML 1.2 typed value: quoted +// strings unquote, the case-insensitive keywords `null`/`~`, `true`/`yes`/`on` +// and `false`/`no`/`off` produce the matching constants, integer and float +// literals (with `_` digit separators) parse to `i64`/`u64`/`f64`, and any +// other text falls back to a plain string. +fn parse_scalar(text string) !Any { + value := strip_node_decorators(text).trim_space() + if value == '' { + return Any('') + } + if value.len >= 2 && ((value[0] == `"` && value[value.len - 1] == `"`) + || (value[0] == `'` && value[value.len - 1] == `'`)) { + return Any(parse_quoted_string(value)!) + } + // Keyword check: only strings of length 1..5 can match `~`, `null`, `true`, + // `yes`, `on`, `false`, `no`, `off`. Length-bound first to skip the + // allocation of `to_lower()` for every plain scalar (the overwhelmingly + // common case in real documents). + if value.len <= 5 { + if value.len == 1 && value[0] == `~` { + return null + } + if equals_ascii_ci(value, 'null') { + return null + } + if equals_ascii_ci(value, 'true') || equals_ascii_ci(value, 'yes') + || equals_ascii_ci(value, 'on') { + return Any(true) + } + if equals_ascii_ci(value, 'false') || equals_ascii_ci(value, 'no') + || equals_ascii_ci(value, 'off') { + return Any(false) + } + } + numeric := strip_underscores(value) + if is_integer(numeric) { + if numeric.starts_with('-') { + return Any(numeric.parse_int(0, 64)!) + } + if numeric.starts_with('+') { + return Any(numeric[1..].parse_uint(0, 64)!) + } + return Any(numeric.parse_uint(0, 64)!) + } + if is_float(numeric) { + return Any(strconv.atof64(numeric)!) + } + return Any(value) +} + +// parse_quoted_string unquotes a YAML scalar wrapped in matching `"` or `'` +// quotes. Single-quoted strings only undouble `''` to `'`. Double-quoted +// strings honor the YAML 1.2 §5.7 escape set (`\b \f \n \r \t \" \\ \/ +// \uXXXX`); any other backslash sequence is rejected as malformed. +fn parse_quoted_string(src string) !string { + if src.len < 2 { + return error('yaml: invalid quoted string') + } + quote := src[0] + inner := fold_quoted_inner(src[1..src.len - 1]) + if quote == `'` { + // Single-quoted strings only undouble `''`. When the body has no + // doubled quote, return the slice as-is — `inner` already shares the + // source buffer, so this avoids a `replace` allocation. + if !inner.contains("''") { + return inner + } + return inner.replace("''", "'") + } + // Double-quoted fast path: if there's no `\` in the body, no escape + // resolution is needed — `inner` is the final value verbatim. + if !inner.contains_u8(`\\`) { + return inner + } + mut out := []u8{cap: inner.len} + mut i := 0 + for i < inner.len { + ch := inner[i] + if ch != `\\` { + out << ch + i++ + continue + } + i++ + if i >= inner.len { + return error('yaml: invalid escape sequence') + } + esc := inner[i] + match esc { + `"`, `\\`, `/` { + out << esc + } + `b` { + out << `\b` + } + `f` { + out << `\f` + } + `n` { + out << `\n` + } + `r` { + out << `\r` + } + `t` { + out << `\t` + } + `u` { + if i + 4 >= inner.len { + return error('yaml: invalid unicode escape') + } + code := inner[i + 1..i + 5] + r := rune(code.parse_uint(16, 32)!) + out << r.str().bytes() + i += 4 + } + else { + return error('yaml: unknown escape sequence \\${rune(esc).str()}') + } + } + + i++ + } + return out.bytestr() +} + +// fold_quoted_inner applies YAML 1.2 §7.3 line folding rules to the body of a +// quoted scalar (the chars between the opening and closing quote): adjacent +// non-blank content lines fold to a single space, runs of N consecutive line +// breaks collapse to N-1 literal newlines, and leading whitespace inside +// continuation lines is stripped. A leading or trailing empty line folds to a +// single space (§7.3.1). +fn fold_quoted_inner(inner string) string { + if !inner.contains_u8(`\n`) { + return inner + } + lines := inner.split('\n') + n := lines.len + mut trimmed := []string{cap: n} + for i := 0; i < n; i++ { + line := lines[i] + if i == 0 && i != n - 1 { + trimmed << line.trim_right(' \t') + } else if i == n - 1 && i != 0 { + trimmed << line.trim_left(' \t') + } else { + trimmed << line.trim(' \t') + } + } + has_pre := trimmed[0] == '' + has_post := trimmed[n - 1] == '' + mut sb := strings.new_builder(inner.len) + if has_pre { + sb.write_u8(` `) + } + start := if has_pre { 1 } else { 0 } + end := if has_post { n - 1 } else { n } + mut blanks := 0 + mut wrote := false + for i := start; i < end; i++ { + f := trimmed[i] + if f == '' { + blanks++ + continue + } + if wrote { + if blanks == 0 { + sb.write_u8(` `) + } else { + for _ in 0 .. blanks { + sb.write_u8(`\n`) + } + } + } + blanks = 0 + sb.write_string(f) + wrote = true + } + if has_post { + sb.write_u8(` `) + } + return sb.str() +} + +// quoted_terminated reports whether `s` (which starts with a `quote` byte) +// ends with the matching closing quote, taking single-quote `''` doubling and +// double-quote `\` escapes into account. +fn quoted_terminated(s string, quote u8) bool { + if s.len < 2 || s[0] != quote { + return false + } + mut i := 1 + if quote == `'` { + for i < s.len { + if s[i] == `'` { + if i + 1 < s.len && s[i + 1] == `'` { + i += 2 + continue + } + return i == s.len - 1 + } + i++ + } + return false + } + mut esc := false + for i < s.len { + ch := s[i] + if esc { + esc = false + i++ + continue + } + if ch == `\\` { + esc = true + i++ + continue + } + if ch == `"` { + return i == s.len - 1 + } + i++ + } + return false +} + +// gather_quoted_continuation accumulates subsequent lines into a quoted scalar +// that doesn't terminate on its first line. The returned string still wraps +// the original line breaks; `fold_quoted_inner` collapses them later. +fn (mut p Parser) gather_quoted_continuation(initial string) !string { + if initial == '' { + return initial + } + quote := initial[0] + if quote != `"` && quote != `'` { + return initial + } + if quoted_terminated(initial, quote) { + return initial + } + mut buf := initial.bytes() + for p.idx < p.lines.len { + buf << `\n` + buf << p.lines[p.idx].bytes() + p.idx++ + snap := buf.bytestr() + if quoted_terminated(snap, quote) { + return snap + } + } + return error('yaml: unterminated quoted string') +} + +fn strip_comments(line string) string { + // Fast path: the overwhelming majority of YAML lines have no `#`. + if !line.contains_u8(`#`) { + return line.trim_right(' \t') + } + mut in_single := false + mut in_double := false + mut escape := false + mut bracket_depth := 0 + mut brace_depth := 0 + mut i := 0 + for i < line.len { + ch := line[i] + if in_double { + if escape { + escape = false + } else if ch == `\\` { + escape = true + } else if ch == `"` { + in_double = false + } + i++ + continue + } + if in_single { + if ch == `'` { + if i + 1 < line.len && line[i + 1] == `'` { + i += 2 + continue + } + in_single = false + } + i++ + continue + } + match ch { + `"` { + in_double = true + } + `'` { + in_single = true + } + `[` { + bracket_depth++ + } + `]` { + if bracket_depth > 0 { + bracket_depth-- + } + } + `{` { + brace_depth++ + } + `}` { + if brace_depth > 0 { + brace_depth-- + } + } + `#` { + if bracket_depth == 0 && brace_depth == 0 { + return line[..i].trim_right(' \t') + } + } + else {} + } + + i++ + } + return line.trim_right(' \t') +} + +fn fold_block_scalar(lines []string) string { + mut out := '' + mut pending_newlines := 0 + mut started := false + for line in lines { + if line == '' { + pending_newlines++ + continue + } + if !started { + out = '\n'.repeat(pending_newlines) + line + started = true + } else if pending_newlines > 0 { + out += '\n'.repeat(pending_newlines) + line + } else { + out += ' ' + line + } + pending_newlines = 0 + } + if pending_newlines > 0 { + out += '\n'.repeat(pending_newlines) + } + return out +} + +fn is_block_scalar(value string) bool { + if value == '' || (value[0] != `|` && value[0] != `>`) { + return false + } + // Allow `|`, `>`, `|-`, `|+`, `>2`, `|3-`, etc. Any other char rules it out + // (e.g. `> something` is just a plain scalar starting with `>`). + for i := 1; i < value.len; i++ { + c := value[i] + if c != `+` && c != `-` && !(c >= `0` && c <= `9`) { + return false + } + } + return true +} + +fn is_integer(value string) bool { + if value == '' { + return false + } + if value[0] in [`+`, `-`] { + if value.len == 1 { + return false + } + if value[0] == `-` { + value.parse_int(0, 64) or { return false } + return true + } + value[1..].parse_uint(0, 64) or { return false } + return true + } + value.parse_uint(0, 64) or { return false } + return true +} + +fn is_float(value string) bool { + if value == '' { + return false + } + if !value.contains('.') && !value.contains('e') && !value.contains('E') { + return false + } + strconv.atof64(value) or { return false } + return true +} + +// equals_ascii_ci reports whether `s` equals `lower_ref` byte-for-byte once +// ASCII letters in `s` are lower-cased. `lower_ref` MUST already be lowercase +// ASCII; mixing case in it silently breaks the comparison. Used by +// `parse_scalar` to recognize boolean / null keywords without allocating a +// lower-cased copy of every plain scalar in the document. +fn equals_ascii_ci(s string, lower_ref string) bool { + if s.len != lower_ref.len { + return false + } + for i := 0; i < s.len; i++ { + mut c := s[i] + if c >= `A` && c <= `Z` { + c |= 0x20 + } + if c != lower_ref[i] { + return false + } + } + return true +} + +// strip_underscores removes `_` digit separators from a numeric literal in a +// single pass. Returns `value` unchanged when no `_` is present, avoiding an +// allocation on the common case. +fn strip_underscores(value string) string { + if !value.contains_u8(`_`) { + return value + } + mut out := []u8{cap: value.len} + for c in value { + if c != `_` { + out << c + } + } + return out.bytestr() +} diff --git a/vlib/yaml/path.v b/vlib/yaml/path.v new file mode 100644 index 000000000..4a655520a --- /dev/null +++ b/vlib/yaml/path.v @@ -0,0 +1,139 @@ +module yaml + +import time +import x.json2 + +fn (a Any) value_(current Any, key []string) ?Any { + if key.len == 0 { + return none + } + k, index := parse_array_key(key[0]) + value := match current { + []Any { + if k != '' { + return none + } + current[index] or { return none } + } + map[string]Any { + v := current[k] or { return none } + if index > -1 { + if v is []Any { + v[index] or { return none } + } else { + return none + } + } else { + v + } + } + else { + return none + } + } + + if key.len <= 1 { + return value + } + return match value { + []Any, map[string]Any { a.value_(value, key[1..]) } + else { none } + } +} + +fn parse_dotted_key(key string) ![]string { + mut out := []string{} + mut buf := '' + mut in_string := false + mut delimiter := u8(` `) + for ch in key { + if ch in [`"`, `'`] { + if !in_string { + delimiter = ch + in_string = true + continue + } + if ch == delimiter { + in_string = false + if buf != '' { + out << buf + } + buf = '' + delimiter = ` ` + continue + } + } + buf += ch.ascii_str() + if !in_string && ch == `.` { + buf = buf[..buf.len - 1] + if buf != '' { + out << buf + } + buf = '' + } + } + if buf != '' { + out << buf + } + if in_string { + return error('yaml: missing closing string delimiter `${delimiter.ascii_str()}`') + } + return out +} + +fn parse_array_key(key string) (string, int) { + mut index := -1 + mut k := key + if k.contains('[') { + index = k.all_after('[').all_before(']').int() + if k.starts_with('[') { + k = '' + } else { + k = k.all_before('[') + } + } + return k, index +} + +fn from_json2(value json2.Any) Any { + return match value { + []json2.Any { + mut arr := []Any{cap: value.len} + for item in value { + arr << from_json2(item) + } + Any(arr) + } + map[string]json2.Any { + mut out := map[string]Any{} + for key, item in value { + out[key] = from_json2(item) + } + Any(out) + } + bool { + Any(value) + } + f32, f64 { + Any(f64(value)) + } + i8, i16, i32, int { + Any(int(value)) + } + i64 { + Any(value) + } + u8, u16, u32, u64 { + Any(u64(value)) + } + string { + Any(value) + } + time.Time { + Any(value.str()) + } + json2.Null { + null + } + } +} diff --git a/vlib/yaml/test_helpers.v b/vlib/yaml/test_helpers.v new file mode 100644 index 000000000..2b6eb5567 --- /dev/null +++ b/vlib/yaml/test_helpers.v @@ -0,0 +1,19 @@ +module yaml + +import x.json2 + +// json_logically_eq compares two JSON strings by decoding and re-encoding +// both sides via json2, normalizing whitespace and treating an empty input +// as the literal "null" document. +// +// For tests only. Lives in a non-_test.v file because V compiles each +// `_test.v` file as its own binary, so a helper cannot otherwise be shared +// across the module's test files. Not exported (lowercase name) so it stays +// invisible to consumers of `yaml`. +fn json_logically_eq(a string, b string) !bool { + a_norm := if a.trim_space() == '' { 'null' } else { a } + b_norm := if b.trim_space() == '' { 'null' } else { b } + pa := json2.decode[json2.Any](a_norm)! + pb := json2.decode[json2.Any](b_norm)! + return json2.encode(pa, json2.EncoderOptions{}) == json2.encode(pb, json2.EncoderOptions{}) +} diff --git a/vlib/yaml/yaml.v b/vlib/yaml/yaml.v index f10593d20..ea1157275 100644 --- a/vlib/yaml/yaml.v +++ b/vlib/yaml/yaml.v @@ -2,8 +2,7 @@ module yaml import json import os -import strconv -import time +import strings import x.json2 // Null is a simple representation of the YAML `null` value. @@ -28,22 +27,35 @@ pub fn parse_file(path string) !Doc { // parse_text parses the YAML document provided in `text`. pub fn parse_text(text string) !Doc { - mut normalized := text.replace('\r\n', '\n').replace('\r', '\n') + mut normalized := text + if normalized.contains_u8(`\r`) { + normalized = normalized.replace('\r\n', '\n').replace('\r', '\n') + } if normalized.len >= 3 && normalized[0] == 0xef && normalized[1] == 0xbb && normalized[2] == 0xbf { normalized = normalized[3..] } + // `split('\n')` would otherwise turn the canonical trailing line break into + // a phantom empty last line, which the block-scalar reader then treats as a + // genuine blank line and over-counts during chomping. + if normalized.ends_with('\n') { + normalized = normalized[..normalized.len - 1] + } trimmed := normalized.trim_space() if trimmed == '' { return Doc{ - root: Any(map[string]Any{}) + root: null } } if trimmed.starts_with('{') || trimmed.starts_with('[') { - raw := json2.decode[json2.Any](trimmed) or { json2.null } - if raw !is json2.Null { + // JSON-superset fast path. `parse_flow_value` already consumes the + // flow-style grammar that YAML borrows from JSON, so it builds the + // `yaml.Any` tree directly — no second-pass `from_json2` rebuild. + // Falls through to the block parser if the body is anything other + // than a clean flow document. + if val := parse_flow_value(trimmed) { return Doc{ - root: from_json2(raw) + root: val } } } @@ -71,7 +83,7 @@ pub fn decode_file[T](path string) !T { // The generic encode/decode path uses the main `json` module for field parity. pub fn encode[T](value T) string { json_text := json.encode(value) - raw := json2.decode[json2.Any](json_text) or { panic(err) } + raw := json2.decode[json2.Any](json_text) or { return '' } return from_json2(raw).to_yaml() } @@ -80,8 +92,13 @@ pub fn encode_file[T](path string, value T) ! { os.write_file(path, encode(value))! } -// decode decodes the YAML document into the target type `T`. +// decode decodes the YAML document into the target type `T`. An empty +// document (parsed as the YAML 1.2 null node) decodes to a default-initialized +// `T`, matching the common "empty config file = use defaults" idiom. pub fn (d Doc) decode[T]() !T { + if d.root is Null { + return json.decode(T, '{}')! + } return json.decode(T, d.to_json())! } @@ -119,12 +136,8 @@ pub fn (a Any) str() string { // string returns `Any` as a string when possible, or a YAML representation otherwise. pub fn (a Any) string() string { return match a { - string { a.clone() } - bool { a.str() } - f64 { a.str() } - i64 { a.str() } - int { a.str() } - u64 { a.str() } + string { a } + bool, f64, i64, int, u64 { a.str() } Null { 'null' } []Any, map[string]Any { a.to_yaml() } } @@ -284,33 +297,41 @@ pub fn (a Any) bool() bool { // array returns `Any` as an array. pub fn (a Any) array() []Any { - if a is []Any { - return a - } - if a is map[string]Any { - mut arr := []Any{} - for _, value in a { - arr << value + return match a { + []Any { + a + } + map[string]Any { + mut arr := []Any{cap: a.len} + for _, value in a { + arr << value + } + arr + } + else { + [a] } - return arr } - return [a] } // as_map returns `Any` as a map. pub fn (a Any) as_map() map[string]Any { - if a is map[string]Any { - return a - } - if a is []Any { - mut out := map[string]Any{} - for i, value in a { - out['${i}'] = value + return match a { + map[string]Any { + a + } + []Any { + mut out := map[string]Any{} + for i, value in a { + out['${i}'] = value + } + out + } + else { + { + '0': a + } } - return out - } - return { - '0': a } } @@ -324,18 +345,16 @@ pub fn (a Any) default_to(value Any) Any { // value queries a value from the current node using dotted keys and array indices. pub fn (a Any) value(key string) Any { - key_split := parse_dotted_key(key) or { return null } - return a.value_(a, key_split) + return a.value_opt(key) or { null } } // value_opt queries a value from the current node and returns an error when missing. +// A YAML key whose value is the explicit `null` literal returns that `Null` +// (it is not treated as missing); only an absent key or a non-traversable +// path raises an error. pub fn (a Any) value_opt(key string) !Any { - key_split := parse_dotted_key(key) or { return error('invalid dotted key') } - value := a.value_(a, key_split) - if value is Null { - return error('no value for key') - } - return value + key_split := parse_dotted_key(key) or { return error('yaml: invalid dotted key `${key}`') } + return a.value_(a, key_split) or { error('yaml: no value for key `${key}`') } } // value queries a value from the map. @@ -368,12 +387,16 @@ pub fn (m map[string]Any) as_strings() map[string]string { // to_json converts `Any` to JSON. pub fn (a Any) to_json() string { - return json2.encode(a.to_json2(), json2.EncoderOptions{}) + mut sb := strings.new_builder(256) + emit_any_as_json(mut sb, a) + return sb.str() } // to_yaml converts `Any` to YAML. pub fn (a Any) to_yaml() string { - return yaml_from_any(a, 0) + mut sb := strings.new_builder(256) + emit_yaml_any(mut sb, a, 0) + return sb.str() } // to_yaml converts a YAML array to YAML text. @@ -385,1000 +408,3 @@ pub fn (a []Any) to_yaml() string { pub fn (m map[string]Any) to_yaml() string { return Any(m).to_yaml() } - -fn (a Any) value_(current Any, key []string) Any { - if key.len == 0 { - return null - } - mut value := Any(null) - k, index := parse_array_key(key[0]) - if k == '' { - if current is []Any { - arr := current as []Any - value = arr[index] or { return null } - } else { - return null - } - } - if current is map[string]Any { - value = current[k] or { return null } - if index > -1 { - if value is []Any { - arr := value as []Any - value = arr[index] or { return null } - } else { - return null - } - } - } - if key.len <= 1 { - return value - } - return match value { - []Any, map[string]Any { a.value_(value, key[1..]) } - else { null } - } -} - -fn (a Any) to_json2() json2.Any { - return match a { - []Any { - mut arr := []json2.Any{cap: a.len} - for value in a { - arr << value.to_json2() - } - json2.Any(arr) - } - map[string]Any { - mut out := map[string]json2.Any{} - for key, value in a { - out[key] = value.to_json2() - } - json2.Any(out) - } - string { - json2.Any(a) - } - bool { - json2.Any(a) - } - f64 { - json2.Any(a) - } - i64 { - json2.Any(a) - } - int { - json2.Any(a) - } - u64 { - json2.Any(a) - } - Null { - json2.Any(json2.null) - } - } -} - -fn from_json2(value json2.Any) Any { - return match value { - []json2.Any { - mut arr := []Any{cap: value.len} - for item in value { - arr << from_json2(item) - } - Any(arr) - } - map[string]json2.Any { - mut out := map[string]Any{} - for key, item in value { - out[key] = from_json2(item) - } - Any(out) - } - bool { - Any(value) - } - f32, f64 { - Any(f64(value)) - } - i8, i16, i32, int { - Any(int(value)) - } - i64 { - Any(value) - } - u8, u16, u32, u64 { - Any(u64(value)) - } - string { - Any(value) - } - time.Time { - Any(value.str()) - } - json2.Null { - null - } - } -} - -struct Parser { - lines []string -mut: - idx int -} - -fn (mut p Parser) parse() !Any { - p.skip_ignorable() - if p.idx >= p.lines.len { - return Any(map[string]Any{}) - } - indent := p.line_indent(p.idx)! - if indent < 0 { - return Any(map[string]Any{}) - } - return p.parse_node(indent) -} - -fn (mut p Parser) parse_node(indent int) !Any { - p.skip_ignorable() - if p.idx >= p.lines.len { - return null - } - current_indent := p.line_indent(p.idx)! - if current_indent < indent { - return null - } - content := p.current_content()! - if content.starts_with('-') && (content.len == 1 || content[1] == ` `) { - return p.parse_sequence(current_indent) - } - if split_mapping_entry(content).ok { - return p.parse_mapping(current_indent) - } - if content.starts_with('[') || content.starts_with('{') { - p.idx++ - return parse_flow_value(content) - } - p.idx++ - return parse_scalar(content) -} - -fn (mut p Parser) parse_mapping(indent int) !Any { - mut result := map[string]Any{} - for p.idx < p.lines.len { - p.skip_ignorable() - if p.idx >= p.lines.len { - break - } - current_indent := p.line_indent(p.idx)! - if current_indent < indent { - break - } - if current_indent > indent { - return error('yaml: unexpected indentation on line ${p.idx + 1}') - } - content := p.current_content()! - entry := split_mapping_entry(content) - if !entry.ok { - return error('yaml: expected a mapping entry on line ${p.idx + 1}') - } - p.idx++ - result[entry.key] = p.parse_mapping_value(entry.rest, indent)! - } - return Any(result) -} - -fn (mut p Parser) parse_mapping_value(rest string, indent int) !Any { - if rest == '' { - next_indent := p.peek_next_indent() - if next_indent > indent { - return p.parse_node(next_indent) - } - return null - } - if is_block_scalar(rest) { - return Any(p.parse_block_scalar(indent, rest[0])!) - } - if rest.starts_with('[') || rest.starts_with('{') { - return parse_flow_value(rest) - } - return parse_scalar(rest) -} - -fn (mut p Parser) parse_sequence(indent int) !Any { - mut items := []Any{} - for p.idx < p.lines.len { - p.skip_ignorable() - if p.idx >= p.lines.len { - break - } - current_indent := p.line_indent(p.idx)! - if current_indent < indent { - break - } - if current_indent > indent { - return error('yaml: unexpected indentation on line ${p.idx + 1}') - } - content := p.current_content()! - if !content.starts_with('-') || (content.len > 1 && content[1] != ` `) { - break - } - rest := if content.len == 1 { '' } else { content[1..].trim_space() } - p.idx++ - items << p.parse_sequence_item(rest, indent)! - } - return Any(items) -} - -fn (mut p Parser) parse_sequence_item(rest string, indent int) !Any { - if rest == '' { - next_indent := p.peek_next_indent() - if next_indent > indent { - return p.parse_node(next_indent) - } - return null - } - if is_block_scalar(rest) { - return Any(p.parse_block_scalar(indent, rest[0])!) - } - if rest.starts_with('[') || rest.starts_with('{') { - return parse_flow_value(rest) - } - entry := split_mapping_entry(rest) - if entry.ok { - mut result := map[string]Any{} - child_indent := indent + 2 - result[entry.key] = p.parse_mapping_value(entry.rest, child_indent)! - for p.idx < p.lines.len { - p.skip_ignorable() - if p.idx >= p.lines.len { - break - } - current_indent := p.line_indent(p.idx)! - if current_indent <= indent { - break - } - if current_indent != child_indent { - return error('yaml: unexpected indentation on line ${p.idx + 1}') - } - content := p.current_content()! - next_entry := split_mapping_entry(content) - if !next_entry.ok { - break - } - p.idx++ - result[next_entry.key] = p.parse_mapping_value(next_entry.rest, child_indent)! - } - return Any(result) - } - return parse_scalar(rest) -} - -fn (mut p Parser) parse_block_scalar(parent_indent int, style u8) !string { - start := p.idx - mut min_indent := -1 - for i := start; i < p.lines.len; i++ { - line := p.lines[i] - if line.trim_space() == '' { - continue - } - line_indent := p.line_indent(i)! - if line_indent <= parent_indent { - break - } - if min_indent == -1 || line_indent < min_indent { - min_indent = line_indent - } - } - if min_indent == -1 { - return '' - } - mut lines := []string{} - for p.idx < p.lines.len { - line := p.lines[p.idx] - if line.trim_space() == '' { - lines << '' - p.idx++ - continue - } - line_indent := p.line_indent(p.idx)! - if line_indent <= parent_indent { - break - } - if line.len <= min_indent { - lines << '' - } else { - lines << line[min_indent..] - } - p.idx++ - } - for lines.len > 0 && lines[lines.len - 1] == '' { - lines.delete(lines.len - 1) - } - if style == `|` { - return lines.join('\n') - } - return fold_block_scalar(lines) -} - -fn (mut p Parser) skip_ignorable() { - for p.idx < p.lines.len { - line := p.lines[p.idx] - trimmed := strip_comments(line).trim_space() - if trimmed == '' || trimmed == '---' || trimmed == '...' { - p.idx++ - continue - } - break - } -} - -fn (p &Parser) peek_next_indent() int { - mut i := p.idx - for i < p.lines.len { - line := p.lines[i] - trimmed := strip_comments(line).trim_space() - if trimmed == '' || trimmed == '---' || trimmed == '...' { - i++ - continue - } - return p.line_indent(i) or { -1 } - } - return -1 -} - -fn (p &Parser) current_content() !string { - line := p.lines[p.idx] - indent := p.line_indent(p.idx)! - if line.len <= indent { - return '' - } - return strip_comments(line[indent..]).trim_space() -} - -fn (p &Parser) line_indent(index int) !int { - line := p.lines[index] - mut indent := 0 - for indent < line.len && line[indent] == ` ` { - indent++ - } - if indent < line.len && line[indent] == `\t` { - return error('yaml: tabs are not supported for indentation on line ${index + 1}') - } - return indent -} - -struct MappingEntry { - key string - rest string - ok bool -} - -fn split_mapping_entry(content string) MappingEntry { - mut in_single := false - mut in_double := false - mut escape := false - mut bracket_depth := 0 - mut brace_depth := 0 - mut i := 0 - for i < content.len { - ch := content[i] - if in_double { - if escape { - escape = false - } else if ch == `\\` { - escape = true - } else if ch == `"` { - in_double = false - } - i++ - continue - } - if in_single { - if ch == `'` { - if i + 1 < content.len && content[i + 1] == `'` { - i += 2 - continue - } - in_single = false - } - i++ - continue - } - match ch { - `"` { - in_double = true - } - `'` { - in_single = true - } - `[` { - bracket_depth++ - } - `]` { - if bracket_depth > 0 { - bracket_depth-- - } - } - `{` { - brace_depth++ - } - `}` { - if brace_depth > 0 { - brace_depth-- - } - } - `:` { - if bracket_depth == 0 && brace_depth == 0 - && (i + 1 == content.len || content[i + 1].is_space()) { - key_text := content[..i].trim_space() - if key_text == '' { - return MappingEntry{} - } - return MappingEntry{ - key: parse_key(key_text) or { return MappingEntry{} } - rest: if i + 1 < content.len { - content[i + 1..].trim_space() - } else { - '' - } - ok: true - } - } - } - else {} - } - - i++ - } - return MappingEntry{} -} - -fn parse_key(src string) !string { - if src.len >= 2 && ((src[0] == `"` && src[src.len - 1] == `"`) - || (src[0] == `'` && src[src.len - 1] == `'`)) { - return parse_quoted_string(src) - } - return src.trim_space() -} - -fn parse_scalar(text string) !Any { - value := text.trim_space() - if value == '' { - return Any('') - } - if value.len >= 2 && ((value[0] == `"` && value[value.len - 1] == `"`) - || (value[0] == `'` && value[value.len - 1] == `'`)) { - return Any(parse_quoted_string(value)!) - } - lower := value.to_lower() - if lower in ['null', '~'] { - return null - } - if lower in ['true', 'yes', 'on'] { - return Any(true) - } - if lower in ['false', 'no', 'off'] { - return Any(false) - } - numeric := value.replace('_', '') - if is_integer(numeric) { - if numeric.starts_with('-') { - return Any(numeric.parse_int(0, 64)!) - } - if numeric.starts_with('+') { - return Any(numeric[1..].parse_uint(0, 64)!) - } - return Any(numeric.parse_uint(0, 64)!) - } - if is_float(numeric) { - return Any(strconv.atof64(numeric)!) - } - return Any(value) -} - -fn parse_quoted_string(src string) !string { - if src.len < 2 { - return error('yaml: invalid quoted string') - } - quote := src[0] - inner := src[1..src.len - 1] - if quote == `'` { - return inner.replace("''", "'") - } - mut out := []u8{} - mut i := 0 - for i < inner.len { - ch := inner[i] - if ch != `\\` { - out << ch - i++ - continue - } - i++ - if i >= inner.len { - return error('yaml: invalid escape sequence') - } - esc := inner[i] - match esc { - `"`, `\\`, `/` { - out << esc - } - `b` { - out << `\b` - } - `f` { - out << `\f` - } - `n` { - out << `\n` - } - `r` { - out << `\r` - } - `t` { - out << `\t` - } - `u` { - if i + 4 >= inner.len { - return error('yaml: invalid unicode escape') - } - code := inner[i + 1..i + 5] - r := rune(code.parse_uint(16, 32)!) - out << r.str().bytes() - i += 4 - } - else { - out << esc - } - } - - i++ - } - return out.bytestr() -} - -fn strip_comments(line string) string { - mut in_single := false - mut in_double := false - mut escape := false - mut bracket_depth := 0 - mut brace_depth := 0 - mut i := 0 - for i < line.len { - ch := line[i] - if in_double { - if escape { - escape = false - } else if ch == `\\` { - escape = true - } else if ch == `"` { - in_double = false - } - i++ - continue - } - if in_single { - if ch == `'` { - if i + 1 < line.len && line[i + 1] == `'` { - i += 2 - continue - } - in_single = false - } - i++ - continue - } - match ch { - `"` { - in_double = true - } - `'` { - in_single = true - } - `[` { - bracket_depth++ - } - `]` { - if bracket_depth > 0 { - bracket_depth-- - } - } - `{` { - brace_depth++ - } - `}` { - if brace_depth > 0 { - brace_depth-- - } - } - `#` { - if bracket_depth == 0 && brace_depth == 0 { - return line[..i].trim_right(' \t') - } - } - else {} - } - - i++ - } - return line.trim_right(' \t') -} - -struct FlowParser { - src string -mut: - pos int -} - -fn parse_flow_value(src string) !Any { - mut parser := FlowParser{ - src: src - } - value := parser.parse_value()! - parser.skip_space() - if parser.pos != parser.src.len { - return error('yaml: unexpected trailing flow content') - } - return value -} - -fn (mut p FlowParser) parse_value() !Any { - p.skip_space() - if p.pos >= p.src.len { - return error('yaml: unexpected end of flow value') - } - return match p.src[p.pos] { - `[` { p.parse_array() } - `{` { p.parse_object() } - `"`, `'` { Any(parse_quoted_flow_string(mut p)!) } - else { parse_scalar(p.parse_plain_token()) } - } -} - -fn (mut p FlowParser) parse_array() !Any { - p.pos++ - mut items := []Any{} - for { - p.skip_space() - if p.pos >= p.src.len { - return error('yaml: unterminated flow array') - } - if p.src[p.pos] == `]` { - p.pos++ - break - } - items << p.parse_value()! - p.skip_space() - if p.pos >= p.src.len { - return error('yaml: unterminated flow array') - } - if p.src[p.pos] == `,` { - p.pos++ - continue - } - if p.src[p.pos] == `]` { - p.pos++ - break - } - return error('yaml: expected `,` or `]` in flow array') - } - return Any(items) -} - -fn (mut p FlowParser) parse_object() !Any { - p.pos++ - mut result := map[string]Any{} - for { - p.skip_space() - if p.pos >= p.src.len { - return error('yaml: unterminated flow object') - } - if p.src[p.pos] == `}` { - p.pos++ - break - } - key := p.parse_key()! - p.skip_space() - if p.pos >= p.src.len || p.src[p.pos] != `:` { - return error('yaml: expected `:` in flow object') - } - p.pos++ - result[key] = p.parse_value()! - p.skip_space() - if p.pos >= p.src.len { - return error('yaml: unterminated flow object') - } - if p.src[p.pos] == `,` { - p.pos++ - continue - } - if p.src[p.pos] == `}` { - p.pos++ - break - } - return error('yaml: expected `,` or `}` in flow object') - } - return Any(result) -} - -fn (mut p FlowParser) parse_key() !string { - p.skip_space() - if p.pos >= p.src.len { - return error('yaml: unexpected end of flow key') - } - if p.src[p.pos] in [`"`, `'`] { - return parse_quoted_flow_string(mut p) - } - start := p.pos - for p.pos < p.src.len { - ch := p.src[p.pos] - if ch == `:` { - break - } - p.pos++ - } - return p.src[start..p.pos].trim_space() -} - -fn (mut p FlowParser) parse_plain_token() string { - start := p.pos - mut bracket_depth := 0 - mut brace_depth := 0 - for p.pos < p.src.len { - ch := p.src[p.pos] - if ch == `[` { - bracket_depth++ - } else if ch == `]` { - if bracket_depth == 0 { - break - } - bracket_depth-- - } else if ch == `{` { - brace_depth++ - } else if ch == `}` { - if brace_depth == 0 { - break - } - brace_depth-- - } else if ch == `,` && bracket_depth == 0 && brace_depth == 0 { - break - } - p.pos++ - } - return p.src[start..p.pos].trim_space() -} - -fn (mut p FlowParser) skip_space() { - for p.pos < p.src.len && p.src[p.pos].is_space() { - p.pos++ - } -} - -fn parse_quoted_flow_string(mut p FlowParser) !string { - start := p.pos - quote := p.src[p.pos] - p.pos++ - mut escape := false - for p.pos < p.src.len { - ch := p.src[p.pos] - if quote == `"` { - if escape { - escape = false - } else if ch == `\\` { - escape = true - } else if ch == `"` { - p.pos++ - return parse_quoted_string(p.src[start..p.pos]) - } - } else if ch == `'` { - if p.pos + 1 < p.src.len && p.src[p.pos + 1] == `'` { - p.pos += 2 - continue - } - p.pos++ - return parse_quoted_string(p.src[start..p.pos]) - } - p.pos++ - } - return error('yaml: unterminated quoted flow string') -} - -fn parse_dotted_key(key string) ![]string { - mut out := []string{} - mut buf := '' - mut in_string := false - mut delimiter := u8(` `) - for ch in key { - if ch in [`"`, `'`] { - if !in_string { - delimiter = ch - in_string = true - continue - } - if ch == delimiter { - in_string = false - if buf != '' { - out << buf - } - buf = '' - delimiter = ` ` - continue - } - } - buf += ch.ascii_str() - if !in_string && ch == `.` { - buf = buf[..buf.len - 1] - if buf != '' { - out << buf - } - buf = '' - } - } - if buf != '' { - out << buf - } - if in_string { - return error('yaml: missing closing string delimiter `${delimiter.ascii_str()}`') - } - return out -} - -fn parse_array_key(key string) (string, int) { - mut index := -1 - mut k := key - if k.contains('[') { - index = k.all_after('[').all_before(']').int() - if k.starts_with('[') { - k = '' - } else { - k = k.all_before('[') - } - } - return k, index -} - -fn yaml_from_any(value Any, indent int) string { - return match value { - map[string]Any { yaml_from_map(value, indent) } - []Any { yaml_from_array(value, indent) } - else { yaml_scalar(value) } - } -} - -fn yaml_from_map(value map[string]Any, indent int) string { - if value.len == 0 { - return '{}' - } - mut lines := []string{cap: value.len} - padding := ' '.repeat(indent) - for key, item in value { - quoted_key := yaml_quote_string(key) - match item { - map[string]Any, []Any { - lines << '${padding}${quoted_key}:' - lines << yaml_from_any(item, indent + 2) - } - else { - lines << '${padding}${quoted_key}: ${yaml_scalar(item)}' - } - } - } - return lines.join('\n') -} - -fn yaml_from_array(value []Any, indent int) string { - if value.len == 0 { - return '[]' - } - mut lines := []string{cap: value.len * 2} - padding := ' '.repeat(indent) - for item in value { - match item { - map[string]Any, []Any { - lines << '${padding}-' - lines << yaml_from_any(item, indent + 2) - } - else { - lines << '${padding}- ${yaml_scalar(item)}' - } - } - } - return lines.join('\n') -} - -fn yaml_scalar(value Any) string { - return match value { - string { - yaml_quote_string(value) - } - bool { - if value { - 'true' - } else { - 'false' - } - } - f64 { - value.str() - } - i64 { - value.str() - } - int { - value.str() - } - u64 { - value.str() - } - Null { - 'null' - } - []Any, map[string]Any { - yaml_from_any(value, 0) - } - } -} - -fn yaml_quote_string(value string) string { - return json.encode(value) -} - -fn fold_block_scalar(lines []string) string { - mut out := '' - mut pending_newlines := 0 - for line in lines { - if line == '' { - pending_newlines++ - continue - } - if out.len == 0 { - out = line - } else if pending_newlines > 0 { - out += '\n'.repeat(pending_newlines + 1) + line - } else { - out += ' ' + line - } - pending_newlines = 0 - } - if pending_newlines > 0 { - out += '\n'.repeat(pending_newlines) - } - return out -} - -fn is_block_scalar(value string) bool { - return value.len > 0 && value[0] in [`|`, `>`] -} - -fn is_integer(value string) bool { - if value.len == 0 { - return false - } - if value[0] in [`+`, `-`] { - if value.len == 1 { - return false - } - if value[0] == `-` { - _ := value.parse_int(0, 64) or { return false } - return true - } - _ := value[1..].parse_uint(0, 64) or { return false } - return true - } - _ := value.parse_uint(0, 64) or { return false } - return true -} - -fn is_float(value string) bool { - if value.len == 0 { - return false - } - if !value.contains('.') && !value.contains('e') && !value.contains('E') { - return false - } - _ := strconv.atof64(value) or { return false } - return true -} diff --git a/vlib/yaml/yaml_conformance_test.v b/vlib/yaml/yaml_conformance_test.v new file mode 100644 index 000000000..09d885aa7 --- /dev/null +++ b/vlib/yaml/yaml_conformance_test.v @@ -0,0 +1,145 @@ +module yaml + +// Conformance-style coverage. Each case mirrors a pattern from the YAML 1.2 +// spec (or the public yaml-test-suite) that's within the subset V's parser +// implements: plain mappings, sequences, scalars, flow style, JSON-superset +// documents. Anchors, aliases, tags, merge keys, multi-document streams and +// the chomp/indent indicators on block scalars are intentionally NOT covered: +// V's parser does not support them today. +// +// Each case is `name | yaml_input | expected_json` and is verified by parsing +// `yaml_input`, serializing to JSON, and comparing against the expected JSON +// via `json_logically_eq` (see `test_helpers.v`). +// +// Scalar variants (booleans, nulls, numeric underscores), quoted-string +// escapes, block scalars, `[]`/`{}`, and the empty document are exercised by +// `yaml_edge_cases_test.v` with stricter, typed assertions; not duplicated here. + +struct ConformanceCase { + name string + src string + expected string +} + +const cases = [ + ConformanceCase{ + name: 'single scalar mapping' + src: 'key: value' + expected: '{"key":"value"}' + }, + ConformanceCase{ + name: 'integer values' + src: 'a: 1\nb: -2\nc: 0\nd: 9223372036854775807' + expected: '{"a":1,"b":-2,"c":0,"d":9223372036854775807}' + }, + ConformanceCase{ + name: 'float values' + src: 'a: 1.5\nb: -0.001\nc: 1.0e10\nd: -1.5e-3' + expected: '{"a":1.5,"b":-0.001,"c":10000000000,"d":-0.0015}' + }, + ConformanceCase{ + name: 'simple sequence' + src: 'items:\n - one\n - two\n - three' + expected: '{"items":["one","two","three"]}' + }, + ConformanceCase{ + name: 'nested mapping under sequence item' + src: 'servers:\n - host: a\n port: 1\n - host: b\n port: 2' + expected: '{"servers":[{"host":"a","port":1},{"host":"b","port":2}]}' + }, + ConformanceCase{ + name: 'flow sequence inline' + src: 'tags: [web, api, db]' + expected: '{"tags":["web","api","db"]}' + }, + ConformanceCase{ + name: 'flow mapping inline' + src: 'config: {host: a, port: 8080}' + expected: '{"config":{"host":"a","port":8080}}' + }, + ConformanceCase{ + name: 'mixed flow and block' + src: 'top:\n inner:\n - {k: v, n: 1}\n - [a, b, c]' + expected: '{"top":{"inner":[{"k":"v","n":1},["a","b","c"]]}}' + }, + ConformanceCase{ + name: 'deeply nested mapping' + src: 'a:\n b:\n c:\n d:\n e: end' + expected: '{"a":{"b":{"c":{"d":{"e":"end"}}}}}' + }, + ConformanceCase{ + name: 'sequence of sequences' + src: 'matrix:\n - [1, 2, 3]\n - [4, 5, 6]\n - [7, 8, 9]' + expected: '{"matrix":[[1,2,3],[4,5,6],[7,8,9]]}' + }, + ConformanceCase{ + name: 'unicode in string values' + src: 'a: "café"\nb: "日本語"\nc: plain café' + expected: '{"a":"café","b":"日本語","c":"plain café"}' + }, + ConformanceCase{ + name: 'comment is stripped' + src: '# leading comment\nkey: value # trailing comment' + expected: '{"key":"value"}' + }, + ConformanceCase{ + name: 'document separator markers are tolerated' + src: '---\na: 1\n...' + expected: '{"a":1}' + }, + ConformanceCase{ + name: 'JSON-superset input (object)' + src: '{"a": 1, "b": [2, 3]}' + expected: '{"a":1,"b":[2,3]}' + }, + ConformanceCase{ + name: 'JSON-superset input (array root)' + src: '[1, 2, {"three": 3}]' + expected: '[1,2,{"three":3}]' + }, + ConformanceCase{ + name: 'sequence containing null' + src: 'a:\n - one\n - ~\n - three' + expected: '{"a":["one",null,"three"]}' + }, + ConformanceCase{ + name: 'mapping with quoted keys containing dots' + src: 'plain: 1\n"a.b": 2\n"c.d.e": 3' + expected: '{"plain":1,"a.b":2,"c.d.e":3}' + }, + ConformanceCase{ + name: 'mixed types in flow sequence' + src: 'mix: [1, "two", true, null, 3.14]' + expected: '{"mix":[1,"two",true,null,3.14]}' + }, + ConformanceCase{ + name: 'nested flow inside block sequence' + src: 'items:\n - {a: 1, b: [x, y]}\n - {a: 2, b: [z]}' + expected: '{"items":[{"a":1,"b":["x","y"]},{"a":2,"b":["z"]}]}' + }, +]! + +fn test_yaml_conformance_cases() ! { + mut failed := []string{} + for c in cases { + doc := parse_text(c.src) or { + failed << '${c.name}: parse error: ${err}' + continue + } + got := doc.to_json() + matched := json_logically_eq(got, c.expected) or { + failed << '${c.name}: invalid JSON produced: ${got} (vs expected ${c.expected}): ${err}' + continue + } + if !matched { + failed << '${c.name}: got ${got}, want ${c.expected}' + } + } + if failed.len > 0 { + eprintln('YAML conformance failures (${failed.len}/${cases.len}):') + for f in failed { + eprintln(' - ${f}') + } + assert false, '${failed.len} conformance case(s) failed' + } +} diff --git a/vlib/yaml/yaml_edge_cases_test.v b/vlib/yaml/yaml_edge_cases_test.v new file mode 100644 index 000000000..8d9008cb3 --- /dev/null +++ b/vlib/yaml/yaml_edge_cases_test.v @@ -0,0 +1,344 @@ +module yaml + +import os +import x.json2 + +// Edge-case coverage for parse_text + scalar parsing + serialization. +// Complements yaml_test.v which covers the happy path. Each fn targets one +// specific gap that a previous regression in the module would have silently +// passed. + +fn test_parse_strips_utf8_bom() ! { + src := '\xef\xbb\xbfname: app\nport: 8080\n' + doc := parse_text(src)! + assert doc.value('name').string() == 'app' + assert doc.value('port').int() == 8080 +} + +fn test_parse_normalizes_crlf_and_cr() ! { + a := parse_text('a: 1\r\nb: 2\r\n')! + assert a.value('a').int() == 1 + assert a.value('b').int() == 2 + b := parse_text('a: 1\rb: 2\r')! + assert b.value('a').int() == 1 + assert b.value('b').int() == 2 +} + +fn test_parse_empty_and_whitespace_only_documents() ! { + // Per YAML 1.2, a document with no content is the null node. + for src in ['', ' ', '\n\n', ' \n \n'] { + doc := parse_text(src)! + assert doc.root is Null + } +} + +struct EmptyDecodeTarget { + name string + port int +} + +fn test_decode_empty_document_yields_default_struct() ! { + // An empty config file should decode to a zero-initialized struct, not + // raise an error — even though the YAML root is Null per spec. + for src in ['', ' ', '\n\n'] { + got := decode[EmptyDecodeTarget](src)! + assert got.name == '' + assert got.port == 0 + } +} + +fn test_parse_null_variants() ! { + doc := parse_text(' +a: ~ +b: null +c: Null +d: NULL +e: +')! + for k in ['a', 'b', 'c', 'd', 'e'] { + v := doc.value(k) + assert v is Null, '${k} should be Null, got ${typeof(v).name}' + } +} + +fn test_parse_bool_yaml11_variants() ! { + doc := parse_text(' +t1: true +t2: True +t3: TRUE +t4: yes +t5: YES +t6: on +t7: On +f1: false +f2: False +f3: no +f4: NO +f5: off +f6: Off +')! + for k in ['t1', 't2', 't3', 't4', 't5', 't6', 't7'] { + assert doc.value(k).bool(), '${k} should be true' + } + for k in ['f1', 'f2', 'f3', 'f4', 'f5', 'f6'] { + assert !doc.value(k).bool(), '${k} should be false' + } +} + +fn test_parse_numeric_underscores_and_signs() ! { + doc := parse_text(' +a: 1_000_000 +b: -42 +c: +17 +d: 1.5e10 +e: -1.0e-5 +')! + assert doc.value('a').i64() == 1_000_000 + assert doc.value('b').int() == -42 + assert doc.value('c').u64() == 17 + assert doc.value('d').f64() == 1.5e10 + assert doc.value('e').f64() == -1.0e-5 +} + +fn test_parse_quoted_string_escapes() ! { + doc := parse_text(' +a: "line1\\nline2" +b: "tab\\there" +c: "quote: \\"" +d: "unicode: \\u00e9" +e: \'sing\'\'le\' +')! + assert doc.value('a').string() == 'line1\nline2' + assert doc.value('b').string() == 'tab\there' + assert doc.value('c').string() == 'quote: "' + assert doc.value('d').string() == 'unicode: é' + assert doc.value('e').string() == "sing'le" +} + +fn test_parse_comment_inside_quoted_string_is_preserved() ! { + doc := parse_text('a: "value with # not a comment"\nb: real # comment trimmed\n')! + assert doc.value('a').string() == 'value with # not a comment' + assert doc.value('b').string() == 'real' +} + +fn test_parse_nested_flow_style() ! { + doc := parse_text('root: {a: [1, [2, 3], {b: c, d: [e, f]}], g: 4}\n')! + assert doc.value('root.a[0]').int() == 1 + assert doc.value('root.a[1]').array().len == 2 + assert doc.value('root.a[2].b').string() == 'c' + assert doc.value('root.a[2].d[1]').string() == 'f' + assert doc.value('root.g').int() == 4 +} + +fn test_parse_block_scalar_literal_and_folded() ! { + doc := parse_text(' +literal: | + line1 + line2 + + line4 +folded: > + hello + world + + + next paragraph +')! + assert doc.value('literal').string() == 'line1\nline2\n\nline4\n' + assert doc.value('folded').string() == 'hello world\n\nnext paragraph\n' +} + +fn test_parse_rejects_tabs_in_indentation() { + if _ := parse_text('a:\n\tb: 1\n') { + assert false, 'tabs in indentation should error' + } else { + msg := err.msg() + assert msg.contains('tabs are not supported') + // Error must point at the offending line so the caller can locate it. + // The tab is on line 2 of the input. + assert msg.contains('line 2'), 'error should report line number, got: ${msg}' + } +} + +fn test_parse_rejects_unexpected_indentation_in_mapping() { + if _ := parse_text('a: 1\n b: 2\n') { + assert false, 'over-indented mapping entry should error' + } else { + assert err.msg().contains('unexpected indentation') + } +} + +fn test_parse_json_superset_path() ! { + // JSON-shaped input takes the json2 fast path in parse_text. + doc := parse_text('{"a": [1, 2, {"b": "c"}], "d": null}')! + assert doc.value('a[0]').int() == 1 + assert doc.value('a[2].b').string() == 'c' + assert doc.value('d') is Null +} + +fn test_parse_empty_inline_collections() ! { + doc := parse_text('a: []\nb: {}\nc: [[]]\nd: [{}]\n')! + assert doc.value('a').array().len == 0 + assert doc.value('b').as_map().len == 0 + assert doc.value('c').array().len == 1 + assert doc.value('d').array().len == 1 +} + +fn test_parse_deeply_nested_structure() ! { + mut src := 'root:\n' + mut indent := ' ' + for i in 0 .. 30 { + src += '${indent}level${i}:\n' + indent += ' ' + } + src += '${indent}leaf: 42\n' + doc := parse_text(src)! + mut node := doc.value('root') + for i in 0 .. 30 { + node = node.value('level${i}') + } + assert node.value('leaf').int() == 42 +} + +fn test_to_yaml_roundtrip_preserves_structure() ! { + src := 'name: app +servers: + - host: a + port: 1 + - host: b + port: 2 +' + doc := parse_text(src)! + yaml_text := doc.to_yaml() + doc2 := parse_text(yaml_text)! + assert doc2.value('name').string() == 'app' + assert doc2.value('servers[0].host').string() == 'a' + assert doc2.value('servers[1].port').int() == 2 +} + +fn test_to_yaml_is_stable_across_many_calls() ! { + // Anti-regression for a real crash that used to surface only after many + // repeated `to_yaml` calls on the same Doc (sumtype recursion through the + // json2.Any rebuild path under -prod -gc boehm). 1000 iterations are + // enough to flush the original failure mode without bloating CI runtime. + doc := parse_text(' +name: my-app +version: 1.2.3 +servers: + - host: a + port: 1 + - host: b + port: 2 +features: + enable_cache: true + enable_metrics: true +')! + first := doc.to_yaml() + for _ in 0 .. 1000 { + assert doc.to_yaml() == first + } +} + +fn test_to_json_emits_valid_json_for_unicode() ! { + doc := parse_text('a: "café"\nb: "中文"\n')! + out := doc.to_json() + // Re-parse the output instead of asserting on a substring: this catches + // real corruption of the strings, while staying agnostic to whitespace + // and key ordering choices in the emitter. + parsed := json2.decode[json2.Any](out)! + mapped := parsed as map[string]json2.Any + a := mapped['a'] or { return error('missing key a in re-parsed output') } + b := mapped['b'] or { return error('missing key b in re-parsed output') } + assert a.str() == 'café' + assert b.str() == '中文' +} + +fn test_to_json_escapes_special_chars() ! { + doc := parse_text('a: "tab\there"\nb: "quote: \\""\n')! + out := doc.to_json() + parsed := json2.decode[json2.Any](out)! + mapped := parsed as map[string]json2.Any + a := mapped['a'] or { return error('missing key a in re-parsed output') } + b := mapped['b'] or { return error('missing key b in re-parsed output') } + assert a.str() == 'tab\there' + assert b.str() == 'quote: "' +} + +fn test_to_yaml_quotes_keys_consistently() ! { + doc := parse_text('plain: 1\n"a.b": 2\n')! + out := doc.to_yaml() + // Both keys go through yaml_quote_string -> json.encode, so both end up + // quoted. This guards against a future change that would silently switch + // to plain-style and break round-tripping for keys containing dots. + assert out.contains('"plain":') + assert out.contains('"a.b":') +} + +fn test_value_returns_null_for_missing_path() ! { + doc := parse_text('a: 1\nb:\n c: 2\n')! + assert doc.value('z') is Null + assert doc.value('a.does.not.exist') is Null + assert doc.value('b.c.d') is Null +} + +fn test_value_opt_errors_on_missing() ! { + doc := parse_text('a: 1\n')! + if _ := doc.value_opt('z') { + assert false, 'expected error for missing key' + } +} + +fn test_value_returns_null_on_array_out_of_bounds() ! { + doc := parse_text('a: [1, 2, 3]\n')! + assert doc.value('a[99]') is Null +} + +fn test_parse_skips_yaml_directives() ! { + // `%YAML`, `%TAG`, and any other `%`-prefixed directive line is consumed + // without becoming part of the document. + doc := parse_text('%YAML 1.2\n%TAG !e! tag:example.com,2000:app/\n---\nname: app\n')! + assert doc.value('name').string() == 'app' +} + +fn test_parse_anchor_and_alias_resolution() ! { + // `&id` registers the value, `*id` returns the same value at use sites. + doc := parse_text('a: &x hello\nb: *x\nlist:\n - &y 42\n - *y\n')! + assert doc.value('a').string() == 'hello' + assert doc.value('b').string() == 'hello' + assert doc.value('list[0]').int() == 42 + assert doc.value('list[1]').int() == 42 +} + +fn test_parse_unknown_alias_returns_null() ! { + doc := parse_text('a: *missing\n')! + assert doc.value('a') is Null +} + +fn test_parse_file_happy_path() ! { + path := os.join_path(os.vtmp_dir(), 'yaml_pf_${os.getpid()}.yml') + defer { + os.rm(path) or {} + } + os.write_file(path, 'name: app\nport: 8080\n')! + doc := parse_file(path)! + assert doc.value('name').string() == 'app' + assert doc.value('port').int() == 8080 +} + +fn test_parse_file_returns_error_on_missing_path() { + missing := os.join_path(os.vtmp_dir(), 'yaml_does_not_exist_${os.getpid()}.yml') + if _ := parse_file(missing) { + assert false, 'parse_file on missing path should error' + } +} + +fn test_parse_flow_collection_spanning_multiple_lines() ! { + // Flow `[ ]` and `{ }` may wrap across lines; the parser must accumulate + // until brackets balance. + doc := parse_text('arr: [\n 1,\n 2,\n 3\n]\n')! + assert doc.value('arr').array().len == 3 + assert doc.value('arr[2]').int() == 3 + doc2 := parse_text('obj: {\n a: 1,\n b: 2\n}\n')! + assert doc2.value('obj.a').int() == 1 + assert doc2.value('obj.b').int() == 2 +} diff --git a/vlib/yaml/yaml_json_roundtrip_test.v b/vlib/yaml/yaml_json_roundtrip_test.v new file mode 100644 index 000000000..ce7fda239 --- /dev/null +++ b/vlib/yaml/yaml_json_roundtrip_test.v @@ -0,0 +1,82 @@ +module yaml + +// JSON round-trip: parse_text() should accept any JSON document (YAML 1.2 is +// a JSON superset), and to_json() should produce a JSON document that +// re-parses to the same logical tree. This guards against regressions in +// either the parser's JSON-superset fast path or the serializer's escaping. + +const json_corpus = [ + // scalars + 'true', + 'false', + 'null', + '0', + '-1', + '1234567890', + '3.14159', + '1.5e-10', + '""', + '"hello"', + '"with\\"quote"', + '"with\\nnewline"', + '"unicode: \\u00e9 \\u4e2d"', + '"verbatim utf8: café 中"', + // arrays + '[]', + '[1]', + '[1,2,3]', + '["a","b","c"]', + '[null,true,false,1,1.5,"s"]', + '[[1,2],[3,4],[]]', + '[{"k":"v"},{"k":"w"}]', + // objects + '{}', + '{"a":1}', + '{"a":1,"b":2,"c":3}', + '{"nested":{"deep":{"deeper":42}}}', + '{"mixed":[1,{"k":"v"},null,true]}', + '{"empty_arr":[],"empty_obj":{}}', + '{"unicode_key_café":"value","quoted.key":"v2"}', + // edge sizes + '[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]', + '{"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9,"j":10}', +]! + +fn test_json_corpus_roundtrip() ! { + mut failed := []string{} + for input in json_corpus { + doc := parse_text(input) or { + failed << 'parse error on ${input}: ${err}' + continue + } + out := doc.to_json() + eq := json_logically_eq(input, out) or { + failed << '${input} -> ${out}: invalid JSON produced (${err})' + continue + } + if !eq { + failed << '${input} -> ${out}: not logically equal' + } + } + if failed.len > 0 { + eprintln('JSON round-trip failures (${failed.len}/${json_corpus.len}):') + for f in failed { + eprintln(' - ${f}') + } + assert false, '${failed.len} round-trip case(s) failed' + } +} + +// Anti-regression for the "first call works, repeated call breaks" family +// that used to surface in the json2.Any rebuild path: parse + to_json must +// be byte-stable across iterations on the same input. 500 iterations are +// enough to flush the original failure mode without bloating CI runtime. +fn test_json_roundtrip_is_idempotent_under_repetition() ! { + src := '{"users":[{"id":1,"name":"alice","tags":["x","y"]},{"id":2,"name":"bob","tags":[]}],"meta":{"count":2,"page":null,"flags":{"a":true,"b":false}}}' + mut prev := parse_text(src)!.to_json() + for _ in 0 .. 500 { + curr := parse_text(prev)!.to_json() + assert curr == prev, 'round-trip drift detected' + prev = curr + } +} diff --git a/vlib/yaml/yaml_test.v b/vlib/yaml/yaml_test.v index 56f0a3633..4be439c1e 100644 --- a/vlib/yaml/yaml_test.v +++ b/vlib/yaml/yaml_test.v @@ -44,8 +44,8 @@ folded: > assert doc.value('servers[0].host').string() == 'api.local' assert doc.value('servers[0].ports[1]').int() == 443 assert doc.value('quoted."a.b"').int() == 7 - assert doc.value('notes').string() == 'first line\nsecond line' - assert doc.value('folded').string() == 'hello world' + assert doc.value('notes').string() == 'first line\nsecond line\n' + assert doc.value('folded').string() == 'hello world\n' } fn test_generic_encode_decode_with_json_attrs() ! { -- 2.39.5