| 1 | // Copyright (c) 2021 Lars Pontoppidan. All rights reserved. |
| 2 | // Use of this source code is governed by an MIT license |
| 3 | // that can be found in the LICENSE file. |
| 4 | module scanner |
| 5 | |
| 6 | import toml.input |
| 7 | import toml.token |
| 8 | import toml.util |
| 9 | |
| 10 | pub const digit_extras = [`_`, `.`, `x`, `o`, `b`, `e`, `E`] |
| 11 | pub const end_of_text = u32(~0) |
| 12 | |
| 13 | // Scanner contains the necessary fields for the state of the scan process. |
| 14 | // the task the scanner does is also referred to as "lexing" or "tokenizing". |
| 15 | // The Scanner methods are based on much of the work in `vlib/strings/textscanner`. |
| 16 | pub struct Scanner { |
| 17 | pub: |
| 18 | config Config |
| 19 | text string // the input TOML text |
| 20 | mut: |
| 21 | col int // current column number (x coordinate) |
| 22 | line_nr int = 1 // current line number (y coordinate) |
| 23 | pos int // current flat/index position in the `text` field |
| 24 | header_len int // Length, how many bytes of header was found |
| 25 | // Quirks |
| 26 | is_left_of_assign bool = true // indicates if the scanner is on the *left* side of an assignment |
| 27 | } |
| 28 | |
| 29 | // State is a read-only copy of the scanner's internal state. |
| 30 | // See also `Scanner.state()`. |
| 31 | pub struct State { |
| 32 | pub: |
| 33 | col int // current column number (x coordinate) |
| 34 | line_nr int = 1 // current line number (y coordinate) |
| 35 | pos int // current flat/index position in the `text` field |
| 36 | } |
| 37 | |
| 38 | // Config is used to configure a Scanner instance. |
| 39 | // Only one of the fields `text` and `file_path` is allowed to be set at time of configuration. |
| 40 | pub struct Config { |
| 41 | pub: |
| 42 | input input.Config |
| 43 | tokenize_formatting bool = true // if true, generate tokens for `\n`, ` `, `\t`, `\r` etc. |
| 44 | } |
| 45 | |
| 46 | // new_scanner returns a new *heap* allocated `Scanner` instance, based on the file in config.input.file_path, |
| 47 | // or based on the text in config.input.text . |
| 48 | pub fn new_scanner(config Config) !&Scanner { |
| 49 | mut s := &Scanner{ |
| 50 | config: config |
| 51 | text: config.input.read_input()! |
| 52 | } |
| 53 | return s |
| 54 | } |
| 55 | |
| 56 | // new_simple returns a new *stack* allocated `Scanner` instance. |
| 57 | pub fn new_simple(config Config) !Scanner { |
| 58 | return Scanner{ |
| 59 | config: config |
| 60 | text: config.input.read_input()! |
| 61 | } |
| 62 | } |
| 63 | |
| 64 | // new_simple_text returns a new *stack* allocated `Scanner` instance |
| 65 | // ready for parsing TOML in `text`. |
| 66 | pub fn new_simple_text(text string) !Scanner { |
| 67 | in_config := input.Config{ |
| 68 | text: text |
| 69 | } |
| 70 | config := Config{ |
| 71 | input: in_config |
| 72 | } |
| 73 | return Scanner{ |
| 74 | config: config |
| 75 | text: config.input.read_input()! |
| 76 | } |
| 77 | } |
| 78 | |
| 79 | // new_simple_file returns a new *stack* allocated `Scanner` instance |
| 80 | // ready for parsing TOML in file read from `path`. |
| 81 | pub fn new_simple_file(path string) !Scanner { |
| 82 | in_config := input.Config{ |
| 83 | file_path: path |
| 84 | } |
| 85 | config := Config{ |
| 86 | input: in_config |
| 87 | } |
| 88 | return Scanner{ |
| 89 | config: config |
| 90 | text: config.input.read_input()! |
| 91 | } |
| 92 | } |
| 93 | |
| 94 | // scan returns the next token from the input. |
| 95 | @[direct_array_access] |
| 96 | pub fn (mut s Scanner) scan() !token.Token { |
| 97 | s.validate_and_skip_headers()! |
| 98 | |
| 99 | for { |
| 100 | c := s.next() |
| 101 | byte_c := u8(c) |
| 102 | if c == end_of_text { |
| 103 | s.inc_line_number() |
| 104 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'reached EOF') |
| 105 | return s.new_token(.eof, '', 1) |
| 106 | } |
| 107 | |
| 108 | ascii := byte_c.ascii_str() |
| 109 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'current char "${ascii}"') |
| 110 | |
| 111 | if byte_c == u8(0x0) { |
| 112 | s.reset() |
| 113 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 114 | ' NULL control character `${c.hex()}` is not allowed at (${s.line_nr},${s.col}) "${ascii}" near ...${s.excerpt(s.pos, 5)}...') |
| 115 | } |
| 116 | |
| 117 | is_sign := c == `+` || c == `-` |
| 118 | |
| 119 | // (+/-)nan & (+/-)inf |
| 120 | peek_1 := s.peek(1) |
| 121 | peek_2 := s.peek(2) |
| 122 | is_nan := c == `n` && s.at() == `a` && peek_1 == `n` |
| 123 | is_inf := !is_nan && c == `i` && s.at() == `n` && peek_1 == `f` |
| 124 | is_signed_nan := is_sign && s.at() == `n` && peek_1 == `a` && peek_2 == `n` |
| 125 | is_signed_inf := !is_signed_nan && is_sign && s.at() == `i` && peek_1 == `n` |
| 126 | && peek_2 == `f` |
| 127 | if !s.is_left_of_assign && (is_nan || is_inf || is_signed_nan || is_signed_inf) { |
| 128 | num := s.extract_nan_or_inf_number()! |
| 129 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 130 | 'identified a special number "${num}" (${num.len})') |
| 131 | return s.new_token(.number, num, num.len) |
| 132 | } |
| 133 | |
| 134 | is_signed_number := is_sign && u8(s.at()).is_digit() && !u8(s.peek(-1)).is_digit() |
| 135 | is_digit := byte_c.is_digit() |
| 136 | if is_digit || is_signed_number { |
| 137 | num := s.extract_number()! |
| 138 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 139 | 'identified a number "${num}" (${num.len})') |
| 140 | return s.new_token(.number, num, num.len) |
| 141 | } |
| 142 | |
| 143 | if util.is_key_char(byte_c) { |
| 144 | key := s.extract_key() |
| 145 | if u8(s.peek(1)) != `=` && (key == 'true' || key == 'false') { |
| 146 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 147 | 'identified a boolean "${key}" (${key.len})') |
| 148 | return s.new_token(.boolean, key, key.len) |
| 149 | } |
| 150 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 151 | 'identified a bare key "${key}" (${key.len})') |
| 152 | return s.new_token(.bare, key, key.len) |
| 153 | } |
| 154 | |
| 155 | match rune(c) { |
| 156 | ` `, `\t`, `\n`, `\r` { |
| 157 | if c == `\n` { |
| 158 | s.inc_line_number() |
| 159 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 160 | 'incremented line nr to ${s.line_nr}') |
| 161 | } else if c == `\r` { |
| 162 | // CR should always be followed by a `\n` |
| 163 | if s.at() != `\n` { |
| 164 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 165 | ' missing newline/linefeed character after "\\c" carriage return at (${s.line_nr},${s.col}) "${ascii}" near ...${s.excerpt(s.pos, 5)}...') |
| 166 | } |
| 167 | } |
| 168 | // Date-Time in RFC 3339 is allowed to have a space between the date and time in supplement to the 'T' |
| 169 | // so we allow space characters to slip through to the parser if the space is between two digits... |
| 170 | // util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, '"'+u8(s.peek(-1)).ascii_str()+'" < "${ascii}" > "'+u8(s.at()).ascii_str()+'"') |
| 171 | if c == ` ` && u8(s.peek(-1)).is_digit() && u8(s.at()).is_digit() { |
| 172 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 173 | 'identified, what could be, a space between a RFC 3339 date and time ("${ascii}") (${ascii.len})') |
| 174 | return s.new_token(token.Kind.whitespace, ascii, ascii.len) |
| 175 | } |
| 176 | if s.config.tokenize_formatting { |
| 177 | mut kind := token.Kind.whitespace |
| 178 | if c == `\t` { |
| 179 | kind = token.Kind.tab |
| 180 | } else if c == `\r` { |
| 181 | kind = token.Kind.cr |
| 182 | } else if c == `\n` { |
| 183 | kind = token.Kind.nl |
| 184 | } |
| 185 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 186 | 'identified formatting character ("${ascii}") (${ascii.len})') |
| 187 | return s.new_token(kind, ascii, ascii.len) |
| 188 | } else { |
| 189 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 190 | 'skipping " ", "\\t" or "\\n" ("${ascii}") (${ascii.len})') |
| 191 | } |
| 192 | continue |
| 193 | } |
| 194 | `-` { |
| 195 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 196 | 'identified minus "${ascii}" (${ascii.len})') |
| 197 | return s.new_token(.minus, ascii, ascii.len) |
| 198 | } |
| 199 | `_` { |
| 200 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 201 | 'identified underscore "${ascii}" (${ascii.len})') |
| 202 | return s.new_token(.underscore, ascii, ascii.len) |
| 203 | } |
| 204 | `+` { |
| 205 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 206 | 'identified plus "${ascii}" (${ascii.len})') |
| 207 | return s.new_token(.plus, ascii, ascii.len) |
| 208 | } |
| 209 | `=` { |
| 210 | s.is_left_of_assign = false |
| 211 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 212 | 'identified assignment "${ascii}" (${ascii.len})') |
| 213 | return s.new_token(.assign, ascii, ascii.len) |
| 214 | } |
| 215 | `"`, `'` { // ... some string "/' |
| 216 | ident_string := s.extract_string()! |
| 217 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 218 | 'identified quoted string `${ident_string}`') |
| 219 | return s.new_token(.quoted, ident_string, ident_string.len) |
| 220 | } |
| 221 | `#` { |
| 222 | hash := s.ignore_line()! |
| 223 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 224 | 'identified comment hash "${hash}" (${hash.len})') |
| 225 | return s.new_token(.hash, hash, hash.len + 1) |
| 226 | } |
| 227 | `{` { |
| 228 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 229 | 'identified left curly bracket "${ascii}" (${ascii.len})') |
| 230 | return s.new_token(.lcbr, ascii, ascii.len) |
| 231 | } |
| 232 | `}` { |
| 233 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 234 | 'identified right curly bracket "${ascii}" (${ascii.len})') |
| 235 | return s.new_token(.rcbr, ascii, ascii.len) |
| 236 | } |
| 237 | `[` { |
| 238 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 239 | 'identified left square bracket "${ascii}" (${ascii.len})') |
| 240 | return s.new_token(.lsbr, ascii, ascii.len) |
| 241 | } |
| 242 | `]` { |
| 243 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 244 | 'identified right square bracket "${ascii}" (${ascii.len})') |
| 245 | return s.new_token(.rsbr, ascii, ascii.len) |
| 246 | } |
| 247 | `:` { |
| 248 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 249 | 'identified colon "${ascii}" (${ascii.len})') |
| 250 | return s.new_token(.colon, ascii, ascii.len) |
| 251 | } |
| 252 | `,` { |
| 253 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 254 | 'identified comma "${ascii}" (${ascii.len})') |
| 255 | return s.new_token(.comma, ascii, ascii.len) |
| 256 | } |
| 257 | `.` { |
| 258 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 259 | 'identified period "${ascii}" (${ascii.len})') |
| 260 | return s.new_token(.period, ascii, ascii.len) |
| 261 | } |
| 262 | else { |
| 263 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 264 | ' could not scan character `${ascii}` / ${c} at ${s.pos} (${s.line_nr},${s.col}) near ...${s.excerpt(s.pos, 5)}...') |
| 265 | } |
| 266 | } |
| 267 | } |
| 268 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'unknown character code at ${s.pos} (${s.line_nr},${s.col}) near ...${s.excerpt(s.pos, |
| 269 | 5)}...') |
| 270 | return s.new_token(.unknown, '', 0) |
| 271 | } |
| 272 | |
| 273 | // free frees all allocated resources. |
| 274 | @[unsafe] |
| 275 | pub fn (mut s Scanner) free() { |
| 276 | unsafe { |
| 277 | s.text.free() |
| 278 | } |
| 279 | } |
| 280 | |
| 281 | // remaining returns how many characters remain in the text input. |
| 282 | @[inline] |
| 283 | pub fn (s &Scanner) remaining() int { |
| 284 | return s.text.len - s.pos |
| 285 | } |
| 286 | |
| 287 | // next returns the next character code from the input text. |
| 288 | // next returns `end_of_text` if it can't reach the next character. |
| 289 | @[direct_array_access; inline] |
| 290 | pub fn (mut s Scanner) next() u32 { |
| 291 | if s.pos < s.text.len { |
| 292 | opos := s.pos |
| 293 | s.pos++ |
| 294 | s.col++ |
| 295 | c := s.text[opos] |
| 296 | return c |
| 297 | } |
| 298 | return end_of_text |
| 299 | } |
| 300 | |
| 301 | // skip skips one character ahead. |
| 302 | @[inline] |
| 303 | pub fn (mut s Scanner) skip() { |
| 304 | if s.pos + 1 < s.text.len { |
| 305 | s.pos++ |
| 306 | s.col++ |
| 307 | } |
| 308 | } |
| 309 | |
| 310 | // skip_n skips ahead `n` characters. |
| 311 | // If the skip goes out of bounds from the length of `Scanner.text`, |
| 312 | // the scanner position will be sat to the last character possible. |
| 313 | @[inline] |
| 314 | pub fn (mut s Scanner) skip_n(n int) { |
| 315 | s.pos += n |
| 316 | if s.pos > s.text.len { |
| 317 | s.pos = s.text.len |
| 318 | } |
| 319 | s.col = s.pos |
| 320 | } |
| 321 | |
| 322 | // at returns the *current* character code from the input text. |
| 323 | // at returns `end_of_text` if it can't get the current character. |
| 324 | // unlike `next()`, `at()` does not change the state of the scanner. |
| 325 | @[direct_array_access; inline] |
| 326 | pub fn (s &Scanner) at() u32 { |
| 327 | if s.pos < s.text.len { |
| 328 | return s.text[s.pos] |
| 329 | } |
| 330 | return end_of_text |
| 331 | } |
| 332 | |
| 333 | // at_crlf returns `true` if the scanner is at a `\r` character |
| 334 | // and the next character is a `\n`. |
| 335 | fn (s &Scanner) at_crlf() bool { |
| 336 | return s.at() == `\r` && s.peek(1) == `\n` |
| 337 | } |
| 338 | |
| 339 | // peek returns the character code from the input text at position + `n`. |
| 340 | // peek returns `end_of_text` if it can't peek `n` characters ahead. |
| 341 | @[direct_array_access; inline] |
| 342 | pub fn (s &Scanner) peek(n int) u32 { |
| 343 | if s.pos + n < s.text.len { |
| 344 | // Allow peeking back - needed for spaces between date and time in RFC 3339 format :/ |
| 345 | if n - 1 < 0 && s.pos + n - 1 >= 0 { |
| 346 | // util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'LOOKING BAAAA-AACK - OOOVER MY SHOOOOULDEEEER "${s.text[s.pos + n-1]}"') |
| 347 | return s.text[s.pos + n - 1] |
| 348 | } |
| 349 | return s.text[s.pos + n] |
| 350 | } |
| 351 | return end_of_text |
| 352 | } |
| 353 | |
| 354 | // reset resets the internal state of the scanner. |
| 355 | pub fn (mut s Scanner) reset() { |
| 356 | s.pos = 0 |
| 357 | s.col = 0 |
| 358 | s.line_nr = 1 |
| 359 | s.header_len = 0 |
| 360 | } |
| 361 | |
| 362 | // new_token returns a new `token.Token`. |
| 363 | @[inline] |
| 364 | fn (mut s Scanner) new_token(kind token.Kind, lit string, len int) token.Token { |
| 365 | // println('new_token(${lit})') |
| 366 | mut col := s.col - len + 1 |
| 367 | if s.line_nr == 1 { |
| 368 | col -= s.header_len |
| 369 | } |
| 370 | return token.Token{ |
| 371 | kind: kind |
| 372 | lit: lit |
| 373 | col: if col < 1 { 1 } else { col } |
| 374 | line_nr: s.line_nr + 1 |
| 375 | pos: s.pos - s.header_len - len + 1 |
| 376 | len: len |
| 377 | } |
| 378 | } |
| 379 | |
| 380 | // ignore_line forwards the scanner to the end of the current line. |
| 381 | @[direct_array_access; inline] |
| 382 | fn (mut s Scanner) ignore_line() !string { |
| 383 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, ' ignoring until EOL...') |
| 384 | start := s.pos |
| 385 | for c := s.at(); c != end_of_text && c != `\n`; c = s.at() { |
| 386 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping "${u8(c).ascii_str()} / ${c}"') |
| 387 | if s.at_crlf() { |
| 388 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'letting `\\r\\n` slip through') |
| 389 | break |
| 390 | } |
| 391 | s.next() |
| 392 | } |
| 393 | return s.text[start..s.pos] |
| 394 | } |
| 395 | |
| 396 | // inc_line_number increases the internal line number. |
| 397 | @[inline] |
| 398 | fn (mut s Scanner) inc_line_number() { |
| 399 | s.col = 0 |
| 400 | s.line_nr++ |
| 401 | s.is_left_of_assign = true |
| 402 | } |
| 403 | |
| 404 | // extract_key parses and returns a TOML key as a string. |
| 405 | @[direct_array_access; inline] |
| 406 | fn (mut s Scanner) extract_key() string { |
| 407 | s.pos-- |
| 408 | s.col-- |
| 409 | start := s.pos |
| 410 | for s.pos < s.text.len { |
| 411 | c := u8(s.at()) |
| 412 | if !(util.is_key_char(c) || c.is_digit() || c in [`_`, `-`]) { |
| 413 | break |
| 414 | } |
| 415 | s.pos++ |
| 416 | s.col++ |
| 417 | } |
| 418 | key := s.text[start..s.pos] |
| 419 | return key |
| 420 | } |
| 421 | |
| 422 | // extract_string collects and returns a string containing |
| 423 | // any bytes recognized as a TOML string. |
| 424 | // TOML strings are everything found between two double or single quotation marks (`"`/`'`). |
| 425 | @[direct_array_access; inline] |
| 426 | fn (mut s Scanner) extract_string() !string { |
| 427 | // extract_string is called when the scanner has already reached |
| 428 | // a byte that is the start of a string so we rewind it to start at the correct |
| 429 | s.pos-- |
| 430 | s.col-- |
| 431 | quote := u8(s.at()) |
| 432 | start := s.pos |
| 433 | mut lit := quote.ascii_str() |
| 434 | |
| 435 | is_multiline := s.text[s.pos + 1] == quote && s.text[s.pos + 2] == quote |
| 436 | // Check for escaped multiline quote |
| 437 | if is_multiline { |
| 438 | mls := s.extract_multiline_string()! |
| 439 | return mls |
| 440 | } |
| 441 | |
| 442 | for { |
| 443 | s.pos++ |
| 444 | s.col++ |
| 445 | |
| 446 | if s.pos >= s.text.len { |
| 447 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 448 | ' unfinished single-line string literal `${quote.ascii_str()}` started at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...') |
| 449 | } |
| 450 | |
| 451 | c := u8(s.at()) |
| 452 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 453 | 'c: `${c.ascii_str()}` / ${c} (quote type: ${quote}/${quote.ascii_str()})') |
| 454 | |
| 455 | // Check for escaped chars |
| 456 | if c == u8(92) { |
| 457 | esc, skip := s.handle_escapes(quote, is_multiline) |
| 458 | lit += esc |
| 459 | if skip > 0 { |
| 460 | s.pos += skip |
| 461 | s.col += skip |
| 462 | continue |
| 463 | } |
| 464 | } |
| 465 | // Check for control characters (allow TAB) |
| 466 | if util.is_illegal_ascii_control_character(c) { |
| 467 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 468 | ' control character `${c.hex()}` is not allowed at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...') |
| 469 | } |
| 470 | |
| 471 | if c == quote { |
| 472 | s.pos++ |
| 473 | s.col++ |
| 474 | return lit + quote.ascii_str() |
| 475 | } |
| 476 | |
| 477 | lit += c.ascii_str() |
| 478 | |
| 479 | // Don't eat multiple lines in single-line mode |
| 480 | if lit.contains('\n') { |
| 481 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 482 | ' unfinished single-line string literal `${quote.ascii_str()}` started at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...') |
| 483 | } |
| 484 | } |
| 485 | return lit |
| 486 | } |
| 487 | |
| 488 | // extract_multiline_string collects and returns a string containing |
| 489 | // any bytes recognized as a TOML string. |
| 490 | // TOML strings are everything found between two double or single quotation marks (`"`/`'`). |
| 491 | @[direct_array_access; inline] |
| 492 | fn (mut s Scanner) extract_multiline_string() !string { |
| 493 | // extract_multiline_string is called from extract_string so we know the 3 first |
| 494 | // characters is the quotes |
| 495 | quote := u8(s.at()) |
| 496 | start := s.pos |
| 497 | mut lit := quote.ascii_str() + quote.ascii_str() + quote.ascii_str() |
| 498 | |
| 499 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'multi-line `${quote.ascii_str()}${s.text[ |
| 500 | s.pos + 1].ascii_str()}${s.text[s.pos + 2].ascii_str()}` string started at pos ${start} (${s.line_nr},${s.col}) (quote type: ${quote.ascii_str()} / ${quote})') |
| 501 | |
| 502 | s.pos += 2 |
| 503 | s.col += 2 |
| 504 | |
| 505 | for { |
| 506 | s.pos++ |
| 507 | s.col++ |
| 508 | |
| 509 | if s.pos >= s.text.len { |
| 510 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 511 | ' unfinished multi-line string literal (${quote.ascii_str()}${quote.ascii_str()}${quote.ascii_str()}) started at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...') |
| 512 | } |
| 513 | |
| 514 | c := u8(s.at()) |
| 515 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 516 | 'c: `${c.ascii_str()}` / ${c} (quote type: ${quote}/${quote.ascii_str()})') |
| 517 | |
| 518 | if c == `\r` && s.peek(1) == `\n` { |
| 519 | continue |
| 520 | } |
| 521 | if c == `\n` { |
| 522 | s.inc_line_number() |
| 523 | lit += c.ascii_str() |
| 524 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'c: `\\n` / ${c}') |
| 525 | continue |
| 526 | } |
| 527 | // Check for escaped chars |
| 528 | if c == u8(92) { |
| 529 | esc, skip := s.handle_escapes(quote, true) |
| 530 | lit += esc |
| 531 | if skip > 0 { |
| 532 | s.pos += skip |
| 533 | s.col += skip |
| 534 | continue |
| 535 | } |
| 536 | } |
| 537 | // Check for control characters (allow TAB) |
| 538 | if util.is_illegal_ascii_control_character(c) { |
| 539 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 540 | ' control character `${c.hex()}` is not allowed at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...') |
| 541 | } |
| 542 | |
| 543 | if c == quote { |
| 544 | if s.peek(1) == quote && s.peek(2) == quote { |
| 545 | if s.peek(3) == end_of_text { |
| 546 | s.pos += 3 |
| 547 | s.col += 3 |
| 548 | lit += quote.ascii_str() + quote.ascii_str() + quote.ascii_str() |
| 549 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 550 | 'returning at ${c.ascii_str()} `${lit}`') |
| 551 | return lit |
| 552 | } else if s.peek(3) != quote { |
| 553 | // lit += c.ascii_str() |
| 554 | // lit += quote.ascii_str() |
| 555 | s.pos += 3 |
| 556 | s.col += 3 |
| 557 | lit += quote.ascii_str() + quote.ascii_str() + quote.ascii_str() |
| 558 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 559 | 'returning at ${c.ascii_str()} `${lit}`') |
| 560 | return lit |
| 561 | } |
| 562 | } |
| 563 | } |
| 564 | lit += c.ascii_str() |
| 565 | } |
| 566 | return lit |
| 567 | } |
| 568 | |
| 569 | // handle_escapes returns any escape character sequence. |
| 570 | // For escape sequence validation see `Checker.check_quoted_escapes`. |
| 571 | fn (mut s Scanner) handle_escapes(quote u8, is_multiline bool) (string, int) { |
| 572 | c := u8(s.at()) |
| 573 | mut lit := c.ascii_str() |
| 574 | is_literal_string := quote == `'` |
| 575 | if !is_literal_string { |
| 576 | if s.peek(1) == `u` && u8(s.peek(2)).is_hex_digit() && u8(s.peek(3)).is_hex_digit() |
| 577 | && u8(s.peek(4)).is_hex_digit() && u8(s.peek(5)).is_hex_digit() { |
| 578 | lit += s.text[s.pos + 1..s.pos + 6] //.ascii_str() |
| 579 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped unicode `${lit}`') |
| 580 | return lit, 5 |
| 581 | } else if s.peek(1) == quote { |
| 582 | if (!is_multiline && s.peek(2) == `\n`) |
| 583 | || (is_multiline && s.peek(2) == quote && s.peek(3) == quote && s.peek(4) == `\n`) { |
| 584 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 585 | 'ignore special case escaped `${lit}` at end of string') |
| 586 | return '', 0 |
| 587 | } |
| 588 | lit += quote.ascii_str() |
| 589 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped `${lit}`') |
| 590 | return lit, 1 |
| 591 | } |
| 592 | } |
| 593 | if is_literal_string { |
| 594 | if s.peek(1) == quote { |
| 595 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 596 | 'ignore escape `${lit}${u8(s.peek(1)).ascii_str()}` in literal string') |
| 597 | return '', 0 |
| 598 | } |
| 599 | } |
| 600 | |
| 601 | lit += u8(s.peek(1)).ascii_str() |
| 602 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped `${lit}`') |
| 603 | return lit, 1 |
| 604 | } |
| 605 | |
| 606 | // extract_number collects and returns a string containing |
| 607 | // any bytes recognized as a TOML number except for "(+/-)nan" and "(+/-)inf". |
| 608 | // TOML numbers can include digits 0-9 and `_`. |
| 609 | @[direct_array_access; inline] |
| 610 | fn (mut s Scanner) extract_number() !string { |
| 611 | // extract_number is called when the scanner has already reached |
| 612 | // a byte that is a number or +/- - so we rewind it to start at the correct |
| 613 | // position to get the complete number. Even if it's only one digit |
| 614 | s.pos-- |
| 615 | s.col-- |
| 616 | start := s.pos |
| 617 | |
| 618 | mut c := s.at() |
| 619 | is_digit := u8(c).is_digit() |
| 620 | if !(is_digit || c in [`+`, `-`]) { |
| 621 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 622 | ' ${u8(c).ascii_str()} is not a number at ${s.excerpt(s.pos, 10)}') |
| 623 | } |
| 624 | s.pos++ |
| 625 | s.col++ |
| 626 | for s.pos < s.text.len { |
| 627 | c = s.at() |
| 628 | // Adjust scanner position to floating point numbers |
| 629 | mut float_precision := 0 |
| 630 | if c == `.` { |
| 631 | mut i := 1 |
| 632 | for c_ := u8(s.peek(i)); c_ != end_of_text && c_ != `\n`; c_ = u8(s.peek(i)) { |
| 633 | if !c_.is_digit() && c_ != `,` { |
| 634 | float_precision = 0 |
| 635 | break |
| 636 | } |
| 637 | float_precision++ |
| 638 | i++ |
| 639 | } |
| 640 | } |
| 641 | s.pos += float_precision |
| 642 | s.col += float_precision |
| 643 | // Handle signed exponent notation. I.e.: 3e2, 3E2, 3e-2, 3E+2, 3e0, 3.1e2, 3.1E2, -1E-1 |
| 644 | if c in [`e`, `E`] && s.peek(1) in [`+`, `-`] && u8(s.peek(2)).is_digit() { |
| 645 | s.pos += 2 |
| 646 | s.col += 2 |
| 647 | } |
| 648 | c = s.at() |
| 649 | if !(u8(c).is_hex_digit() || c in digit_extras) || (c == `.` && s.is_left_of_assign) { |
| 650 | break |
| 651 | } |
| 652 | s.pos++ |
| 653 | s.col++ |
| 654 | } |
| 655 | key := s.text[start..s.pos] |
| 656 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 657 | 'identified number "${key}" in range [${start} .. ${s.pos}]') |
| 658 | return key |
| 659 | } |
| 660 | |
| 661 | // extract_nan_or_inf_number collects and returns a string containing |
| 662 | // any bytes recognized as infinity or not-a-number TOML numbers. |
| 663 | @[direct_array_access; inline] |
| 664 | fn (mut s Scanner) extract_nan_or_inf_number() !string { |
| 665 | // extract_nan_or_inf_number is called when the scanner has already identified that |
| 666 | // +/- or 'nan'/'inf' bytes is up but we rewind it to start at the correct position |
| 667 | s.pos-- |
| 668 | s.col-- |
| 669 | start := s.pos |
| 670 | |
| 671 | mut c := s.at() |
| 672 | if c !in [`+`, `-`, `n`, `i`] { |
| 673 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 674 | ' ${u8(c).ascii_str()} is not a number at ${s.excerpt(s.pos, 10)}') |
| 675 | } |
| 676 | s.pos++ |
| 677 | s.col++ |
| 678 | for s.pos < s.text.len { |
| 679 | c = s.at() |
| 680 | if c !in [`n`, `a`, `i`, `f`] { |
| 681 | break |
| 682 | } |
| 683 | s.pos++ |
| 684 | s.col++ |
| 685 | } |
| 686 | key := s.text[start..s.pos] |
| 687 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, |
| 688 | 'identified special number "${key}" in range [${start} .. ${s.pos}]') |
| 689 | return key |
| 690 | } |
| 691 | |
| 692 | // excerpt returns a string excerpt of the input text centered |
| 693 | // at `pos`. The `margin` argument defines how many chacters |
| 694 | // on each side of `pos` is returned |
| 695 | pub fn (s &Scanner) excerpt(pos int, margin int) string { |
| 696 | start := if pos > 0 && pos >= margin { pos - margin } else { 0 } |
| 697 | end := if pos + margin < s.text.len { pos + margin } else { s.text.len } |
| 698 | return s.text[start..end].replace('\n', r'\n') |
| 699 | } |
| 700 | |
| 701 | // state returns a read-only view of the scanner's internal state. |
| 702 | pub fn (s &Scanner) state() State { |
| 703 | return State{ |
| 704 | col: s.col |
| 705 | line_nr: s.line_nr |
| 706 | pos: s.pos |
| 707 | } |
| 708 | } |
| 709 | |
| 710 | fn (mut s Scanner) validate_and_skip_headers() ! { |
| 711 | // UTF-16 / UTF-32 headers (BE/LE) |
| 712 | s.check_utf16_or_32_bom()! |
| 713 | |
| 714 | // NICE-TO-HAVE-TODO Check other types of (UTF-?) headers and yield an error. TOML is UTF-8 only. |
| 715 | |
| 716 | // Skip optional UTF-8 header, if any. |
| 717 | if s.at() == 0xEF && s.peek(1) == 0xBB && s.peek(2) == 0xBF { |
| 718 | util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping UTF-8 byte order mark (BOM)') |
| 719 | s.header_len = 3 |
| 720 | s.skip_n(s.header_len) |
| 721 | } |
| 722 | |
| 723 | // Check after we've skipped UTF-8 BOM |
| 724 | s.check_utf16_or_32_bom()! |
| 725 | } |
| 726 | |
| 727 | fn (mut s Scanner) check_utf16_or_32_bom() ! { |
| 728 | if (s.at() == 0xFF && s.peek(1) == 0xFE && s.peek(2) == 0x00 && s.peek(3) == 0x00) |
| 729 | || (s.at() == 0x00 && s.peek(1) == 0x00 && s.peek(2) == 0xFE && s.peek(3) == 0xFF) { |
| 730 | s.header_len = 4 |
| 731 | s.skip_n(s.header_len) |
| 732 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 733 | ' UTF-32 is not a valid TOML encoding at ${s.pos} (${s.line_nr},${s.col}) near ...${s.excerpt(s.pos, 5)}...') |
| 734 | } |
| 735 | if (s.at() == 0xFE && s.peek(1) == 0xFF) || (s.at() == 0xFF && s.peek(1) == 0xFE) { |
| 736 | s.header_len = 2 |
| 737 | s.skip_n(s.header_len) |
| 738 | return error(@MOD + '.' + @STRUCT + '.' + @FN + |
| 739 | ' UTF-16 is not a valid TOML encoding at ${s.pos} (${s.line_nr},${s.col}) near ...${s.excerpt(s.pos, 5)}...') |
| 740 | } |
| 741 | } |
| 742 | |