From 76530de56d9a48118baeca24b4fbc50af92c0858 Mon Sep 17 00:00:00 2001 From: igrekus Date: Fri, 17 Nov 2023 18:11:30 +0300 Subject: [PATCH] scanner: implement support for UTF-32 escape codes in string literals (#19911) --- .../checker/tests/string_escape_u16_err_a.out | 5 + ..._u_err_a.vv => string_escape_u16_err_a.vv} | 0 .../checker/tests/string_escape_u16_err_b.out | 5 + ..._u_err_b.vv => string_escape_u16_err_b.vv} | 0 .../checker/tests/string_escape_u32_err_a.out | 5 + .../checker/tests/string_escape_u32_err_a.vv | 3 + .../checker/tests/string_escape_u32_err_b.out | 5 + .../checker/tests/string_escape_u32_err_b.vv | 3 + .../v/checker/tests/string_escape_u_err_a.out | 5 - .../v/checker/tests/string_escape_u_err_b.out | 5 - vlib/v/gen/native/tests/string.vv | 7 +- vlib/v/gen/native/tests/string.vv.out | 3 +- vlib/v/scanner/scanner.v | 98 ++++++++++++++----- vlib/v/scanner/scanner_test.v | 55 ++++++++++- .../scanner/tests/invalid_unicode_16_err.out | 4 + ...icode_err.vv => invalid_unicode_16_err.vv} | 0 .../scanner/tests/invalid_unicode_32_err.out | 4 + .../v/scanner/tests/invalid_unicode_32_err.vv | 2 + vlib/v/scanner/tests/invalid_unicode_err.out | 4 - 19 files changed, 169 insertions(+), 44 deletions(-) create mode 100644 vlib/v/checker/tests/string_escape_u16_err_a.out rename vlib/v/checker/tests/{string_escape_u_err_a.vv => string_escape_u16_err_a.vv} (100%) create mode 100644 vlib/v/checker/tests/string_escape_u16_err_b.out rename vlib/v/checker/tests/{string_escape_u_err_b.vv => string_escape_u16_err_b.vv} (100%) create mode 100644 vlib/v/checker/tests/string_escape_u32_err_a.out create mode 100644 vlib/v/checker/tests/string_escape_u32_err_a.vv create mode 100644 vlib/v/checker/tests/string_escape_u32_err_b.out create mode 100644 vlib/v/checker/tests/string_escape_u32_err_b.vv delete mode 100644 vlib/v/checker/tests/string_escape_u_err_a.out delete mode 100644 vlib/v/checker/tests/string_escape_u_err_b.out create mode 100644 vlib/v/scanner/tests/invalid_unicode_16_err.out rename vlib/v/scanner/tests/{invalid_unicode_err.vv => invalid_unicode_16_err.vv} (100%) create mode 100644 vlib/v/scanner/tests/invalid_unicode_32_err.out create mode 100644 vlib/v/scanner/tests/invalid_unicode_32_err.vv delete mode 100644 vlib/v/scanner/tests/invalid_unicode_err.out diff --git a/vlib/v/checker/tests/string_escape_u16_err_a.out b/vlib/v/checker/tests/string_escape_u16_err_a.out new file mode 100644 index 000000000..adb3692ab --- /dev/null +++ b/vlib/v/checker/tests/string_escape_u16_err_a.out @@ -0,0 +1,5 @@ +vlib/v/checker/tests/string_escape_u16_err_a.vv:2:15: error: `\u` incomplete 16 bit unicode character value + 1 | fn main() { + 2 | println('\u') + | ^ + 3 | } diff --git a/vlib/v/checker/tests/string_escape_u_err_a.vv b/vlib/v/checker/tests/string_escape_u16_err_a.vv similarity index 100% rename from vlib/v/checker/tests/string_escape_u_err_a.vv rename to vlib/v/checker/tests/string_escape_u16_err_a.vv diff --git a/vlib/v/checker/tests/string_escape_u16_err_b.out b/vlib/v/checker/tests/string_escape_u16_err_b.out new file mode 100644 index 000000000..dfbc7fba3 --- /dev/null +++ b/vlib/v/checker/tests/string_escape_u16_err_b.out @@ -0,0 +1,5 @@ +vlib/v/checker/tests/string_escape_u16_err_b.vv:2:15: error: `\u` incomplete 16 bit unicode character value + 1 | fn main() { + 2 | println('\u345') + | ^ + 3 | } diff --git a/vlib/v/checker/tests/string_escape_u_err_b.vv b/vlib/v/checker/tests/string_escape_u16_err_b.vv similarity index 100% rename from vlib/v/checker/tests/string_escape_u_err_b.vv rename to vlib/v/checker/tests/string_escape_u16_err_b.vv diff --git a/vlib/v/checker/tests/string_escape_u32_err_a.out b/vlib/v/checker/tests/string_escape_u32_err_a.out new file mode 100644 index 000000000..7218b0562 --- /dev/null +++ b/vlib/v/checker/tests/string_escape_u32_err_a.out @@ -0,0 +1,5 @@ +vlib/v/checker/tests/string_escape_u32_err_a.vv:2:15: error: `\U` incomplete 32 bit unicode character value + 1 | fn main() { + 2 | println('\U') + | ^ + 3 | } diff --git a/vlib/v/checker/tests/string_escape_u32_err_a.vv b/vlib/v/checker/tests/string_escape_u32_err_a.vv new file mode 100644 index 000000000..9c87c8a79 --- /dev/null +++ b/vlib/v/checker/tests/string_escape_u32_err_a.vv @@ -0,0 +1,3 @@ +fn main() { + println('\U') +} diff --git a/vlib/v/checker/tests/string_escape_u32_err_b.out b/vlib/v/checker/tests/string_escape_u32_err_b.out new file mode 100644 index 000000000..03e0cccf3 --- /dev/null +++ b/vlib/v/checker/tests/string_escape_u32_err_b.out @@ -0,0 +1,5 @@ +vlib/v/checker/tests/string_escape_u32_err_b.vv:2:15: error: `\U` incomplete 32 bit unicode character value + 1 | fn main() { + 2 | println('\U345') + | ^ + 3 | } diff --git a/vlib/v/checker/tests/string_escape_u32_err_b.vv b/vlib/v/checker/tests/string_escape_u32_err_b.vv new file mode 100644 index 000000000..8293830fb --- /dev/null +++ b/vlib/v/checker/tests/string_escape_u32_err_b.vv @@ -0,0 +1,3 @@ +fn main() { + println('\U345') +} diff --git a/vlib/v/checker/tests/string_escape_u_err_a.out b/vlib/v/checker/tests/string_escape_u_err_a.out deleted file mode 100644 index 607c240e5..000000000 --- a/vlib/v/checker/tests/string_escape_u_err_a.out +++ /dev/null @@ -1,5 +0,0 @@ -vlib/v/checker/tests/string_escape_u_err_a.vv:2:15: error: `\u` incomplete unicode character value - 1 | fn main() { - 2 | println('\u') - | ^ - 3 | } diff --git a/vlib/v/checker/tests/string_escape_u_err_b.out b/vlib/v/checker/tests/string_escape_u_err_b.out deleted file mode 100644 index d81d688d1..000000000 --- a/vlib/v/checker/tests/string_escape_u_err_b.out +++ /dev/null @@ -1,5 +0,0 @@ -vlib/v/checker/tests/string_escape_u_err_b.vv:2:15: error: `\u` incomplete unicode character value - 1 | fn main() { - 2 | println('\u345') - | ^ - 3 | } diff --git a/vlib/v/gen/native/tests/string.vv b/vlib/v/gen/native/tests/string.vv index 7cbcd3536..1743d9b7a 100644 --- a/vlib/v/gen/native/tests/string.vv +++ b/vlib/v/gen/native/tests/string.vv @@ -8,6 +8,8 @@ fn test_escape_codes() { println(star1) star2 := '\u2605' println(star2) + star3 := '\U00002605' + println(star3) aaa := '\x61\141a' println(aaa) @@ -33,8 +35,9 @@ fn test_runes() { // should all print `★` print(`\u2605`) + print(`\U00002605`) print(`\xe2\x98\x85`) - println(`\xe2\x98\x85`) + println(`\xe2\x98\x85`) } fn main() { @@ -42,4 +45,4 @@ fn main() { test_escape_codes() test_raw_string() test_runes() -} \ No newline at end of file +} diff --git a/vlib/v/gen/native/tests/string.vv.out b/vlib/v/gen/native/tests/string.vv.out index c6d8227ab..b8f7159f4 100644 --- a/vlib/v/gen/native/tests/string.vv.out +++ b/vlib/v/gen/native/tests/string.vv.out @@ -1,6 +1,7 @@ 😀😆😎💻🌎 ★ ★ +★ aaa ## # ### # @@ -8,4 +9,4 @@ hello\tworld\n V 😀 🚀 -★★★ \ No newline at end of file +★★★★ \ No newline at end of file diff --git a/vlib/v/scanner/scanner.v b/vlib/v/scanner/scanner.v index 9997657ea..e90d790f1 100644 --- a/vlib/v/scanner/scanner.v +++ b/vlib/v/scanner/scanner.v @@ -1216,7 +1216,8 @@ fn (mut s Scanner) ident_string() string { s.inc_line_number() } s.is_inside_string = false - mut u_escapes_pos := []int{} // pos list of \uXXXX + mut u16_escapes_pos := []int{} // pos list of \uXXXX + mut u32_escapes_pos := []int{} // pos list of \UXXXXXXXX mut h_escapes_pos := []int{} // pos list of \xXX mut backslash_count := if start_char == scanner.backslash { 1 } else { 0 } for { @@ -1247,7 +1248,7 @@ fn (mut s Scanner) ident_string() string { if c == scanner.b_lf { s.inc_line_number() } - // Escape `\x` `\u` + // Escape `\x` `\u` `\U` if backslash_count % 2 == 1 && !is_raw && !is_cstr { // Escape `\x` if c == `x` { @@ -1263,9 +1264,23 @@ fn (mut s Scanner) ident_string() string { || s.text[s.pos + 3] == s.quote || s.text[s.pos + 4] == s.quote || !s.text[s.pos + 1].is_hex_digit() || !s.text[s.pos + 2].is_hex_digit() || !s.text[s.pos + 3].is_hex_digit() || !s.text[s.pos + 4].is_hex_digit() { - s.error(r'`\u` incomplete unicode character value') + s.error(r'`\u` incomplete 16 bit unicode character value') } - u_escapes_pos << s.pos - 1 + u16_escapes_pos << s.pos - 1 + } + // Escape `\U` + if c == `U` { + if s.text[s.pos + 1] == s.quote || s.text[s.pos + 2] == s.quote + || s.text[s.pos + 3] == s.quote || s.text[s.pos + 4] == s.quote + || s.text[s.pos + 5] == s.quote || s.text[s.pos + 6] == s.quote + || s.text[s.pos + 7] == s.quote || s.text[s.pos + 8] == s.quote + || !s.text[s.pos + 1].is_hex_digit() || !s.text[s.pos + 2].is_hex_digit() + || !s.text[s.pos + 3].is_hex_digit() || !s.text[s.pos + 4].is_hex_digit() + || !s.text[s.pos + 5].is_hex_digit() || !s.text[s.pos + 6].is_hex_digit() + || !s.text[s.pos + 7].is_hex_digit() || !s.text[s.pos + 8].is_hex_digit() { + s.error(r'`\U` incomplete 32 bit unicode character value') + } + u32_escapes_pos << s.pos - 1 } // Unknown escape sequence if !is_escape_sequence(c) && !c.is_digit() { @@ -1307,19 +1322,26 @@ fn (mut s Scanner) ident_string() string { if !s.is_fmt { mut segment_idx := 0 mut str_segments := []string{} - if u_escapes_pos.len + h_escapes_pos.len > 0 { + if u16_escapes_pos.len + h_escapes_pos.len + u32_escapes_pos.len > 0 { mut all_pos := []int{} - all_pos << u_escapes_pos + all_pos << u16_escapes_pos + all_pos << u32_escapes_pos all_pos << h_escapes_pos - if u_escapes_pos.len != 0 && h_escapes_pos.len != 0 { - all_pos.sort() - } + all_pos.sort() + for pos in all_pos { str_segments << string_so_far[segment_idx..(pos - start)] segment_idx = pos - start - if pos in u_escapes_pos { - end_idx, segment := s.decode_u_escape_single(string_so_far, segment_idx) + if pos in u16_escapes_pos { + end_idx, segment := s.decode_u16_escape_single(string_so_far, + segment_idx) + str_segments << segment + segment_idx = end_idx + } + if pos in u32_escapes_pos { + end_idx, segment := s.decode_u32_escape_single(string_so_far, + segment_idx) str_segments << segment segment_idx = end_idx } @@ -1407,7 +1429,7 @@ fn (mut s Scanner) decode_o_escapes(sinput string, start int, escapes_pos []int) return ss.join('') } -fn (mut s Scanner) decode_u_escape_single(str string, idx int) (int, string) { +fn (mut s Scanner) decode_u16_escape_single(str string, idx int) (int, string) { end_idx := idx + 6 // "\uXXXX".len == 6 escaped_code_point := strconv.parse_uint(str[idx + 2..end_idx], 16, 32) or { 0 } // Check if Escaped Code Point is invalid or not @@ -1418,9 +1440,32 @@ fn (mut s Scanner) decode_u_escape_single(str string, idx int) (int, string) { return end_idx, utf32_to_str(u32(escaped_code_point)) } -// decode a single unicode escaped rune into its utf-8 bytes -fn (mut s Scanner) decode_uerune(str string) string { - end_idx, segment := s.decode_u_escape_single(str, 0) +// decode a single 16 bit unicode escaped rune into its utf-8 bytes +fn (mut s Scanner) decode_u16erune(str string) string { + end_idx, segment := s.decode_u16_escape_single(str, 0) + if str.len == end_idx { + return segment + } + mut ss := []string{cap: 2} + ss << segment + ss << str[end_idx..] + return ss.join('') +} + +fn (mut s Scanner) decode_u32_escape_single(str string, idx int) (int, string) { + end_idx := idx + 10 // "\uXXXXXXXX".len == 10 + escaped_code_point := strconv.parse_uint(str[idx + 2..end_idx], 16, 32) or { 0 } + // Check if Escaped Code Point is invalid or not + if rune(escaped_code_point).length_in_bytes() == -1 { + s.error('invalid unicode point `${str}`') + } + + return end_idx, utf32_to_str(u32(escaped_code_point)) +} + +// decode a single 32 bit unicode escaped rune into its utf-8 bytes +fn (mut s Scanner) decode_u32erune(str string) string { + end_idx, segment := s.decode_u32_escape_single(str, 0) if str.len == end_idx { return segment } @@ -1448,7 +1493,7 @@ fn trim_slash_line_break(s string) string { @[inline] fn is_escape_sequence(c u8) bool { return c in [`x`, `u`, `e`, `n`, `r`, `t`, `v`, `a`, `f`, `b`, `\\`, `\``, `$`, `@`, `?`, `{`, - `}`, `'`, `"`] + `}`, `'`, `"`, `U`] } /// ident_char is called when a backtick "single-char" is parsed from the code @@ -1460,6 +1505,7 @@ fn is_escape_sequence(c u8) bool { /// escaped single chars like `\\`, `\``, `\n` => '\\', '`', '\n' /// escaped single hex bytes like `\x01`, `\x61` => '\x01', 'a' /// escaped unicode literals like `\u2605` +/// escaped unicode 32 literals like `\U00002605` /// escaped utf8 runes in hex like `\xe2\x98\x85` => (★) /// escaped utf8 runes in octal like `\342\230\205` => (★) fn (mut s Scanner) ident_char() string { @@ -1475,8 +1521,10 @@ fn (mut s Scanner) ident_char() string { // set flags for advanced escapes first escaped_hex := s.expect('\\x', start + 1) - escaped_unicode := s.expect('\\u', start + 1) - escaped_octal := !escaped_hex && !escaped_unicode && s.expect('\\', start + 1) + escaped_unicode_16 := s.expect('\\u', start + 1) + escaped_unicode_32 := s.expect('\\U', start + 1) + escaped_octal := !escaped_hex && !escaped_unicode_16 && !escaped_unicode_32 + && s.expect('\\', start + 1) // walk the string to get characters up to the next backtick for { @@ -1505,13 +1553,17 @@ fn (mut s Scanner) ident_char() string { // the string inside the backticks is longer than one character // but we might only have one rune... attempt to decode escapes // if the content expresses an escape code, it will have an even number of characters - // e.g. (octal) \141 (hex) \x61 or (unicode) \u2605 + // e.g. (octal) \141 (hex) \x61 or (unicode) \u2605 or (32 bit unicode) \U00002605 // we don't handle binary escape codes in rune literals orig := c - if c.len % 2 == 0 && (escaped_hex || escaped_unicode || escaped_octal) { - if escaped_unicode { + if c.len % 2 == 0 + && (escaped_hex || escaped_unicode_16 || escaped_unicode_32 || escaped_octal) { + if escaped_unicode_16 { + // there can only be one, so attempt to decode it now + c = s.decode_u16erune(c) + } else if escaped_unicode_32 { // there can only be one, so attempt to decode it now - c = s.decode_uerune(c) + c = s.decode_u32erune(c) } else { // find escape sequence start positions mut escapes_pos := []int{} @@ -1530,7 +1582,7 @@ fn (mut s Scanner) ident_char() string { u := c.runes() if u.len != 1 { - if escaped_hex || escaped_unicode { + if escaped_hex || escaped_unicode_16 || escaped_unicode_32 { s.error_with_pos('invalid character literal `${orig}` => `${c}` (${u}) (escape sequence did not refer to a singular rune)', lspos) } else if u.len == 0 { diff --git a/vlib/v/scanner/scanner_test.v b/vlib/v/scanner/scanner_test.v index cad72982c..b640cf63e 100644 --- a/vlib/v/scanner/scanner_test.v +++ b/vlib/v/scanner/scanner_test.v @@ -155,6 +155,7 @@ fn test_escape_rune() { // will not work until v compiler on github is updated // assert `\x61` == `a` // assert `\u0061` == `a` + // assert `\U00000061` == `a` // will not work until PR is accepted // assert `\141` == `a` @@ -180,11 +181,16 @@ fn test_escape_rune() { assert result[0].kind == .chartoken assert result[0].lit == r'\\' - // SINGLE CHAR UNICODE ESCAPE + // SINGLE CHAR 16-bit UNICODE ESCAPE result = scan_tokens(r'`\u2605`') assert result[0].kind == .chartoken assert result[0].lit == r'★' + // SINGLE CHAR 32-bit UNICODE ESCAPE + result = scan_tokens(r'`\U00002605`') + assert result[0].kind == .chartoken + assert result[0].lit == r'★' + // SINGLE CHAR ESCAPED ASCII result = scan_tokens(r'`\x61`') assert result[0].kind == .chartoken @@ -207,6 +213,7 @@ fn test_escape_string() { assert '\x61' == 'a' assert '\x62' == 'b' assert '\u0061' == 'a' + assert '\U00000061' == 'a' assert '\141' == 'a' assert '\xe2\x98\x85' == '★' assert '\342\230\205' == '★' @@ -230,7 +237,7 @@ fn test_escape_string() { assert result[0].kind == .string assert result[0].lit == r'\\' - // STRING UNICODE ESCAPE + // STRING 16-bit UNICODE ESCAPE result = scan_tokens(r"'\u2605'") assert result[0].kind == .string assert result[0].lit == r'★' @@ -238,6 +245,14 @@ fn test_escape_string() { assert result[0].kind == .string assert result[0].lit == r'H★H' + // STRING 32-bit UNICODE ESCAPE + result = scan_tokens(r"'\U00002605'") + assert result[0].kind == .string + assert result[0].lit == r'★' + result = scan_tokens(r"'H\U00002605H'") + assert result[0].kind == .string + assert result[0].lit == r'H★H' + // STRING ESCAPED ASCII result = scan_tokens(r"'\x61'") assert result[0].kind == .string @@ -249,7 +264,7 @@ fn test_escape_string() { assert result[0].kind == .string assert result[0].lit.bytes() == [u8(0xe2), `9`, `8`, `8`, `5`] - // MIX STRING ESCAPES + // MIX STRING ESCAPES with UTF-16 escapes result = scan_tokens(r"'\x61\u2605'") assert result[0].kind == .string assert result[0].lit == r'a★' @@ -257,7 +272,7 @@ fn test_escape_string() { assert result[0].kind == .string assert result[0].lit == r'★a' - // MIX STRING ESCAPES with offset + // MIX STRING ESCAPES with UTF-16 escapes with offset result = scan_tokens(r"'x \x61\u2605\x61'") assert result[0].kind == .string assert result[0].lit == r'x a★a' @@ -265,6 +280,38 @@ fn test_escape_string() { assert result[0].kind == .string assert result[0].lit == r'x ★a★' + // MIX STRING ESCAPES with UTF-32 escapes + result = scan_tokens(r"'\x61\U00002605'") + assert result[0].kind == .string + assert result[0].lit == r'a★' + result = scan_tokens(r"'\U00002605\x61'") + assert result[0].kind == .string + assert result[0].lit == r'★a' + + // MIX STRING ESCAPES with UTF-32 escapes with offset + result = scan_tokens(r"'x \x61\U00002605\x61'") + assert result[0].kind == .string + assert result[0].lit == r'x a★a' + result = scan_tokens(r"'x \U00002605\x61\U00002605'") + assert result[0].kind == .string + assert result[0].lit == r'x ★a★' + + // MIX STRING ESCAPES with UTF-16 and UTF-32 escapes + result = scan_tokens(r"'\u2605\x61\U00002605'") + assert result[0].kind == .string + assert result[0].lit == r'★a★' + result = scan_tokens(r"'\U00002605\x61\u2605'") + assert result[0].kind == .string + assert result[0].lit == r'★a★' + + // MIX STRING ESCAPES with UTF-16 and UTF-32 escapes with offset + result = scan_tokens(r"'x \x61\U00002605\x61\u2605'") + assert result[0].kind == .string + assert result[0].lit == r'x a★a★' + result = scan_tokens(r"'x \x61\u2605\x61\U00002605'") + assert result[0].kind == .string + assert result[0].lit == r'x a★a★' + // SHOULD RESULT IN ERRORS // result = scan_tokens(r'`\x61\x61`') // should always result in an error // result = scan_tokens(r"'\x'") // should always result in an error diff --git a/vlib/v/scanner/tests/invalid_unicode_16_err.out b/vlib/v/scanner/tests/invalid_unicode_16_err.out new file mode 100644 index 000000000..7f9d15e5c --- /dev/null +++ b/vlib/v/scanner/tests/invalid_unicode_16_err.out @@ -0,0 +1,4 @@ +vlib/v/scanner/tests/invalid_unicode_16_err.vv:1:13: error: invalid unicode point `\uD8FF` + 1 | a := '\uD8FF' + | ^ + 2 | println(a) diff --git a/vlib/v/scanner/tests/invalid_unicode_err.vv b/vlib/v/scanner/tests/invalid_unicode_16_err.vv similarity index 100% rename from vlib/v/scanner/tests/invalid_unicode_err.vv rename to vlib/v/scanner/tests/invalid_unicode_16_err.vv diff --git a/vlib/v/scanner/tests/invalid_unicode_32_err.out b/vlib/v/scanner/tests/invalid_unicode_32_err.out new file mode 100644 index 000000000..128d130ea --- /dev/null +++ b/vlib/v/scanner/tests/invalid_unicode_32_err.out @@ -0,0 +1,4 @@ +vlib/v/scanner/tests/invalid_unicode_32_err.vv:1:17: error: invalid unicode point `\U0000D8FF` + 1 | a := '\U0000D8FF' + | ^ + 2 | println(a) diff --git a/vlib/v/scanner/tests/invalid_unicode_32_err.vv b/vlib/v/scanner/tests/invalid_unicode_32_err.vv new file mode 100644 index 000000000..cb7a24cd1 --- /dev/null +++ b/vlib/v/scanner/tests/invalid_unicode_32_err.vv @@ -0,0 +1,2 @@ +a := '\U0000D8FF' +println(a) diff --git a/vlib/v/scanner/tests/invalid_unicode_err.out b/vlib/v/scanner/tests/invalid_unicode_err.out deleted file mode 100644 index 95ab20e1b..000000000 --- a/vlib/v/scanner/tests/invalid_unicode_err.out +++ /dev/null @@ -1,4 +0,0 @@ -vlib/v/scanner/tests/invalid_unicode_err.vv:1:13: error: invalid unicode point `\uD8FF` - 1 | a := '\uD8FF' - | ^ - 2 | println(a) -- 2.39.5