From 3642cd580846f35344ea0c2248e74adc47ad4698 Mon Sep 17 00:00:00 2001 From: Alexander Medvednikov Date: Wed, 25 Mar 2026 16:42:23 +0300 Subject: [PATCH] builtin: add support for splitting strings into unicode grapheme clusters (fixes #22117) --- vlib/builtin/README.md | 2 + vlib/builtin/grapheme.v | 2350 +++++++++++++++++ vlib/builtin/string.v | 5 + vlib/builtin/string_test.v | 10 + vlib/builtin/utf8.v | 75 +- vlib/builtin/utf8_test.v | 5 + .../string_interpolation_test.v | 8 + 7 files changed, 2381 insertions(+), 74 deletions(-) create mode 100644 vlib/builtin/grapheme.v diff --git a/vlib/builtin/README.md b/vlib/builtin/README.md index 52ddbddca..444d501ad 100644 --- a/vlib/builtin/README.md +++ b/vlib/builtin/README.md @@ -3,6 +3,8 @@ `builtin` is a module that is implicitly imported by every V program. It implements the builtin V types `array`, `string`, `map`. +It also includes Unicode-aware string helpers such as `string.graphemes()` +for splitting text into grapheme clusters. It also implements builtin functions like `println`, `eprintln`, `malloc`, `panic`, `print_backtrace`. diff --git a/vlib/builtin/grapheme.v b/vlib/builtin/grapheme.v new file mode 100644 index 000000000..793a3670c --- /dev/null +++ b/vlib/builtin/grapheme.v @@ -0,0 +1,2350 @@ +module builtin + +// This file contains Unicode grapheme cluster property tables used by string.graphemes() +// and utf8_str_visible_length(). The data is derived from Unicode 13.0.0 +// GraphemeBreakProperty.txt and emoji-data.txt. + +enum GraphemeBreakProperty { + other + cr + lf + control + extend + regional_indicator + prepend + spacing_mark + l + v + t + lv + lvt + zwj +} + +const grapheme_control_ranges = [ + u32(0x0000), + 0x0009, + 0x000B, + 0x000C, + 0x000E, + 0x001F, + 0x007F, + 0x009F, + 0x00AD, + 0x00AD, + 0x061C, + 0x061C, + 0x180E, + 0x180E, + 0x200B, + 0x200B, + 0x200E, + 0x200F, + 0x2028, + 0x2028, + 0x2029, + 0x2029, + 0x202A, + 0x202E, + 0x2060, + 0x2064, + 0x2065, + 0x2065, + 0x2066, + 0x206F, + 0xFEFF, + 0xFEFF, + 0xFFF0, + 0xFFF8, + 0xFFF9, + 0xFFFB, + 0x13430, + 0x13438, + 0x1BCA0, + 0x1BCA3, + 0x1D173, + 0x1D17A, + 0xE0000, + 0xE0000, + 0xE0001, + 0xE0001, + 0xE0002, + 0xE001F, + 0xE0080, + 0xE00FF, + 0xE01F0, + 0xE0FFF, +]! + +const grapheme_extend_ranges = [ + u32(0x0300), + 0x036F, + 0x0483, + 0x0487, + 0x0488, + 0x0489, + 0x0591, + 0x05BD, + 0x05BF, + 0x05BF, + 0x05C1, + 0x05C2, + 0x05C4, + 0x05C5, + 0x05C7, + 0x05C7, + 0x0610, + 0x061A, + 0x064B, + 0x065F, + 0x0670, + 0x0670, + 0x06D6, + 0x06DC, + 0x06DF, + 0x06E4, + 0x06E7, + 0x06E8, + 0x06EA, + 0x06ED, + 0x0711, + 0x0711, + 0x0730, + 0x074A, + 0x07A6, + 0x07B0, + 0x07EB, + 0x07F3, + 0x07FD, + 0x07FD, + 0x0816, + 0x0819, + 0x081B, + 0x0823, + 0x0825, + 0x0827, + 0x0829, + 0x082D, + 0x0859, + 0x085B, + 0x08D3, + 0x08E1, + 0x08E3, + 0x0902, + 0x093A, + 0x093A, + 0x093C, + 0x093C, + 0x0941, + 0x0948, + 0x094D, + 0x094D, + 0x0951, + 0x0957, + 0x0962, + 0x0963, + 0x0981, + 0x0981, + 0x09BC, + 0x09BC, + 0x09BE, + 0x09BE, + 0x09C1, + 0x09C4, + 0x09CD, + 0x09CD, + 0x09D7, + 0x09D7, + 0x09E2, + 0x09E3, + 0x09FE, + 0x09FE, + 0x0A01, + 0x0A02, + 0x0A3C, + 0x0A3C, + 0x0A41, + 0x0A42, + 0x0A47, + 0x0A48, + 0x0A4B, + 0x0A4D, + 0x0A51, + 0x0A51, + 0x0A70, + 0x0A71, + 0x0A75, + 0x0A75, + 0x0A81, + 0x0A82, + 0x0ABC, + 0x0ABC, + 0x0AC1, + 0x0AC5, + 0x0AC7, + 0x0AC8, + 0x0ACD, + 0x0ACD, + 0x0AE2, + 0x0AE3, + 0x0AFA, + 0x0AFF, + 0x0B01, + 0x0B01, + 0x0B3C, + 0x0B3C, + 0x0B3E, + 0x0B3E, + 0x0B3F, + 0x0B3F, + 0x0B41, + 0x0B44, + 0x0B4D, + 0x0B4D, + 0x0B55, + 0x0B56, + 0x0B57, + 0x0B57, + 0x0B62, + 0x0B63, + 0x0B82, + 0x0B82, + 0x0BBE, + 0x0BBE, + 0x0BC0, + 0x0BC0, + 0x0BCD, + 0x0BCD, + 0x0BD7, + 0x0BD7, + 0x0C00, + 0x0C00, + 0x0C04, + 0x0C04, + 0x0C3E, + 0x0C40, + 0x0C46, + 0x0C48, + 0x0C4A, + 0x0C4D, + 0x0C55, + 0x0C56, + 0x0C62, + 0x0C63, + 0x0C81, + 0x0C81, + 0x0CBC, + 0x0CBC, + 0x0CBF, + 0x0CBF, + 0x0CC2, + 0x0CC2, + 0x0CC6, + 0x0CC6, + 0x0CCC, + 0x0CCD, + 0x0CD5, + 0x0CD6, + 0x0CE2, + 0x0CE3, + 0x0D00, + 0x0D01, + 0x0D3B, + 0x0D3C, + 0x0D3E, + 0x0D3E, + 0x0D41, + 0x0D44, + 0x0D4D, + 0x0D4D, + 0x0D57, + 0x0D57, + 0x0D62, + 0x0D63, + 0x0D81, + 0x0D81, + 0x0DCA, + 0x0DCA, + 0x0DCF, + 0x0DCF, + 0x0DD2, + 0x0DD4, + 0x0DD6, + 0x0DD6, + 0x0DDF, + 0x0DDF, + 0x0E31, + 0x0E31, + 0x0E34, + 0x0E3A, + 0x0E47, + 0x0E4E, + 0x0EB1, + 0x0EB1, + 0x0EB4, + 0x0EBC, + 0x0EC8, + 0x0ECD, + 0x0F18, + 0x0F19, + 0x0F35, + 0x0F35, + 0x0F37, + 0x0F37, + 0x0F39, + 0x0F39, + 0x0F71, + 0x0F7E, + 0x0F80, + 0x0F84, + 0x0F86, + 0x0F87, + 0x0F8D, + 0x0F97, + 0x0F99, + 0x0FBC, + 0x0FC6, + 0x0FC6, + 0x102D, + 0x1030, + 0x1032, + 0x1037, + 0x1039, + 0x103A, + 0x103D, + 0x103E, + 0x1058, + 0x1059, + 0x105E, + 0x1060, + 0x1071, + 0x1074, + 0x1082, + 0x1082, + 0x1085, + 0x1086, + 0x108D, + 0x108D, + 0x109D, + 0x109D, + 0x135D, + 0x135F, + 0x1712, + 0x1714, + 0x1732, + 0x1734, + 0x1752, + 0x1753, + 0x1772, + 0x1773, + 0x17B4, + 0x17B5, + 0x17B7, + 0x17BD, + 0x17C6, + 0x17C6, + 0x17C9, + 0x17D3, + 0x17DD, + 0x17DD, + 0x180B, + 0x180D, + 0x1885, + 0x1886, + 0x18A9, + 0x18A9, + 0x1920, + 0x1922, + 0x1927, + 0x1928, + 0x1932, + 0x1932, + 0x1939, + 0x193B, + 0x1A17, + 0x1A18, + 0x1A1B, + 0x1A1B, + 0x1A56, + 0x1A56, + 0x1A58, + 0x1A5E, + 0x1A60, + 0x1A60, + 0x1A62, + 0x1A62, + 0x1A65, + 0x1A6C, + 0x1A73, + 0x1A7C, + 0x1A7F, + 0x1A7F, + 0x1AB0, + 0x1ABD, + 0x1ABE, + 0x1ABE, + 0x1ABF, + 0x1AC0, + 0x1B00, + 0x1B03, + 0x1B34, + 0x1B34, + 0x1B35, + 0x1B35, + 0x1B36, + 0x1B3A, + 0x1B3C, + 0x1B3C, + 0x1B42, + 0x1B42, + 0x1B6B, + 0x1B73, + 0x1B80, + 0x1B81, + 0x1BA2, + 0x1BA5, + 0x1BA8, + 0x1BA9, + 0x1BAB, + 0x1BAD, + 0x1BE6, + 0x1BE6, + 0x1BE8, + 0x1BE9, + 0x1BED, + 0x1BED, + 0x1BEF, + 0x1BF1, + 0x1C2C, + 0x1C33, + 0x1C36, + 0x1C37, + 0x1CD0, + 0x1CD2, + 0x1CD4, + 0x1CE0, + 0x1CE2, + 0x1CE8, + 0x1CED, + 0x1CED, + 0x1CF4, + 0x1CF4, + 0x1CF8, + 0x1CF9, + 0x1DC0, + 0x1DF9, + 0x1DFB, + 0x1DFF, + 0x200C, + 0x200C, + 0x20D0, + 0x20DC, + 0x20DD, + 0x20E0, + 0x20E1, + 0x20E1, + 0x20E2, + 0x20E4, + 0x20E5, + 0x20F0, + 0x2CEF, + 0x2CF1, + 0x2D7F, + 0x2D7F, + 0x2DE0, + 0x2DFF, + 0x302A, + 0x302D, + 0x302E, + 0x302F, + 0x3099, + 0x309A, + 0xA66F, + 0xA66F, + 0xA670, + 0xA672, + 0xA674, + 0xA67D, + 0xA69E, + 0xA69F, + 0xA6F0, + 0xA6F1, + 0xA802, + 0xA802, + 0xA806, + 0xA806, + 0xA80B, + 0xA80B, + 0xA825, + 0xA826, + 0xA82C, + 0xA82C, + 0xA8C4, + 0xA8C5, + 0xA8E0, + 0xA8F1, + 0xA8FF, + 0xA8FF, + 0xA926, + 0xA92D, + 0xA947, + 0xA951, + 0xA980, + 0xA982, + 0xA9B3, + 0xA9B3, + 0xA9B6, + 0xA9B9, + 0xA9BC, + 0xA9BD, + 0xA9E5, + 0xA9E5, + 0xAA29, + 0xAA2E, + 0xAA31, + 0xAA32, + 0xAA35, + 0xAA36, + 0xAA43, + 0xAA43, + 0xAA4C, + 0xAA4C, + 0xAA7C, + 0xAA7C, + 0xAAB0, + 0xAAB0, + 0xAAB2, + 0xAAB4, + 0xAAB7, + 0xAAB8, + 0xAABE, + 0xAABF, + 0xAAC1, + 0xAAC1, + 0xAAEC, + 0xAAED, + 0xAAF6, + 0xAAF6, + 0xABE5, + 0xABE5, + 0xABE8, + 0xABE8, + 0xABED, + 0xABED, + 0xFB1E, + 0xFB1E, + 0xFE00, + 0xFE0F, + 0xFE20, + 0xFE2F, + 0xFF9E, + 0xFF9F, + 0x101FD, + 0x101FD, + 0x102E0, + 0x102E0, + 0x10376, + 0x1037A, + 0x10A01, + 0x10A03, + 0x10A05, + 0x10A06, + 0x10A0C, + 0x10A0F, + 0x10A38, + 0x10A3A, + 0x10A3F, + 0x10A3F, + 0x10AE5, + 0x10AE6, + 0x10D24, + 0x10D27, + 0x10EAB, + 0x10EAC, + 0x10F46, + 0x10F50, + 0x11001, + 0x11001, + 0x11038, + 0x11046, + 0x1107F, + 0x11081, + 0x110B3, + 0x110B6, + 0x110B9, + 0x110BA, + 0x11100, + 0x11102, + 0x11127, + 0x1112B, + 0x1112D, + 0x11134, + 0x11173, + 0x11173, + 0x11180, + 0x11181, + 0x111B6, + 0x111BE, + 0x111C9, + 0x111CC, + 0x111CF, + 0x111CF, + 0x1122F, + 0x11231, + 0x11234, + 0x11234, + 0x11236, + 0x11237, + 0x1123E, + 0x1123E, + 0x112DF, + 0x112DF, + 0x112E3, + 0x112EA, + 0x11300, + 0x11301, + 0x1133B, + 0x1133C, + 0x1133E, + 0x1133E, + 0x11340, + 0x11340, + 0x11357, + 0x11357, + 0x11366, + 0x1136C, + 0x11370, + 0x11374, + 0x11438, + 0x1143F, + 0x11442, + 0x11444, + 0x11446, + 0x11446, + 0x1145E, + 0x1145E, + 0x114B0, + 0x114B0, + 0x114B3, + 0x114B8, + 0x114BA, + 0x114BA, + 0x114BD, + 0x114BD, + 0x114BF, + 0x114C0, + 0x114C2, + 0x114C3, + 0x115AF, + 0x115AF, + 0x115B2, + 0x115B5, + 0x115BC, + 0x115BD, + 0x115BF, + 0x115C0, + 0x115DC, + 0x115DD, + 0x11633, + 0x1163A, + 0x1163D, + 0x1163D, + 0x1163F, + 0x11640, + 0x116AB, + 0x116AB, + 0x116AD, + 0x116AD, + 0x116B0, + 0x116B5, + 0x116B7, + 0x116B7, + 0x1171D, + 0x1171F, + 0x11722, + 0x11725, + 0x11727, + 0x1172B, + 0x1182F, + 0x11837, + 0x11839, + 0x1183A, + 0x11930, + 0x11930, + 0x1193B, + 0x1193C, + 0x1193E, + 0x1193E, + 0x11943, + 0x11943, + 0x119D4, + 0x119D7, + 0x119DA, + 0x119DB, + 0x119E0, + 0x119E0, + 0x11A01, + 0x11A0A, + 0x11A33, + 0x11A38, + 0x11A3B, + 0x11A3E, + 0x11A47, + 0x11A47, + 0x11A51, + 0x11A56, + 0x11A59, + 0x11A5B, + 0x11A8A, + 0x11A96, + 0x11A98, + 0x11A99, + 0x11C30, + 0x11C36, + 0x11C38, + 0x11C3D, + 0x11C3F, + 0x11C3F, + 0x11C92, + 0x11CA7, + 0x11CAA, + 0x11CB0, + 0x11CB2, + 0x11CB3, + 0x11CB5, + 0x11CB6, + 0x11D31, + 0x11D36, + 0x11D3A, + 0x11D3A, + 0x11D3C, + 0x11D3D, + 0x11D3F, + 0x11D45, + 0x11D47, + 0x11D47, + 0x11D90, + 0x11D91, + 0x11D95, + 0x11D95, + 0x11D97, + 0x11D97, + 0x11EF3, + 0x11EF4, + 0x16AF0, + 0x16AF4, + 0x16B30, + 0x16B36, + 0x16F4F, + 0x16F4F, + 0x16F8F, + 0x16F92, + 0x16FE4, + 0x16FE4, + 0x1BC9D, + 0x1BC9E, + 0x1D165, + 0x1D165, + 0x1D167, + 0x1D169, + 0x1D16E, + 0x1D172, + 0x1D17B, + 0x1D182, + 0x1D185, + 0x1D18B, + 0x1D1AA, + 0x1D1AD, + 0x1D242, + 0x1D244, + 0x1DA00, + 0x1DA36, + 0x1DA3B, + 0x1DA6C, + 0x1DA75, + 0x1DA75, + 0x1DA84, + 0x1DA84, + 0x1DA9B, + 0x1DA9F, + 0x1DAA1, + 0x1DAAF, + 0x1E000, + 0x1E006, + 0x1E008, + 0x1E018, + 0x1E01B, + 0x1E021, + 0x1E023, + 0x1E024, + 0x1E026, + 0x1E02A, + 0x1E130, + 0x1E136, + 0x1E2EC, + 0x1E2EF, + 0x1E8D0, + 0x1E8D6, + 0x1E944, + 0x1E94A, + 0x1F3FB, + 0x1F3FF, + 0xE0020, + 0xE007F, + 0xE0100, + 0xE01EF, +]! + +const grapheme_spacing_mark_ranges = [ + u32(0x0903), + 0x0903, + 0x093B, + 0x093B, + 0x093E, + 0x0940, + 0x0949, + 0x094C, + 0x094E, + 0x094F, + 0x0982, + 0x0983, + 0x09BF, + 0x09C0, + 0x09C7, + 0x09C8, + 0x09CB, + 0x09CC, + 0x0A03, + 0x0A03, + 0x0A3E, + 0x0A40, + 0x0A83, + 0x0A83, + 0x0ABE, + 0x0AC0, + 0x0AC9, + 0x0AC9, + 0x0ACB, + 0x0ACC, + 0x0B02, + 0x0B03, + 0x0B40, + 0x0B40, + 0x0B47, + 0x0B48, + 0x0B4B, + 0x0B4C, + 0x0BBF, + 0x0BBF, + 0x0BC1, + 0x0BC2, + 0x0BC6, + 0x0BC8, + 0x0BCA, + 0x0BCC, + 0x0C01, + 0x0C03, + 0x0C41, + 0x0C44, + 0x0C82, + 0x0C83, + 0x0CBE, + 0x0CBE, + 0x0CC0, + 0x0CC1, + 0x0CC3, + 0x0CC4, + 0x0CC7, + 0x0CC8, + 0x0CCA, + 0x0CCB, + 0x0D02, + 0x0D03, + 0x0D3F, + 0x0D40, + 0x0D46, + 0x0D48, + 0x0D4A, + 0x0D4C, + 0x0D82, + 0x0D83, + 0x0DD0, + 0x0DD1, + 0x0DD8, + 0x0DDE, + 0x0DF2, + 0x0DF3, + 0x0E33, + 0x0E33, + 0x0EB3, + 0x0EB3, + 0x0F3E, + 0x0F3F, + 0x0F7F, + 0x0F7F, + 0x1031, + 0x1031, + 0x103B, + 0x103C, + 0x1056, + 0x1057, + 0x1084, + 0x1084, + 0x17B6, + 0x17B6, + 0x17BE, + 0x17C5, + 0x17C7, + 0x17C8, + 0x1923, + 0x1926, + 0x1929, + 0x192B, + 0x1930, + 0x1931, + 0x1933, + 0x1938, + 0x1A19, + 0x1A1A, + 0x1A55, + 0x1A55, + 0x1A57, + 0x1A57, + 0x1A6D, + 0x1A72, + 0x1B04, + 0x1B04, + 0x1B3B, + 0x1B3B, + 0x1B3D, + 0x1B41, + 0x1B43, + 0x1B44, + 0x1B82, + 0x1B82, + 0x1BA1, + 0x1BA1, + 0x1BA6, + 0x1BA7, + 0x1BAA, + 0x1BAA, + 0x1BE7, + 0x1BE7, + 0x1BEA, + 0x1BEC, + 0x1BEE, + 0x1BEE, + 0x1BF2, + 0x1BF3, + 0x1C24, + 0x1C2B, + 0x1C34, + 0x1C35, + 0x1CE1, + 0x1CE1, + 0x1CF7, + 0x1CF7, + 0xA823, + 0xA824, + 0xA827, + 0xA827, + 0xA880, + 0xA881, + 0xA8B4, + 0xA8C3, + 0xA952, + 0xA953, + 0xA983, + 0xA983, + 0xA9B4, + 0xA9B5, + 0xA9BA, + 0xA9BB, + 0xA9BE, + 0xA9C0, + 0xAA2F, + 0xAA30, + 0xAA33, + 0xAA34, + 0xAA4D, + 0xAA4D, + 0xAAEB, + 0xAAEB, + 0xAAEE, + 0xAAEF, + 0xAAF5, + 0xAAF5, + 0xABE3, + 0xABE4, + 0xABE6, + 0xABE7, + 0xABE9, + 0xABEA, + 0xABEC, + 0xABEC, + 0x11000, + 0x11000, + 0x11002, + 0x11002, + 0x11082, + 0x11082, + 0x110B0, + 0x110B2, + 0x110B7, + 0x110B8, + 0x1112C, + 0x1112C, + 0x11145, + 0x11146, + 0x11182, + 0x11182, + 0x111B3, + 0x111B5, + 0x111BF, + 0x111C0, + 0x111CE, + 0x111CE, + 0x1122C, + 0x1122E, + 0x11232, + 0x11233, + 0x11235, + 0x11235, + 0x112E0, + 0x112E2, + 0x11302, + 0x11303, + 0x1133F, + 0x1133F, + 0x11341, + 0x11344, + 0x11347, + 0x11348, + 0x1134B, + 0x1134D, + 0x11362, + 0x11363, + 0x11435, + 0x11437, + 0x11440, + 0x11441, + 0x11445, + 0x11445, + 0x114B1, + 0x114B2, + 0x114B9, + 0x114B9, + 0x114BB, + 0x114BC, + 0x114BE, + 0x114BE, + 0x114C1, + 0x114C1, + 0x115B0, + 0x115B1, + 0x115B8, + 0x115BB, + 0x115BE, + 0x115BE, + 0x11630, + 0x11632, + 0x1163B, + 0x1163C, + 0x1163E, + 0x1163E, + 0x116AC, + 0x116AC, + 0x116AE, + 0x116AF, + 0x116B6, + 0x116B6, + 0x11720, + 0x11721, + 0x11726, + 0x11726, + 0x1182C, + 0x1182E, + 0x11838, + 0x11838, + 0x11931, + 0x11935, + 0x11937, + 0x11938, + 0x1193D, + 0x1193D, + 0x11940, + 0x11940, + 0x11942, + 0x11942, + 0x119D1, + 0x119D3, + 0x119DC, + 0x119DF, + 0x119E4, + 0x119E4, + 0x11A39, + 0x11A39, + 0x11A57, + 0x11A58, + 0x11A97, + 0x11A97, + 0x11C2F, + 0x11C2F, + 0x11C3E, + 0x11C3E, + 0x11CA9, + 0x11CA9, + 0x11CB1, + 0x11CB1, + 0x11CB4, + 0x11CB4, + 0x11D8A, + 0x11D8E, + 0x11D93, + 0x11D94, + 0x11D96, + 0x11D96, + 0x11EF5, + 0x11EF6, + 0x16F51, + 0x16F87, + 0x16FF0, + 0x16FF1, + 0x1D166, + 0x1D166, + 0x1D16D, + 0x1D16D, +]! + +const grapheme_prepend_ranges = [ + u32(0x0600), + 0x0605, + 0x06DD, + 0x06DD, + 0x070F, + 0x070F, + 0x08E2, + 0x08E2, + 0x0D4E, + 0x0D4E, + 0x110BD, + 0x110BD, + 0x110CD, + 0x110CD, + 0x111C2, + 0x111C3, + 0x1193F, + 0x1193F, + 0x11941, + 0x11941, + 0x11A3A, + 0x11A3A, + 0x11A84, + 0x11A89, + 0x11D46, + 0x11D46, +]! + +const grapheme_extended_pictographic_ranges = [ + u32(0x00A9), + 0x00A9, + 0x00AE, + 0x00AE, + 0x203C, + 0x203C, + 0x2049, + 0x2049, + 0x2122, + 0x2122, + 0x2139, + 0x2139, + 0x2194, + 0x2199, + 0x21A9, + 0x21AA, + 0x231A, + 0x231B, + 0x2328, + 0x2328, + 0x2388, + 0x2388, + 0x23CF, + 0x23CF, + 0x23E9, + 0x23EC, + 0x23ED, + 0x23EE, + 0x23EF, + 0x23EF, + 0x23F0, + 0x23F0, + 0x23F1, + 0x23F2, + 0x23F3, + 0x23F3, + 0x23F8, + 0x23FA, + 0x24C2, + 0x24C2, + 0x25AA, + 0x25AB, + 0x25B6, + 0x25B6, + 0x25C0, + 0x25C0, + 0x25FB, + 0x25FE, + 0x2600, + 0x2601, + 0x2602, + 0x2603, + 0x2604, + 0x2604, + 0x2605, + 0x2605, + 0x2607, + 0x260D, + 0x260E, + 0x260E, + 0x260F, + 0x2610, + 0x2611, + 0x2611, + 0x2612, + 0x2612, + 0x2614, + 0x2615, + 0x2616, + 0x2617, + 0x2618, + 0x2618, + 0x2619, + 0x261C, + 0x261D, + 0x261D, + 0x261E, + 0x261F, + 0x2620, + 0x2620, + 0x2621, + 0x2621, + 0x2622, + 0x2623, + 0x2624, + 0x2625, + 0x2626, + 0x2626, + 0x2627, + 0x2629, + 0x262A, + 0x262A, + 0x262B, + 0x262D, + 0x262E, + 0x262E, + 0x262F, + 0x262F, + 0x2630, + 0x2637, + 0x2638, + 0x2639, + 0x263A, + 0x263A, + 0x263B, + 0x263F, + 0x2640, + 0x2640, + 0x2641, + 0x2641, + 0x2642, + 0x2642, + 0x2643, + 0x2647, + 0x2648, + 0x2653, + 0x2654, + 0x265E, + 0x265F, + 0x265F, + 0x2660, + 0x2660, + 0x2661, + 0x2662, + 0x2663, + 0x2663, + 0x2664, + 0x2664, + 0x2665, + 0x2666, + 0x2667, + 0x2667, + 0x2668, + 0x2668, + 0x2669, + 0x267A, + 0x267B, + 0x267B, + 0x267C, + 0x267D, + 0x267E, + 0x267E, + 0x267F, + 0x267F, + 0x2680, + 0x2685, + 0x2690, + 0x2691, + 0x2692, + 0x2692, + 0x2693, + 0x2693, + 0x2694, + 0x2694, + 0x2695, + 0x2695, + 0x2696, + 0x2697, + 0x2698, + 0x2698, + 0x2699, + 0x2699, + 0x269A, + 0x269A, + 0x269B, + 0x269C, + 0x269D, + 0x269F, + 0x26A0, + 0x26A1, + 0x26A2, + 0x26A6, + 0x26A7, + 0x26A7, + 0x26A8, + 0x26A9, + 0x26AA, + 0x26AB, + 0x26AC, + 0x26AF, + 0x26B0, + 0x26B1, + 0x26B2, + 0x26BC, + 0x26BD, + 0x26BE, + 0x26BF, + 0x26C3, + 0x26C4, + 0x26C5, + 0x26C6, + 0x26C7, + 0x26C8, + 0x26C8, + 0x26C9, + 0x26CD, + 0x26CE, + 0x26CE, + 0x26CF, + 0x26CF, + 0x26D0, + 0x26D0, + 0x26D1, + 0x26D1, + 0x26D2, + 0x26D2, + 0x26D3, + 0x26D3, + 0x26D4, + 0x26D4, + 0x26D5, + 0x26E8, + 0x26E9, + 0x26E9, + 0x26EA, + 0x26EA, + 0x26EB, + 0x26EF, + 0x26F0, + 0x26F1, + 0x26F2, + 0x26F3, + 0x26F4, + 0x26F4, + 0x26F5, + 0x26F5, + 0x26F6, + 0x26F6, + 0x26F7, + 0x26F9, + 0x26FA, + 0x26FA, + 0x26FB, + 0x26FC, + 0x26FD, + 0x26FD, + 0x26FE, + 0x2701, + 0x2702, + 0x2702, + 0x2703, + 0x2704, + 0x2705, + 0x2705, + 0x2708, + 0x270C, + 0x270D, + 0x270D, + 0x270E, + 0x270E, + 0x270F, + 0x270F, + 0x2710, + 0x2711, + 0x2712, + 0x2712, + 0x2714, + 0x2714, + 0x2716, + 0x2716, + 0x271D, + 0x271D, + 0x2721, + 0x2721, + 0x2728, + 0x2728, + 0x2733, + 0x2734, + 0x2744, + 0x2744, + 0x2747, + 0x2747, + 0x274C, + 0x274C, + 0x274E, + 0x274E, + 0x2753, + 0x2755, + 0x2757, + 0x2757, + 0x2763, + 0x2763, + 0x2764, + 0x2764, + 0x2765, + 0x2767, + 0x2795, + 0x2797, + 0x27A1, + 0x27A1, + 0x27B0, + 0x27B0, + 0x27BF, + 0x27BF, + 0x2934, + 0x2935, + 0x2B05, + 0x2B07, + 0x2B1B, + 0x2B1C, + 0x2B50, + 0x2B50, + 0x2B55, + 0x2B55, + 0x3030, + 0x3030, + 0x303D, + 0x303D, + 0x3297, + 0x3297, + 0x3299, + 0x3299, + 0x1F000, + 0x1F003, + 0x1F004, + 0x1F004, + 0x1F005, + 0x1F0CE, + 0x1F0CF, + 0x1F0CF, + 0x1F0D0, + 0x1F0FF, + 0x1F10D, + 0x1F10F, + 0x1F12F, + 0x1F12F, + 0x1F16C, + 0x1F16F, + 0x1F170, + 0x1F171, + 0x1F17E, + 0x1F17F, + 0x1F18E, + 0x1F18E, + 0x1F191, + 0x1F19A, + 0x1F1AD, + 0x1F1E5, + 0x1F201, + 0x1F202, + 0x1F203, + 0x1F20F, + 0x1F21A, + 0x1F21A, + 0x1F22F, + 0x1F22F, + 0x1F232, + 0x1F23A, + 0x1F23C, + 0x1F23F, + 0x1F249, + 0x1F24F, + 0x1F250, + 0x1F251, + 0x1F252, + 0x1F2FF, + 0x1F300, + 0x1F30C, + 0x1F30D, + 0x1F30E, + 0x1F30F, + 0x1F30F, + 0x1F310, + 0x1F310, + 0x1F311, + 0x1F311, + 0x1F312, + 0x1F312, + 0x1F313, + 0x1F315, + 0x1F316, + 0x1F318, + 0x1F319, + 0x1F319, + 0x1F31A, + 0x1F31A, + 0x1F31B, + 0x1F31B, + 0x1F31C, + 0x1F31C, + 0x1F31D, + 0x1F31E, + 0x1F31F, + 0x1F320, + 0x1F321, + 0x1F321, + 0x1F322, + 0x1F323, + 0x1F324, + 0x1F32C, + 0x1F32D, + 0x1F32F, + 0x1F330, + 0x1F331, + 0x1F332, + 0x1F333, + 0x1F334, + 0x1F335, + 0x1F336, + 0x1F336, + 0x1F337, + 0x1F34A, + 0x1F34B, + 0x1F34B, + 0x1F34C, + 0x1F34F, + 0x1F350, + 0x1F350, + 0x1F351, + 0x1F37B, + 0x1F37C, + 0x1F37C, + 0x1F37D, + 0x1F37D, + 0x1F37E, + 0x1F37F, + 0x1F380, + 0x1F393, + 0x1F394, + 0x1F395, + 0x1F396, + 0x1F397, + 0x1F398, + 0x1F398, + 0x1F399, + 0x1F39B, + 0x1F39C, + 0x1F39D, + 0x1F39E, + 0x1F39F, + 0x1F3A0, + 0x1F3C4, + 0x1F3C5, + 0x1F3C5, + 0x1F3C6, + 0x1F3C6, + 0x1F3C7, + 0x1F3C7, + 0x1F3C8, + 0x1F3C8, + 0x1F3C9, + 0x1F3C9, + 0x1F3CA, + 0x1F3CA, + 0x1F3CB, + 0x1F3CE, + 0x1F3CF, + 0x1F3D3, + 0x1F3D4, + 0x1F3DF, + 0x1F3E0, + 0x1F3E3, + 0x1F3E4, + 0x1F3E4, + 0x1F3E5, + 0x1F3F0, + 0x1F3F1, + 0x1F3F2, + 0x1F3F3, + 0x1F3F3, + 0x1F3F4, + 0x1F3F4, + 0x1F3F5, + 0x1F3F5, + 0x1F3F6, + 0x1F3F6, + 0x1F3F7, + 0x1F3F7, + 0x1F3F8, + 0x1F3FA, + 0x1F400, + 0x1F407, + 0x1F408, + 0x1F408, + 0x1F409, + 0x1F40B, + 0x1F40C, + 0x1F40E, + 0x1F40F, + 0x1F410, + 0x1F411, + 0x1F412, + 0x1F413, + 0x1F413, + 0x1F414, + 0x1F414, + 0x1F415, + 0x1F415, + 0x1F416, + 0x1F416, + 0x1F417, + 0x1F429, + 0x1F42A, + 0x1F42A, + 0x1F42B, + 0x1F43E, + 0x1F43F, + 0x1F43F, + 0x1F440, + 0x1F440, + 0x1F441, + 0x1F441, + 0x1F442, + 0x1F464, + 0x1F465, + 0x1F465, + 0x1F466, + 0x1F46B, + 0x1F46C, + 0x1F46D, + 0x1F46E, + 0x1F4AC, + 0x1F4AD, + 0x1F4AD, + 0x1F4AE, + 0x1F4B5, + 0x1F4B6, + 0x1F4B7, + 0x1F4B8, + 0x1F4EB, + 0x1F4EC, + 0x1F4ED, + 0x1F4EE, + 0x1F4EE, + 0x1F4EF, + 0x1F4EF, + 0x1F4F0, + 0x1F4F4, + 0x1F4F5, + 0x1F4F5, + 0x1F4F6, + 0x1F4F7, + 0x1F4F8, + 0x1F4F8, + 0x1F4F9, + 0x1F4FC, + 0x1F4FD, + 0x1F4FD, + 0x1F4FE, + 0x1F4FE, + 0x1F4FF, + 0x1F502, + 0x1F503, + 0x1F503, + 0x1F504, + 0x1F507, + 0x1F508, + 0x1F508, + 0x1F509, + 0x1F509, + 0x1F50A, + 0x1F514, + 0x1F515, + 0x1F515, + 0x1F516, + 0x1F52B, + 0x1F52C, + 0x1F52D, + 0x1F52E, + 0x1F53D, + 0x1F546, + 0x1F548, + 0x1F549, + 0x1F54A, + 0x1F54B, + 0x1F54E, + 0x1F54F, + 0x1F54F, + 0x1F550, + 0x1F55B, + 0x1F55C, + 0x1F567, + 0x1F568, + 0x1F56E, + 0x1F56F, + 0x1F570, + 0x1F571, + 0x1F572, + 0x1F573, + 0x1F579, + 0x1F57A, + 0x1F57A, + 0x1F57B, + 0x1F586, + 0x1F587, + 0x1F587, + 0x1F588, + 0x1F589, + 0x1F58A, + 0x1F58D, + 0x1F58E, + 0x1F58F, + 0x1F590, + 0x1F590, + 0x1F591, + 0x1F594, + 0x1F595, + 0x1F596, + 0x1F597, + 0x1F5A3, + 0x1F5A4, + 0x1F5A4, + 0x1F5A5, + 0x1F5A5, + 0x1F5A6, + 0x1F5A7, + 0x1F5A8, + 0x1F5A8, + 0x1F5A9, + 0x1F5B0, + 0x1F5B1, + 0x1F5B2, + 0x1F5B3, + 0x1F5BB, + 0x1F5BC, + 0x1F5BC, + 0x1F5BD, + 0x1F5C1, + 0x1F5C2, + 0x1F5C4, + 0x1F5C5, + 0x1F5D0, + 0x1F5D1, + 0x1F5D3, + 0x1F5D4, + 0x1F5DB, + 0x1F5DC, + 0x1F5DE, + 0x1F5DF, + 0x1F5E0, + 0x1F5E1, + 0x1F5E1, + 0x1F5E2, + 0x1F5E2, + 0x1F5E3, + 0x1F5E3, + 0x1F5E4, + 0x1F5E7, + 0x1F5E8, + 0x1F5E8, + 0x1F5E9, + 0x1F5EE, + 0x1F5EF, + 0x1F5EF, + 0x1F5F0, + 0x1F5F2, + 0x1F5F3, + 0x1F5F3, + 0x1F5F4, + 0x1F5F9, + 0x1F5FA, + 0x1F5FA, + 0x1F5FB, + 0x1F5FF, + 0x1F600, + 0x1F600, + 0x1F601, + 0x1F606, + 0x1F607, + 0x1F608, + 0x1F609, + 0x1F60D, + 0x1F60E, + 0x1F60E, + 0x1F60F, + 0x1F60F, + 0x1F610, + 0x1F610, + 0x1F611, + 0x1F611, + 0x1F612, + 0x1F614, + 0x1F615, + 0x1F615, + 0x1F616, + 0x1F616, + 0x1F617, + 0x1F617, + 0x1F618, + 0x1F618, + 0x1F619, + 0x1F619, + 0x1F61A, + 0x1F61A, + 0x1F61B, + 0x1F61B, + 0x1F61C, + 0x1F61E, + 0x1F61F, + 0x1F61F, + 0x1F620, + 0x1F625, + 0x1F626, + 0x1F627, + 0x1F628, + 0x1F62B, + 0x1F62C, + 0x1F62C, + 0x1F62D, + 0x1F62D, + 0x1F62E, + 0x1F62F, + 0x1F630, + 0x1F633, + 0x1F634, + 0x1F634, + 0x1F635, + 0x1F635, + 0x1F636, + 0x1F636, + 0x1F637, + 0x1F640, + 0x1F641, + 0x1F644, + 0x1F645, + 0x1F64F, + 0x1F680, + 0x1F680, + 0x1F681, + 0x1F682, + 0x1F683, + 0x1F685, + 0x1F686, + 0x1F686, + 0x1F687, + 0x1F687, + 0x1F688, + 0x1F688, + 0x1F689, + 0x1F689, + 0x1F68A, + 0x1F68B, + 0x1F68C, + 0x1F68C, + 0x1F68D, + 0x1F68D, + 0x1F68E, + 0x1F68E, + 0x1F68F, + 0x1F68F, + 0x1F690, + 0x1F690, + 0x1F691, + 0x1F693, + 0x1F694, + 0x1F694, + 0x1F695, + 0x1F695, + 0x1F696, + 0x1F696, + 0x1F697, + 0x1F697, + 0x1F698, + 0x1F698, + 0x1F699, + 0x1F69A, + 0x1F69B, + 0x1F6A1, + 0x1F6A2, + 0x1F6A2, + 0x1F6A3, + 0x1F6A3, + 0x1F6A4, + 0x1F6A5, + 0x1F6A6, + 0x1F6A6, + 0x1F6A7, + 0x1F6AD, + 0x1F6AE, + 0x1F6B1, + 0x1F6B2, + 0x1F6B2, + 0x1F6B3, + 0x1F6B5, + 0x1F6B6, + 0x1F6B6, + 0x1F6B7, + 0x1F6B8, + 0x1F6B9, + 0x1F6BE, + 0x1F6BF, + 0x1F6BF, + 0x1F6C0, + 0x1F6C0, + 0x1F6C1, + 0x1F6C5, + 0x1F6C6, + 0x1F6CA, + 0x1F6CB, + 0x1F6CB, + 0x1F6CC, + 0x1F6CC, + 0x1F6CD, + 0x1F6CF, + 0x1F6D0, + 0x1F6D0, + 0x1F6D1, + 0x1F6D2, + 0x1F6D3, + 0x1F6D4, + 0x1F6D5, + 0x1F6D5, + 0x1F6D6, + 0x1F6D7, + 0x1F6D8, + 0x1F6DF, + 0x1F6E0, + 0x1F6E5, + 0x1F6E6, + 0x1F6E8, + 0x1F6E9, + 0x1F6E9, + 0x1F6EA, + 0x1F6EA, + 0x1F6EB, + 0x1F6EC, + 0x1F6ED, + 0x1F6EF, + 0x1F6F0, + 0x1F6F0, + 0x1F6F1, + 0x1F6F2, + 0x1F6F3, + 0x1F6F3, + 0x1F6F4, + 0x1F6F6, + 0x1F6F7, + 0x1F6F8, + 0x1F6F9, + 0x1F6F9, + 0x1F6FA, + 0x1F6FA, + 0x1F6FB, + 0x1F6FC, + 0x1F6FD, + 0x1F6FF, + 0x1F774, + 0x1F77F, + 0x1F7D5, + 0x1F7DF, + 0x1F7E0, + 0x1F7EB, + 0x1F7EC, + 0x1F7FF, + 0x1F80C, + 0x1F80F, + 0x1F848, + 0x1F84F, + 0x1F85A, + 0x1F85F, + 0x1F888, + 0x1F88F, + 0x1F8AE, + 0x1F8FF, + 0x1F90C, + 0x1F90C, + 0x1F90D, + 0x1F90F, + 0x1F910, + 0x1F918, + 0x1F919, + 0x1F91E, + 0x1F91F, + 0x1F91F, + 0x1F920, + 0x1F927, + 0x1F928, + 0x1F92F, + 0x1F930, + 0x1F930, + 0x1F931, + 0x1F932, + 0x1F933, + 0x1F93A, + 0x1F93C, + 0x1F93E, + 0x1F93F, + 0x1F93F, + 0x1F940, + 0x1F945, + 0x1F947, + 0x1F94B, + 0x1F94C, + 0x1F94C, + 0x1F94D, + 0x1F94F, + 0x1F950, + 0x1F95E, + 0x1F95F, + 0x1F96B, + 0x1F96C, + 0x1F970, + 0x1F971, + 0x1F971, + 0x1F972, + 0x1F972, + 0x1F973, + 0x1F976, + 0x1F977, + 0x1F978, + 0x1F979, + 0x1F979, + 0x1F97A, + 0x1F97A, + 0x1F97B, + 0x1F97B, + 0x1F97C, + 0x1F97F, + 0x1F980, + 0x1F984, + 0x1F985, + 0x1F991, + 0x1F992, + 0x1F997, + 0x1F998, + 0x1F9A2, + 0x1F9A3, + 0x1F9A4, + 0x1F9A5, + 0x1F9AA, + 0x1F9AB, + 0x1F9AD, + 0x1F9AE, + 0x1F9AF, + 0x1F9B0, + 0x1F9B9, + 0x1F9BA, + 0x1F9BF, + 0x1F9C0, + 0x1F9C0, + 0x1F9C1, + 0x1F9C2, + 0x1F9C3, + 0x1F9CA, + 0x1F9CB, + 0x1F9CB, + 0x1F9CC, + 0x1F9CC, + 0x1F9CD, + 0x1F9CF, + 0x1F9D0, + 0x1F9E6, + 0x1F9E7, + 0x1F9FF, + 0x1FA00, + 0x1FA6F, + 0x1FA70, + 0x1FA73, + 0x1FA74, + 0x1FA74, + 0x1FA75, + 0x1FA77, + 0x1FA78, + 0x1FA7A, + 0x1FA7B, + 0x1FA7F, + 0x1FA80, + 0x1FA82, + 0x1FA83, + 0x1FA86, + 0x1FA87, + 0x1FA8F, + 0x1FA90, + 0x1FA95, + 0x1FA96, + 0x1FAA8, + 0x1FAA9, + 0x1FAAF, + 0x1FAB0, + 0x1FAB6, + 0x1FAB7, + 0x1FABF, + 0x1FAC0, + 0x1FAC2, + 0x1FAC3, + 0x1FACF, + 0x1FAD0, + 0x1FAD6, + 0x1FAD7, + 0x1FAFF, + 0x1FC00, + 0x1FFFD, +]! + +@[inline] +fn in_grapheme_ranges(r rune, ranges []u32) bool { + target := u32(r) + mut low := 0 + mut high := ranges.len / 2 + for low < high { + mid := low + (high - low) / 2 + lo := ranges[mid * 2] + hi := ranges[mid * 2 + 1] + if target < lo { + high = mid + } else if target > hi { + low = mid + 1 + } else { + return true + } + } + return false +} + +@[inline] +fn grapheme_break_property(r rune) GraphemeBreakProperty { + if r == `\r` { + return .cr + } + if r == `\n` { + return .lf + } + if r == 0x200d { + return .zwj + } + if r >= 0x1f1e6 && r <= 0x1f1ff { + return .regional_indicator + } + if r >= 0xac00 && r <= 0xd7a3 { + return if (u32(r) - 0xac00) % 28 == 0 { .lv } else { .lvt } + } + if (r >= 0x1100 && r <= 0x115f) || (r >= 0xa960 && r <= 0xa97c) { + return .l + } + if (r >= 0x1160 && r <= 0x11a7) || (r >= 0xd7b0 && r <= 0xd7c6) { + return .v + } + if (r >= 0x11a8 && r <= 0x11ff) || (r >= 0xd7cb && r <= 0xd7fb) { + return .t + } + if in_grapheme_ranges(r, grapheme_control_ranges[..]) { + return .control + } + if in_grapheme_ranges(r, grapheme_extend_ranges[..]) { + return .extend + } + if in_grapheme_ranges(r, grapheme_spacing_mark_ranges[..]) { + return .spacing_mark + } + if in_grapheme_ranges(r, grapheme_prepend_ranges[..]) { + return .prepend + } + return .other +} + +@[inline] +fn is_extended_pictographic(r rune) bool { + return in_grapheme_ranges(r, grapheme_extended_pictographic_ranges[..]) +} + +struct GraphemeState { +mut: + prev_prop GraphemeBreakProperty + ri_count int + extended_pictographic_state u8 +} + +@[inline] +fn grapheme_state_from_rune(r rune, prop GraphemeBreakProperty) GraphemeState { + return GraphemeState{ + prev_prop: prop + ri_count: if prop == .regional_indicator { 1 } else { 0 } + extended_pictographic_state: if is_extended_pictographic(r) { u8(1) } else { u8(0) } + } +} + +@[inline] +fn (mut gs GraphemeState) push(r rune, prop GraphemeBreakProperty) { + gs.prev_prop = prop + gs.ri_count = if prop == .regional_indicator { gs.ri_count + 1 } else { 0 } + if is_extended_pictographic(r) { + gs.extended_pictographic_state = 1 + } else if prop == .extend && gs.extended_pictographic_state == 1 { + // Keep the `Extended_Pictographic Extend*` tail alive for GB11. + } else if prop == .zwj && gs.extended_pictographic_state == 1 { + gs.extended_pictographic_state = 2 + } else { + gs.extended_pictographic_state = 0 + } +} + +@[inline] +fn should_break_grapheme(gs GraphemeState, r rune, prop GraphemeBreakProperty) bool { + match gs.prev_prop { + .cr { + if prop == .lf { + return false + } + return true + } + .lf, .control { + return true + } + .l { + if prop in [.l, .v, .lv, .lvt] { + return false + } + } + .lv, .v { + if prop in [.v, .t] { + return false + } + } + .lvt, .t { + if prop == .t { + return false + } + } + .prepend { + return false + } + .regional_indicator { + if prop == .regional_indicator && gs.ri_count % 2 == 1 { + return false + } + } + else {} + } + if prop in [.cr, .lf, .control] { + return true + } + if prop in [.extend, .zwj, .spacing_mark] { + return false + } + if gs.extended_pictographic_state == 2 && is_extended_pictographic(r) { + return false + } + return true +} + +@[inline] +fn utf8_rune_visible_width(r rune, prop GraphemeBreakProperty) int { + if prop in [.extend, .zwj, .spacing_mark, .prepend] { + return 0 + } + // Keep the historical formatting behavior for common East Asian wide runes and emoji, + // but apply it per grapheme cluster instead of per code point. + if r >= 0x1100 + && (r <= 0x115f || r == 0x2329 || r == 0x232a || (r >= 0x2e80 && r <= 0xa4cf && r != 0x303f) + || (r >= 0xac00 && r <= 0xd7a3) || (r >= 0xf900 && r <= 0xfaff) + || (r >= 0xfe10 && r <= 0xfe19) || (r >= 0xfe30 && r <= 0xfe6f) + || (r >= 0xff00 && r <= 0xff60) || (r >= 0xffe0 && r <= 0xffe6) + || (r >= 0x1f300 && r <= 0x1f64f) || (r >= 0x1f680 && r <= 0x1f6ff) + || (r >= 0x1f900 && r <= 0x1f9ff) || (r >= 0x1fa70 && r <= 0x1faff) + || (r >= 0x20000 && r <= 0x3fffd)) { + return 2 + } + return 1 +} + +fn string_graphemes_impl(s string) []string { + runes := s.runes() + if runes.len == 0 { + return []string{} + } + mut res := []string{cap: runes.len} + mut cluster := []rune{cap: 4} + first_prop := grapheme_break_property(runes[0]) + mut state := grapheme_state_from_rune(runes[0], first_prop) + cluster << runes[0] + for r in runes[1..] { + prop := grapheme_break_property(r) + if should_break_grapheme(state, r, prop) { + res << cluster.string() + cluster = []rune{cap: 4} + cluster << r + state = grapheme_state_from_rune(r, prop) + continue + } + cluster << r + state.push(r, prop) + } + res << cluster.string() + return res +} + +@[inline] +fn utf8_grapheme_visible_length(s string) int { + runes := s.runes() + if runes.len == 0 { + return 0 + } + first_prop := grapheme_break_property(runes[0]) + mut state := grapheme_state_from_rune(runes[0], first_prop) + mut total := 0 + mut cluster_width := utf8_rune_visible_width(runes[0], first_prop) + for r in runes[1..] { + prop := grapheme_break_property(r) + if should_break_grapheme(state, r, prop) { + total += cluster_width + cluster_width = utf8_rune_visible_width(r, prop) + state = grapheme_state_from_rune(r, prop) + continue + } + rune_width := utf8_rune_visible_width(r, prop) + if rune_width > cluster_width { + cluster_width = rune_width + } + state.push(r, prop) + } + return total + cluster_width +} diff --git a/vlib/builtin/string.v b/vlib/builtin/string.v index cb1c08c14..0a337a7d9 100644 --- a/vlib/builtin/string.v +++ b/vlib/builtin/string.v @@ -75,6 +75,11 @@ pub fn (s string) runes() []rune { return runes } +// graphemes returns the string split into Unicode grapheme clusters. +pub fn (s string) graphemes() []string { + return string_graphemes_impl(s) +} + // cstring_to_vstring creates a new V string copy of the C style string, // pointed by `s`. This function is most likely what you want to use when // working with C style pointers to 0 terminated strings (i.e. `char*`). diff --git a/vlib/builtin/string_test.v b/vlib/builtin/string_test.v index 2ddde1567..a77d027ca 100644 --- a/vlib/builtin/string_test.v +++ b/vlib/builtin/string_test.v @@ -1574,6 +1574,16 @@ fn test_emoji_to_runes() { assert x.runes()[0] == `👋` } +fn test_graphemes() { + assert '\u006E\u0303'.graphemes() == ['ñ'] + assert '\U0001F3F3\uFE0F\u200D\U0001F308'.graphemes() == ['🏳️‍🌈'] + assert 'ห์'.graphemes() == ['ห์'] + assert 'ปีเตอร์'.graphemes() == ['ปี', 'เ', 'ต', 'อ', 'ร์'] + assert '🇺🇳'.graphemes() == ['🇺🇳'] + assert '👩🏽‍💻'.graphemes() == ['👩🏽‍💻'] + assert 'a\r\nb'.graphemes() == ['a', '\r\n', 'b'] +} + fn test_string_to_rune() { x := 'Hello World 👋' assert x.runes().len == 13 diff --git a/vlib/builtin/utf8.v b/vlib/builtin/utf8.v index 4f4ca0e33..d41683d70 100644 --- a/vlib/builtin/utf8.v +++ b/vlib/builtin/utf8.v @@ -132,80 +132,7 @@ fn impl_utf8_to_utf32(_bytes &u8, _bytes_len int) rune { // This is simplified implementation. if you need specification compliant width, // use utf8.east_asian.display_width. pub fn utf8_str_visible_length(s string) int { - mut l := 0 - mut ul := 1 - for i := 0; i < s.len; i += ul { - c := unsafe { s.str[i] } - ul = ((0xe5000000 >> ((unsafe { s.str[i] } >> 3) & 0x1e)) & 3) + 1 - if i + ul > s.len { // incomplete UTF-8 sequence - return l - } - l++ - // avoid the match if not needed - if ul == 1 { - continue - } - // recognize combining characters and wide characters - match ul { - 2 { - r := u64((u16(c) << 8) | unsafe { s.str[i + 1] }) - if r >= 0xcc80 && r < 0xcdb0 { - // diacritical marks - l-- - } - } - 3 { - r := u64((u32(c) << 16) | unsafe { (u32(s.str[i + 1]) << 8) | s.str[i + 2] }) - // diacritical marks extended - // diacritical marks supplement - // diacritical marks for symbols - // TODO: remove this workaround for v2's parser - // vfmt off - if (r >= 0xe1aab0 && r <= 0xe1ac7f) || - (r >= 0xe1b780 && r <= 0xe1b87f) || - (r >= 0xe28390 && r <= 0xe2847f) || - (r >= 0xefb8a0 && r <= 0xefb8af) { - // diacritical marks - l-- - } - // Hangru - // CJK Unified Ideographics - // Hangru - // CJK - else if (r >= 0xe18480 && r <= 0xe1859f) || - (r >= 0xe2ba80 && r <= 0xe2bf95) || - (r >= 0xe38080 && r <= 0xe4b77f) || - (r >= 0xe4b880 && r <= 0xea807f) || - (r >= 0xeaa5a0 && r <= 0xeaa79f) || - (r >= 0xeab080 && r <= 0xed9eaf) || - (r >= 0xefa480 && r <= 0xefac7f) || - (r >= 0xefb8b8 && r <= 0xefb9af) { - // half marks - l++ - } - // vfmt on - } - 4 { - r := u64((u32(c) << 24) | unsafe { - (u32(s.str[i + 1]) << 16) | (u32(s.str[i + 2]) << 8) | s.str[i + 3] - }) - // Enclosed Ideographic Supplement - // Emoji - // CJK Unified Ideographs Extension B-G - // TODO: remove this workaround for v2's parser - // vfmt off - if (r >= 0xf09f8880 && r <= 0xf09f8a8f) || - (r >= 0xf09f8c80 && r <= 0xf09f9c90) || - (r >= 0xf09fa490 && r <= 0xf09fa7af) || - (r >= 0xf0a08080 && r <= 0xf180807f) { - l++ - } - // vfmt on - } - else {} - } - } - return l + return utf8_grapheme_visible_length(s) } // string_to_ansi_not_null_terminated returns an ANSI version of the string `_str`. diff --git a/vlib/builtin/utf8_test.v b/vlib/builtin/utf8_test.v index b0e249132..bba8963ef 100644 --- a/vlib/builtin/utf8_test.v +++ b/vlib/builtin/utf8_test.v @@ -100,6 +100,11 @@ fn test_string_to_ansi_not_null_terminated() { fn test_utf8_str_visible_length() { assert utf8_str_visible_length('𝐀𝐁𝐂') == 3 + assert utf8_str_visible_length('\u006E\u0303') == 1 + assert utf8_str_visible_length('\U0001F3F3\uFE0F\u200D\U0001F308') == 2 + assert utf8_str_visible_length('ห์') == 1 + assert utf8_str_visible_length('ปีเตอร์') == 5 + assert utf8_str_visible_length('👩🏽‍💻') == 2 } fn test_utf8_to_utf32_cases() { diff --git a/vlib/v/tests/builtin_strings_and_interpolation/string_interpolation_test.v b/vlib/v/tests/builtin_strings_and_interpolation/string_interpolation_test.v index 387baaf60..4cf24142f 100644 --- a/vlib/v/tests/builtin_strings_and_interpolation/string_interpolation_test.v +++ b/vlib/v/tests/builtin_strings_and_interpolation/string_interpolation_test.v @@ -187,6 +187,14 @@ fn test_utf8_string_interpolation() { assert '>${g:-13}<' == '>Πελοπόννησος <' } +fn test_utf8_string_interpolation_uses_grapheme_clusters() { + assert '>${'\u006E\u0303':10}<' == '> ñ<' + assert '>${'\U0001F3F3\uFE0F\u200D\U0001F308':10}<' == '> 🏳️‍🌈<' + assert '>${'ห์':10}<' == '> ห์<' + assert '>${'ปีเตอร์':10}<' == '> ปีเตอร์<' + assert '>${'👩🏽‍💻':10}<' == '> 👩🏽‍💻<' +} + struct Sss { v1 int v2 f64 -- 2.39.5