v2 / vlib / builtin / utf8.v
148 lines · 138 sloc · 4.18 KB · e2e5cf8db56f3562c7baa735061690be936bdf3e
Raw
1// Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved.
2// Use of this source code is governed by an MIT license
3// that can be found in the LICENSE file.
4module builtin
5
6// utf8_char_len returns the length in bytes of a UTF-8 encoded codepoint that starts with the byte `b`.
7pub fn utf8_char_len(b u8) int {
8 return int(((u32(0xe5000000) >> ((b >> 3) & 0x1e)) & 3) + 1)
9}
10
11// Convert utf32 to utf8
12// utf32 == Codepoint
13pub fn utf32_to_str(code u32) string {
14 unsafe {
15 mut buffer := malloc_noscan(5)
16 res := utf32_to_str_no_malloc(code, mut buffer)
17 if res.len == 0 {
18 // the buffer was not used at all
19 free(buffer)
20 }
21 return res
22 }
23}
24
25@[manualfree; unsafe]
26pub fn utf32_to_str_no_malloc(code u32, mut buf &u8) string {
27 unsafe {
28 len := utf32_decode_to_buffer(code, mut buf)
29 if len == 0 {
30 return ''
31 }
32 buf[len] = 0
33 return tos(buf, len)
34 }
35}
36
37@[manualfree; unsafe]
38pub fn utf32_decode_to_buffer(code u32, mut buf &u8) int {
39 unsafe {
40 icode := int(code) // Prevents doing casts everywhere
41 mut buffer := &u8(buf)
42 if icode <= 127 {
43 // 0x7F
44 buffer[0] = u8(icode)
45 return 1
46 } else if icode <= 2047 {
47 // 0x7FF
48 buffer[0] = 192 | u8(icode >> 6) // 0xC0 - 110xxxxx
49 buffer[1] = 128 | u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx
50 return 2
51 } else if icode <= 65535 {
52 // 0xFFFF
53 buffer[0] = 224 | u8(icode >> 12) // 0xE0 - 1110xxxx
54 buffer[1] = 128 | (u8(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx
55 buffer[2] = 128 | u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx
56 return 3
57 }
58 // 0x10FFFF
59 else if icode <= 1114111 {
60 buffer[0] = 240 | u8(icode >> 18) // 0xF0 - 11110xxx
61 buffer[1] = 128 | (u8(icode >> 12) & 63) // 0x80 - 0x3F - 10xxxxxx
62 buffer[2] = 128 | (u8(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx
63 buffer[3] = 128 | u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx
64 return 4
65 }
66 }
67 return 0
68}
69
70// Convert utf8 to utf32
71// the original implementation did not check for
72// valid utf8 in the string, and could result in
73// values greater than the utf32 spec
74// it has been replaced by `utf8_to_utf32` which
75// has an option return type.
76//
77// this function is left for backward compatibility
78// it is used in vlib/builtin/string.v,
79// and also in vlib/v/gen/c/cgen.v
80pub fn (_rune string) utf32_code() int {
81 if _rune.len > 4 {
82 return 0
83 }
84 return int(impl_utf8_to_utf32(_rune.str, _rune.len))
85}
86
87// convert array of utf8 bytes to single utf32 value
88// will error if more than 4 bytes are submitted
89pub fn (_bytes []u8) utf8_to_utf32() !rune {
90 if _bytes.len > 4 {
91 return error('attempted to decode too many bytes, utf-8 is limited to four bytes maximum')
92 }
93 return impl_utf8_to_utf32(_bytes.data, _bytes.len)
94}
95
96@[direct_array_access]
97fn impl_utf8_to_utf32(_bytes &u8, _bytes_len int) rune {
98 if _bytes_len == 0 || _bytes_len > 4 {
99 return 0
100 }
101 // return ASCII unchanged
102 if _bytes_len == 1 {
103 return rune(unsafe { _bytes[0] })
104 }
105
106 match _bytes_len {
107 2 {
108 b0 := rune(unsafe { _bytes[0] })
109 b1 := rune(unsafe { _bytes[1] })
110 return ((b0 & 0x1F) << 6) | (b1 & 0x3F)
111 }
112 3 {
113 b0 := rune(unsafe { _bytes[0] })
114 b1 := rune(unsafe { _bytes[1] })
115 b2 := rune(unsafe { _bytes[2] })
116 return ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F)
117 }
118 4 {
119 b0 := rune(unsafe { _bytes[0] })
120 b1 := rune(unsafe { _bytes[1] })
121 b2 := rune(unsafe { _bytes[2] })
122 b3 := rune(unsafe { _bytes[3] })
123 return ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F)
124 }
125 else {
126 return 0
127 }
128 }
129}
130
131// Calculate string length for formatting, i.e. number of "characters"
132// This is simplified implementation. if you need specification compliant width,
133// use utf8.east_asian.display_width.
134pub fn utf8_str_visible_length(s string) int {
135 return utf8_grapheme_visible_length(s)
136}
137
138// string_to_ansi_not_null_terminated returns an ANSI version of the string `_str`.
139// NOTE: This is most useful for converting a vstring to an ANSI string under Windows.
140// NOTE: The ANSI string return is not null-terminated, then you can use `os.write_file_array` write an ANSI file.
141pub fn string_to_ansi_not_null_terminated(_str string) []u8 {
142 wstr := _str.to_wide()
143 mut ansi := wide_to_ansi(wstr)
144 if ansi.len > 0 {
145 unsafe { ansi.len-- } // remove tailing zero
146 }
147 return ansi
148}
149