Gitly


1 // Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved.
2 // Use of this source code is governed by an MIT license
3 // that can be found in the LICENSE file.
4 module builtin
5 
6 // utf8_char_len returns the length in bytes of a UTF-8 encoded codepoint that starts with the byte `b`.
7 pub fn utf8_char_len(b u8) int {
8     return int(((u32(0xe5000000) >> ((b >> 3) & 0x1e)) & 3) + 1)
9 }
10 
11 // Convert utf32 to utf8
12 // utf32 == Codepoint
13 pub fn utf32_to_str(code u32) string {
14     unsafe {
15         mut buffer := malloc_noscan(5)
16         res := utf32_to_str_no_malloc(code, mut buffer)
17         if res.len == 0 {
18             // the buffer was not used at all
19             free(buffer)
20         }
21         return res
22     }
23 }
24 
25 @[manualfree; unsafe]
26 pub fn utf32_to_str_no_malloc(code u32, mut buf &u8) string {
27     unsafe {
28         len := utf32_decode_to_buffer(code, mut buf)
29         if len == 0 {
30             return ''
31         }
32         buf[len] = 0
33         return tos(buf, len)
34     }
35 }
36 
37 @[manualfree; unsafe]
38 pub fn utf32_decode_to_buffer(code u32, mut buf &u8) int {
39     unsafe {
40         icode := int(code) // Prevents doing casts everywhere
41         mut buffer := &u8(buf)
42         if icode <= 127 {
43             // 0x7F
44             buffer[0] = u8(icode)
45             return 1
46         } else if icode <= 2047 {
47             // 0x7FF
48             buffer[0] = 192 | u8(icode >> 6) // 0xC0 - 110xxxxx
49             buffer[1] = 128 | u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx
50             return 2
51         } else if icode <= 65535 {
52             // 0xFFFF
53             buffer[0] = 224 | u8(icode >> 12) // 0xE0 - 1110xxxx
54             buffer[1] = 128 | (u8(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx
55             buffer[2] = 128 | u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx
56             return 3
57         }
58         // 0x10FFFF
59         else if icode <= 1114111 {
60             buffer[0] = 240 | u8(icode >> 18) // 0xF0 - 11110xxx
61             buffer[1] = 128 | (u8(icode >> 12) & 63) // 0x80 - 0x3F - 10xxxxxx
62             buffer[2] = 128 | (u8(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx
63             buffer[3] = 128 | u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx
64             return 4
65         }
66     }
67     return 0
68 }
69 
70 // Convert utf8 to utf32
71 // the original implementation did not check for
72 // valid utf8 in the string, and could result in
73 // values greater than the utf32 spec
74 // it has been replaced by `utf8_to_utf32` which
75 // has an option return type.
76 //
77 // this function is left for backward compatibility
78 // it is used in vlib/builtin/string.v,
79 // and also in vlib/v/gen/c/cgen.v
80 pub fn (_rune string) utf32_code() int {
81     if _rune.len > 4 {
82         return 0
83     }
84     return int(impl_utf8_to_utf32(_rune.str, _rune.len))
85 }
86 
87 // convert array of utf8 bytes to single utf32 value
88 // will error if more than 4 bytes are submitted
89 pub fn (_bytes []u8) utf8_to_utf32() !rune {
90     if _bytes.len > 4 {
91         return error('attempted to decode too many bytes, utf-8 is limited to four bytes maximum')
92     }
93     return impl_utf8_to_utf32(_bytes.data, _bytes.len)
94 }
95 
96 @[direct_array_access]
97 fn impl_utf8_to_utf32(_bytes &u8, _bytes_len int) rune {
98     if _bytes_len == 0 || _bytes_len > 4 {
99         return 0
100     }
101     // return ASCII unchanged
102     if _bytes_len == 1 {
103         return rune(unsafe { _bytes[0] })
104     }
105 
106     match _bytes_len {
107         2 {
108             b0 := rune(unsafe { _bytes[0] })
109             b1 := rune(unsafe { _bytes[1] })
110             return ((b0 & 0x1F) << 6) | (b1 & 0x3F)
111         }
112         3 {
113             b0 := rune(unsafe { _bytes[0] })
114             b1 := rune(unsafe { _bytes[1] })
115             b2 := rune(unsafe { _bytes[2] })
116             return ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F)
117         }
118         4 {
119             b0 := rune(unsafe { _bytes[0] })
120             b1 := rune(unsafe { _bytes[1] })
121             b2 := rune(unsafe { _bytes[2] })
122             b3 := rune(unsafe { _bytes[3] })
123             return ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F)
124         }
125         else {
126             return 0
127         }
128     }
129 }
130 
131 // Calculate string length for formatting, i.e. number of "characters"
132 // This is simplified implementation. if you need specification compliant width,
133 // use utf8.east_asian.display_width.
134 pub fn utf8_str_visible_length(s string) int {
135     return utf8_grapheme_visible_length(s)
136 }
137 
138 // string_to_ansi_not_null_terminated returns an ANSI version of the string `_str`.
139 // NOTE: This is most useful for converting a vstring to an ANSI string under Windows.
140 // NOTE: The ANSI string return is not null-terminated, then you can use `os.write_file_array` write an ANSI file.
141 pub fn string_to_ansi_not_null_terminated(_str string) []u8 {
142     wstr := _str.to_wide()
143     mut ansi := wide_to_ansi(wstr)
144     if ansi.len > 0 {
145         unsafe { ansi.len-- } // remove tailing zero
146     }
147     return ansi
148 }
149

1	// Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved.
2	// Use of this source code is governed by an MIT license
3	// that can be found in the LICENSE file.
4	module builtin
5
6	// utf8_char_len returns the length in bytes of a UTF-8 encoded codepoint that starts with the byte `b`.
7	pub fn utf8_char_len(b u8) int {
8	return int(((u32(0xe5000000) >> ((b >> 3) & 0x1e)) & 3) + 1)
9	}
10
11	// Convert utf32 to utf8
12	// utf32 == Codepoint
13	pub fn utf32_to_str(code u32) string {
14	unsafe {
15	mut buffer := malloc_noscan(5)
16	res := utf32_to_str_no_malloc(code, mut buffer)
17	if res.len == 0 {
18	// the buffer was not used at all
19	free(buffer)
20	}
21	return res
22	}
23	}
24
25	@[manualfree; unsafe]
26	pub fn utf32_to_str_no_malloc(code u32, mut buf &u8) string {
27	unsafe {
28	len := utf32_decode_to_buffer(code, mut buf)
29	if len == 0 {
30	return ''
31	}
32	buf[len] = 0
33	return tos(buf, len)
34	}
35	}
36
37	@[manualfree; unsafe]
38	pub fn utf32_decode_to_buffer(code u32, mut buf &u8) int {
39	unsafe {
40	icode := int(code) // Prevents doing casts everywhere
41	mut buffer := &u8(buf)
42	if icode <= 127 {
43	// 0x7F
44	buffer[0] = u8(icode)
45	return 1
46	} else if icode <= 2047 {
47	// 0x7FF
48	buffer[0] = 192 \| u8(icode >> 6) // 0xC0 - 110xxxxx
49	buffer[1] = 128 \| u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx
50	return 2
51	} else if icode <= 65535 {
52	// 0xFFFF
53	buffer[0] = 224 \| u8(icode >> 12) // 0xE0 - 1110xxxx
54	buffer[1] = 128 \| (u8(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx
55	buffer[2] = 128 \| u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx
56	return 3
57	}
58	// 0x10FFFF
59	else if icode <= 1114111 {
60	buffer[0] = 240 \| u8(icode >> 18) // 0xF0 - 11110xxx
61	buffer[1] = 128 \| (u8(icode >> 12) & 63) // 0x80 - 0x3F - 10xxxxxx
62	buffer[2] = 128 \| (u8(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx
63	buffer[3] = 128 \| u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx
64	return 4
65	}
66	}
67	return 0
68	}
69
70	// Convert utf8 to utf32
71	// the original implementation did not check for
72	// valid utf8 in the string, and could result in
73	// values greater than the utf32 spec
74	// it has been replaced by `utf8_to_utf32` which
75	// has an option return type.
76	//
77	// this function is left for backward compatibility
78	// it is used in vlib/builtin/string.v,
79	// and also in vlib/v/gen/c/cgen.v
80	pub fn (_rune string) utf32_code() int {
81	if _rune.len > 4 {
82	return 0
83	}
84	return int(impl_utf8_to_utf32(_rune.str, _rune.len))
85	}
86
87	// convert array of utf8 bytes to single utf32 value
88	// will error if more than 4 bytes are submitted
89	pub fn (_bytes []u8) utf8_to_utf32() !rune {
90	if _bytes.len > 4 {
91	return error('attempted to decode too many bytes, utf-8 is limited to four bytes maximum')
92	}
93	return impl_utf8_to_utf32(_bytes.data, _bytes.len)
94	}
95
96	@[direct_array_access]
97	fn impl_utf8_to_utf32(_bytes &u8, _bytes_len int) rune {
98	if _bytes_len == 0 \|\| _bytes_len > 4 {
99	return 0
100	}
101	// return ASCII unchanged
102	if _bytes_len == 1 {
103	return rune(unsafe { _bytes[0] })
104	}
105
106	match _bytes_len {
107	2 {
108	b0 := rune(unsafe { _bytes[0] })
109	b1 := rune(unsafe { _bytes[1] })
110	return ((b0 & 0x1F) << 6) \| (b1 & 0x3F)
111	}
112	3 {
113	b0 := rune(unsafe { _bytes[0] })
114	b1 := rune(unsafe { _bytes[1] })
115	b2 := rune(unsafe { _bytes[2] })
116	return ((b0 & 0x0F) << 12) \| ((b1 & 0x3F) << 6) \| (b2 & 0x3F)
117	}
118	4 {
119	b0 := rune(unsafe { _bytes[0] })
120	b1 := rune(unsafe { _bytes[1] })
121	b2 := rune(unsafe { _bytes[2] })
122	b3 := rune(unsafe { _bytes[3] })
123	return ((b0 & 0x07) << 18) \| ((b1 & 0x3F) << 12) \| ((b2 & 0x3F) << 6) \| (b3 & 0x3F)
124	}
125	else {
126	return 0
127	}
128	}
129	}
130
131	// Calculate string length for formatting, i.e. number of "characters"
132	// This is simplified implementation. if you need specification compliant width,
133	// use utf8.east_asian.display_width.
134	pub fn utf8_str_visible_length(s string) int {
135	return utf8_grapheme_visible_length(s)
136	}
137
138	// string_to_ansi_not_null_terminated returns an ANSI version of the string `_str`.
139	// NOTE: This is most useful for converting a vstring to an ANSI string under Windows.
140	// NOTE: The ANSI string return is not null-terminated, then you can use `os.write_file_array` write an ANSI file.
141	pub fn string_to_ansi_not_null_terminated(_str string) []u8 {
142	wstr := _str.to_wide()
143	mut ansi := wide_to_ansi(wstr)
144	if ansi.len > 0 {
145	unsafe { ansi.len-- } // remove tailing zero
146	}
147	return ansi
148	}
149