| 1 | // Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved. |
| 2 | // Use of this source code is governed by an MIT license |
| 3 | // that can be found in the LICENSE file. |
| 4 | module builtin |
| 5 | |
| 6 | // utf8_char_len returns the length in bytes of a UTF-8 encoded codepoint that starts with the byte `b`. |
| 7 | pub fn utf8_char_len(b u8) int { |
| 8 | return int(((u32(0xe5000000) >> ((b >> 3) & 0x1e)) & 3) + 1) |
| 9 | } |
| 10 | |
| 11 | // Convert utf32 to utf8 |
| 12 | // utf32 == Codepoint |
| 13 | pub fn utf32_to_str(code u32) string { |
| 14 | unsafe { |
| 15 | mut buffer := malloc_noscan(5) |
| 16 | res := utf32_to_str_no_malloc(code, mut buffer) |
| 17 | if res.len == 0 { |
| 18 | // the buffer was not used at all |
| 19 | free(buffer) |
| 20 | } |
| 21 | return res |
| 22 | } |
| 23 | } |
| 24 | |
| 25 | @[manualfree; unsafe] |
| 26 | pub fn utf32_to_str_no_malloc(code u32, mut buf &u8) string { |
| 27 | unsafe { |
| 28 | len := utf32_decode_to_buffer(code, mut buf) |
| 29 | if len == 0 { |
| 30 | return '' |
| 31 | } |
| 32 | buf[len] = 0 |
| 33 | return tos(buf, len) |
| 34 | } |
| 35 | } |
| 36 | |
| 37 | @[manualfree; unsafe] |
| 38 | pub fn utf32_decode_to_buffer(code u32, mut buf &u8) int { |
| 39 | unsafe { |
| 40 | icode := int(code) // Prevents doing casts everywhere |
| 41 | mut buffer := &u8(buf) |
| 42 | if icode <= 127 { |
| 43 | // 0x7F |
| 44 | buffer[0] = u8(icode) |
| 45 | return 1 |
| 46 | } else if icode <= 2047 { |
| 47 | // 0x7FF |
| 48 | buffer[0] = 192 | u8(icode >> 6) // 0xC0 - 110xxxxx |
| 49 | buffer[1] = 128 | u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx |
| 50 | return 2 |
| 51 | } else if icode <= 65535 { |
| 52 | // 0xFFFF |
| 53 | buffer[0] = 224 | u8(icode >> 12) // 0xE0 - 1110xxxx |
| 54 | buffer[1] = 128 | (u8(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx |
| 55 | buffer[2] = 128 | u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx |
| 56 | return 3 |
| 57 | } |
| 58 | // 0x10FFFF |
| 59 | else if icode <= 1114111 { |
| 60 | buffer[0] = 240 | u8(icode >> 18) // 0xF0 - 11110xxx |
| 61 | buffer[1] = 128 | (u8(icode >> 12) & 63) // 0x80 - 0x3F - 10xxxxxx |
| 62 | buffer[2] = 128 | (u8(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx |
| 63 | buffer[3] = 128 | u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx |
| 64 | return 4 |
| 65 | } |
| 66 | } |
| 67 | return 0 |
| 68 | } |
| 69 | |
| 70 | // Convert utf8 to utf32 |
| 71 | // the original implementation did not check for |
| 72 | // valid utf8 in the string, and could result in |
| 73 | // values greater than the utf32 spec |
| 74 | // it has been replaced by `utf8_to_utf32` which |
| 75 | // has an option return type. |
| 76 | // |
| 77 | // this function is left for backward compatibility |
| 78 | // it is used in vlib/builtin/string.v, |
| 79 | // and also in vlib/v/gen/c/cgen.v |
| 80 | pub fn (_rune string) utf32_code() int { |
| 81 | if _rune.len > 4 { |
| 82 | return 0 |
| 83 | } |
| 84 | return int(impl_utf8_to_utf32(_rune.str, _rune.len)) |
| 85 | } |
| 86 | |
| 87 | // convert array of utf8 bytes to single utf32 value |
| 88 | // will error if more than 4 bytes are submitted |
| 89 | pub fn (_bytes []u8) utf8_to_utf32() !rune { |
| 90 | if _bytes.len > 4 { |
| 91 | return error('attempted to decode too many bytes, utf-8 is limited to four bytes maximum') |
| 92 | } |
| 93 | return impl_utf8_to_utf32(_bytes.data, _bytes.len) |
| 94 | } |
| 95 | |
| 96 | @[direct_array_access] |
| 97 | fn impl_utf8_to_utf32(_bytes &u8, _bytes_len int) rune { |
| 98 | if _bytes_len == 0 || _bytes_len > 4 { |
| 99 | return 0 |
| 100 | } |
| 101 | // return ASCII unchanged |
| 102 | if _bytes_len == 1 { |
| 103 | return rune(unsafe { _bytes[0] }) |
| 104 | } |
| 105 | |
| 106 | match _bytes_len { |
| 107 | 2 { |
| 108 | b0 := rune(unsafe { _bytes[0] }) |
| 109 | b1 := rune(unsafe { _bytes[1] }) |
| 110 | return ((b0 & 0x1F) << 6) | (b1 & 0x3F) |
| 111 | } |
| 112 | 3 { |
| 113 | b0 := rune(unsafe { _bytes[0] }) |
| 114 | b1 := rune(unsafe { _bytes[1] }) |
| 115 | b2 := rune(unsafe { _bytes[2] }) |
| 116 | return ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F) |
| 117 | } |
| 118 | 4 { |
| 119 | b0 := rune(unsafe { _bytes[0] }) |
| 120 | b1 := rune(unsafe { _bytes[1] }) |
| 121 | b2 := rune(unsafe { _bytes[2] }) |
| 122 | b3 := rune(unsafe { _bytes[3] }) |
| 123 | return ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F) |
| 124 | } |
| 125 | else { |
| 126 | return 0 |
| 127 | } |
| 128 | } |
| 129 | } |
| 130 | |
| 131 | // Calculate string length for formatting, i.e. number of "characters" |
| 132 | // This is simplified implementation. if you need specification compliant width, |
| 133 | // use utf8.east_asian.display_width. |
| 134 | pub fn utf8_str_visible_length(s string) int { |
| 135 | return utf8_grapheme_visible_length(s) |
| 136 | } |
| 137 | |
| 138 | // string_to_ansi_not_null_terminated returns an ANSI version of the string `_str`. |
| 139 | // NOTE: This is most useful for converting a vstring to an ANSI string under Windows. |
| 140 | // NOTE: The ANSI string return is not null-terminated, then you can use `os.write_file_array` write an ANSI file. |
| 141 | pub fn string_to_ansi_not_null_terminated(_str string) []u8 { |
| 142 | wstr := _str.to_wide() |
| 143 | mut ansi := wide_to_ansi(wstr) |
| 144 | if ansi.len > 0 { |
| 145 | unsafe { ansi.len-- } // remove tailing zero |
| 146 | } |
| 147 | return ansi |
| 148 | } |
| 149 | |