| 1 | module iconv |
| 2 | |
| 3 | // Module iconv provides functions to convert between vstring(UTF8) and other encodings. |
| 4 | import os |
| 5 | |
| 6 | @[inline] |
| 7 | fn reverse_u16(src u16) u16 { |
| 8 | return u16(src >> 8 | src << 8) |
| 9 | } |
| 10 | |
| 11 | @[inline] |
| 12 | fn reverse_u32(src u32) u32 { |
| 13 | return u32(src >> 24 | ((src >> 8) & 0x0000_FF00) | ((src << 8) & 0x00FF_0000) | src << 24) |
| 14 | } |
| 15 | |
| 16 | // vstring_to_encoding convert V string `str` to `tocode` encoding string |
| 17 | // tips: use `iconv --list` check for supported encodings |
| 18 | pub fn vstring_to_encoding(str string, tocode string) ![]u8 { |
| 19 | mut encoding_name := tocode.to_upper() |
| 20 | if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! { |
| 21 | return error('please use UTF16-LE/UTF-16BE/UTF-32LE/UTF-32BE instead') |
| 22 | } |
| 23 | if encoding_name == 'LOCAL' { |
| 24 | $if windows { |
| 25 | encoding_name = 'ANSI' |
| 26 | } $else { |
| 27 | encoding_name = 'UTF-8' |
| 28 | } |
| 29 | } |
| 30 | return conv(encoding_name, 'UTF-8', str.str, str.len) |
| 31 | } |
| 32 | |
| 33 | // encoding_to_vstring converts the given `bytes` using `fromcode` encoding, to a V string (encoded with UTF-8) |
| 34 | // tips: use `iconv --list` check for supported encodings |
| 35 | pub fn encoding_to_vstring(bytes []u8, fromcode string) !string { |
| 36 | mut encoding_name := fromcode.to_upper() |
| 37 | if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! { |
| 38 | return error('please use UTF16-LE/UTF-16BE/UTF-32LE/UTF-32BE instead') |
| 39 | } |
| 40 | if encoding_name == 'LOCAL' { |
| 41 | $if windows { |
| 42 | encoding_name = 'ANSI' |
| 43 | } $else { |
| 44 | encoding_name = 'UTF-8' |
| 45 | } |
| 46 | } |
| 47 | mut dst := conv('UTF-8', encoding_name, bytes.data, bytes.len)! |
| 48 | dst << 0 // add a tail zero, to build a vstring |
| 49 | return unsafe { cstring_to_vstring(dst.data) } |
| 50 | } |
| 51 | |
| 52 | // create_utf_string_with_bom will create a utf8/utf16/utf32 string with BOM header |
| 53 | // for utf8, it will prepend 0xEFBBBF to the `src` |
| 54 | // for utf16le, it will prepend 0xFFFE to the `src` |
| 55 | // for utf16be, it will prepend 0xFEFF to the `src` |
| 56 | // for utf32le, it will prepend 0xFFFE0000 to the `src` |
| 57 | // for utf32be, it will prepend 0x0000FEFF to the `src` |
| 58 | pub fn create_utf_string_with_bom(src []u8, utf_type string) []u8 { |
| 59 | mut clone := src.clone() |
| 60 | mut encoding_name := utf_type.to_upper() |
| 61 | if encoding_name == 'LOCAL' { |
| 62 | $if windows { |
| 63 | encoding_name = 'ANSI' |
| 64 | } $else { |
| 65 | encoding_name = 'UTF-8' |
| 66 | } |
| 67 | } |
| 68 | match encoding_name { |
| 69 | 'UTF8', 'UTF-8' { |
| 70 | clone.prepend([u8(0xEF), 0xBB, 0xBF]) |
| 71 | } |
| 72 | 'UTF16LE', 'UTF-16LE' { |
| 73 | clone.prepend([u8(0xFF), 0xFE]) |
| 74 | } |
| 75 | 'UTF16BE', 'UTF-16BE' { |
| 76 | clone.prepend([u8(0xFE), 0xFF]) |
| 77 | } |
| 78 | 'UTF32LE', 'UTF-32LE' { |
| 79 | clone.prepend([u8(0xFF), 0xFE, 0, 0]) |
| 80 | } |
| 81 | 'UTF32BE', 'UTF-32BE' { |
| 82 | clone.prepend([u8(0), 0, 0xFE, 0xFF]) |
| 83 | } |
| 84 | else {} |
| 85 | } |
| 86 | |
| 87 | return clone |
| 88 | } |
| 89 | |
| 90 | // remove_utf_string_with_bom will remove a utf8/utf16/utf32 string's BOM header |
| 91 | // for utf8, it will remove 0xEFBBBF from the `src` |
| 92 | // for utf16le, it will remove 0xFFFE from the `src` |
| 93 | // for utf16be, it will remove 0xFEFF from the `src` |
| 94 | // for utf32le, it will remove 0xFFFE0000 from the `src` |
| 95 | // for utf32be, it will remove 0x0000FEFF from the `src` |
| 96 | @[direct_array_access] |
| 97 | pub fn remove_utf_string_with_bom(src []u8, utf_type string) []u8 { |
| 98 | mut clone := src.clone() |
| 99 | mut encoding_name := utf_type.to_upper() |
| 100 | if encoding_name == 'LOCAL' { |
| 101 | $if windows { |
| 102 | encoding_name = 'ANSI' |
| 103 | } $else { |
| 104 | encoding_name = 'UTF-8' |
| 105 | } |
| 106 | } |
| 107 | match encoding_name { |
| 108 | 'UTF8', 'UTF-8' { |
| 109 | if clone.len > 3 { |
| 110 | if clone[0] == u8(0xEF) && clone[1] == u8(0xBB) && clone[2] == u8(0xBF) { |
| 111 | clone.delete_many(0, 3) |
| 112 | } |
| 113 | } |
| 114 | } |
| 115 | 'UTF16LE', 'UTF-16LE' { |
| 116 | if clone.len > 2 { |
| 117 | if clone[0] == u8(0xFF) && clone[1] == u8(0xFE) { |
| 118 | clone.delete_many(0, 2) |
| 119 | } |
| 120 | } |
| 121 | } |
| 122 | 'UTF16BE', 'UTF-16BE' { |
| 123 | if clone.len > 2 { |
| 124 | if clone[0] == u8(0xFE) && clone[1] == u8(0xFF) { |
| 125 | clone.delete_many(0, 2) |
| 126 | } |
| 127 | } |
| 128 | } |
| 129 | 'UTF32LE', 'UTF-32LE' { |
| 130 | if clone.len > 4 { |
| 131 | if clone[0] == u8(0xFF) && clone[1] == u8(0xFE) && clone[2] == u8(0) |
| 132 | && clone[3] == u8(0) { |
| 133 | clone.delete_many(0, 4) |
| 134 | } |
| 135 | } |
| 136 | } |
| 137 | 'UTF32BE', 'UTF-32BE' { |
| 138 | if clone.len > 4 { |
| 139 | if clone[0] == u8(0) && clone[1] == u8(0) && clone[2] == u8(0xFE) |
| 140 | && clone[3] == u8(0xFF) { |
| 141 | clone.delete_many(0, 4) |
| 142 | } |
| 143 | } |
| 144 | } |
| 145 | else {} |
| 146 | } |
| 147 | |
| 148 | return clone |
| 149 | } |
| 150 | |
| 151 | // write_file_encoding write_file convert `text` into `encoding` and writes to a file with the given `path`. If `path` already exists, it will be overwritten. |
| 152 | // For `encoding` in UTF8/UTF16/UTF32, if `bom` is true, then a BOM header will write to the file. |
| 153 | pub fn write_file_encoding(path string, text string, encoding string, bom bool) ! { |
| 154 | encoding_bytes := vstring_to_encoding(text, encoding)! |
| 155 | if bom && encoding.to_upper().starts_with('UTF') { |
| 156 | encoding_bom_bytes := create_utf_string_with_bom(encoding_bytes, encoding) |
| 157 | os.write_file_array(path, encoding_bom_bytes)! |
| 158 | } else { |
| 159 | os.write_file_array(path, encoding_bytes)! |
| 160 | } |
| 161 | } |
| 162 | |
| 163 | // read_file_encoding reads the file in `path` with `encoding` and returns the contents |
| 164 | pub fn read_file_encoding(path string, encoding string) !string { |
| 165 | encoding_bytes := os.read_file_array[u8](path) |
| 166 | encoding_without_bom_bytes := remove_utf_string_with_bom(encoding_bytes, encoding) |
| 167 | return encoding_to_vstring(encoding_without_bom_bytes, encoding)! |
| 168 | } |
| 169 | |