| 1 | module iconv |
| 2 | |
| 3 | // Module iconv provides functions convert between vstring(UTF8) to/from different encodings. |
| 4 | // iconv implementation using Win32 API to convert |
| 5 | // Idear from https://github.com/win-iconv/win-iconv |
| 6 | |
| 7 | fn C.GetACP() i32 |
| 8 | fn C.GetOEMCP() i32 |
| 9 | fn C.WideCharToMultiByte(codepage u32, dwflags u32, src &u8, src_len i32, dst &u8, dst_len i32, default_char &u8, used_default_char &bool) i32 |
| 10 | fn C.MultiByteToWideChar(codepage u32, dwflags u32, src &u8, src_len i32, dst &u8, dst_len i32) i32 |
| 11 | |
| 12 | struct Codepage_Alias { |
| 13 | codepage int |
| 14 | name string |
| 15 | } |
| 16 | |
| 17 | const codepage_alias = [ |
| 18 | // NOTE! name field string MUST in uppercase! |
| 19 | // vfmt off |
| 20 | Codepage_Alias{65001, 'CP65001'}, |
| 21 | Codepage_Alias{65001, 'UTF8'}, |
| 22 | Codepage_Alias{65001, 'UTF-8'}, |
| 23 | |
| 24 | Codepage_Alias{1200, 'CP1200'}, |
| 25 | Codepage_Alias{1200, 'UTF16LE'}, |
| 26 | Codepage_Alias{1200, 'UTF-16LE'}, |
| 27 | Codepage_Alias{1200, 'UCS2LE'}, |
| 28 | Codepage_Alias{1200, 'UCS-2LE'}, |
| 29 | Codepage_Alias{1200, 'UCS-2-INTERNAL'}, |
| 30 | Codepage_Alias{1200, 'UNICODE'}, // for iconv |
| 31 | |
| 32 | Codepage_Alias{1201, 'CP1201'}, |
| 33 | Codepage_Alias{1201, 'UTF16BE'}, |
| 34 | Codepage_Alias{1201, 'UTF-16BE'}, |
| 35 | Codepage_Alias{1201, 'UCS2BE'}, |
| 36 | Codepage_Alias{1201, 'UCS-2BE'}, |
| 37 | Codepage_Alias{1201, 'UNICODEFFFE'}, |
| 38 | |
| 39 | Codepage_Alias{12000, 'CP12000'}, |
| 40 | Codepage_Alias{12000, 'UTF32LE'}, |
| 41 | Codepage_Alias{12000, 'UTF-32LE'}, |
| 42 | Codepage_Alias{12000, 'UCS4LE'}, |
| 43 | Codepage_Alias{12000, 'UCS-4LE'}, |
| 44 | |
| 45 | Codepage_Alias{12001, 'CP12001'}, |
| 46 | Codepage_Alias{12001, 'UTF32BE'}, |
| 47 | Codepage_Alias{12001, 'UTF-32BE'}, |
| 48 | Codepage_Alias{12001, 'UCS4BE'}, |
| 49 | Codepage_Alias{12001, 'UCS-4BE'}, |
| 50 | |
| 51 | //#ifndef GLIB_COMPILATION |
| 52 | // // |
| 53 | // * Default is big endian. |
| 54 | // * See rfc2781 4.3 Interpreting text labelled as UTF-16. |
| 55 | // |
| 56 | // Codepage_Alias{1201, 'UTF16'}, |
| 57 | // Codepage_Alias{1201, 'UTF-16'}, |
| 58 | // Codepage_Alias{1201, 'UCS2'}, |
| 59 | // Codepage_Alias{1201, 'UCS-2'}, |
| 60 | // Codepage_Alias{12001, 'UTF32'}, |
| 61 | // Codepage_Alias{12001, 'UTF-32'}, |
| 62 | // Codepage_Alias{12001, 'UCS-4'}, |
| 63 | // Codepage_Alias{12001, 'UCS4'}, |
| 64 | //#else |
| 65 | // Default is little endian, because the platform is |
| 66 | Codepage_Alias{1200, 'UTF16'}, |
| 67 | Codepage_Alias{1200, 'UTF-16'}, |
| 68 | Codepage_Alias{1200, 'UCS2'}, |
| 69 | Codepage_Alias{1200, 'UCS-2'}, |
| 70 | Codepage_Alias{12000, 'UTF32'}, |
| 71 | Codepage_Alias{12000, 'UTF-32'}, |
| 72 | Codepage_Alias{12000, 'UCS4'}, |
| 73 | Codepage_Alias{12000, 'UCS-4'}, |
| 74 | //#endif |
| 75 | |
| 76 | // copy from libiconv `iconv -l` |
| 77 | // !IsValidCodePage(367) |
| 78 | Codepage_Alias{20127, 'ANSI_X3.4-1968'}, |
| 79 | Codepage_Alias{20127, 'ANSI_X3.4-1986'}, |
| 80 | Codepage_Alias{20127, 'ASCII'}, |
| 81 | Codepage_Alias{20127, 'CP367'}, |
| 82 | Codepage_Alias{20127, 'IBM367'}, |
| 83 | Codepage_Alias{20127, 'ISO-IR-6'}, |
| 84 | Codepage_Alias{20127, 'ISO646-US'}, |
| 85 | Codepage_Alias{20127, 'ISO_646.IRV:1991'}, |
| 86 | Codepage_Alias{20127, 'US'}, |
| 87 | Codepage_Alias{20127, 'US-ASCII'}, |
| 88 | Codepage_Alias{20127, 'CSASCII'}, |
| 89 | |
| 90 | // !IsValidCodePage(819) |
| 91 | Codepage_Alias{1252, 'CP819'}, |
| 92 | Codepage_Alias{1252, 'IBM819'}, |
| 93 | Codepage_Alias{28591, 'ISO-8859-1'}, |
| 94 | Codepage_Alias{28591, 'ISO-IR-100'}, |
| 95 | Codepage_Alias{28591, 'ISO8859-1'}, |
| 96 | Codepage_Alias{28591, 'ISO_8859-1'}, |
| 97 | Codepage_Alias{28591, 'ISO_8859-1:1987'}, |
| 98 | Codepage_Alias{28591, 'L1'}, |
| 99 | Codepage_Alias{28591, 'LATIN1'}, |
| 100 | Codepage_Alias{28591, 'CSISOLATIN1'}, |
| 101 | |
| 102 | Codepage_Alias{1250, 'CP1250'}, |
| 103 | Codepage_Alias{1250, 'MS-EE'}, |
| 104 | Codepage_Alias{1250, 'WINDOWS-1250'}, |
| 105 | |
| 106 | Codepage_Alias{1251, 'CP1251'}, |
| 107 | Codepage_Alias{1251, 'MS-CYRL'}, |
| 108 | Codepage_Alias{1251, 'WINDOWS-1251'}, |
| 109 | |
| 110 | Codepage_Alias{1252, 'CP1252'}, |
| 111 | Codepage_Alias{1252, 'MS-ANSI'}, |
| 112 | Codepage_Alias{1252, 'WINDOWS-1252'}, |
| 113 | |
| 114 | Codepage_Alias{1253, 'CP1253'}, |
| 115 | Codepage_Alias{1253, 'MS-GREEK'}, |
| 116 | Codepage_Alias{1253, 'WINDOWS-1253'}, |
| 117 | |
| 118 | Codepage_Alias{1254, 'CP1254'}, |
| 119 | Codepage_Alias{1254, 'MS-TURK'}, |
| 120 | Codepage_Alias{1254, 'WINDOWS-1254'}, |
| 121 | |
| 122 | Codepage_Alias{1255, 'CP1255'}, |
| 123 | Codepage_Alias{1255, 'MS-HEBR'}, |
| 124 | Codepage_Alias{1255, 'WINDOWS-1255'}, |
| 125 | |
| 126 | Codepage_Alias{1256, 'CP1256'}, |
| 127 | Codepage_Alias{1256, 'MS-ARAB'}, |
| 128 | Codepage_Alias{1256, 'WINDOWS-1256'}, |
| 129 | |
| 130 | Codepage_Alias{1257, 'CP1257'}, |
| 131 | Codepage_Alias{1257, 'WINBALTRIM'}, |
| 132 | Codepage_Alias{1257, 'WINDOWS-1257'}, |
| 133 | |
| 134 | Codepage_Alias{1258, 'CP1258'}, |
| 135 | Codepage_Alias{1258, 'WINDOWS-1258'}, |
| 136 | |
| 137 | Codepage_Alias{850, '850'}, |
| 138 | Codepage_Alias{850, 'CP850'}, |
| 139 | Codepage_Alias{850, 'IBM850'}, |
| 140 | Codepage_Alias{850, 'CSPC850MULTILINGUAL'}, |
| 141 | |
| 142 | // !IsValidCodePage(862) |
| 143 | Codepage_Alias{862, '862'}, |
| 144 | Codepage_Alias{862, 'CP862'}, |
| 145 | Codepage_Alias{862, 'IBM862'}, |
| 146 | Codepage_Alias{862, 'CSPC862LATINHEBREW'}, |
| 147 | |
| 148 | Codepage_Alias{866, '866'}, |
| 149 | Codepage_Alias{866, 'CP866'}, |
| 150 | Codepage_Alias{866, 'IBM866'}, |
| 151 | Codepage_Alias{866, 'CSIBM866'}, |
| 152 | |
| 153 | // !IsValidCodePage(154) |
| 154 | Codepage_Alias{154, 'CP154'}, |
| 155 | Codepage_Alias{154, 'CYRILLIC-ASIAN'}, |
| 156 | Codepage_Alias{154, 'PT154'}, |
| 157 | Codepage_Alias{154, 'PTCP154'}, |
| 158 | Codepage_Alias{154, 'CSPTCP154'}, |
| 159 | |
| 160 | // !IsValidCodePage(1133) |
| 161 | Codepage_Alias{1133, 'CP1133'}, |
| 162 | Codepage_Alias{1133, 'IBM-CP1133'}, |
| 163 | |
| 164 | Codepage_Alias{874, 'CP874'}, |
| 165 | Codepage_Alias{874, 'WINDOWS-874'}, |
| 166 | |
| 167 | // !IsValidCodePage(51932) |
| 168 | Codepage_Alias{51932, 'CP51932'}, |
| 169 | Codepage_Alias{51932, 'MS51932'}, |
| 170 | Codepage_Alias{51932, 'WINDOWS-51932'}, |
| 171 | Codepage_Alias{51932, 'EUC-JP'}, |
| 172 | |
| 173 | Codepage_Alias{932, 'CP932'}, |
| 174 | Codepage_Alias{932, 'MS932'}, |
| 175 | Codepage_Alias{932, 'SHIFFT_JIS'}, |
| 176 | Codepage_Alias{932, 'SHIFFT_JIS-MS'}, |
| 177 | Codepage_Alias{932, 'SJIS'}, |
| 178 | Codepage_Alias{932, 'SJIS-MS'}, |
| 179 | Codepage_Alias{932, 'SJIS-OPEN'}, |
| 180 | Codepage_Alias{932, 'SJIS-WIN'}, |
| 181 | Codepage_Alias{932, 'WINDOWS-31J'}, |
| 182 | Codepage_Alias{932, 'WINDOWS-932'}, |
| 183 | Codepage_Alias{932, 'CSWINDOWS31J'}, |
| 184 | |
| 185 | Codepage_Alias{50221, 'CP50221'}, |
| 186 | Codepage_Alias{50221, 'ISO-2022-JP'}, |
| 187 | Codepage_Alias{50221, 'ISO-2022-JP-MS'}, |
| 188 | Codepage_Alias{50221, 'ISO2022-JP'}, |
| 189 | Codepage_Alias{50221, 'ISO2022-JP-MS'}, |
| 190 | Codepage_Alias{50221, 'MS50221'}, |
| 191 | Codepage_Alias{50221, 'WINDOWS-50221'}, |
| 192 | |
| 193 | Codepage_Alias{936, 'CP936'}, |
| 194 | Codepage_Alias{936, 'GBK'}, |
| 195 | Codepage_Alias{936, 'MS936'}, |
| 196 | Codepage_Alias{936, 'WINDOWS-936'}, |
| 197 | |
| 198 | Codepage_Alias{950, 'CP950'}, |
| 199 | Codepage_Alias{950, 'BIG5'}, |
| 200 | Codepage_Alias{950, 'BIG5HKSCS'}, |
| 201 | Codepage_Alias{950, 'BIG5-HKSCS'}, |
| 202 | |
| 203 | Codepage_Alias{949, 'CP949'}, |
| 204 | Codepage_Alias{949, 'UHC'}, |
| 205 | Codepage_Alias{949, 'EUC-KR'}, |
| 206 | |
| 207 | Codepage_Alias{1361, 'CP1361'}, |
| 208 | Codepage_Alias{1361, 'JOHAB'}, |
| 209 | |
| 210 | Codepage_Alias{437, '437'}, |
| 211 | Codepage_Alias{437, 'CP437'}, |
| 212 | Codepage_Alias{437, 'IBM437'}, |
| 213 | Codepage_Alias{437, 'CSPC8CODEPAGE437'}, |
| 214 | |
| 215 | Codepage_Alias{737, 'CP737'}, |
| 216 | |
| 217 | Codepage_Alias{775, 'CP775'}, |
| 218 | Codepage_Alias{775, 'IBM775'}, |
| 219 | Codepage_Alias{775, 'CSPC775BALTIC'}, |
| 220 | |
| 221 | Codepage_Alias{852, '852'}, |
| 222 | Codepage_Alias{852, 'CP852'}, |
| 223 | Codepage_Alias{852, 'IBM852'}, |
| 224 | Codepage_Alias{852, 'CSPCP852'}, |
| 225 | |
| 226 | // !IsValidCodePage(853) |
| 227 | Codepage_Alias{853, 'CP853'}, |
| 228 | |
| 229 | Codepage_Alias{855, '855'}, |
| 230 | Codepage_Alias{855, 'CP855'}, |
| 231 | Codepage_Alias{855, 'IBM855'}, |
| 232 | Codepage_Alias{855, 'CSIBM855'}, |
| 233 | |
| 234 | Codepage_Alias{857, '857'}, |
| 235 | Codepage_Alias{857, 'CP857'}, |
| 236 | Codepage_Alias{857, 'IBM857'}, |
| 237 | Codepage_Alias{857, 'CSIBM857'}, |
| 238 | |
| 239 | // !IsValidCodePage(858) |
| 240 | Codepage_Alias{858, 'CP858'}, |
| 241 | |
| 242 | Codepage_Alias{860, '860'}, |
| 243 | Codepage_Alias{860, 'CP860'}, |
| 244 | Codepage_Alias{860, 'IBM860'}, |
| 245 | Codepage_Alias{860, 'CSIBM860'}, |
| 246 | |
| 247 | Codepage_Alias{861, '861'}, |
| 248 | Codepage_Alias{861, 'CP-IS'}, |
| 249 | Codepage_Alias{861, 'CP861'}, |
| 250 | Codepage_Alias{861, 'IBM861'}, |
| 251 | Codepage_Alias{861, 'CSIBM861'}, |
| 252 | |
| 253 | Codepage_Alias{863, '863'}, |
| 254 | Codepage_Alias{863, 'CP863'}, |
| 255 | Codepage_Alias{863, 'IBM863'}, |
| 256 | Codepage_Alias{863, 'CSIBM863'}, |
| 257 | |
| 258 | Codepage_Alias{864, 'CP864'}, |
| 259 | Codepage_Alias{864, 'IBM864'}, |
| 260 | Codepage_Alias{864, 'CSIBM864'}, |
| 261 | |
| 262 | Codepage_Alias{865, '865'}, |
| 263 | Codepage_Alias{865, 'CP865'}, |
| 264 | Codepage_Alias{865, 'IBM865'}, |
| 265 | Codepage_Alias{865, 'CSIBM865'}, |
| 266 | |
| 267 | Codepage_Alias{869, '869'}, |
| 268 | Codepage_Alias{869, 'CP-GR'}, |
| 269 | Codepage_Alias{869, 'CP869'}, |
| 270 | Codepage_Alias{869, 'IBM869'}, |
| 271 | Codepage_Alias{869, 'CSIBM869'}, |
| 272 | |
| 273 | // !IsValidCodePage(1152) |
| 274 | Codepage_Alias{1125, 'CP1125'}, |
| 275 | |
| 276 | // |
| 277 | // * Code Page Identifiers |
| 278 | // * https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers |
| 279 | |
| 280 | Codepage_Alias{37, 'IBM037'}, // IBM EBCDIC US-Canada |
| 281 | Codepage_Alias{437, 'IBM437'}, // OEM United States |
| 282 | Codepage_Alias{500, 'IBM500'}, // IBM EBCDIC International |
| 283 | Codepage_Alias{708, 'ASMO-708'}, // Arabic (ASMO 708) |
| 284 | // 709 Arabic (ASMO-449+, BCON V4) |
| 285 | // 710 Arabic - Transparent Arabic |
| 286 | Codepage_Alias{720, 'DOS-720'}, // Arabic (Transparent ASMO); Arabic (DOS) |
| 287 | Codepage_Alias{737, 'IBM737'}, // OEM Greek (formerly 437G); Greek (DOS) |
| 288 | Codepage_Alias{775, 'IBM775'}, // OEM Baltic; Baltic (DOS) |
| 289 | Codepage_Alias{850, 'IBM850'}, // OEM Multilingual Latin 1; Western European (DOS) |
| 290 | Codepage_Alias{852, 'IBM852'}, // OEM Latin 2; Central European (DOS) |
| 291 | Codepage_Alias{855, 'IBM855'}, // OEM Cyrillic (primarily Russian) |
| 292 | Codepage_Alias{857, 'IBM857'}, // OEM Turkish; Turkish (DOS) |
| 293 | Codepage_Alias{858, 'IBM00858'}, // OEM Multilingual Latin 1 + Euro symbol |
| 294 | Codepage_Alias{860, 'IBM860'}, // OEM Portuguese; Portuguese (DOS) |
| 295 | Codepage_Alias{861, 'IBM861'}, // OEM Icelandic; Icelandic (DOS) |
| 296 | Codepage_Alias{862, 'DOS-862'}, // OEM Hebrew; Hebrew (DOS) |
| 297 | Codepage_Alias{863, 'IBM863'}, // OEM French Canadian; French Canadian (DOS) |
| 298 | Codepage_Alias{864, 'IBM864'}, // OEM Arabic; Arabic (864) |
| 299 | Codepage_Alias{865, 'IBM865'}, // OEM Nordic; Nordic (DOS) |
| 300 | Codepage_Alias{866, 'CP866'}, // OEM Russian; Cyrillic (DOS) |
| 301 | Codepage_Alias{869, 'IBM869'}, // OEM Modern Greek; Greek, Modern (DOS) |
| 302 | Codepage_Alias{870, 'IBM870'}, // IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 |
| 303 | Codepage_Alias{874, 'WINDOWS-874'}, // ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows) |
| 304 | Codepage_Alias{875, 'CP875'}, // IBM EBCDIC Greek Modern |
| 305 | Codepage_Alias{932, 'SHIFT_JIS'}, // ANSI/OEM Japanese; Japanese (Shift-JIS) |
| 306 | Codepage_Alias{932, 'SHIFT-JIS'}, // alternative name for it |
| 307 | Codepage_Alias{936, 'GB2312'}, // ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) |
| 308 | Codepage_Alias{949, 'KS_C_5601-1987'}, // ANSI/OEM Korean (Unified Hangul Code) |
| 309 | Codepage_Alias{950, 'BIG5'}, // ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) |
| 310 | Codepage_Alias{950, 'BIG5HKSCS'}, // ANSI/OEM Traditional Chinese (Hong Kong SAR); Chinese Traditional (Big5-HKSCS) |
| 311 | Codepage_Alias{950, 'BIG5-HKSCS'}, // alternative name for it |
| 312 | Codepage_Alias{1026, 'IBM1026'}, // IBM EBCDIC Turkish (Latin 5) |
| 313 | Codepage_Alias{1047, 'IBM01047'}, // IBM EBCDIC Latin 1/Open System |
| 314 | Codepage_Alias{1140, 'IBM01140'}, // IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) |
| 315 | Codepage_Alias{1141, 'IBM01141'}, // IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) |
| 316 | Codepage_Alias{1142, 'IBM01142'}, // IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro) |
| 317 | Codepage_Alias{1143, 'IBM01143'}, // IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro) |
| 318 | Codepage_Alias{1144, 'IBM01144'}, // IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) |
| 319 | Codepage_Alias{1145, 'IBM01145'}, // IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro) |
| 320 | Codepage_Alias{1146, 'IBM01146'}, // IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro) |
| 321 | Codepage_Alias{1147, 'IBM01147'}, // IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro) |
| 322 | Codepage_Alias{1148, 'IBM01148'}, // IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro) |
| 323 | Codepage_Alias{1149, 'IBM01149'}, // IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro) |
| 324 | Codepage_Alias{1200, 'UTF-16'}, // Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications |
| 325 | Codepage_Alias{1201, 'UNICODEFFFE'}, // Unicode UTF-16, big endian byte order; available only to managed applications |
| 326 | Codepage_Alias{1250, 'WINDOWS-1250'}, // ANSI Central European; Central European (Windows) |
| 327 | Codepage_Alias{1251, 'WINDOWS-1251'}, // ANSI Cyrillic; Cyrillic (Windows) |
| 328 | Codepage_Alias{1252, 'WINDOWS-1252'}, // ANSI Latin 1; Western European (Windows) |
| 329 | Codepage_Alias{1253, 'WINDOWS-1253'}, // ANSI Greek; Greek (Windows) |
| 330 | Codepage_Alias{1254, 'WINDOWS-1254'}, // ANSI Turkish; Turkish (Windows) |
| 331 | Codepage_Alias{1255, 'WINDOWS-1255'}, // ANSI Hebrew; Hebrew (Windows) |
| 332 | Codepage_Alias{1256, 'WINDOWS-1256'}, // ANSI Arabic; Arabic (Windows) |
| 333 | Codepage_Alias{1257, 'WINDOWS-1257'}, // ANSI Baltic; Baltic (Windows) |
| 334 | Codepage_Alias{1258, 'WINDOWS-1258'}, // ANSI/OEM Vietnamese; Vietnamese (Windows) |
| 335 | Codepage_Alias{1361, 'JOHAB'}, // Korean (Johab) |
| 336 | Codepage_Alias{10000, 'MACINTOSH'}, // MAC Roman; Western European (Mac) |
| 337 | Codepage_Alias{10001, 'X-MAC-JAPANESE'}, // Japanese (Mac) |
| 338 | Codepage_Alias{10002, 'X-MAC-CHINESETRAD'}, // MAC Traditional Chinese (Big5); Chinese Traditional (Mac) |
| 339 | Codepage_Alias{10003, 'X-MAC-KOREAN'}, // Korean (Mac) |
| 340 | Codepage_Alias{10004, 'X-MAC-ARABIC'}, // Arabic (Mac) |
| 341 | Codepage_Alias{10005, 'X-MAC-HEBREW'}, // Hebrew (Mac) |
| 342 | Codepage_Alias{10006, 'X-MAC-GREEK'}, // Greek (Mac) |
| 343 | Codepage_Alias{10007, 'X-MAC-CYRILLIC'}, // Cyrillic (Mac) |
| 344 | Codepage_Alias{10008, 'X-MAC-CHINESESIMP'}, // MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac) |
| 345 | Codepage_Alias{10010, 'X-MAC-ROMANIAN'}, // Romanian (Mac) |
| 346 | Codepage_Alias{10017, 'X-MAC-UKRAINIAN'}, // Ukrainian (Mac) |
| 347 | Codepage_Alias{10021, 'X-MAC-THAI'}, // Thai (Mac) |
| 348 | Codepage_Alias{10029, 'X-MAC-CE'}, // MAC Latin 2; Central European (Mac) |
| 349 | Codepage_Alias{10079, 'X-MAC-ICELANDIC'}, // Icelandic (Mac) |
| 350 | Codepage_Alias{10081, 'X-MAC-TURKISH'}, // Turkish (Mac) |
| 351 | Codepage_Alias{10082, 'X-MAC-CROATIAN'}, // Croatian (Mac) |
| 352 | Codepage_Alias{12000, 'UTF-32'}, // Unicode UTF-32, little endian byte order; available only to managed applications |
| 353 | Codepage_Alias{12001, 'UTF-32BE'}, // Unicode UTF-32, big endian byte order; available only to managed applications |
| 354 | Codepage_Alias{20000, 'X-CHINESE_CNS'}, // CNS Taiwan; Chinese Traditional (CNS) |
| 355 | Codepage_Alias{20001, 'X-CP20001'}, // TCA Taiwan |
| 356 | Codepage_Alias{20002, 'X_CHINESE-ETEN'}, // Eten Taiwan; Chinese Traditional (Eten) |
| 357 | Codepage_Alias{20003, 'X-CP20003'}, // IBM5550 Taiwan |
| 358 | Codepage_Alias{20004, 'X-CP20004'}, // TeleText Taiwan |
| 359 | Codepage_Alias{20005, 'X-CP20005'}, // Wang Taiwan |
| 360 | Codepage_Alias{20105, 'X-IA5'}, // IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5) |
| 361 | Codepage_Alias{20106, 'X-IA5-GERMAN'}, // IA5 German (7-bit) |
| 362 | Codepage_Alias{20107, 'X-IA5-SWEDISH'}, // IA5 Swedish (7-bit) |
| 363 | Codepage_Alias{20108, 'X-IA5-NORWEGIAN'}, // IA5 Norwegian (7-bit) |
| 364 | Codepage_Alias{20127, 'US-ASCII'}, // US-ASCII (7-bit) |
| 365 | Codepage_Alias{20261, 'X-CP20261'}, // T.61 |
| 366 | Codepage_Alias{20269, 'X-CP20269'}, // ISO 6937 Non-Spacing Accent |
| 367 | Codepage_Alias{20273, 'IBM273'}, // IBM EBCDIC Germany |
| 368 | Codepage_Alias{20277, 'IBM277'}, // IBM EBCDIC Denmark-Norway |
| 369 | Codepage_Alias{20278, 'IBM278'}, // IBM EBCDIC Finland-Sweden |
| 370 | Codepage_Alias{20280, 'IBM280'}, // IBM EBCDIC Italy |
| 371 | Codepage_Alias{20284, 'IBM284'}, // IBM EBCDIC Latin America-Spain |
| 372 | Codepage_Alias{20285, 'IBM285'}, // IBM EBCDIC United Kingdom |
| 373 | Codepage_Alias{20290, 'IBM290'}, // IBM EBCDIC Japanese Katakana Extended |
| 374 | Codepage_Alias{20297, 'IBM297'}, // IBM EBCDIC France |
| 375 | Codepage_Alias{20420, 'IBM420'}, // IBM EBCDIC Arabic |
| 376 | Codepage_Alias{20423, 'IBM423'}, // IBM EBCDIC Greek |
| 377 | Codepage_Alias{20424, 'IBM424'}, // IBM EBCDIC Hebrew |
| 378 | Codepage_Alias{20833, 'X-EBCDIC-KOREANEXTENDED'}, // IBM EBCDIC Korean Extended |
| 379 | Codepage_Alias{20838, 'IBM-THAI'}, // IBM EBCDIC Thai |
| 380 | Codepage_Alias{20866, 'KOI8-R'}, // Russian (KOI8-R); Cyrillic (KOI8-R) |
| 381 | Codepage_Alias{20871, 'IBM871'}, // IBM EBCDIC Icelandic |
| 382 | Codepage_Alias{20880, 'IBM880'}, // IBM EBCDIC Cyrillic Russian |
| 383 | Codepage_Alias{20905, 'IBM905'}, // IBM EBCDIC Turkish |
| 384 | Codepage_Alias{20924, 'IBM00924'}, // IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) |
| 385 | Codepage_Alias{20932, 'EUC-JP'}, // Japanese (JIS 0208-1990 and 0121-1990) |
| 386 | Codepage_Alias{20936, 'X-CP20936'}, // Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) |
| 387 | Codepage_Alias{20949, 'X-CP20949'}, // Korean Wansung |
| 388 | Codepage_Alias{21025, 'CP1025'}, // IBM EBCDIC Cyrillic Serbian-Bulgarian |
| 389 | // 21027 (deprecated) |
| 390 | Codepage_Alias{21866, 'KOI8-U'}, // Ukrainian (KOI8-U); Cyrillic (KOI8-U) |
| 391 | Codepage_Alias{28591, 'ISO-8859-1'}, // ISO 8859-1 Latin 1; Western European (ISO) |
| 392 | Codepage_Alias{28591, 'ISO8859-1'}, // ISO 8859-1 Latin 1; Western European (ISO) |
| 393 | Codepage_Alias{28591, 'ISO_8859-1'}, |
| 394 | Codepage_Alias{28591, 'ISO_8859_1'}, |
| 395 | Codepage_Alias{28592, 'ISO-8859-2'}, // ISO 8859-2 Central European; Central European (ISO) |
| 396 | Codepage_Alias{28592, 'ISO8859-2'}, // ISO 8859-2 Central European; Central European (ISO) |
| 397 | Codepage_Alias{28592, 'ISO_8859-2'}, |
| 398 | Codepage_Alias{28592, 'ISO_8859_2'}, |
| 399 | Codepage_Alias{28593, 'ISO-8859-3'}, // ISO 8859-3 Latin 3 |
| 400 | Codepage_Alias{28593, 'ISO8859-3'}, // ISO 8859-3 Latin 3 |
| 401 | Codepage_Alias{28593, 'ISO_8859-3'}, |
| 402 | Codepage_Alias{28593, 'ISO_8859_3'}, |
| 403 | Codepage_Alias{28594, 'ISO-8859-4'}, // ISO 8859-4 Baltic |
| 404 | Codepage_Alias{28594, 'ISO8859-4'}, // ISO 8859-4 Baltic |
| 405 | Codepage_Alias{28594, 'ISO_8859-4'}, |
| 406 | Codepage_Alias{28594, 'ISO_8859_4'}, |
| 407 | Codepage_Alias{28595, 'ISO-8859-5'}, // ISO 8859-5 Cyrillic |
| 408 | Codepage_Alias{28595, 'ISO8859-5'}, // ISO 8859-5 Cyrillic |
| 409 | Codepage_Alias{28595, 'ISO_8859-5'}, |
| 410 | Codepage_Alias{28595, 'ISO_8859_5'}, |
| 411 | Codepage_Alias{28596, 'ISO-8859-6'}, // ISO 8859-6 Arabic |
| 412 | Codepage_Alias{28596, 'ISO8859-6'}, // ISO 8859-6 Arabic |
| 413 | Codepage_Alias{28596, 'ISO_8859-6'}, |
| 414 | Codepage_Alias{28596, 'ISO_8859_6'}, |
| 415 | Codepage_Alias{28597, 'ISO-8859-7'}, // ISO 8859-7 Greek |
| 416 | Codepage_Alias{28597, 'ISO8859-7'}, // ISO 8859-7 Greek |
| 417 | Codepage_Alias{28597, 'ISO_8859-7'}, |
| 418 | Codepage_Alias{28597, 'ISO_8859_7'}, |
| 419 | Codepage_Alias{28598, 'ISO-8859-8'}, // ISO 8859-8 Hebrew; Hebrew (ISO-Visual) |
| 420 | Codepage_Alias{28598, 'ISO8859-8'}, // ISO 8859-8 Hebrew; Hebrew (ISO-Visual) |
| 421 | Codepage_Alias{28598, 'ISO_8859-8'}, |
| 422 | Codepage_Alias{28598, 'ISO_8859_8'}, |
| 423 | Codepage_Alias{28599, 'ISO-8859-9'}, // ISO 8859-9 Turkish |
| 424 | Codepage_Alias{28599, 'ISO8859-9'}, // ISO 8859-9 Turkish |
| 425 | Codepage_Alias{28599, 'ISO_8859-9'}, |
| 426 | Codepage_Alias{28599, 'ISO_8859_9'}, |
| 427 | Codepage_Alias{28603, 'ISO-8859-13'}, // ISO 8859-13 Estonian |
| 428 | Codepage_Alias{28603, 'ISO8859-13'}, // ISO 8859-13 Estonian |
| 429 | Codepage_Alias{28603, 'ISO_8859-13'}, |
| 430 | Codepage_Alias{28603, 'ISO_8859_13'}, |
| 431 | Codepage_Alias{28605, 'ISO-8859-15'}, // ISO 8859-15 Latin 9 |
| 432 | Codepage_Alias{28605, 'ISO8859-15'}, // ISO 8859-15 Latin 9 |
| 433 | Codepage_Alias{28605, 'ISO_8859-15'}, |
| 434 | Codepage_Alias{28605, 'ISO_8859_15'}, |
| 435 | Codepage_Alias{29001, 'X-EUROPA'}, // Europa 3 |
| 436 | Codepage_Alias{38598, 'ISO-8859-8-I'}, // ISO 8859-8 Hebrew; Hebrew (ISO-Logical) |
| 437 | Codepage_Alias{38598, 'ISO8859-8-I'}, // ISO 8859-8 Hebrew; Hebrew (ISO-Logical) |
| 438 | Codepage_Alias{38598, 'ISO_8859-8-I'}, |
| 439 | Codepage_Alias{38598, 'ISO_8859_8-I'}, |
| 440 | Codepage_Alias{50220, 'ISO-2022-JP'}, // ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) |
| 441 | Codepage_Alias{50221, 'CSISO2022JP'}, // ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana) |
| 442 | Codepage_Alias{50222, 'ISO-2022-JP'}, // ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI) |
| 443 | Codepage_Alias{50225, 'ISO-2022-KR'}, // ISO 2022 Korean |
| 444 | Codepage_Alias{50225, 'ISO2022-KR'}, // ISO 2022 Korean |
| 445 | Codepage_Alias{50227, 'X-CP50227'}, // ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) |
| 446 | // 50229 ISO 2022 Traditional Chinese |
| 447 | // 50930 EBCDIC Japanese (Katakana) Extended |
| 448 | // 50931 EBCDIC US-Canada and Japanese |
| 449 | // 50933 EBCDIC Korean Extended and Korean |
| 450 | // 50935 EBCDIC Simplified Chinese Extended and Simplified Chinese |
| 451 | // 50936 EBCDIC Simplified Chinese |
| 452 | // 50937 EBCDIC US-Canada and Traditional Chinese |
| 453 | // 50939 EBCDIC Japanese (Latin) Extended and Japanese |
| 454 | Codepage_Alias{51932, 'EUC-JP'}, // EUC Japanese |
| 455 | Codepage_Alias{51936, 'EUC-CN'}, // EUC Simplified Chinese; Chinese Simplified (EUC) |
| 456 | Codepage_Alias{51949, 'EUC-KR'}, // EUC Korean |
| 457 | // 51950 EUC Traditional Chinese |
| 458 | Codepage_Alias{52936, 'HZ-GB-2312'}, // HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) |
| 459 | Codepage_Alias{54936, 'GB18030'}, // Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030) |
| 460 | Codepage_Alias{57002, 'X-ISCII-DE'}, // ISCII Devanagari |
| 461 | Codepage_Alias{57003, 'X-ISCII-BE'}, // ISCII Bengali |
| 462 | Codepage_Alias{57004, 'X-ISCII-TA'}, // ISCII Tamil |
| 463 | Codepage_Alias{57005, 'X-ISCII-TE'}, // ISCII Telugu |
| 464 | Codepage_Alias{57006, 'X-ISCII-AS'}, // ISCII Assamese |
| 465 | Codepage_Alias{57007, 'X-ISCII-OR'}, // ISCII Oriya |
| 466 | Codepage_Alias{57008, 'X-ISCII-KA'}, // ISCII Kannada |
| 467 | Codepage_Alias{57009, 'X-ISCII-MA'}, // ISCII Malayalam |
| 468 | Codepage_Alias{57010, 'X-ISCII-GU'}, // ISCII Gujarati |
| 469 | Codepage_Alias{57011, 'X-ISCII-PA'}, // ISCII Punjabi |
| 470 | Codepage_Alias{65000, 'UTF-7'}, // Unicode (UTF-7) |
| 471 | Codepage_Alias{65001, 'UTF-8'}, // Unicode (UTF-8) |
| 472 | // vfmt on |
| 473 | ] |
| 474 | |
| 475 | fn name_to_codepage(name string) int { |
| 476 | // performance hack |
| 477 | if name == 'UTF-8' { |
| 478 | return 65001 |
| 479 | } |
| 480 | |
| 481 | name_upper := name.to_upper() |
| 482 | if name_upper == '' || name_upper == 'CP_ACP' || name_upper == 'ANSI' { |
| 483 | return C.GetACP() |
| 484 | } |
| 485 | if name_upper == 'CP_OEMCP' { |
| 486 | return C.GetOEMCP() |
| 487 | } |
| 488 | if name_upper.len < 2 { |
| 489 | return -1 |
| 490 | } |
| 491 | if name_upper == 'WCHAR_T' { |
| 492 | return 1200 |
| 493 | } |
| 494 | // CP123 |
| 495 | if name_upper.starts_with('CP') { |
| 496 | return name_upper[2..].int() |
| 497 | } |
| 498 | if name_upper.is_int() { |
| 499 | return name_upper.int() |
| 500 | } |
| 501 | // XX123 for debug |
| 502 | if name_upper.starts_with('xx') { |
| 503 | return name_upper[2..].int() |
| 504 | } |
| 505 | |
| 506 | for x in codepage_alias { |
| 507 | if x.name == name_upper { |
| 508 | return x.codepage |
| 509 | } |
| 510 | } |
| 511 | return -1 |
| 512 | } |
| 513 | |
| 514 | // https://www.cnblogs.com/findumars/p/6376034.html |
| 515 | @[direct_array_access] |
| 516 | fn utf32_to_utf16(src &u8, src_len int, is_src_little_endian bool, is_dst_little_endian bool) ![]u8 { |
| 517 | mut dst := []u8{len: src_len} |
| 518 | mut sptr := unsafe { &u32(src) } |
| 519 | mut dptr := &u16(dst.data) |
| 520 | mut src_idx := 0 |
| 521 | mut dst_idx := 0 |
| 522 | mut c := u32(0) |
| 523 | mut t := u16(0) |
| 524 | for { |
| 525 | if src_idx == src_len / 4 { |
| 526 | break |
| 527 | } |
| 528 | unsafe { |
| 529 | c = sptr[src_idx] |
| 530 | } |
| 531 | if !is_src_little_endian { |
| 532 | c = reverse_u32(c) |
| 533 | } |
| 534 | src_idx++ |
| 535 | if c <= 0xFFFF { |
| 536 | t = u16(c) |
| 537 | if !is_dst_little_endian { |
| 538 | t = reverse_u16(t) |
| 539 | } |
| 540 | unsafe { |
| 541 | dptr[dst_idx] = t |
| 542 | } |
| 543 | dst_idx++ |
| 544 | } else if c <= 0xEFFFF { |
| 545 | t = u16((0xD800 + (c >> 10) - 0x40)) // high |
| 546 | |
| 547 | if !is_dst_little_endian { |
| 548 | t = reverse_u16(t) |
| 549 | } |
| 550 | unsafe { |
| 551 | dptr[dst_idx] = t |
| 552 | } |
| 553 | dst_idx++ |
| 554 | t = u16(0xDC00 + (c & 0x03FF)) // low |
| 555 | if !is_dst_little_endian { |
| 556 | t = reverse_u16(t) |
| 557 | } |
| 558 | unsafe { |
| 559 | dptr[dst_idx] = t |
| 560 | } |
| 561 | dst_idx++ |
| 562 | } else { |
| 563 | return error('invalid UTF-32LE encoding') |
| 564 | } |
| 565 | } |
| 566 | dst.trim(dst_idx * 2) |
| 567 | return dst |
| 568 | } |
| 569 | |
| 570 | // https://www.cnblogs.com/findumars/p/6376034.html |
| 571 | @[direct_array_access] |
| 572 | fn utf16_to_utf32(src &u8, src_len int, is_src_little_endian bool, is_dst_little_endian bool) ![]u8 { |
| 573 | mut dst := []u8{len: src_len * 2} |
| 574 | mut sptr := unsafe { &u16(src) } |
| 575 | mut dptr := &u32(dst.data) |
| 576 | mut w1 := u16(0) |
| 577 | mut w2 := u16(0) |
| 578 | mut t := u32(0) |
| 579 | mut src_idx := 0 |
| 580 | mut dst_idx := 0 |
| 581 | for { |
| 582 | if src_idx == src_len / 2 { |
| 583 | break |
| 584 | } |
| 585 | unsafe { |
| 586 | w1 = sptr[src_idx] |
| 587 | } |
| 588 | if !is_src_little_endian { |
| 589 | w1 = reverse_u16(w1) |
| 590 | } |
| 591 | src_idx++ |
| 592 | if w1 >= 0xD800 && w1 <= 0xDFFF { |
| 593 | if w1 < 0xDC00 { |
| 594 | if src_idx == src_len / 2 { |
| 595 | return error('invalid UTF-16LE encoding') |
| 596 | } |
| 597 | unsafe { |
| 598 | w2 = sptr[src_idx] |
| 599 | } |
| 600 | if !is_src_little_endian { |
| 601 | w2 = reverse_u16(w2) |
| 602 | } |
| 603 | if w2 >= 0xDC00 && w2 <= 0xDFFF { |
| 604 | t = (w2 & 0x03FF) + (((w1 & 0x03FF) + 0x40) << 10) |
| 605 | if !is_dst_little_endian { |
| 606 | t = reverse_u32(t) |
| 607 | } |
| 608 | unsafe { |
| 609 | dptr[dst_idx] = t |
| 610 | } |
| 611 | dst_idx++ |
| 612 | } |
| 613 | } else { |
| 614 | return error('invalid UTF-16LE encoding') |
| 615 | } |
| 616 | } else { |
| 617 | t = w1 |
| 618 | if !is_dst_little_endian { |
| 619 | t = reverse_u32(t) |
| 620 | } |
| 621 | unsafe { |
| 622 | dptr[dst_idx] = t |
| 623 | } |
| 624 | dst_idx++ |
| 625 | } |
| 626 | } |
| 627 | dst.trim(dst_idx * 4) |
| 628 | return dst |
| 629 | } |
| 630 | |
| 631 | // conv convert `fromcode` encoding string to `tocode` encoding string |
| 632 | @[direct_array_access] |
| 633 | fn conv(tocode string, fromcode string, src &u8, src_len int) ![]u8 { |
| 634 | if src_len < 0 { |
| 635 | return error('src length error') |
| 636 | } |
| 637 | src_codepage := name_to_codepage(fromcode) |
| 638 | dst_codepage := name_to_codepage(tocode) |
| 639 | if src_codepage <= 0 { |
| 640 | return error('fromcode ${fromcode} does not exist') |
| 641 | } |
| 642 | if dst_codepage <= 0 { |
| 643 | return error('tocode ${tocode} does not exist') |
| 644 | } |
| 645 | |
| 646 | if src_codepage == dst_codepage { |
| 647 | // clone src |
| 648 | mut dst_buf := []u8{len: src_len} |
| 649 | unsafe { vmemcpy(dst_buf.data, src, src_len) } |
| 650 | return dst_buf |
| 651 | } |
| 652 | |
| 653 | mut unicode := []u8{} |
| 654 | // src codepage => Unicode |
| 655 | match src_codepage { |
| 656 | 1200 { |
| 657 | // src already in Unicode(UTF-16LE) encoding, just clone src |
| 658 | unsafe { |
| 659 | unicode.grow_len(src_len) |
| 660 | vmemcpy(unicode.data, src, src_len) |
| 661 | } |
| 662 | } |
| 663 | 1201 { |
| 664 | // Windows does not support UTF-16BE |
| 665 | // byte swap each 16 bit character element |
| 666 | unsafe { |
| 667 | unicode.grow_len(src_len) |
| 668 | vmemcpy(unicode.data, src, src_len) |
| 669 | } |
| 670 | mut eptr := &u16(unicode.data) |
| 671 | for i in 0 .. src_len / 2 { |
| 672 | unsafe { |
| 673 | eptr[i] = reverse_u16(eptr[i]) |
| 674 | } |
| 675 | } |
| 676 | } |
| 677 | 12000 { |
| 678 | // Windows does not support UTF-32LE |
| 679 | unicode = utf32_to_utf16(src, src_len, true, true)! |
| 680 | } |
| 681 | 12001 { |
| 682 | // Windows does not support UTF-32BE |
| 683 | unicode = utf32_to_utf16(src, src_len, false, true)! |
| 684 | } |
| 685 | else { |
| 686 | char_num := C.MultiByteToWideChar(src_codepage, 0, src, src_len, 0, 0) |
| 687 | if char_num == 0 { |
| 688 | return error('MultiByteToWideChar fail: src contain zero ${fromcode} character') |
| 689 | } |
| 690 | unsafe { unicode.grow_len(char_num * 2) } // every char take 2 bytes |
| 691 | C.MultiByteToWideChar(src_codepage, 0, src, src_len, unicode.data, unicode.len) |
| 692 | } |
| 693 | } |
| 694 | |
| 695 | mut dst := []u8{} |
| 696 | // Unicode => dst codepage |
| 697 | match dst_codepage { |
| 698 | 1200 { |
| 699 | // dst codepage is Unicode, just return unicode |
| 700 | return unicode |
| 701 | } |
| 702 | 1201 { |
| 703 | // Windows does not support UTF-16BE |
| 704 | // byte swap each 16 bit character element |
| 705 | mut eptr := &u16(unicode.data) |
| 706 | for i in 0 .. unicode.len / 2 { |
| 707 | unsafe { |
| 708 | eptr[i] = reverse_u16(eptr[i]) |
| 709 | } |
| 710 | } |
| 711 | return unicode |
| 712 | } |
| 713 | 12000 { |
| 714 | // Windows does not support UTF-32LE |
| 715 | dst = utf16_to_utf32(unicode.data, unicode.len, true, true)! |
| 716 | } |
| 717 | 12001 { |
| 718 | // Windows does not support UTF-32BE |
| 719 | dst = utf16_to_utf32(unicode.data, unicode.len, true, false)! |
| 720 | } |
| 721 | else { |
| 722 | dst_len := C.WideCharToMultiByte(dst_codepage, 0, unicode.data, unicode.len / 2, 0, 0, |
| 723 | 0, 0) |
| 724 | if dst_len == 0 { |
| 725 | return error('WideCharToMultiByte fail: src contain zero unicode character') |
| 726 | } |
| 727 | unsafe { dst.grow_len(dst_len) } |
| 728 | C.WideCharToMultiByte(dst_codepage, 0, unicode.data, unicode.len, dst.data, dst.len, 0, |
| 729 | 0) |
| 730 | } |
| 731 | } |
| 732 | |
| 733 | return dst |
| 734 | } |
| 735 | |