v2 / vlib / encoding / iconv / iconv_windows.c.v
734 lines · 673 sloc · 27.22 KB · 8e35f4d9848f7ad35d857a187dddbfd2eca5e19d
Raw
1module iconv
2
3// Module iconv provides functions convert between vstring(UTF8) to/from different encodings.
4// iconv implementation using Win32 API to convert
5// Idear from https://github.com/win-iconv/win-iconv
6
7fn C.GetACP() i32
8fn C.GetOEMCP() i32
9fn C.WideCharToMultiByte(codepage u32, dwflags u32, src &u8, src_len i32, dst &u8, dst_len i32, default_char &u8, used_default_char &bool) i32
10fn C.MultiByteToWideChar(codepage u32, dwflags u32, src &u8, src_len i32, dst &u8, dst_len i32) i32
11
12struct Codepage_Alias {
13 codepage int
14 name string
15}
16
17const codepage_alias = [
18 // NOTE! name field string MUST in uppercase!
19 // vfmt off
20 Codepage_Alias{65001, 'CP65001'},
21 Codepage_Alias{65001, 'UTF8'},
22 Codepage_Alias{65001, 'UTF-8'},
23
24 Codepage_Alias{1200, 'CP1200'},
25 Codepage_Alias{1200, 'UTF16LE'},
26 Codepage_Alias{1200, 'UTF-16LE'},
27 Codepage_Alias{1200, 'UCS2LE'},
28 Codepage_Alias{1200, 'UCS-2LE'},
29 Codepage_Alias{1200, 'UCS-2-INTERNAL'},
30 Codepage_Alias{1200, 'UNICODE'}, // for iconv
31
32 Codepage_Alias{1201, 'CP1201'},
33 Codepage_Alias{1201, 'UTF16BE'},
34 Codepage_Alias{1201, 'UTF-16BE'},
35 Codepage_Alias{1201, 'UCS2BE'},
36 Codepage_Alias{1201, 'UCS-2BE'},
37 Codepage_Alias{1201, 'UNICODEFFFE'},
38
39 Codepage_Alias{12000, 'CP12000'},
40 Codepage_Alias{12000, 'UTF32LE'},
41 Codepage_Alias{12000, 'UTF-32LE'},
42 Codepage_Alias{12000, 'UCS4LE'},
43 Codepage_Alias{12000, 'UCS-4LE'},
44
45 Codepage_Alias{12001, 'CP12001'},
46 Codepage_Alias{12001, 'UTF32BE'},
47 Codepage_Alias{12001, 'UTF-32BE'},
48 Codepage_Alias{12001, 'UCS4BE'},
49 Codepage_Alias{12001, 'UCS-4BE'},
50
51//#ifndef GLIB_COMPILATION
52// //
53// * Default is big endian.
54// * See rfc2781 4.3 Interpreting text labelled as UTF-16.
55//
56// Codepage_Alias{1201, 'UTF16'},
57// Codepage_Alias{1201, 'UTF-16'},
58// Codepage_Alias{1201, 'UCS2'},
59// Codepage_Alias{1201, 'UCS-2'},
60// Codepage_Alias{12001, 'UTF32'},
61// Codepage_Alias{12001, 'UTF-32'},
62// Codepage_Alias{12001, 'UCS-4'},
63// Codepage_Alias{12001, 'UCS4'},
64//#else
65 // Default is little endian, because the platform is
66 Codepage_Alias{1200, 'UTF16'},
67 Codepage_Alias{1200, 'UTF-16'},
68 Codepage_Alias{1200, 'UCS2'},
69 Codepage_Alias{1200, 'UCS-2'},
70 Codepage_Alias{12000, 'UTF32'},
71 Codepage_Alias{12000, 'UTF-32'},
72 Codepage_Alias{12000, 'UCS4'},
73 Codepage_Alias{12000, 'UCS-4'},
74//#endif
75
76 // copy from libiconv `iconv -l`
77 // !IsValidCodePage(367)
78 Codepage_Alias{20127, 'ANSI_X3.4-1968'},
79 Codepage_Alias{20127, 'ANSI_X3.4-1986'},
80 Codepage_Alias{20127, 'ASCII'},
81 Codepage_Alias{20127, 'CP367'},
82 Codepage_Alias{20127, 'IBM367'},
83 Codepage_Alias{20127, 'ISO-IR-6'},
84 Codepage_Alias{20127, 'ISO646-US'},
85 Codepage_Alias{20127, 'ISO_646.IRV:1991'},
86 Codepage_Alias{20127, 'US'},
87 Codepage_Alias{20127, 'US-ASCII'},
88 Codepage_Alias{20127, 'CSASCII'},
89
90 // !IsValidCodePage(819)
91 Codepage_Alias{1252, 'CP819'},
92 Codepage_Alias{1252, 'IBM819'},
93 Codepage_Alias{28591, 'ISO-8859-1'},
94 Codepage_Alias{28591, 'ISO-IR-100'},
95 Codepage_Alias{28591, 'ISO8859-1'},
96 Codepage_Alias{28591, 'ISO_8859-1'},
97 Codepage_Alias{28591, 'ISO_8859-1:1987'},
98 Codepage_Alias{28591, 'L1'},
99 Codepage_Alias{28591, 'LATIN1'},
100 Codepage_Alias{28591, 'CSISOLATIN1'},
101
102 Codepage_Alias{1250, 'CP1250'},
103 Codepage_Alias{1250, 'MS-EE'},
104 Codepage_Alias{1250, 'WINDOWS-1250'},
105
106 Codepage_Alias{1251, 'CP1251'},
107 Codepage_Alias{1251, 'MS-CYRL'},
108 Codepage_Alias{1251, 'WINDOWS-1251'},
109
110 Codepage_Alias{1252, 'CP1252'},
111 Codepage_Alias{1252, 'MS-ANSI'},
112 Codepage_Alias{1252, 'WINDOWS-1252'},
113
114 Codepage_Alias{1253, 'CP1253'},
115 Codepage_Alias{1253, 'MS-GREEK'},
116 Codepage_Alias{1253, 'WINDOWS-1253'},
117
118 Codepage_Alias{1254, 'CP1254'},
119 Codepage_Alias{1254, 'MS-TURK'},
120 Codepage_Alias{1254, 'WINDOWS-1254'},
121
122 Codepage_Alias{1255, 'CP1255'},
123 Codepage_Alias{1255, 'MS-HEBR'},
124 Codepage_Alias{1255, 'WINDOWS-1255'},
125
126 Codepage_Alias{1256, 'CP1256'},
127 Codepage_Alias{1256, 'MS-ARAB'},
128 Codepage_Alias{1256, 'WINDOWS-1256'},
129
130 Codepage_Alias{1257, 'CP1257'},
131 Codepage_Alias{1257, 'WINBALTRIM'},
132 Codepage_Alias{1257, 'WINDOWS-1257'},
133
134 Codepage_Alias{1258, 'CP1258'},
135 Codepage_Alias{1258, 'WINDOWS-1258'},
136
137 Codepage_Alias{850, '850'},
138 Codepage_Alias{850, 'CP850'},
139 Codepage_Alias{850, 'IBM850'},
140 Codepage_Alias{850, 'CSPC850MULTILINGUAL'},
141
142 // !IsValidCodePage(862)
143 Codepage_Alias{862, '862'},
144 Codepage_Alias{862, 'CP862'},
145 Codepage_Alias{862, 'IBM862'},
146 Codepage_Alias{862, 'CSPC862LATINHEBREW'},
147
148 Codepage_Alias{866, '866'},
149 Codepage_Alias{866, 'CP866'},
150 Codepage_Alias{866, 'IBM866'},
151 Codepage_Alias{866, 'CSIBM866'},
152
153 // !IsValidCodePage(154)
154 Codepage_Alias{154, 'CP154'},
155 Codepage_Alias{154, 'CYRILLIC-ASIAN'},
156 Codepage_Alias{154, 'PT154'},
157 Codepage_Alias{154, 'PTCP154'},
158 Codepage_Alias{154, 'CSPTCP154'},
159
160 // !IsValidCodePage(1133)
161 Codepage_Alias{1133, 'CP1133'},
162 Codepage_Alias{1133, 'IBM-CP1133'},
163
164 Codepage_Alias{874, 'CP874'},
165 Codepage_Alias{874, 'WINDOWS-874'},
166
167 // !IsValidCodePage(51932)
168 Codepage_Alias{51932, 'CP51932'},
169 Codepage_Alias{51932, 'MS51932'},
170 Codepage_Alias{51932, 'WINDOWS-51932'},
171 Codepage_Alias{51932, 'EUC-JP'},
172
173 Codepage_Alias{932, 'CP932'},
174 Codepage_Alias{932, 'MS932'},
175 Codepage_Alias{932, 'SHIFFT_JIS'},
176 Codepage_Alias{932, 'SHIFFT_JIS-MS'},
177 Codepage_Alias{932, 'SJIS'},
178 Codepage_Alias{932, 'SJIS-MS'},
179 Codepage_Alias{932, 'SJIS-OPEN'},
180 Codepage_Alias{932, 'SJIS-WIN'},
181 Codepage_Alias{932, 'WINDOWS-31J'},
182 Codepage_Alias{932, 'WINDOWS-932'},
183 Codepage_Alias{932, 'CSWINDOWS31J'},
184
185 Codepage_Alias{50221, 'CP50221'},
186 Codepage_Alias{50221, 'ISO-2022-JP'},
187 Codepage_Alias{50221, 'ISO-2022-JP-MS'},
188 Codepage_Alias{50221, 'ISO2022-JP'},
189 Codepage_Alias{50221, 'ISO2022-JP-MS'},
190 Codepage_Alias{50221, 'MS50221'},
191 Codepage_Alias{50221, 'WINDOWS-50221'},
192
193 Codepage_Alias{936, 'CP936'},
194 Codepage_Alias{936, 'GBK'},
195 Codepage_Alias{936, 'MS936'},
196 Codepage_Alias{936, 'WINDOWS-936'},
197
198 Codepage_Alias{950, 'CP950'},
199 Codepage_Alias{950, 'BIG5'},
200 Codepage_Alias{950, 'BIG5HKSCS'},
201 Codepage_Alias{950, 'BIG5-HKSCS'},
202
203 Codepage_Alias{949, 'CP949'},
204 Codepage_Alias{949, 'UHC'},
205 Codepage_Alias{949, 'EUC-KR'},
206
207 Codepage_Alias{1361, 'CP1361'},
208 Codepage_Alias{1361, 'JOHAB'},
209
210 Codepage_Alias{437, '437'},
211 Codepage_Alias{437, 'CP437'},
212 Codepage_Alias{437, 'IBM437'},
213 Codepage_Alias{437, 'CSPC8CODEPAGE437'},
214
215 Codepage_Alias{737, 'CP737'},
216
217 Codepage_Alias{775, 'CP775'},
218 Codepage_Alias{775, 'IBM775'},
219 Codepage_Alias{775, 'CSPC775BALTIC'},
220
221 Codepage_Alias{852, '852'},
222 Codepage_Alias{852, 'CP852'},
223 Codepage_Alias{852, 'IBM852'},
224 Codepage_Alias{852, 'CSPCP852'},
225
226 // !IsValidCodePage(853)
227 Codepage_Alias{853, 'CP853'},
228
229 Codepage_Alias{855, '855'},
230 Codepage_Alias{855, 'CP855'},
231 Codepage_Alias{855, 'IBM855'},
232 Codepage_Alias{855, 'CSIBM855'},
233
234 Codepage_Alias{857, '857'},
235 Codepage_Alias{857, 'CP857'},
236 Codepage_Alias{857, 'IBM857'},
237 Codepage_Alias{857, 'CSIBM857'},
238
239 // !IsValidCodePage(858)
240 Codepage_Alias{858, 'CP858'},
241
242 Codepage_Alias{860, '860'},
243 Codepage_Alias{860, 'CP860'},
244 Codepage_Alias{860, 'IBM860'},
245 Codepage_Alias{860, 'CSIBM860'},
246
247 Codepage_Alias{861, '861'},
248 Codepage_Alias{861, 'CP-IS'},
249 Codepage_Alias{861, 'CP861'},
250 Codepage_Alias{861, 'IBM861'},
251 Codepage_Alias{861, 'CSIBM861'},
252
253 Codepage_Alias{863, '863'},
254 Codepage_Alias{863, 'CP863'},
255 Codepage_Alias{863, 'IBM863'},
256 Codepage_Alias{863, 'CSIBM863'},
257
258 Codepage_Alias{864, 'CP864'},
259 Codepage_Alias{864, 'IBM864'},
260 Codepage_Alias{864, 'CSIBM864'},
261
262 Codepage_Alias{865, '865'},
263 Codepage_Alias{865, 'CP865'},
264 Codepage_Alias{865, 'IBM865'},
265 Codepage_Alias{865, 'CSIBM865'},
266
267 Codepage_Alias{869, '869'},
268 Codepage_Alias{869, 'CP-GR'},
269 Codepage_Alias{869, 'CP869'},
270 Codepage_Alias{869, 'IBM869'},
271 Codepage_Alias{869, 'CSIBM869'},
272
273 // !IsValidCodePage(1152)
274 Codepage_Alias{1125, 'CP1125'},
275
276 //
277 // * Code Page Identifiers
278 // * https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
279
280 Codepage_Alias{37, 'IBM037'}, // IBM EBCDIC US-Canada
281 Codepage_Alias{437, 'IBM437'}, // OEM United States
282 Codepage_Alias{500, 'IBM500'}, // IBM EBCDIC International
283 Codepage_Alias{708, 'ASMO-708'}, // Arabic (ASMO 708)
284 // 709 Arabic (ASMO-449+, BCON V4)
285 // 710 Arabic - Transparent Arabic
286 Codepage_Alias{720, 'DOS-720'}, // Arabic (Transparent ASMO); Arabic (DOS)
287 Codepage_Alias{737, 'IBM737'}, // OEM Greek (formerly 437G); Greek (DOS)
288 Codepage_Alias{775, 'IBM775'}, // OEM Baltic; Baltic (DOS)
289 Codepage_Alias{850, 'IBM850'}, // OEM Multilingual Latin 1; Western European (DOS)
290 Codepage_Alias{852, 'IBM852'}, // OEM Latin 2; Central European (DOS)
291 Codepage_Alias{855, 'IBM855'}, // OEM Cyrillic (primarily Russian)
292 Codepage_Alias{857, 'IBM857'}, // OEM Turkish; Turkish (DOS)
293 Codepage_Alias{858, 'IBM00858'}, // OEM Multilingual Latin 1 + Euro symbol
294 Codepage_Alias{860, 'IBM860'}, // OEM Portuguese; Portuguese (DOS)
295 Codepage_Alias{861, 'IBM861'}, // OEM Icelandic; Icelandic (DOS)
296 Codepage_Alias{862, 'DOS-862'}, // OEM Hebrew; Hebrew (DOS)
297 Codepage_Alias{863, 'IBM863'}, // OEM French Canadian; French Canadian (DOS)
298 Codepage_Alias{864, 'IBM864'}, // OEM Arabic; Arabic (864)
299 Codepage_Alias{865, 'IBM865'}, // OEM Nordic; Nordic (DOS)
300 Codepage_Alias{866, 'CP866'}, // OEM Russian; Cyrillic (DOS)
301 Codepage_Alias{869, 'IBM869'}, // OEM Modern Greek; Greek, Modern (DOS)
302 Codepage_Alias{870, 'IBM870'}, // IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
303 Codepage_Alias{874, 'WINDOWS-874'}, // ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows)
304 Codepage_Alias{875, 'CP875'}, // IBM EBCDIC Greek Modern
305 Codepage_Alias{932, 'SHIFT_JIS'}, // ANSI/OEM Japanese; Japanese (Shift-JIS)
306 Codepage_Alias{932, 'SHIFT-JIS'}, // alternative name for it
307 Codepage_Alias{936, 'GB2312'}, // ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
308 Codepage_Alias{949, 'KS_C_5601-1987'}, // ANSI/OEM Korean (Unified Hangul Code)
309 Codepage_Alias{950, 'BIG5'}, // ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
310 Codepage_Alias{950, 'BIG5HKSCS'}, // ANSI/OEM Traditional Chinese (Hong Kong SAR); Chinese Traditional (Big5-HKSCS)
311 Codepage_Alias{950, 'BIG5-HKSCS'}, // alternative name for it
312 Codepage_Alias{1026, 'IBM1026'}, // IBM EBCDIC Turkish (Latin 5)
313 Codepage_Alias{1047, 'IBM01047'}, // IBM EBCDIC Latin 1/Open System
314 Codepage_Alias{1140, 'IBM01140'}, // IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
315 Codepage_Alias{1141, 'IBM01141'}, // IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
316 Codepage_Alias{1142, 'IBM01142'}, // IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
317 Codepage_Alias{1143, 'IBM01143'}, // IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
318 Codepage_Alias{1144, 'IBM01144'}, // IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
319 Codepage_Alias{1145, 'IBM01145'}, // IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
320 Codepage_Alias{1146, 'IBM01146'}, // IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
321 Codepage_Alias{1147, 'IBM01147'}, // IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
322 Codepage_Alias{1148, 'IBM01148'}, // IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
323 Codepage_Alias{1149, 'IBM01149'}, // IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
324 Codepage_Alias{1200, 'UTF-16'}, // Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
325 Codepage_Alias{1201, 'UNICODEFFFE'}, // Unicode UTF-16, big endian byte order; available only to managed applications
326 Codepage_Alias{1250, 'WINDOWS-1250'}, // ANSI Central European; Central European (Windows)
327 Codepage_Alias{1251, 'WINDOWS-1251'}, // ANSI Cyrillic; Cyrillic (Windows)
328 Codepage_Alias{1252, 'WINDOWS-1252'}, // ANSI Latin 1; Western European (Windows)
329 Codepage_Alias{1253, 'WINDOWS-1253'}, // ANSI Greek; Greek (Windows)
330 Codepage_Alias{1254, 'WINDOWS-1254'}, // ANSI Turkish; Turkish (Windows)
331 Codepage_Alias{1255, 'WINDOWS-1255'}, // ANSI Hebrew; Hebrew (Windows)
332 Codepage_Alias{1256, 'WINDOWS-1256'}, // ANSI Arabic; Arabic (Windows)
333 Codepage_Alias{1257, 'WINDOWS-1257'}, // ANSI Baltic; Baltic (Windows)
334 Codepage_Alias{1258, 'WINDOWS-1258'}, // ANSI/OEM Vietnamese; Vietnamese (Windows)
335 Codepage_Alias{1361, 'JOHAB'}, // Korean (Johab)
336 Codepage_Alias{10000, 'MACINTOSH'}, // MAC Roman; Western European (Mac)
337 Codepage_Alias{10001, 'X-MAC-JAPANESE'}, // Japanese (Mac)
338 Codepage_Alias{10002, 'X-MAC-CHINESETRAD'}, // MAC Traditional Chinese (Big5); Chinese Traditional (Mac)
339 Codepage_Alias{10003, 'X-MAC-KOREAN'}, // Korean (Mac)
340 Codepage_Alias{10004, 'X-MAC-ARABIC'}, // Arabic (Mac)
341 Codepage_Alias{10005, 'X-MAC-HEBREW'}, // Hebrew (Mac)
342 Codepage_Alias{10006, 'X-MAC-GREEK'}, // Greek (Mac)
343 Codepage_Alias{10007, 'X-MAC-CYRILLIC'}, // Cyrillic (Mac)
344 Codepage_Alias{10008, 'X-MAC-CHINESESIMP'}, // MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)
345 Codepage_Alias{10010, 'X-MAC-ROMANIAN'}, // Romanian (Mac)
346 Codepage_Alias{10017, 'X-MAC-UKRAINIAN'}, // Ukrainian (Mac)
347 Codepage_Alias{10021, 'X-MAC-THAI'}, // Thai (Mac)
348 Codepage_Alias{10029, 'X-MAC-CE'}, // MAC Latin 2; Central European (Mac)
349 Codepage_Alias{10079, 'X-MAC-ICELANDIC'}, // Icelandic (Mac)
350 Codepage_Alias{10081, 'X-MAC-TURKISH'}, // Turkish (Mac)
351 Codepage_Alias{10082, 'X-MAC-CROATIAN'}, // Croatian (Mac)
352 Codepage_Alias{12000, 'UTF-32'}, // Unicode UTF-32, little endian byte order; available only to managed applications
353 Codepage_Alias{12001, 'UTF-32BE'}, // Unicode UTF-32, big endian byte order; available only to managed applications
354 Codepage_Alias{20000, 'X-CHINESE_CNS'}, // CNS Taiwan; Chinese Traditional (CNS)
355 Codepage_Alias{20001, 'X-CP20001'}, // TCA Taiwan
356 Codepage_Alias{20002, 'X_CHINESE-ETEN'}, // Eten Taiwan; Chinese Traditional (Eten)
357 Codepage_Alias{20003, 'X-CP20003'}, // IBM5550 Taiwan
358 Codepage_Alias{20004, 'X-CP20004'}, // TeleText Taiwan
359 Codepage_Alias{20005, 'X-CP20005'}, // Wang Taiwan
360 Codepage_Alias{20105, 'X-IA5'}, // IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)
361 Codepage_Alias{20106, 'X-IA5-GERMAN'}, // IA5 German (7-bit)
362 Codepage_Alias{20107, 'X-IA5-SWEDISH'}, // IA5 Swedish (7-bit)
363 Codepage_Alias{20108, 'X-IA5-NORWEGIAN'}, // IA5 Norwegian (7-bit)
364 Codepage_Alias{20127, 'US-ASCII'}, // US-ASCII (7-bit)
365 Codepage_Alias{20261, 'X-CP20261'}, // T.61
366 Codepage_Alias{20269, 'X-CP20269'}, // ISO 6937 Non-Spacing Accent
367 Codepage_Alias{20273, 'IBM273'}, // IBM EBCDIC Germany
368 Codepage_Alias{20277, 'IBM277'}, // IBM EBCDIC Denmark-Norway
369 Codepage_Alias{20278, 'IBM278'}, // IBM EBCDIC Finland-Sweden
370 Codepage_Alias{20280, 'IBM280'}, // IBM EBCDIC Italy
371 Codepage_Alias{20284, 'IBM284'}, // IBM EBCDIC Latin America-Spain
372 Codepage_Alias{20285, 'IBM285'}, // IBM EBCDIC United Kingdom
373 Codepage_Alias{20290, 'IBM290'}, // IBM EBCDIC Japanese Katakana Extended
374 Codepage_Alias{20297, 'IBM297'}, // IBM EBCDIC France
375 Codepage_Alias{20420, 'IBM420'}, // IBM EBCDIC Arabic
376 Codepage_Alias{20423, 'IBM423'}, // IBM EBCDIC Greek
377 Codepage_Alias{20424, 'IBM424'}, // IBM EBCDIC Hebrew
378 Codepage_Alias{20833, 'X-EBCDIC-KOREANEXTENDED'}, // IBM EBCDIC Korean Extended
379 Codepage_Alias{20838, 'IBM-THAI'}, // IBM EBCDIC Thai
380 Codepage_Alias{20866, 'KOI8-R'}, // Russian (KOI8-R); Cyrillic (KOI8-R)
381 Codepage_Alias{20871, 'IBM871'}, // IBM EBCDIC Icelandic
382 Codepage_Alias{20880, 'IBM880'}, // IBM EBCDIC Cyrillic Russian
383 Codepage_Alias{20905, 'IBM905'}, // IBM EBCDIC Turkish
384 Codepage_Alias{20924, 'IBM00924'}, // IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
385 Codepage_Alias{20932, 'EUC-JP'}, // Japanese (JIS 0208-1990 and 0121-1990)
386 Codepage_Alias{20936, 'X-CP20936'}, // Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
387 Codepage_Alias{20949, 'X-CP20949'}, // Korean Wansung
388 Codepage_Alias{21025, 'CP1025'}, // IBM EBCDIC Cyrillic Serbian-Bulgarian
389 // 21027 (deprecated)
390 Codepage_Alias{21866, 'KOI8-U'}, // Ukrainian (KOI8-U); Cyrillic (KOI8-U)
391 Codepage_Alias{28591, 'ISO-8859-1'}, // ISO 8859-1 Latin 1; Western European (ISO)
392 Codepage_Alias{28591, 'ISO8859-1'}, // ISO 8859-1 Latin 1; Western European (ISO)
393 Codepage_Alias{28591, 'ISO_8859-1'},
394 Codepage_Alias{28591, 'ISO_8859_1'},
395 Codepage_Alias{28592, 'ISO-8859-2'}, // ISO 8859-2 Central European; Central European (ISO)
396 Codepage_Alias{28592, 'ISO8859-2'}, // ISO 8859-2 Central European; Central European (ISO)
397 Codepage_Alias{28592, 'ISO_8859-2'},
398 Codepage_Alias{28592, 'ISO_8859_2'},
399 Codepage_Alias{28593, 'ISO-8859-3'}, // ISO 8859-3 Latin 3
400 Codepage_Alias{28593, 'ISO8859-3'}, // ISO 8859-3 Latin 3
401 Codepage_Alias{28593, 'ISO_8859-3'},
402 Codepage_Alias{28593, 'ISO_8859_3'},
403 Codepage_Alias{28594, 'ISO-8859-4'}, // ISO 8859-4 Baltic
404 Codepage_Alias{28594, 'ISO8859-4'}, // ISO 8859-4 Baltic
405 Codepage_Alias{28594, 'ISO_8859-4'},
406 Codepage_Alias{28594, 'ISO_8859_4'},
407 Codepage_Alias{28595, 'ISO-8859-5'}, // ISO 8859-5 Cyrillic
408 Codepage_Alias{28595, 'ISO8859-5'}, // ISO 8859-5 Cyrillic
409 Codepage_Alias{28595, 'ISO_8859-5'},
410 Codepage_Alias{28595, 'ISO_8859_5'},
411 Codepage_Alias{28596, 'ISO-8859-6'}, // ISO 8859-6 Arabic
412 Codepage_Alias{28596, 'ISO8859-6'}, // ISO 8859-6 Arabic
413 Codepage_Alias{28596, 'ISO_8859-6'},
414 Codepage_Alias{28596, 'ISO_8859_6'},
415 Codepage_Alias{28597, 'ISO-8859-7'}, // ISO 8859-7 Greek
416 Codepage_Alias{28597, 'ISO8859-7'}, // ISO 8859-7 Greek
417 Codepage_Alias{28597, 'ISO_8859-7'},
418 Codepage_Alias{28597, 'ISO_8859_7'},
419 Codepage_Alias{28598, 'ISO-8859-8'}, // ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
420 Codepage_Alias{28598, 'ISO8859-8'}, // ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
421 Codepage_Alias{28598, 'ISO_8859-8'},
422 Codepage_Alias{28598, 'ISO_8859_8'},
423 Codepage_Alias{28599, 'ISO-8859-9'}, // ISO 8859-9 Turkish
424 Codepage_Alias{28599, 'ISO8859-9'}, // ISO 8859-9 Turkish
425 Codepage_Alias{28599, 'ISO_8859-9'},
426 Codepage_Alias{28599, 'ISO_8859_9'},
427 Codepage_Alias{28603, 'ISO-8859-13'}, // ISO 8859-13 Estonian
428 Codepage_Alias{28603, 'ISO8859-13'}, // ISO 8859-13 Estonian
429 Codepage_Alias{28603, 'ISO_8859-13'},
430 Codepage_Alias{28603, 'ISO_8859_13'},
431 Codepage_Alias{28605, 'ISO-8859-15'}, // ISO 8859-15 Latin 9
432 Codepage_Alias{28605, 'ISO8859-15'}, // ISO 8859-15 Latin 9
433 Codepage_Alias{28605, 'ISO_8859-15'},
434 Codepage_Alias{28605, 'ISO_8859_15'},
435 Codepage_Alias{29001, 'X-EUROPA'}, // Europa 3
436 Codepage_Alias{38598, 'ISO-8859-8-I'}, // ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
437 Codepage_Alias{38598, 'ISO8859-8-I'}, // ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
438 Codepage_Alias{38598, 'ISO_8859-8-I'},
439 Codepage_Alias{38598, 'ISO_8859_8-I'},
440 Codepage_Alias{50220, 'ISO-2022-JP'}, // ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
441 Codepage_Alias{50221, 'CSISO2022JP'}, // ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
442 Codepage_Alias{50222, 'ISO-2022-JP'}, // ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
443 Codepage_Alias{50225, 'ISO-2022-KR'}, // ISO 2022 Korean
444 Codepage_Alias{50225, 'ISO2022-KR'}, // ISO 2022 Korean
445 Codepage_Alias{50227, 'X-CP50227'}, // ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)
446 // 50229 ISO 2022 Traditional Chinese
447 // 50930 EBCDIC Japanese (Katakana) Extended
448 // 50931 EBCDIC US-Canada and Japanese
449 // 50933 EBCDIC Korean Extended and Korean
450 // 50935 EBCDIC Simplified Chinese Extended and Simplified Chinese
451 // 50936 EBCDIC Simplified Chinese
452 // 50937 EBCDIC US-Canada and Traditional Chinese
453 // 50939 EBCDIC Japanese (Latin) Extended and Japanese
454 Codepage_Alias{51932, 'EUC-JP'}, // EUC Japanese
455 Codepage_Alias{51936, 'EUC-CN'}, // EUC Simplified Chinese; Chinese Simplified (EUC)
456 Codepage_Alias{51949, 'EUC-KR'}, // EUC Korean
457 // 51950 EUC Traditional Chinese
458 Codepage_Alias{52936, 'HZ-GB-2312'}, // HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
459 Codepage_Alias{54936, 'GB18030'}, // Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
460 Codepage_Alias{57002, 'X-ISCII-DE'}, // ISCII Devanagari
461 Codepage_Alias{57003, 'X-ISCII-BE'}, // ISCII Bengali
462 Codepage_Alias{57004, 'X-ISCII-TA'}, // ISCII Tamil
463 Codepage_Alias{57005, 'X-ISCII-TE'}, // ISCII Telugu
464 Codepage_Alias{57006, 'X-ISCII-AS'}, // ISCII Assamese
465 Codepage_Alias{57007, 'X-ISCII-OR'}, // ISCII Oriya
466 Codepage_Alias{57008, 'X-ISCII-KA'}, // ISCII Kannada
467 Codepage_Alias{57009, 'X-ISCII-MA'}, // ISCII Malayalam
468 Codepage_Alias{57010, 'X-ISCII-GU'}, // ISCII Gujarati
469 Codepage_Alias{57011, 'X-ISCII-PA'}, // ISCII Punjabi
470 Codepage_Alias{65000, 'UTF-7'}, // Unicode (UTF-7)
471 Codepage_Alias{65001, 'UTF-8'}, // Unicode (UTF-8)
472 // vfmt on
473]
474
475fn name_to_codepage(name string) int {
476 // performance hack
477 if name == 'UTF-8' {
478 return 65001
479 }
480
481 name_upper := name.to_upper()
482 if name_upper == '' || name_upper == 'CP_ACP' || name_upper == 'ANSI' {
483 return C.GetACP()
484 }
485 if name_upper == 'CP_OEMCP' {
486 return C.GetOEMCP()
487 }
488 if name_upper.len < 2 {
489 return -1
490 }
491 if name_upper == 'WCHAR_T' {
492 return 1200
493 }
494 // CP123
495 if name_upper.starts_with('CP') {
496 return name_upper[2..].int()
497 }
498 if name_upper.is_int() {
499 return name_upper.int()
500 }
501 // XX123 for debug
502 if name_upper.starts_with('xx') {
503 return name_upper[2..].int()
504 }
505
506 for x in codepage_alias {
507 if x.name == name_upper {
508 return x.codepage
509 }
510 }
511 return -1
512}
513
514// https://www.cnblogs.com/findumars/p/6376034.html
515@[direct_array_access]
516fn utf32_to_utf16(src &u8, src_len int, is_src_little_endian bool, is_dst_little_endian bool) ![]u8 {
517 mut dst := []u8{len: src_len}
518 mut sptr := unsafe { &u32(src) }
519 mut dptr := &u16(dst.data)
520 mut src_idx := 0
521 mut dst_idx := 0
522 mut c := u32(0)
523 mut t := u16(0)
524 for {
525 if src_idx == src_len / 4 {
526 break
527 }
528 unsafe {
529 c = sptr[src_idx]
530 }
531 if !is_src_little_endian {
532 c = reverse_u32(c)
533 }
534 src_idx++
535 if c <= 0xFFFF {
536 t = u16(c)
537 if !is_dst_little_endian {
538 t = reverse_u16(t)
539 }
540 unsafe {
541 dptr[dst_idx] = t
542 }
543 dst_idx++
544 } else if c <= 0xEFFFF {
545 t = u16((0xD800 + (c >> 10) - 0x40)) // high
546
547 if !is_dst_little_endian {
548 t = reverse_u16(t)
549 }
550 unsafe {
551 dptr[dst_idx] = t
552 }
553 dst_idx++
554 t = u16(0xDC00 + (c & 0x03FF)) // low
555 if !is_dst_little_endian {
556 t = reverse_u16(t)
557 }
558 unsafe {
559 dptr[dst_idx] = t
560 }
561 dst_idx++
562 } else {
563 return error('invalid UTF-32LE encoding')
564 }
565 }
566 dst.trim(dst_idx * 2)
567 return dst
568}
569
570// https://www.cnblogs.com/findumars/p/6376034.html
571@[direct_array_access]
572fn utf16_to_utf32(src &u8, src_len int, is_src_little_endian bool, is_dst_little_endian bool) ![]u8 {
573 mut dst := []u8{len: src_len * 2}
574 mut sptr := unsafe { &u16(src) }
575 mut dptr := &u32(dst.data)
576 mut w1 := u16(0)
577 mut w2 := u16(0)
578 mut t := u32(0)
579 mut src_idx := 0
580 mut dst_idx := 0
581 for {
582 if src_idx == src_len / 2 {
583 break
584 }
585 unsafe {
586 w1 = sptr[src_idx]
587 }
588 if !is_src_little_endian {
589 w1 = reverse_u16(w1)
590 }
591 src_idx++
592 if w1 >= 0xD800 && w1 <= 0xDFFF {
593 if w1 < 0xDC00 {
594 if src_idx == src_len / 2 {
595 return error('invalid UTF-16LE encoding')
596 }
597 unsafe {
598 w2 = sptr[src_idx]
599 }
600 if !is_src_little_endian {
601 w2 = reverse_u16(w2)
602 }
603 if w2 >= 0xDC00 && w2 <= 0xDFFF {
604 t = (w2 & 0x03FF) + (((w1 & 0x03FF) + 0x40) << 10)
605 if !is_dst_little_endian {
606 t = reverse_u32(t)
607 }
608 unsafe {
609 dptr[dst_idx] = t
610 }
611 dst_idx++
612 }
613 } else {
614 return error('invalid UTF-16LE encoding')
615 }
616 } else {
617 t = w1
618 if !is_dst_little_endian {
619 t = reverse_u32(t)
620 }
621 unsafe {
622 dptr[dst_idx] = t
623 }
624 dst_idx++
625 }
626 }
627 dst.trim(dst_idx * 4)
628 return dst
629}
630
631// conv convert `fromcode` encoding string to `tocode` encoding string
632@[direct_array_access]
633fn conv(tocode string, fromcode string, src &u8, src_len int) ![]u8 {
634 if src_len < 0 {
635 return error('src length error')
636 }
637 src_codepage := name_to_codepage(fromcode)
638 dst_codepage := name_to_codepage(tocode)
639 if src_codepage <= 0 {
640 return error('fromcode ${fromcode} does not exist')
641 }
642 if dst_codepage <= 0 {
643 return error('tocode ${tocode} does not exist')
644 }
645
646 if src_codepage == dst_codepage {
647 // clone src
648 mut dst_buf := []u8{len: src_len}
649 unsafe { vmemcpy(dst_buf.data, src, src_len) }
650 return dst_buf
651 }
652
653 mut unicode := []u8{}
654 // src codepage => Unicode
655 match src_codepage {
656 1200 {
657 // src already in Unicode(UTF-16LE) encoding, just clone src
658 unsafe {
659 unicode.grow_len(src_len)
660 vmemcpy(unicode.data, src, src_len)
661 }
662 }
663 1201 {
664 // Windows does not support UTF-16BE
665 // byte swap each 16 bit character element
666 unsafe {
667 unicode.grow_len(src_len)
668 vmemcpy(unicode.data, src, src_len)
669 }
670 mut eptr := &u16(unicode.data)
671 for i in 0 .. src_len / 2 {
672 unsafe {
673 eptr[i] = reverse_u16(eptr[i])
674 }
675 }
676 }
677 12000 {
678 // Windows does not support UTF-32LE
679 unicode = utf32_to_utf16(src, src_len, true, true)!
680 }
681 12001 {
682 // Windows does not support UTF-32BE
683 unicode = utf32_to_utf16(src, src_len, false, true)!
684 }
685 else {
686 char_num := C.MultiByteToWideChar(src_codepage, 0, src, src_len, 0, 0)
687 if char_num == 0 {
688 return error('MultiByteToWideChar fail: src contain zero ${fromcode} character')
689 }
690 unsafe { unicode.grow_len(char_num * 2) } // every char take 2 bytes
691 C.MultiByteToWideChar(src_codepage, 0, src, src_len, unicode.data, unicode.len)
692 }
693 }
694
695 mut dst := []u8{}
696 // Unicode => dst codepage
697 match dst_codepage {
698 1200 {
699 // dst codepage is Unicode, just return unicode
700 return unicode
701 }
702 1201 {
703 // Windows does not support UTF-16BE
704 // byte swap each 16 bit character element
705 mut eptr := &u16(unicode.data)
706 for i in 0 .. unicode.len / 2 {
707 unsafe {
708 eptr[i] = reverse_u16(eptr[i])
709 }
710 }
711 return unicode
712 }
713 12000 {
714 // Windows does not support UTF-32LE
715 dst = utf16_to_utf32(unicode.data, unicode.len, true, true)!
716 }
717 12001 {
718 // Windows does not support UTF-32BE
719 dst = utf16_to_utf32(unicode.data, unicode.len, true, false)!
720 }
721 else {
722 dst_len := C.WideCharToMultiByte(dst_codepage, 0, unicode.data, unicode.len / 2, 0, 0,
723 0, 0)
724 if dst_len == 0 {
725 return error('WideCharToMultiByte fail: src contain zero unicode character')
726 }
727 unsafe { dst.grow_len(dst_len) }
728 C.WideCharToMultiByte(dst_codepage, 0, unicode.data, unicode.len, dst.data, dst.len, 0,
729 0)
730 }
731 }
732
733 return dst
734}
735