v / vlib / encoding / iconv / iconv.v
168 lines · 157 sloc · 4.97 KB · b5b93b092b8c72e76ff578650e01950fcc7c2314
Raw
1module iconv
2
3// Module iconv provides functions to convert between vstring(UTF8) and other encodings.
4import os
5
6@[inline]
7fn reverse_u16(src u16) u16 {
8 return u16(src >> 8 | src << 8)
9}
10
11@[inline]
12fn reverse_u32(src u32) u32 {
13 return u32(src >> 24 | ((src >> 8) & 0x0000_FF00) | ((src << 8) & 0x00FF_0000) | src << 24)
14}
15
16// vstring_to_encoding convert V string `str` to `tocode` encoding string
17// tips: use `iconv --list` check for supported encodings
18pub fn vstring_to_encoding(str string, tocode string) ![]u8 {
19 mut encoding_name := tocode.to_upper()
20 if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
21 return error('please use UTF16-LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
22 }
23 if encoding_name == 'LOCAL' {
24 $if windows {
25 encoding_name = 'ANSI'
26 } $else {
27 encoding_name = 'UTF-8'
28 }
29 }
30 return conv(encoding_name, 'UTF-8', str.str, str.len)
31}
32
33// encoding_to_vstring converts the given `bytes` using `fromcode` encoding, to a V string (encoded with UTF-8)
34// tips: use `iconv --list` check for supported encodings
35pub fn encoding_to_vstring(bytes []u8, fromcode string) !string {
36 mut encoding_name := fromcode.to_upper()
37 if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
38 return error('please use UTF16-LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
39 }
40 if encoding_name == 'LOCAL' {
41 $if windows {
42 encoding_name = 'ANSI'
43 } $else {
44 encoding_name = 'UTF-8'
45 }
46 }
47 mut dst := conv('UTF-8', encoding_name, bytes.data, bytes.len)!
48 dst << 0 // add a tail zero, to build a vstring
49 return unsafe { cstring_to_vstring(dst.data) }
50}
51
52// create_utf_string_with_bom will create a utf8/utf16/utf32 string with BOM header
53// for utf8, it will prepend 0xEFBBBF to the `src`
54// for utf16le, it will prepend 0xFFFE to the `src`
55// for utf16be, it will prepend 0xFEFF to the `src`
56// for utf32le, it will prepend 0xFFFE0000 to the `src`
57// for utf32be, it will prepend 0x0000FEFF to the `src`
58pub fn create_utf_string_with_bom(src []u8, utf_type string) []u8 {
59 mut clone := src.clone()
60 mut encoding_name := utf_type.to_upper()
61 if encoding_name == 'LOCAL' {
62 $if windows {
63 encoding_name = 'ANSI'
64 } $else {
65 encoding_name = 'UTF-8'
66 }
67 }
68 match encoding_name {
69 'UTF8', 'UTF-8' {
70 clone.prepend([u8(0xEF), 0xBB, 0xBF])
71 }
72 'UTF16LE', 'UTF-16LE' {
73 clone.prepend([u8(0xFF), 0xFE])
74 }
75 'UTF16BE', 'UTF-16BE' {
76 clone.prepend([u8(0xFE), 0xFF])
77 }
78 'UTF32LE', 'UTF-32LE' {
79 clone.prepend([u8(0xFF), 0xFE, 0, 0])
80 }
81 'UTF32BE', 'UTF-32BE' {
82 clone.prepend([u8(0), 0, 0xFE, 0xFF])
83 }
84 else {}
85 }
86
87 return clone
88}
89
90// remove_utf_string_with_bom will remove a utf8/utf16/utf32 string's BOM header
91// for utf8, it will remove 0xEFBBBF from the `src`
92// for utf16le, it will remove 0xFFFE from the `src`
93// for utf16be, it will remove 0xFEFF from the `src`
94// for utf32le, it will remove 0xFFFE0000 from the `src`
95// for utf32be, it will remove 0x0000FEFF from the `src`
96@[direct_array_access]
97pub fn remove_utf_string_with_bom(src []u8, utf_type string) []u8 {
98 mut clone := src.clone()
99 mut encoding_name := utf_type.to_upper()
100 if encoding_name == 'LOCAL' {
101 $if windows {
102 encoding_name = 'ANSI'
103 } $else {
104 encoding_name = 'UTF-8'
105 }
106 }
107 match encoding_name {
108 'UTF8', 'UTF-8' {
109 if clone.len > 3 {
110 if clone[0] == u8(0xEF) && clone[1] == u8(0xBB) && clone[2] == u8(0xBF) {
111 clone.delete_many(0, 3)
112 }
113 }
114 }
115 'UTF16LE', 'UTF-16LE' {
116 if clone.len > 2 {
117 if clone[0] == u8(0xFF) && clone[1] == u8(0xFE) {
118 clone.delete_many(0, 2)
119 }
120 }
121 }
122 'UTF16BE', 'UTF-16BE' {
123 if clone.len > 2 {
124 if clone[0] == u8(0xFE) && clone[1] == u8(0xFF) {
125 clone.delete_many(0, 2)
126 }
127 }
128 }
129 'UTF32LE', 'UTF-32LE' {
130 if clone.len > 4 {
131 if clone[0] == u8(0xFF) && clone[1] == u8(0xFE) && clone[2] == u8(0)
132 && clone[3] == u8(0) {
133 clone.delete_many(0, 4)
134 }
135 }
136 }
137 'UTF32BE', 'UTF-32BE' {
138 if clone.len > 4 {
139 if clone[0] == u8(0) && clone[1] == u8(0) && clone[2] == u8(0xFE)
140 && clone[3] == u8(0xFF) {
141 clone.delete_many(0, 4)
142 }
143 }
144 }
145 else {}
146 }
147
148 return clone
149}
150
151// write_file_encoding write_file convert `text` into `encoding` and writes to a file with the given `path`. If `path` already exists, it will be overwritten.
152// For `encoding` in UTF8/UTF16/UTF32, if `bom` is true, then a BOM header will write to the file.
153pub fn write_file_encoding(path string, text string, encoding string, bom bool) ! {
154 encoding_bytes := vstring_to_encoding(text, encoding)!
155 if bom && encoding.to_upper().starts_with('UTF') {
156 encoding_bom_bytes := create_utf_string_with_bom(encoding_bytes, encoding)
157 os.write_file_array(path, encoding_bom_bytes)!
158 } else {
159 os.write_file_array(path, encoding_bytes)!
160 }
161}
162
163// read_file_encoding reads the file in `path` with `encoding` and returns the contents
164pub fn read_file_encoding(path string, encoding string) !string {
165 encoding_bytes := os.read_file_array[u8](path)
166 encoding_without_bom_bytes := remove_utf_string_with_bom(encoding_bytes, encoding)
167 return encoding_to_vstring(encoding_without_bom_bytes, encoding)!
168}
169