1
+ #pragma once
2
+
3
+ #include < cerrno>
4
+ #include < cstddef>
5
+ #include < cstring>
6
+
7
+ #include < openvic-dataloader/detail/Encoding.hpp>
8
+
9
+ #include < lexy/_detail/memory_resource.hpp>
10
+ #include < lexy/encoding.hpp>
11
+ #include < lexy/input/buffer.hpp>
12
+ #include < lexy/input/file.hpp>
13
+
14
+ #ifdef _WIN32
15
+ #define WIN32_LEAN_AND_MEAN
16
+ #include < windows.h>
17
+ #undef WIN32_LEAN_AND_MEAN
18
+ #elif defined(__unix__) || defined(__APPLE__) || __has_include(<iconv.h>)
19
+ #include < iconv.h>
20
+ #endif
21
+
22
+ namespace ovdl ::convert::gbk {
23
+ template <typename Encoding, lexy::encoding_endianness Endian>
24
+ struct _make_buffer {
25
+ static constexpr size_t small_buffer_size = size_t (4 ) * 1024 ;
26
+
27
+ template <typename MemoryResource = void >
28
+ auto operator ()(detail::Encoding encoding, const void * _memory, std::size_t size,
29
+ MemoryResource* resource = lexy::_detail::get_memory_resource<MemoryResource>()) const {
30
+ constexpr auto native_endianness = LEXY_IS_LITTLE_ENDIAN ? lexy::encoding_endianness::little : lexy::encoding_endianness::big;
31
+
32
+ using char_type = typename Encoding::char_type;
33
+ LEXY_PRECONDITION (size % sizeof (char_type) == 0 );
34
+ auto memory = static_cast <const unsigned char *>(_memory);
35
+
36
+ if constexpr (sizeof (char_type) == 1 || Endian == native_endianness) {
37
+ switch (encoding) {
38
+ using enum detail::Encoding;
39
+ case Ascii:
40
+ case Utf8:
41
+ return lexy::make_buffer_from_raw<Encoding, Endian>(_memory, size, resource);
42
+ default : break ;
43
+ }
44
+
45
+ #if defined(__unix__) || defined(__APPLE__) || __has_include(<iconv.h>)
46
+ iconv_t cd = ::iconv_open (" UTF-8" , " WINDOWS-936" );
47
+ if (cd == (iconv_t )-1 ) {
48
+ return lexy::buffer<Encoding, MemoryResource> { resource };
49
+ }
50
+ #endif
51
+
52
+ size_t in_size = size;
53
+ // While technically illegal, it seems the contract for iconv is wrong, it doesn't modify the content of inbuff
54
+ // It only ever does such for convenience
55
+ char * in_buffer = const_cast <char *>(static_cast <const char *>(_memory));
56
+
57
+ if (in_buffer == nullptr ) {
58
+ return lexy::buffer<Encoding, MemoryResource> { resource };
59
+ }
60
+
61
+ typename lexy::buffer<Encoding, MemoryResource>::builder out_builder (size * 3 );
62
+ char * out_buffer = out_builder.data ();
63
+ size_t out_size = out_builder.size ();
64
+
65
+ auto iconv_err_handler = [&]() {
66
+ if (errno == EILSEQ && in_buffer && in_size >= 1 ) {
67
+ auto full_width_exclaim = [&] {
68
+ // Insert UTF-8 ! (full width exclaimation mark)
69
+ *out_buffer++ = ' \xEF ' ;
70
+ *out_buffer++ = ' \xBC ' ;
71
+ *out_buffer++ = ' \x81 ' ;
72
+ out_size -= 3 ;
73
+ in_buffer += sizeof (char_type);
74
+ --in_size;
75
+ };
76
+ switch (*in_buffer) {
77
+ // Expect non-standard § from Windows-1252, required for color behavior
78
+ case ' \xA7 ' :
79
+ // Insert UTF-8 §
80
+ *out_buffer++ = ' \xC2 ' ;
81
+ *out_buffer++ = ' \xA7 ' ;
82
+ out_size -= 2 ;
83
+ in_buffer += sizeof (char_type);
84
+ --in_size;
85
+ return true ;
86
+ // Expect non-standard ! (full width exclaimation mark), found in some localizations
87
+ case ' \xA1 ' :
88
+ full_width_exclaim ();
89
+ return true ;
90
+ // Expect nothing then non-standard ! (full width exclaimation mark), found in some localizations
91
+ case ' \xAD ' :
92
+ if (in_size >= 2 && in_buffer + 1 && in_buffer[1 ] == ' \xA1 ' ) {
93
+ --out_size;
94
+ in_buffer += sizeof (char_type);
95
+ --in_size;
96
+ full_width_exclaim ();
97
+ }
98
+ return true ;
99
+ // Unexpected error
100
+ default : break ;
101
+ }
102
+ }
103
+ return false ;
104
+ };
105
+ #if defined(_WIN32)
106
+ auto iconv_mimic = [&]() -> int64_t {
107
+ static constexpr size_t CP_GBK = 936 ;
108
+ static constexpr size_t MB_CHAR_MAX = 16 ;
109
+
110
+ static auto mblen = [](const char * buf, int bufsize) {
111
+ int len = 0 ;
112
+
113
+ unsigned char c = *buf;
114
+ if (c < 0x80 ) {
115
+ len = 1 ;
116
+ } else if ((c & 0xE0 ) == 0xC0 ) {
117
+ len = 2 ;
118
+ } else if ((c & 0xF0 ) == 0xE0 ) {
119
+ len = 3 ;
120
+ } else if ((c & 0xF8 ) == 0xF0 ) {
121
+ len = 4 ;
122
+ } else if ((c & 0xFC ) == 0xF8 ) {
123
+ len = 5 ;
124
+ } else if ((c & 0xFE ) == 0xFC ) {
125
+ len = 6 ;
126
+ }
127
+
128
+ if (len == 0 ) {
129
+ errno = EILSEQ;
130
+ return -1 ;
131
+ } else if (bufsize < len) {
132
+ errno = EINVAL;
133
+ return -1 ;
134
+ }
135
+ return len;
136
+ };
137
+
138
+ while (in_size != 0 ) {
139
+ unsigned short wbuf[MB_CHAR_MAX]; /* enough room for one character */
140
+ size_t wsize = MB_CHAR_MAX;
141
+
142
+ int insize = IsDBCSLeadByteEx (CP_GBK, *in_buffer) ? 2 : 1 ;
143
+ if (insize == 2 && in_buffer && in_size >= 2 ) {
144
+ // iconv errors on user-defined double byte characters
145
+ // MultiByteToWideChar/WideCharToMultiByte does not
146
+ unsigned char byte1 = static_cast <unsigned char >(*in_buffer);
147
+ unsigned char byte2 = static_cast <unsigned char >(in_buffer[1 ]);
148
+ if (byte1 >= 0xAA && byte1 <= 0xAF && byte2 >= 0xA1 && byte2 <= 0xFE ) {
149
+ errno = EILSEQ;
150
+ return -1 ;
151
+ }
152
+ if (byte1 >= 0xF8 && byte1 <= 0xFE && byte2 >= 0xA1 && byte2 <= 0xFE ) {
153
+ errno = EILSEQ;
154
+ return -1 ;
155
+ }
156
+ if (byte1 >= 0xA1 && byte1 <= 0xA7 && byte2 >= 0x40 && byte2 <= 0xA0 && byte2 != 0x7F ) {
157
+ errno = EILSEQ;
158
+ return -1 ;
159
+ }
160
+ }
161
+ wsize = MultiByteToWideChar (CP_GBK, MB_ERR_INVALID_CHARS, in_buffer, insize, (wchar_t *)wbuf, wsize);
162
+ if (wsize == 0 ) {
163
+ in_buffer += insize;
164
+ in_size -= insize;
165
+ continue ;
166
+ }
167
+
168
+ if (out_size == 0 ) {
169
+ errno = E2BIG;
170
+ return -1 ;
171
+ }
172
+
173
+ int outsize = WideCharToMultiByte (CP_UTF8, 0 , (const wchar_t *)wbuf, wsize, out_buffer, out_size, NULL , NULL );
174
+ if (outsize == 0 ) {
175
+ switch (GetLastError ()) {
176
+ case ERROR_INVALID_FLAGS:
177
+ case ERROR_INVALID_PARAMETER:
178
+ case ERROR_INSUFFICIENT_BUFFER:
179
+ errno = E2BIG;
180
+ return -1 ;
181
+ default : break ;
182
+ }
183
+ errno = EILSEQ;
184
+ return -1 ;
185
+ } else if (mblen (out_buffer, outsize) != outsize) {
186
+ /* validate result */
187
+ errno = EILSEQ;
188
+ return -1 ;
189
+ }
190
+
191
+ in_buffer += insize;
192
+ out_buffer += outsize;
193
+ in_size -= insize;
194
+ out_size -= outsize;
195
+ }
196
+
197
+ return 0 ;
198
+ };
199
+
200
+ const auto end = in_buffer + size;
201
+ while (in_size > 0 && out_size > 0 && in_buffer != end) {
202
+ if (iconv_mimic () == -1 ) {
203
+ if (!iconv_err_handler ()) {
204
+ break ;
205
+ }
206
+ }
207
+ }
208
+ #elif defined(__unix__) || defined(__APPLE__) || __has_include(<iconv.h>)
209
+ const auto end = in_buffer + size;
210
+ while (in_size > 0 && out_size > 0 && in_buffer != end) {
211
+ if (::iconv (cd, &in_buffer, &in_size, &out_buffer, &out_size) == -1 ) {
212
+ if (!iconv_err_handler ()) {
213
+ break ;
214
+ }
215
+ }
216
+ }
217
+ ::iconv_close (cd);
218
+ #else
219
+ #error "GBK conversion not supported on this platform"
220
+ #endif
221
+ return lexy::buffer<Encoding, MemoryResource> { out_builder.data (), static_cast <size_t >(out_buffer - out_builder.data ()), resource };
222
+ } else {
223
+ return lexy::make_buffer_from_raw<Encoding, Endian>(_memory, size, resource);
224
+ }
225
+ }
226
+ };
227
+
228
+ template <typename Encoding, lexy::encoding_endianness Endianness = lexy::encoding_endianness::bom>
229
+ constexpr auto make_buffer_from_raw = _make_buffer<Encoding, Endianness> {};
230
+
231
+ template <typename Encoding, lexy::encoding_endianness Endian, typename MemoryResource>
232
+ struct _read_file_user_data : lexy::_read_file_user_data<Encoding, Endian, MemoryResource> {
233
+ using base_type = lexy::_read_file_user_data<Encoding, Endian, MemoryResource>;
234
+
235
+ detail::Encoding encoding;
236
+
237
+ _read_file_user_data (detail::Encoding encoding, MemoryResource* resource) : base_type(resource), encoding(encoding) {}
238
+ static auto callback () {
239
+ return [](void * _user_data, const char * memory, std::size_t size) {
240
+ auto user_data = static_cast <_read_file_user_data*>(_user_data);
241
+
242
+ user_data->buffer = make_buffer_from_raw<Encoding, Endian>(user_data->encoding , memory, size, user_data->resource );
243
+ };
244
+ }
245
+ };
246
+
247
+ template <typename Encoding = lexy::default_encoding,
248
+ lexy::encoding_endianness Endian = lexy::encoding_endianness::bom,
249
+ typename MemoryResource = void >
250
+ auto read_file (
251
+ const char * path,
252
+ detail::Encoding encoding,
253
+ MemoryResource* resource = lexy::_detail::get_memory_resource<MemoryResource>())
254
+ -> lexy::read_file_result<Encoding, MemoryResource> {
255
+ _read_file_user_data<Encoding, Endian, MemoryResource> user_data (encoding, resource);
256
+ auto error = lexy::_detail::read_file (path, user_data.callback (), &user_data);
257
+ return lexy::read_file_result (error, LEXY_MOV (user_data.buffer ));
258
+ }
259
+
260
+ // / Reads stdin into a buffer.
261
+ template <typename Encoding = lexy::default_encoding,
262
+ lexy::encoding_endianness Endian = lexy::encoding_endianness::bom,
263
+ typename MemoryResource = void >
264
+ auto read_stdin (
265
+ detail::Encoding encoding,
266
+ MemoryResource* resource = lexy::_detail::get_memory_resource<MemoryResource>())
267
+ -> lexy::read_file_result<Encoding, MemoryResource> {
268
+ _read_file_user_data<Encoding, Endian, MemoryResource> user_data (encoding, resource);
269
+ auto error = lexy::_detail::read_stdin (user_data.callback (), &user_data);
270
+ return lexy::read_file_result (error, LEXY_MOV (user_data.buffer ));
271
+ }
272
+ }
0 commit comments