Skip to content

Commit 3cbcc1f

Browse files
committed
Add support for GBK/CP936 encoding and conversion
1 parent 6bedcf4 commit 3cbcc1f

File tree

7 files changed

+367
-2
lines changed

7 files changed

+367
-2
lines changed

include/openvic-dataloader/detail/Encoding.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ namespace ovdl::detail {
88
Ascii,
99
Utf8,
1010
Windows1251,
11-
Windows1252
11+
Windows1252,
12+
Gbk,
1213
};
1314
}

src/openvic-dataloader/csv/Parser.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ struct Parser::ParseHandler final : detail::BasicFileParseHandler<CsvParseState>
4343
case Utf8:
4444
case Windows1251:
4545
case Windows1252:
46+
case Gbk:
4647
return lexy::parse<Node>(buffer<lexy::utf8_char_encoding>(), parse_state(), parse_state().logger().error_callback());
4748
OVDL_DEFAULT_CASE_UNREACHABLE(Unknown);
4849
}
Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
#pragma once
2+
3+
#include <cerrno>
4+
#include <cstddef>
5+
#include <cstring>
6+
7+
#include <openvic-dataloader/detail/Encoding.hpp>
8+
9+
#include <lexy/_detail/memory_resource.hpp>
10+
#include <lexy/encoding.hpp>
11+
#include <lexy/input/buffer.hpp>
12+
#include <lexy/input/file.hpp>
13+
14+
#ifdef _WIN32
15+
#define WIN32_LEAN_AND_MEAN
16+
#include <windows.h>
17+
#undef WIN32_LEAN_AND_MEAN
18+
#elif defined(__unix__) || defined(__APPLE__) || __has_include(<iconv.h>)
19+
#include <iconv.h>
20+
#endif
21+
22+
namespace ovdl::convert::gbk {
23+
template<typename Encoding, lexy::encoding_endianness Endian>
24+
struct _make_buffer {
25+
static constexpr size_t small_buffer_size = size_t(4) * 1024;
26+
27+
template<typename MemoryResource = void>
28+
auto operator()(detail::Encoding encoding, const void* _memory, std::size_t size,
29+
MemoryResource* resource = lexy::_detail::get_memory_resource<MemoryResource>()) const {
30+
constexpr auto native_endianness = LEXY_IS_LITTLE_ENDIAN ? lexy::encoding_endianness::little : lexy::encoding_endianness::big;
31+
32+
using char_type = typename Encoding::char_type;
33+
LEXY_PRECONDITION(size % sizeof(char_type) == 0);
34+
auto memory = static_cast<const unsigned char*>(_memory);
35+
36+
if constexpr (sizeof(char_type) == 1 || Endian == native_endianness) {
37+
switch (encoding) {
38+
using enum detail::Encoding;
39+
case Ascii:
40+
case Utf8:
41+
return lexy::make_buffer_from_raw<Encoding, Endian>(_memory, size, resource);
42+
default: break;
43+
}
44+
45+
#if defined(__unix__) || defined(__APPLE__) || __has_include(<iconv.h>)
46+
iconv_t cd = ::iconv_open("UTF-8", "WINDOWS-936");
47+
if (cd == (iconv_t)-1) {
48+
return lexy::buffer<Encoding, MemoryResource> { resource };
49+
}
50+
#endif
51+
52+
size_t in_size = size;
53+
// While technically illegal, it seems the contract for iconv is wrong, it doesn't modify the content of inbuff
54+
// It only ever does such for convenience
55+
char* in_buffer = const_cast<char*>(static_cast<const char*>(_memory));
56+
57+
if (in_buffer == nullptr) {
58+
return lexy::buffer<Encoding, MemoryResource> { resource };
59+
}
60+
61+
typename lexy::buffer<Encoding, MemoryResource>::builder out_builder(size * 3);
62+
char* out_buffer = out_builder.data();
63+
size_t out_size = out_builder.size();
64+
65+
auto iconv_err_handler = [&]() {
66+
if (errno == EILSEQ && in_buffer && in_size >= 1) {
67+
auto full_width_exclaim = [&] {
68+
// Insert UTF-8 ! (full width exclaimation mark)
69+
*out_buffer++ = '\xEF';
70+
*out_buffer++ = '\xBC';
71+
*out_buffer++ = '\x81';
72+
out_size -= 3;
73+
in_buffer += sizeof(char_type);
74+
--in_size;
75+
};
76+
switch (*in_buffer) {
77+
// Expect non-standard § from Windows-1252, required for color behavior
78+
case '\xA7':
79+
// Insert UTF-8 §
80+
*out_buffer++ = '\xC2';
81+
*out_buffer++ = '\xA7';
82+
out_size -= 2;
83+
in_buffer += sizeof(char_type);
84+
--in_size;
85+
return true;
86+
// Expect non-standard ! (full width exclaimation mark), found in some localizations
87+
case '\xA1':
88+
full_width_exclaim();
89+
return true;
90+
// Expect nothing then non-standard ! (full width exclaimation mark), found in some localizations
91+
case '\xAD':
92+
if (in_size >= 2 && in_buffer + 1 && in_buffer[1] == '\xA1') {
93+
--out_size;
94+
in_buffer += sizeof(char_type);
95+
--in_size;
96+
full_width_exclaim();
97+
}
98+
return true;
99+
// Unexpected error
100+
default: break;
101+
}
102+
}
103+
return false;
104+
};
105+
#if defined(_WIN32)
106+
auto iconv_mimic = [&]() -> int64_t {
107+
static constexpr size_t CP_GBK = 936;
108+
static constexpr size_t MB_CHAR_MAX = 16;
109+
110+
static auto mblen = [](const char* buf, int bufsize) {
111+
int len = 0;
112+
113+
unsigned char c = *buf;
114+
if (c < 0x80) {
115+
len = 1;
116+
} else if ((c & 0xE0) == 0xC0) {
117+
len = 2;
118+
} else if ((c & 0xF0) == 0xE0) {
119+
len = 3;
120+
} else if ((c & 0xF8) == 0xF0) {
121+
len = 4;
122+
} else if ((c & 0xFC) == 0xF8) {
123+
len = 5;
124+
} else if ((c & 0xFE) == 0xFC) {
125+
len = 6;
126+
}
127+
128+
if (len == 0) {
129+
errno = EILSEQ;
130+
return -1;
131+
} else if (bufsize < len) {
132+
errno = EINVAL;
133+
return -1;
134+
}
135+
return len;
136+
};
137+
138+
while (in_size != 0) {
139+
unsigned short wbuf[MB_CHAR_MAX]; /* enough room for one character */
140+
size_t wsize = MB_CHAR_MAX;
141+
142+
int insize = IsDBCSLeadByteEx(CP_GBK, *in_buffer) ? 2 : 1;
143+
if (insize == 2 && in_buffer && in_size >= 2) {
144+
// iconv errors on user-defined double byte characters
145+
// MultiByteToWideChar/WideCharToMultiByte does not
146+
unsigned char byte1 = static_cast<unsigned char>(*in_buffer);
147+
unsigned char byte2 = static_cast<unsigned char>(in_buffer[1]);
148+
if (byte1 >= 0xAA && byte1 <= 0xAF && byte2 >= 0xA1 && byte2 <= 0xFE) {
149+
errno = EILSEQ;
150+
return -1;
151+
}
152+
if (byte1 >= 0xF8 && byte1 <= 0xFE && byte2 >= 0xA1 && byte2 <= 0xFE) {
153+
errno = EILSEQ;
154+
return -1;
155+
}
156+
if (byte1 >= 0xA1 && byte1 <= 0xA7 && byte2 >= 0x40 && byte2 <= 0xA0 && byte2 != 0x7F) {
157+
errno = EILSEQ;
158+
return -1;
159+
}
160+
}
161+
wsize = MultiByteToWideChar(CP_GBK, MB_ERR_INVALID_CHARS, in_buffer, insize, (wchar_t*)wbuf, wsize);
162+
if (wsize == 0) {
163+
in_buffer += insize;
164+
in_size -= insize;
165+
continue;
166+
}
167+
168+
if (out_size == 0) {
169+
errno = E2BIG;
170+
return -1;
171+
}
172+
173+
int outsize = WideCharToMultiByte(CP_UTF8, 0, (const wchar_t*)wbuf, wsize, out_buffer, out_size, NULL, NULL);
174+
if (outsize == 0) {
175+
switch (GetLastError()) {
176+
case ERROR_INVALID_FLAGS:
177+
case ERROR_INVALID_PARAMETER:
178+
case ERROR_INSUFFICIENT_BUFFER:
179+
errno = E2BIG;
180+
return -1;
181+
default: break;
182+
}
183+
errno = EILSEQ;
184+
return -1;
185+
} else if (mblen(out_buffer, outsize) != outsize) {
186+
/* validate result */
187+
errno = EILSEQ;
188+
return -1;
189+
}
190+
191+
in_buffer += insize;
192+
out_buffer += outsize;
193+
in_size -= insize;
194+
out_size -= outsize;
195+
}
196+
197+
return 0;
198+
};
199+
200+
const auto end = in_buffer + size;
201+
while (in_size > 0 && out_size > 0 && in_buffer != end) {
202+
if (iconv_mimic() == -1) {
203+
if (!iconv_err_handler()) {
204+
break;
205+
}
206+
}
207+
}
208+
#elif defined(__unix__) || defined(__APPLE__) || __has_include(<iconv.h>)
209+
const auto end = in_buffer + size;
210+
while (in_size > 0 && out_size > 0 && in_buffer != end) {
211+
if (::iconv(cd, &in_buffer, &in_size, &out_buffer, &out_size) == -1) {
212+
if (!iconv_err_handler()) {
213+
break;
214+
}
215+
}
216+
}
217+
::iconv_close(cd);
218+
#else
219+
#error "GBK conversion not supported on this platform"
220+
#endif
221+
return lexy::buffer<Encoding, MemoryResource> { out_builder.data(), static_cast<size_t>(out_buffer - out_builder.data()), resource };
222+
} else {
223+
return lexy::make_buffer_from_raw<Encoding, Endian>(_memory, size, resource);
224+
}
225+
}
226+
};
227+
228+
template<typename Encoding, lexy::encoding_endianness Endianness = lexy::encoding_endianness::bom>
229+
constexpr auto make_buffer_from_raw = _make_buffer<Encoding, Endianness> {};
230+
231+
template<typename Encoding, lexy::encoding_endianness Endian, typename MemoryResource>
232+
struct _read_file_user_data : lexy::_read_file_user_data<Encoding, Endian, MemoryResource> {
233+
using base_type = lexy::_read_file_user_data<Encoding, Endian, MemoryResource>;
234+
235+
detail::Encoding encoding;
236+
237+
_read_file_user_data(detail::Encoding encoding, MemoryResource* resource) : base_type(resource), encoding(encoding) {}
238+
static auto callback() {
239+
return [](void* _user_data, const char* memory, std::size_t size) {
240+
auto user_data = static_cast<_read_file_user_data*>(_user_data);
241+
242+
user_data->buffer = make_buffer_from_raw<Encoding, Endian>(user_data->encoding, memory, size, user_data->resource);
243+
};
244+
}
245+
};
246+
247+
template<typename Encoding = lexy::default_encoding,
248+
lexy::encoding_endianness Endian = lexy::encoding_endianness::bom,
249+
typename MemoryResource = void>
250+
auto read_file(
251+
const char* path,
252+
detail::Encoding encoding,
253+
MemoryResource* resource = lexy::_detail::get_memory_resource<MemoryResource>())
254+
-> lexy::read_file_result<Encoding, MemoryResource> {
255+
_read_file_user_data<Encoding, Endian, MemoryResource> user_data(encoding, resource);
256+
auto error = lexy::_detail::read_file(path, user_data.callback(), &user_data);
257+
return lexy::read_file_result(error, LEXY_MOV(user_data.buffer));
258+
}
259+
260+
/// Reads stdin into a buffer.
261+
template<typename Encoding = lexy::default_encoding,
262+
lexy::encoding_endianness Endian = lexy::encoding_endianness::bom,
263+
typename MemoryResource = void>
264+
auto read_stdin(
265+
detail::Encoding encoding,
266+
MemoryResource* resource = lexy::_detail::get_memory_resource<MemoryResource>())
267+
-> lexy::read_file_result<Encoding, MemoryResource> {
268+
_read_file_user_data<Encoding, Endian, MemoryResource> user_data(encoding, resource);
269+
auto error = lexy::_detail::read_stdin(user_data.callback(), &user_data);
270+
return lexy::read_file_result(error, LEXY_MOV(user_data.buffer));
271+
}
272+
}

src/openvic-dataloader/detail/Detect.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#include "detail/Detect.hpp"
22

3+
#include <optional>
4+
35
using namespace ovdl;
46
using namespace ovdl::encoding_detect;
57

@@ -23,6 +25,15 @@ std::optional<int64_t> AsciiCandidate::read(const std::span<const cbyte>& buffer
2325
return std::nullopt;
2426
}
2527

28+
std::optional<int64_t> GbkCandidate::read(const std::span<const cbyte>& buffer) {
29+
auto lexy_buffer = lexy::make_buffer_from_raw<lexy::default_encoding, lexy::encoding_endianness::little>(buffer.data(), buffer.size());
30+
if (is_gbk(lexy_buffer)) {
31+
return 2;
32+
}
33+
34+
return std::nullopt;
35+
}
36+
2637
std::optional<int64_t> NonLatinCasedCandidate::read(const std::span<const cbyte>& buffer) {
2738
static constexpr cbyte LATIN_LETTER = 1;
2839
static constexpr int64_t NON_LATIN_MIXED_CASE_PENALTY = -20;
@@ -351,3 +362,5 @@ std::optional<int64_t> LatinCandidate::read(const std::span<const cbyte>& buffer
351362

352363
template struct ovdl::encoding_detect::DetectUtf8<true>;
353364
template struct ovdl::encoding_detect::DetectUtf8<false>;
365+
template struct ovdl::encoding_detect::DetectGbk<true>;
366+
template struct ovdl::encoding_detect::DetectGbk<false>;

0 commit comments

Comments
 (0)