Skip to content

Commit c71166f

Browse files
committed
Fix wstring conversion on Windows
On most Unices, wchar_t is 32 bits and typically holds UCS4 characters, which are big enough for any Unicode code point. On Windows, wchar_t is 16 bits and typically holds UTF-16 code units, which sometimes need to be used in pairs. This is because Windows introduced wchar_t before UCS4, UTF-8 and UTF-16 were invented, originally using them for UCS2 characters back when all of Unicode fit in sixteen bits. That meant that the existing string conversion code, which assumed std::wstring was a fixed-width encoding, would only work on Windows for the first 55295 code points (where UCS2 and UTF-16 were compatible), so anything outside the basic multilingual plane wouldn't work.
1 parent e026a20 commit c71166f

File tree

2 files changed

+71
-3
lines changed

2 files changed

+71
-3
lines changed

src/vsg/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,8 @@ if(MSVC)
376376
target_compile_definitions(vsg PUBLIC "_ITERATOR_DEBUG_LEVEL=0")
377377
endif()
378378

379+
# MSVC assumes system-wide eight-bit code page (e.g. CP1252, CP1250) if there's no BOM
380+
target_compile_options(vsg PRIVATE "/utf-8")
379381
endif()
380382

381383

src/vsg/io/convert_utf.cpp

Lines changed: 69 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* <editor-fold desc="MIT License">
22
3-
Copyright(c) 2022 Robert Osfield
3+
Copyright(c) 2022-2025 Robert Osfield, Chris Djali
44
55
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
66
@@ -13,6 +13,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
1313
#include <vsg/io/convert_utf.h>
1414

1515
#include <cstdint>
16+
#include <limits>
1617

1718
using namespace vsg;
1819

@@ -108,14 +109,79 @@ bool encode_utf8(Iterator itr, Iterator end, Func op)
108109
return true;
109110
}
110111

112+
template<typename Iterator, class Func>
113+
bool decode_utf16(Iterator itr, size_t count, Func op)
114+
{
115+
while (count > 0)
116+
{
117+
auto c0 = *itr++;
118+
--count;
119+
120+
if ((c0 >= 0x0000 && c0 <= 0xD7FF) || (c0 >= 0xE000 && c0 <= 0xFFFF )) // 2-byte UCS2 character
121+
{
122+
op(c0);
123+
continue;
124+
}
125+
126+
// unpaired surrogate
127+
if (count == 0 || c0 >= 0xDC00) return false;
128+
129+
auto c1 = *itr++;
130+
--count;
131+
if (c1 >= 0xDC00 && c1 <= 0xDFFF) // 4-byte surrogate pair
132+
{
133+
op((((c0 - 0xD800) << 10) | (c1 - 0xDC00)) + 0x10000);
134+
continue;
135+
}
136+
else return false; // unpaired surrogate
137+
}
138+
139+
return true;
140+
}
141+
142+
template<typename Iterator, class Func>
143+
bool encode_utf16(Iterator itr, Iterator end, Func op)
144+
{
145+
while (itr != end)
146+
{
147+
uint32_t c = *itr++;
148+
if ((c >= 0x0000 && c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFF )) // 2-byte UCS2 character
149+
{
150+
op(c);
151+
continue;
152+
}
153+
154+
// unpaired surrogate
155+
if (c < 0x10000) return false;
156+
else // 4-byte surrogate pair
157+
{
158+
op(0xD800 + (((c - 0x10000) >> 10) & 0x3FF)); // high surrogate
159+
op(0xDC00 + ((c - 0x10000) & 0x3FF)); // low surrogate
160+
continue;
161+
}
162+
}
163+
164+
return true;
165+
}
166+
111167
void vsg::convert_utf(const std::string& utf8, std::wstring& dst)
112168
{
113169
dst.clear();
114-
decode_utf8(utf8.begin(), utf8.size(), [&dst](uint32_t c) { dst.push_back(c); });
170+
if constexpr (std::numeric_limits<wchar_t>::max() == 0xFFFF)
171+
decode_utf8(utf8.begin(), utf8.size(), [&dst](uint32_t c) { encode_utf16(&c, (&c) + 1, [&dst](uint32_t cu) { dst.push_back(cu); }); });
172+
else
173+
decode_utf8(utf8.begin(), utf8.size(), [&dst](uint32_t c) { dst.push_back(c); });
115174
}
116175

117176
void vsg::convert_utf(const std::wstring& src, std::string& utf8)
118177
{
119178
utf8.clear();
120-
encode_utf8(src.begin(), src.end(), [&utf8](uint32_t c) { utf8.push_back(static_cast<char>(c)); });
179+
if constexpr (std::numeric_limits<wchar_t>::max() == 0xFFFF)
180+
{
181+
std::u32string intermediate;
182+
decode_utf16(src.begin(), src.size(), [&intermediate](char32_t c) { intermediate.push_back(c); });
183+
encode_utf8(intermediate.begin(), intermediate.end(), [&utf8](char32_t c) { utf8.push_back(static_cast<char>(c)); });
184+
}
185+
else
186+
encode_utf8(src.begin(), src.end(), [&utf8](uint32_t c) { utf8.push_back(static_cast<char>(c)); });
121187
}

0 commit comments

Comments
 (0)