Skip to content

Commit 7093200

Browse files
Merge pull request #1479 from AnyOldName3/windows-unicode-fixes
Fix wstring conversion on Windows
2 parents f37ef3b + c71166f commit 7093200

File tree

2 files changed

+71
-3
lines changed

2 files changed

+71
-3
lines changed

src/vsg/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,8 @@ if(MSVC)
379379
target_compile_definitions(vsg PUBLIC "_ITERATOR_DEBUG_LEVEL=0")
380380
endif()
381381

382+
# MSVC assumes system-wide eight-bit code page (e.g. CP1252, CP1250) if there's no BOM
383+
target_compile_options(vsg PRIVATE "/utf-8")
382384
endif()
383385

384386

src/vsg/io/convert_utf.cpp

Lines changed: 69 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* <editor-fold desc="MIT License">
22
3-
Copyright(c) 2022 Robert Osfield
3+
Copyright(c) 2022-2025 Robert Osfield, Chris Djali
44
55
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
66
@@ -13,6 +13,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
1313
#include <vsg/io/convert_utf.h>
1414

1515
#include <cstdint>
16+
#include <limits>
1617

1718
using namespace vsg;
1819

@@ -108,14 +109,79 @@ bool encode_utf8(Iterator itr, Iterator end, Func op)
108109
return true;
109110
}
110111

112+
template<typename Iterator, class Func>
113+
bool decode_utf16(Iterator itr, size_t count, Func op)
114+
{
115+
while (count > 0)
116+
{
117+
auto c0 = *itr++;
118+
--count;
119+
120+
if ((c0 >= 0x0000 && c0 <= 0xD7FF) || (c0 >= 0xE000 && c0 <= 0xFFFF )) // 2-byte UCS2 character
121+
{
122+
op(c0);
123+
continue;
124+
}
125+
126+
// unpaired surrogate
127+
if (count == 0 || c0 >= 0xDC00) return false;
128+
129+
auto c1 = *itr++;
130+
--count;
131+
if (c1 >= 0xDC00 && c1 <= 0xDFFF) // 4-byte surrogate pair
132+
{
133+
op((((c0 - 0xD800) << 10) | (c1 - 0xDC00)) + 0x10000);
134+
continue;
135+
}
136+
else return false; // unpaired surrogate
137+
}
138+
139+
return true;
140+
}
141+
142+
template<typename Iterator, class Func>
143+
bool encode_utf16(Iterator itr, Iterator end, Func op)
144+
{
145+
while (itr != end)
146+
{
147+
uint32_t c = *itr++;
148+
if ((c >= 0x0000 && c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFF )) // 2-byte UCS2 character
149+
{
150+
op(c);
151+
continue;
152+
}
153+
154+
// unpaired surrogate
155+
if (c < 0x10000) return false;
156+
else // 4-byte surrogate pair
157+
{
158+
op(0xD800 + (((c - 0x10000) >> 10) & 0x3FF)); // high surrogate
159+
op(0xDC00 + ((c - 0x10000) & 0x3FF)); // low surrogate
160+
continue;
161+
}
162+
}
163+
164+
return true;
165+
}
166+
111167
void vsg::convert_utf(const std::string& utf8, std::wstring& dst)
112168
{
113169
dst.clear();
114-
decode_utf8(utf8.begin(), utf8.size(), [&dst](uint32_t c) { dst.push_back(c); });
170+
if constexpr (std::numeric_limits<wchar_t>::max() == 0xFFFF)
171+
decode_utf8(utf8.begin(), utf8.size(), [&dst](uint32_t c) { encode_utf16(&c, (&c) + 1, [&dst](uint32_t cu) { dst.push_back(cu); }); });
172+
else
173+
decode_utf8(utf8.begin(), utf8.size(), [&dst](uint32_t c) { dst.push_back(c); });
115174
}
116175

117176
void vsg::convert_utf(const std::wstring& src, std::string& utf8)
118177
{
119178
utf8.clear();
120-
encode_utf8(src.begin(), src.end(), [&utf8](uint32_t c) { utf8.push_back(static_cast<char>(c)); });
179+
if constexpr (std::numeric_limits<wchar_t>::max() == 0xFFFF)
180+
{
181+
std::u32string intermediate;
182+
decode_utf16(src.begin(), src.size(), [&intermediate](char32_t c) { intermediate.push_back(c); });
183+
encode_utf8(intermediate.begin(), intermediate.end(), [&utf8](char32_t c) { utf8.push_back(static_cast<char>(c)); });
184+
}
185+
else
186+
encode_utf8(src.begin(), src.end(), [&utf8](uint32_t c) { utf8.push_back(static_cast<char>(c)); });
121187
}

0 commit comments

Comments
 (0)