From c71166f19ae0291e308f17d26cbf4c215dde8980 Mon Sep 17 00:00:00 2001 From: AnyOldName3 Date: Wed, 28 May 2025 17:49:22 +0100 Subject: [PATCH] Fix wstring conversion on Windows On most Unices, wchar_t is 32 bits and typically holds UCS4 characters, which are big enough for any Unicode code point. On Windows, wchar_t is 16 bits and typically holds UTF-16 code units, which sometimes need to be used in pairs. This is because Windows introduced wchar_t before UCS4, UTF-8 and UTF-16 were invented, originally using them for UCS2 characters back when all of Unicode fit in sixteen bits. That meant that the existing string conversion code, which assumed std::wstring was a fixed-width encoding, would only work on Windows for the first 55295 code points (where UCS2 and UTF-16 were compatible), so anything outside the basic multilingual plane wouldn't work. --- src/vsg/CMakeLists.txt | 2 ++ src/vsg/io/convert_utf.cpp | 72 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 71 insertions(+), 3 deletions(-) diff --git a/src/vsg/CMakeLists.txt b/src/vsg/CMakeLists.txt index db7e0b93bf..e2898cddd0 100644 --- a/src/vsg/CMakeLists.txt +++ b/src/vsg/CMakeLists.txt @@ -376,6 +376,8 @@ if(MSVC) target_compile_definitions(vsg PUBLIC "_ITERATOR_DEBUG_LEVEL=0") endif() + # MSVC assumes system-wide eight-bit code page (e.g. CP1252, CP1250) if there's no BOM + target_compile_options(vsg PRIVATE "/utf-8") endif() diff --git a/src/vsg/io/convert_utf.cpp b/src/vsg/io/convert_utf.cpp index ed39ffb190..e1336f5785 100644 --- a/src/vsg/io/convert_utf.cpp +++ b/src/vsg/io/convert_utf.cpp @@ -1,6 +1,6 @@ /* -Copyright(c) 2022 Robert Osfield +Copyright(c) 2022-2025 Robert Osfield, Chris Djali Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: @@ -13,6 +13,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI #include #include +#include using namespace vsg; @@ -108,14 +109,79 @@ bool encode_utf8(Iterator itr, Iterator end, Func op) return true; } +template +bool decode_utf16(Iterator itr, size_t count, Func op) +{ + while (count > 0) + { + auto c0 = *itr++; + --count; + + if ((c0 >= 0x0000 && c0 <= 0xD7FF) || (c0 >= 0xE000 && c0 <= 0xFFFF )) // 2-byte UCS2 character + { + op(c0); + continue; + } + + // unpaired surrogate + if (count == 0 || c0 >= 0xDC00) return false; + + auto c1 = *itr++; + --count; + if (c1 >= 0xDC00 && c1 <= 0xDFFF) // 4-byte surrogate pair + { + op((((c0 - 0xD800) << 10) | (c1 - 0xDC00)) + 0x10000); + continue; + } + else return false; // unpaired surrogate + } + + return true; +} + +template +bool encode_utf16(Iterator itr, Iterator end, Func op) +{ + while (itr != end) + { + uint32_t c = *itr++; + if ((c >= 0x0000 && c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFF )) // 2-byte UCS2 character + { + op(c); + continue; + } + + // unpaired surrogate + if (c < 0x10000) return false; + else // 4-byte surrogate pair + { + op(0xD800 + (((c - 0x10000) >> 10) & 0x3FF)); // high surrogate + op(0xDC00 + ((c - 0x10000) & 0x3FF)); // low surrogate + continue; + } + } + + return true; +} + void vsg::convert_utf(const std::string& utf8, std::wstring& dst) { dst.clear(); - decode_utf8(utf8.begin(), utf8.size(), [&dst](uint32_t c) { dst.push_back(c); }); + if constexpr (std::numeric_limits::max() == 0xFFFF) + decode_utf8(utf8.begin(), utf8.size(), [&dst](uint32_t c) { encode_utf16(&c, (&c) + 1, [&dst](uint32_t cu) { dst.push_back(cu); }); }); + else + decode_utf8(utf8.begin(), utf8.size(), [&dst](uint32_t c) { dst.push_back(c); }); } void vsg::convert_utf(const std::wstring& src, std::string& utf8) { utf8.clear(); - encode_utf8(src.begin(), src.end(), [&utf8](uint32_t c) { utf8.push_back(static_cast(c)); }); + if constexpr (std::numeric_limits::max() == 0xFFFF) + { + std::u32string intermediate; + decode_utf16(src.begin(), src.size(), [&intermediate](char32_t c) { intermediate.push_back(c); }); + encode_utf8(intermediate.begin(), intermediate.end(), [&utf8](char32_t c) { utf8.push_back(static_cast(c)); }); + } + else + encode_utf8(src.begin(), src.end(), [&utf8](uint32_t c) { utf8.push_back(static_cast(c)); }); }