Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use UTF-8 for XML #584

Merged
merged 27 commits into from
Aug 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
0a10c9c
Squashed commit of the following:
Apr 18, 2022
c113aa8
Squashed commit of the following:
Apr 18, 2022
667e747
Squashed commit of the following:
Apr 18, 2022
7ada712
use str::EncodedString to hide str:u8_string messiness
Apr 18, 2022
636132b
Merge branch 'master' into develop/encoded-string
Apr 20, 2022
0fa5f80
Merge branch 'master' into develop/encoded-string
Apr 20, 2022
dbe44f2
Merge branch 'master' into develop/encoded-string
Jul 29, 2022
5e4caea
Squashed commit of the following:
Jul 29, 2022
63c895a
XML is now always UTF-8; no more legacy work-arounds
Jul 29, 2022
f6e7173
Squashed commit of the following:
Jul 29, 2022
5991021
XML is now always UTF-8
Jul 29, 2022
2fbe28b
Merge branch 'master' into develop/encoded-string
Aug 3, 2022
0ce6fc8
coda-oss updates
Aug 3, 2022
c7299c6
no more str::fromUtf8()
Aug 3, 2022
db2961a
Merge branch 'master' into develop/encoded-string
Aug 6, 2022
f1a6b77
Squashed commit of the following:
Aug 6, 2022
44f16e6
Merge branch 'master' into develop/encoded-string
Aug 9, 2022
62d5d31
Update ReleaseNotes.md
Aug 9, 2022
4a16fef
found more parseDataFromString() and toXMLString() routines that shou…
Aug 9, 2022
393939b
use std::u8string for XML
Aug 9, 2022
b040096
changing parseDataFromString() breaks pre-built Python bindings :-(
Aug 9, 2022
63378c5
restore previous toXMLString() overloads so existing SWIG bindings do…
Aug 9, 2022
49f4110
use toXMLString_() in existing Python bindings
Aug 9, 2022
a36c398
Merge branch 'master' into develop/encoded-string
Aug 10, 2022
618b082
Squashed commit of the following:
Aug 10, 2022
c2f81c6
send all XML through UTF-8 routines
Aug 10, 2022
d9c9ba5
remove remaining uses of io::StringSteam for XML
Aug 10, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions externals/coda-oss/UnitTest/pch.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers
#include <WinSock.h>
#include <windows.h>
#include <comdef.h>
#undef min
#undef max

Expand Down
58 changes: 47 additions & 11 deletions externals/coda-oss/modules/c++/str/source/Encoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ inline void append(std::u32string& result, const coda_oss::u8string& utf8)
}

template<typename TChar>
static void fromWindows1252_(str::W1252string::value_type ch, std::basic_string<TChar>& result)
static void fromWindows1252_(str::W1252string::value_type ch, std::basic_string<TChar>& result, bool strict=false)
{
// ASCII is the same in UTF-8
if (ch < static_cast<str::W1252string::value_type>(0x80))
Expand All @@ -135,18 +135,40 @@ static void fromWindows1252_(str::W1252string::value_type ch, std::basic_string<
}

static const auto map = Windows1252_to_u8string();
const auto it = map.find(static_cast<std::u32string::value_type>(ch));
const auto ch32 = static_cast<std::u32string::value_type>(ch);
const auto it = map.find(ch32);
if (it != map.end())
{
append(result, it->second);
return;
}

// If the input text contains a character that isn't defined in Windows-1252; return a
// "replacement character." Yes, this will **corrupt** the input data as information is lost:
// https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character
static const coda_oss::u8string replacement_character = utf8_(0xfffd);
append(result, replacement_character);
switch (static_cast<uint8_t>(ch))
{
case 0x81:
case 0x8d:
case 0x8f:
case 0x90:
case 0x9d:
{
if (strict)
{
// If the input text contains a character that isn't defined in Windows-1252; return a
// "replacement character." Yes, this will **corrupt** the input data as information is lost:
// https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character
static const coda_oss::u8string replacement_character = utf8_(0xfffd);
append(result, replacement_character);
}
else
{
// _bstr_t just preserves these values, do the same
append(result, utf8_(ch32));
}
break;
}
default:
throw std::invalid_argument("Invalid Windows-1252 character.");
}
}
template<typename TChar>
void windows1252_to_string_(str::W1252string::const_pointer p, size_t sz, std::basic_string<TChar>& result)
Expand Down Expand Up @@ -205,7 +227,7 @@ std::map<TValue, TKey> kv_to_vk(const std::map<TKey, TValue>& kv)
static void get_next_utf8_byte(coda_oss::u8string::const_pointer p, size_t sz,
size_t& i, coda_oss::u8string& utf8)
{
if (!(i + i < sz))
if (!(i + 1 < sz))
{
throw std::invalid_argument("No remaining bytes, invalid UTF-8 encoding.");
}
Expand All @@ -220,7 +242,7 @@ static void get_next_utf8_byte(coda_oss::u8string::const_pointer p, size_t sz,
utf8 += coda_oss::u8string{static_cast<coda_oss::u8string::value_type>(b)};
}
template<typename TChar>
static void utf8to1252(coda_oss::u8string::const_pointer p, size_t sz, std::basic_string<TChar>& result)
static void utf8to1252(coda_oss::u8string::const_pointer p, size_t sz, std::basic_string<TChar>& result, bool strict=false)
{
using value_type = typename std::basic_string<TChar>::value_type;
for (size_t i = 0; i < sz; i++)
Expand Down Expand Up @@ -254,10 +276,24 @@ static void utf8to1252(coda_oss::u8string::const_pointer p, size_t sz, std::basi
{
result += static_cast<value_type>(it->second);
}
else if (strict)
{
throw std::invalid_argument("UTF-8 sequence can't be converted to Windows-1252.");
//assert("UTF-8 sequence can't be converted to Windows-1252." && 0);
//result += static_cast<TChar>(0x7F); // <DEL>
}
else
{
assert("UTF-8 sequence can't be converted to Windows-1252." && 0);
result += static_cast<TChar>(0x7F); // <DEL>
// _bstr_t preserves these values
if (utf8.length() == 2)
{
result += static_cast<TChar>(utf8[1]);
}
else
{
assert("UTF-8 sequence can't be converted to Windows-1252." && 0);
result += static_cast<TChar>(0x7F); // <DEL>
}
}
}
}
Expand Down
Loading