From 56b0750b3279afaa09ff870cf4d7a9b680513abf Mon Sep 17 00:00:00 2001 From: Spartan322 Date: Wed, 31 Jul 2024 20:39:48 -0400 Subject: [PATCH] =?UTF-8?q?Add=20`\x8F`=20to=20`=C4=98`=20conversion=20for?= =?UTF-8?q?=20Windows-1252=20=09To=20support=20special=20vanilla=20Polish?= =?UTF-8?q?=20TODOs=20that=20break=20utf8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add CSV Win1252->Utf8 conversion tests Fix map_value warning not triggering Optimize for ascii characters --- src/openvic-dataloader/detail/Convert.hpp | 110 ++++++----- tests/src/csv/Parser.cpp | 222 ++++++++++++++++++++++ 2 files changed, 288 insertions(+), 44 deletions(-) diff --git a/src/openvic-dataloader/detail/Convert.hpp b/src/openvic-dataloader/detail/Convert.hpp index 5d9fca0..6f8f279 100644 --- a/src/openvic-dataloader/detail/Convert.hpp +++ b/src/openvic-dataloader/detail/Convert.hpp @@ -23,28 +23,22 @@ #include "v2script/ParseState.hpp" namespace ovdl::convert { - struct MappedChar { - char value; - std::string_view utf8; - - constexpr bool is_invalid() const { return value == 0; } - constexpr bool is_pass() const { return value == 1; } - }; - constexpr MappedChar invalid_map { 0, "" }; - constexpr MappedChar pass_map { 1, "" }; - struct map_value { std::string_view _value; constexpr map_value() noexcept : _value("") {} - constexpr map_value(std::nullptr_t) noexcept : _value("\0") {} + constexpr map_value(std::nullptr_t) noexcept : _value("\0", 1) {} constexpr explicit map_value(std::string_view val) noexcept : _value(val) {} - constexpr bool is_invalid() const { + static constexpr map_value invalid_value() noexcept { + return map_value(nullptr); + } + + constexpr bool is_invalid() const noexcept { return !_value.empty() && _value[0] == '\0'; } - constexpr bool is_pass() const { + constexpr bool is_pass() const noexcept { return _value.empty(); } @@ -203,13 +197,19 @@ namespace ovdl::convert { .map<'\xFC'>("ü") .map<'\xFD'>("ý") .map<'\xFE'>("þ") - .map<'\xFF'>("ÿ"); + .map<'\xFF'>("ÿ") + + // Paradox being special, invalid Windows-1252 + // Used for (semantically incorrect) Polish localization TODOs + .map<'\x8F'>("Ę"); template static constexpr map_value try_parse(Reader& reader) { auto index = map.try_parse(reader); if (index) { return map_value(map[index]); + } else if (*reader.position() < 0) { + return map_value::invalid_value(); } return {}; } @@ -358,6 +358,8 @@ namespace ovdl::convert { auto index = map.try_parse(reader); if (index) { return map_value(map[index]); + } else if (*reader.position() < 0) { + return map_value::invalid_value(); } return {}; } @@ -405,6 +407,11 @@ namespace ovdl::convert { break; // Skip Ascii and Utf8 encoding default: { + // If within ASCII range + if (c >= CharT {}) { + break; + } + map_value val = {}; CharT char_array[] { c, CharT() }; auto input = lexy::range_input(&char_array[0], &char_array[1]); @@ -454,19 +461,24 @@ namespace ovdl::convert { auto begin = reader.position(); auto last_it = begin; while (reader.peek() != eof) { - map_value val = try_parse_map(state.encoding(), reader); + // If not within ASCII range + if (*reader.position() < 0) { + map_value val = try_parse_map(state.encoding(), reader); + + if (val.is_invalid()) { + Error::on_invalid_character(state, reader); + reader.bump(); + continue; + } else if (!val.is_pass()) { + result.append(val._value); + last_it = reader.position(); + continue; + } + } - if (val.is_invalid()) { - Error::on_invalid_character(state, reader); + while (reader.peek() != eof && *reader.position() > 0) { reader.bump(); - continue; - } else if (!val.is_pass()) { - result.append(val._value); - last_it = reader.position(); - continue; } - - reader.bump(); result.append(last_it, reader.position()); last_it = reader.position(); } @@ -503,19 +515,24 @@ namespace ovdl::convert { auto begin = reader.position(); auto last_it = begin; while (reader.peek() != eof) { - map_value val = try_parse_map(state.encoding(), reader); + // If not within ASCII range + if (*reader.position() < 0) { + map_value val = try_parse_map(state.encoding(), reader); + + if (val.is_invalid()) { + Error::on_invalid_character(state, reader); + reader.bump(); + continue; + } else if (!val.is_pass()) { + result.append(val._value); + last_it = reader.position(); + continue; + } + } - if (val.is_invalid()) { - Error::on_invalid_character(state, reader); + while (reader.peek() != eof && *reader.position() > 0) { reader.bump(); - continue; - } else if (!val.is_pass()) { - result.append(val._value); - last_it = reader.position(); - continue; } - - reader.bump(); result.append(last_it, reader.position()); last_it = reader.position(); } @@ -550,19 +567,24 @@ namespace ovdl::convert { auto begin = reader.position(); auto last_it = begin; while (reader.peek() != eof) { - map_value val = try_parse_map(state.encoding(), reader); + // If not within ASCII range + if (*reader.position() < 0) { + map_value val = try_parse_map(state.encoding(), reader); - if (val.is_invalid()) { - Error::on_invalid_character(state, reader); - reader.bump(); - continue; - } else if (!val.is_pass()) { - result.append(val._value); - last_it = reader.position(); - continue; + if (val.is_invalid()) { + Error::on_invalid_character(state, reader); + reader.bump(); + continue; + } else if (!val.is_pass()) { + result.append(val._value); + last_it = reader.position(); + continue; + } } - reader.bump(); + while (reader.peek() != eof && *reader.position() > 0) { + reader.bump(); + } result.append(last_it, reader.position()); last_it = reader.position(); } diff --git a/tests/src/csv/Parser.cpp b/tests/src/csv/Parser.cpp index e72c02a..a55a01f 100644 --- a/tests/src/csv/Parser.cpp +++ b/tests/src/csv/Parser.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include using namespace ovdl; @@ -568,4 +569,225 @@ TEST_CASE("CSV Parse", "[csv-parse]") { } } } + + SECTION("Score militaire;Militär;;Puntuación militar") { + static constexpr auto buffer = "Score militaire;Militär;;Puntuación militar"sv; + parser.load_from_string(buffer); + + CHECK_PARSE(); + + const std::vector& line_list = parser.get_lines(); + CHECK_FALSE(line_list.empty()); + CHECK(ranges::size(line_list) == 1); + + const LineObject& line = line_list.front(); + CHECK_FALSE(line.empty()); + CHECK(ranges::size(line) == 3); + CHECK(line.value_count() == 4); + CHECK(line.prefix_end() == 0); + CHECK(line.suffix_end() == 4); + + for (const auto [index, val] : line | ranges::views::enumerate) { + CAPTURE(index); + CHECK_FALSE_OR_CONTINUE(val.second.empty()); + switch (index) { + case 0: + CHECK_OR_CONTINUE(val.first == 0); + CHECK_OR_CONTINUE(val.second == "Score militaire"sv); + break; + case 1: + CHECK_OR_CONTINUE(val.first == 1); + CHECK_OR_CONTINUE(val.second == "Militär"sv); + break; + case 2: + CHECK_OR_CONTINUE(val.first == 3); + CHECK_OR_CONTINUE(val.second == "Puntuación militar"sv); + break; + default: CHECK_OR_CONTINUE(false); break; + } + } + + CHECK(line.value_count() == 4); + + for (const auto index : ranges::views::iota(size_t(0), line.value_count())) { + CAPTURE(index); + switch (index) { + case 0: CHECK_OR_CONTINUE(line.get_value_for(index) == "Score militaire"sv); break; + case 1: CHECK_OR_CONTINUE(line.get_value_for(index) == "Militär"sv); break; + case 2: CHECK_OR_CONTINUE(line.get_value_for(index) == ""sv); break; + case 3: CHECK_OR_CONTINUE(line.get_value_for(index) == "Puntuación militar"sv); break; + default: CHECK_OR_CONTINUE(false); break; + } + } + } + + SECTION(";§RNo research set§W;§RAucune recherche définie§W;") { + static constexpr auto buffer = ";§RNo research set§W;§RAucune recherche définie§W;"sv; + parser.load_from_string(buffer); + + CHECK_PARSE(); + + const std::vector& line_list = parser.get_lines(); + CHECK_FALSE(line_list.empty()); + CHECK(ranges::size(line_list) == 1); + + const LineObject& line = line_list.front(); + CHECK_FALSE(line.empty()); + CHECK(ranges::size(line) == 2); + CHECK(line.value_count() == 3); + CHECK(line.prefix_end() == 1); + CHECK(line.suffix_end() == 3); + + for (const auto [index, val] : line | ranges::views::enumerate) { + CAPTURE(index); + CHECK_FALSE_OR_CONTINUE(val.second.empty()); + switch (index) { + case 0: + CHECK_OR_CONTINUE(val.first == 1); + CHECK_OR_CONTINUE(val.second == "§RNo research set§W"sv); + break; + case 1: + CHECK_OR_CONTINUE(val.first == 2); + CHECK_OR_CONTINUE(val.second == "§RAucune recherche définie§W"sv); + break; + default: CHECK_OR_CONTINUE(false); break; + } + } + + CHECK(line.value_count() == 3); + + for (const auto index : ranges::views::iota(size_t(0), line.value_count())) { + CAPTURE(index); + switch (index) { + case 0: CHECK_OR_CONTINUE(line.get_value_for(index) == ""sv); break; + case 1: CHECK_OR_CONTINUE(line.get_value_for(index) == "§RNo research set§W"sv); break; + case 2: CHECK_OR_CONTINUE(line.get_value_for(index) == "§RAucune recherche définie§W"sv); break; + default: CHECK_OR_CONTINUE(false); break; + } + } + } + + SECTION("Württemberg;Wurtemberg;Württemberg;;Württemberg;") { + static constexpr auto buffer = "Württemberg;Wurtemberg;Württemberg;;Württemberg;"sv; + parser.load_from_string(buffer); + + CHECK_PARSE(); + + const std::vector& line_list = parser.get_lines(); + CHECK_FALSE(line_list.empty()); + CHECK(ranges::size(line_list) == 1); + + const LineObject& line = line_list.front(); + CHECK_FALSE(line.empty()); + CHECK(ranges::size(line) == 4); + CHECK(line.value_count() == 5); + CHECK(line.prefix_end() == 0); + CHECK(line.suffix_end() == 5); + + for (const auto [index, val] : line | ranges::views::enumerate) { + CAPTURE(index); + CHECK_FALSE_OR_CONTINUE(val.second.empty()); + switch (index) { + case 0: + CHECK_OR_CONTINUE(val.first == 0); + CHECK_OR_CONTINUE(val.second == "Württemberg"sv); + break; + case 1: + CHECK_OR_CONTINUE(val.first == 1); + CHECK_OR_CONTINUE(val.second == "Wurtemberg"sv); + break; + case 2: + CHECK_OR_CONTINUE(val.first == 2); + CHECK_OR_CONTINUE(val.second == "Württemberg"sv); + break; + case 3: + CHECK_OR_CONTINUE(val.first == 4); + CHECK_OR_CONTINUE(val.second == "Württemberg"sv); + break; + default: CHECK_OR_CONTINUE(false); break; + } + } + + CHECK(line.value_count() == 5); + + for (const auto index : ranges::views::iota(size_t(0), line.value_count())) { + CAPTURE(index); + switch (index) { + case 0: CHECK_OR_CONTINUE(line.get_value_for(index) == "Württemberg"sv); break; + case 1: CHECK_OR_CONTINUE(line.get_value_for(index) == "Wurtemberg"sv); break; + case 2: CHECK_OR_CONTINUE(line.get_value_for(index) == "Württemberg"sv); break; + case 3: CHECK_OR_CONTINUE(line.get_value_for(index) == ""sv); break; + case 4: CHECK_OR_CONTINUE(line.get_value_for(index) == "Württemberg"sv); break; + default: CHECK_OR_CONTINUE(false); break; + } + } + } + + SECTION(";$NAME$ wurde in $PROV$ gebaut.;ID'\\8F' DO;") { + // Blame Ubuntu 22's GCC-12 distribution for this crap + // Compiler bug has been found + static constexpr auto cstring_left = ";$NAME$ wurde in $PROV$ gebaut.;ID"; + static auto byte = '\x8F'; + static constexpr auto cstring_right = " DO;"; + static auto buffer = [&] { + static std::string_view cstring_array[] = { + cstring_left, + std::string_view { &byte, 1 }, + cstring_right + }; + static auto cstring = cstring_array | ranges::views::join; + std::string result; + result.reserve(cstring.size()); + for (auto str : cstring) { + result.push_back(str); + } + return result; + }(); + parser.load_from_string(buffer); + + CHECK_PARSE(); + + const std::vector& line_list = parser.get_lines(); + CHECK_FALSE(line_list.empty()); + CHECK(ranges::size(line_list) == 1); + + const LineObject& line = line_list.front(); + CHECK_FALSE(line.empty()); + CHECK(ranges::size(line) == 2); + CHECK(line.value_count() == 3); + CHECK(line.prefix_end() == 1); + CHECK(line.suffix_end() == 3); + + for (const auto [index, val] : line | ranges::views::enumerate) { + CAPTURE(index); + CHECK_FALSE_OR_CONTINUE(val.second.empty()); + switch (index) { + case 0: + CHECK_OR_CONTINUE(val.first == 1); + CHECK_OR_CONTINUE(val.second == "$NAME$ wurde in $PROV$ gebaut."sv); + break; + case 1: + CHECK_OR_CONTINUE(val.first == 2); + CHECK_OR_CONTINUE(val.second == "IDĘ DO"sv); + break; + case 2: + CHECK_OR_CONTINUE(val.first == 3); + CHECK_OR_CONTINUE(val.second == ""sv); + break; + default: CHECK_OR_CONTINUE(false); break; + } + } + + CHECK(line.value_count() == 3); + + for (const auto index : ranges::views::iota(size_t(0), line.value_count())) { + CAPTURE(index); + switch (index) { + case 0: CHECK_OR_CONTINUE(line.get_value_for(index) == ""sv); break; + case 1: CHECK_OR_CONTINUE(line.get_value_for(index) == "$NAME$ wurde in $PROV$ gebaut."sv); break; + case 2: CHECK_OR_CONTINUE(line.get_value_for(index) == "IDĘ DO"sv); break; + default: CHECK_OR_CONTINUE(false); break; + } + } + } } \ No newline at end of file