Skip to content

Commit

Permalink
Add \x8F to Ę conversion for Windows-1252
Browse files Browse the repository at this point in the history
	To support special vanilla Polish TODOs that break utf8

Add CSV Win1252->Utf8 conversion tests

Fix map_value warning not triggering
Optimize for ascii characters
  • Loading branch information
Spartan322 committed Aug 1, 2024
1 parent 8472800 commit 421c450
Show file tree
Hide file tree
Showing 2 changed files with 270 additions and 44 deletions.
110 changes: 66 additions & 44 deletions src/openvic-dataloader/detail/Convert.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,28 +23,22 @@
#include "v2script/ParseState.hpp"

namespace ovdl::convert {
struct MappedChar {
char value;
std::string_view utf8;

constexpr bool is_invalid() const { return value == 0; }
constexpr bool is_pass() const { return value == 1; }
};
constexpr MappedChar invalid_map { 0, "" };
constexpr MappedChar pass_map { 1, "" };

struct map_value {
std::string_view _value;

constexpr map_value() noexcept : _value("") {}
constexpr map_value(std::nullptr_t) noexcept : _value("\0") {}
constexpr map_value(std::nullptr_t) noexcept : _value("\0", 1) {}
constexpr explicit map_value(std::string_view val) noexcept : _value(val) {}

constexpr bool is_invalid() const {
static constexpr map_value invalid_value() noexcept {
return map_value(nullptr);
}

constexpr bool is_invalid() const noexcept {
return !_value.empty() && _value[0] == '\0';
}

constexpr bool is_pass() const {
constexpr bool is_pass() const noexcept {
return _value.empty();
}

Expand Down Expand Up @@ -203,13 +197,19 @@ namespace ovdl::convert {
.map<'\xFC'>("ü")
.map<'\xFD'>("ý")
.map<'\xFE'>("þ")
.map<'\xFF'>("ÿ");
.map<'\xFF'>("ÿ")

// Paradox being special, invalid Windows-1252
// Used for (semantically incorrect) Polish localization TODOs
.map<'\x8F'>("Ę");

template<typename Reader>
static constexpr map_value try_parse(Reader& reader) {
auto index = map.try_parse(reader);
if (index) {
return map_value(map[index]);
} else if (*reader.position() < 0) {
return map_value::invalid_value();
}
return {};
}
Expand Down Expand Up @@ -358,6 +358,8 @@ namespace ovdl::convert {
auto index = map.try_parse(reader);
if (index) {
return map_value(map[index]);
} else if (*reader.position() < 0) {
return map_value::invalid_value();
}
return {};
}
Expand Down Expand Up @@ -405,6 +407,11 @@ namespace ovdl::convert {
break;
// Skip Ascii and Utf8 encoding
default: {
// If within ASCII range
if (c >= CharT {}) {
break;
}

map_value val = {};
CharT char_array[] { c, CharT() };
auto input = lexy::range_input(&char_array[0], &char_array[1]);
Expand Down Expand Up @@ -454,19 +461,24 @@ namespace ovdl::convert {
auto begin = reader.position();
auto last_it = begin;
while (reader.peek() != eof) {
map_value val = try_parse_map(state.encoding(), reader);
// If not within ASCII range
if (*reader.position() < 0) {
map_value val = try_parse_map(state.encoding(), reader);

if (val.is_invalid()) {
Error::on_invalid_character(state, reader);
reader.bump();
continue;
} else if (!val.is_pass()) {
result.append(val._value);
last_it = reader.position();
continue;
}
}

if (val.is_invalid()) {
Error::on_invalid_character(state, reader);
while (reader.peek() != eof && *reader.position() > 0) {
reader.bump();
continue;
} else if (!val.is_pass()) {
result.append(val._value);
last_it = reader.position();
continue;
}

reader.bump();
result.append(last_it, reader.position());
last_it = reader.position();
}
Expand Down Expand Up @@ -503,19 +515,24 @@ namespace ovdl::convert {
auto begin = reader.position();
auto last_it = begin;
while (reader.peek() != eof) {
map_value val = try_parse_map(state.encoding(), reader);
// If not within ASCII range
if (*reader.position() < 0) {
map_value val = try_parse_map(state.encoding(), reader);

if (val.is_invalid()) {
Error::on_invalid_character(state, reader);
reader.bump();
continue;
} else if (!val.is_pass()) {
result.append(val._value);
last_it = reader.position();
continue;
}
}

if (val.is_invalid()) {
Error::on_invalid_character(state, reader);
while (reader.peek() != eof && *reader.position() > 0) {
reader.bump();
continue;
} else if (!val.is_pass()) {
result.append(val._value);
last_it = reader.position();
continue;
}

reader.bump();
result.append(last_it, reader.position());
last_it = reader.position();
}
Expand Down Expand Up @@ -550,19 +567,24 @@ namespace ovdl::convert {
auto begin = reader.position();
auto last_it = begin;
while (reader.peek() != eof) {
map_value val = try_parse_map(state.encoding(), reader);
// If not within ASCII range
if (*reader.position() < 0) {
map_value val = try_parse_map(state.encoding(), reader);

if (val.is_invalid()) {
Error::on_invalid_character(state, reader);
reader.bump();
continue;
} else if (!val.is_pass()) {
result.append(val._value);
last_it = reader.position();
continue;
if (val.is_invalid()) {
Error::on_invalid_character(state, reader);
reader.bump();
continue;
} else if (!val.is_pass()) {
result.append(val._value);
last_it = reader.position();
continue;
}
}

reader.bump();
while (reader.peek() != eof && *reader.position() > 0) {
reader.bump();
}
result.append(last_it, reader.position());
last_it = reader.position();
}
Expand Down
204 changes: 204 additions & 0 deletions tests/src/csv/Parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -568,4 +568,208 @@ TEST_CASE("CSV Parse", "[csv-parse]") {
}
}
}

SECTION("Score militaire;Militär;;Puntuación militar") {
static constexpr auto buffer = "Score militaire;Militär;;Puntuación militar"sv;
parser.load_from_string(buffer);

CHECK_PARSE();

const std::vector<LineObject>& line_list = parser.get_lines();
CHECK_FALSE(line_list.empty());
CHECK(ranges::size(line_list) == 1);

const LineObject& line = line_list.front();
CHECK_FALSE(line.empty());
CHECK(ranges::size(line) == 3);
CHECK(line.value_count() == 4);
CHECK(line.prefix_end() == 0);
CHECK(line.suffix_end() == 4);

for (const auto [index, val] : line | ranges::views::enumerate) {
CAPTURE(index);
CHECK_FALSE_OR_CONTINUE(val.second.empty());
switch (index) {
case 0:
CHECK_OR_CONTINUE(val.first == 0);
CHECK_OR_CONTINUE(val.second == "Score militaire"sv);
break;
case 1:
CHECK_OR_CONTINUE(val.first == 1);
CHECK_OR_CONTINUE(val.second == "Militär"sv);
break;
case 2:
CHECK_OR_CONTINUE(val.first == 3);
CHECK_OR_CONTINUE(val.second == "Puntuación militar"sv);
break;
default: CHECK_OR_CONTINUE(false); break;
}
}

CHECK(line.value_count() == 4);

for (const auto index : ranges::views::iota(size_t(0), line.value_count())) {
CAPTURE(index);
switch (index) {
case 0: CHECK_OR_CONTINUE(line.get_value_for(index) == "Score militaire"sv); break;
case 1: CHECK_OR_CONTINUE(line.get_value_for(index) == "Militär"sv); break;
case 2: CHECK_OR_CONTINUE(line.get_value_for(index) == ""sv); break;
case 3: CHECK_OR_CONTINUE(line.get_value_for(index) == "Puntuación militar"sv); break;
default: CHECK_OR_CONTINUE(false); break;
}
}
}

SECTION(";§RNo research set§W;§RAucune recherche définie§W;") {
static constexpr auto buffer = ";§RNo research set§W;§RAucune recherche définie§W;"sv;
parser.load_from_string(buffer);

CHECK_PARSE();

const std::vector<LineObject>& line_list = parser.get_lines();
CHECK_FALSE(line_list.empty());
CHECK(ranges::size(line_list) == 1);

const LineObject& line = line_list.front();
CHECK_FALSE(line.empty());
CHECK(ranges::size(line) == 2);
CHECK(line.value_count() == 3);
CHECK(line.prefix_end() == 1);
CHECK(line.suffix_end() == 3);

for (const auto [index, val] : line | ranges::views::enumerate) {
CAPTURE(index);
CHECK_FALSE_OR_CONTINUE(val.second.empty());
switch (index) {
case 0:
CHECK_OR_CONTINUE(val.first == 1);
CHECK_OR_CONTINUE(val.second == "§RNo research set§W"sv);
break;
case 1:
CHECK_OR_CONTINUE(val.first == 2);
CHECK_OR_CONTINUE(val.second == "§RAucune recherche définie§W"sv);
break;
default: CHECK_OR_CONTINUE(false); break;
}
}

CHECK(line.value_count() == 3);

for (const auto index : ranges::views::iota(size_t(0), line.value_count())) {
CAPTURE(index);
switch (index) {
case 0: CHECK_OR_CONTINUE(line.get_value_for(index) == ""sv); break;
case 1: CHECK_OR_CONTINUE(line.get_value_for(index) == "§RNo research set§W"sv); break;
case 2: CHECK_OR_CONTINUE(line.get_value_for(index) == "§RAucune recherche définie§W"sv); break;
default: CHECK_OR_CONTINUE(false); break;
}
}
}

SECTION("Württemberg;Wurtemberg;Württemberg;;Württemberg;") {
static constexpr auto buffer = "Württemberg;Wurtemberg;Württemberg;;Württemberg;"sv;
parser.load_from_string(buffer);

CHECK_PARSE();

const std::vector<LineObject>& line_list = parser.get_lines();
CHECK_FALSE(line_list.empty());
CHECK(ranges::size(line_list) == 1);

const LineObject& line = line_list.front();
CHECK_FALSE(line.empty());
CHECK(ranges::size(line) == 4);
CHECK(line.value_count() == 5);
CHECK(line.prefix_end() == 0);
CHECK(line.suffix_end() == 5);

for (const auto [index, val] : line | ranges::views::enumerate) {
CAPTURE(index);
CHECK_FALSE_OR_CONTINUE(val.second.empty());
switch (index) {
case 0:
CHECK_OR_CONTINUE(val.first == 0);
CHECK_OR_CONTINUE(val.second == "Württemberg"sv);
break;
case 1:
CHECK_OR_CONTINUE(val.first == 1);
CHECK_OR_CONTINUE(val.second == "Wurtemberg"sv);
break;
case 2:
CHECK_OR_CONTINUE(val.first == 2);
CHECK_OR_CONTINUE(val.second == "Württemberg"sv);
break;
case 3:
CHECK_OR_CONTINUE(val.first == 4);
CHECK_OR_CONTINUE(val.second == "Württemberg"sv);
break;
default: CHECK_OR_CONTINUE(false); break;
}
}

CHECK(line.value_count() == 5);

for (const auto index : ranges::views::iota(size_t(0), line.value_count())) {
CAPTURE(index);
switch (index) {
case 0: CHECK_OR_CONTINUE(line.get_value_for(index) == "Württemberg"sv); break;
case 1: CHECK_OR_CONTINUE(line.get_value_for(index) == "Wurtemberg"sv); break;
case 2: CHECK_OR_CONTINUE(line.get_value_for(index) == "Württemberg"sv); break;
case 3: CHECK_OR_CONTINUE(line.get_value_for(index) == ""sv); break;
case 4: CHECK_OR_CONTINUE(line.get_value_for(index) == "Württemberg"sv); break;
default: CHECK_OR_CONTINUE(false); break;
}
}
}

SECTION(";$NAME$ wurde in $PROV$ gebaut.;ID\\x8F DO;") {
static constexpr auto cstring = ";$NAME$ wurde in $PROV$ gebaut.;ID\x8F DO;";
static constexpr auto buffer = std::string_view { cstring };
parser.load_from_string(buffer);

CHECK_PARSE();

const std::vector<LineObject>& line_list = parser.get_lines();
CHECK_FALSE(line_list.empty());
CHECK(ranges::size(line_list) == 1);

const LineObject& line = line_list.front();
CHECK_FALSE(line.empty());
CHECK(ranges::size(line) == 2);
CHECK(line.value_count() == 3);
CHECK(line.prefix_end() == 1);
CHECK(line.suffix_end() == 3);

for (const auto [index, val] : line | ranges::views::enumerate) {
CAPTURE(index);
CHECK_FALSE_OR_CONTINUE(val.second.empty());
switch (index) {
case 0:
CHECK_OR_CONTINUE(val.first == 1);
CHECK_OR_CONTINUE(val.second == "$NAME$ wurde in $PROV$ gebaut."sv);
break;
case 1:
CHECK_OR_CONTINUE(val.first == 2);
CHECK_OR_CONTINUE(val.second == "IDĘ DO"sv);
break;
case 2:
CHECK_OR_CONTINUE(val.first == 3);
CHECK_OR_CONTINUE(val.second == ""sv);
break;
default: CHECK_OR_CONTINUE(false); break;
}
}

CHECK(line.value_count() == 3);

for (const auto index : ranges::views::iota(size_t(0), line.value_count())) {
CAPTURE(index);
switch (index) {
case 0: CHECK_OR_CONTINUE(line.get_value_for(index) == ""sv); break;
case 1: CHECK_OR_CONTINUE(line.get_value_for(index) == "$NAME$ wurde in $PROV$ gebaut."sv); break;
case 2: CHECK_OR_CONTINUE(line.get_value_for(index) == "IDĘ DO"sv); break;
default: CHECK_OR_CONTINUE(false); break;
}
}
}
}

0 comments on commit 421c450

Please sign in to comment.