Rewrote EU4Parser sideproject in C++

2kai2kai2 · 2kai2kai2 · commit bb1bca290c24 · 2021-02-24T23:26:00.000-08:00
This should probably be a separate repo or at least a separate branch. Oh well. It's just me on the project so who cares. I don't think anybody even reads these and I rarely do because I'm the one who wrote it.
Overall, an approximately 40% time reduction on my machine (6.4s to 3.8s on a 1444 save).
Custom C++ dict-like class that allows multiple of the same key.
Between EU4Dict and using py::list, we avoid the problems in std::variant with recursive variants, meaning we can have groups nested within each other indefinitely.
I'm sure that a lot more performance can be gotten by refining the C++ code.
- Large substrings mean a lot of copying. If we were to use pointers to just refer to smaller sections of the original string, that may significantly improve performance.
- There's probably a better system for EU4Dict such as a buckets system and/or custom containers.
- Not sure how much of an issue this is, but long longs are used to accommodate the large numbers in save files, but most numbers are much smaller. However, that's mostly memory and when Python gets ahold of the data it'll be huge anyway.
- Should we be using python lists?
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -4,7 +4,13 @@
     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
     "version": "0.2.0",
     "configurations": [
-
+        {
+            "name": "Python: Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal"
+        },
         {
             "name": "Debug EU4Bot",
             "type": "python",
diff --git a/EU4Parser.py b/EU4Parser.py
@@ -155,43 +155,6 @@
 }
 """
 
-eu4Types = Union[str, int, float, EU4cppparser.EU4Date, List[str], dict]
-
-
-def parseGroup(group: List[str]) -> Union[List[eu4Types], dict]:
-    """
-    Parses the text of a list of string items (either values or key-value pairs)
-
-    Returns either a list or a dict of 
-    """
-    if len(group) == 0:
-        return []
-    elif "=" in group[0] and ("{" not in group[0] or (group[0].index("=") < group[0].index("{"))):
-        # It's a dict.
-        dictGroup = {}
-        for item in group:
-            key, value = item.split("=", maxsplit=1)
-            dictGroup[key] = parseType(value)
-        return dictGroup
-    else:
-        # It's a list.
-        return list(map(parseType, group))
-
-
-def parseType(text: str) -> eu4Types:
-    text = text.strip()
-    if text.isdigit():  # int
-        return int(text)
-    elif text.isdecimal():  # float
-        return float(text)
-    elif EU4cppparser.EU4Date.stringValid(text):  # date
-        return EU4cppparser.EU4Date(text)
-    elif text[0] == "{" and text[-1] == "}":  # group
-        # The string starts and ends with {} so we need to remove that for splitting
-        return parseGroup(EU4cppparser.splitStrings(text[1:-1]))
-    else:  # str
-        return text.strip("\"")
-
 
 def formatFix(text: str) -> str:
     return text.replace("map_area_data{", "map_area_data={").replace("EU4txt", "")
@@ -207,7 +170,7 @@ def formatFix(text: str) -> str:
 starttime = time.time()
 count = 10
 for i in range(count):
-    parseGroup(EU4cppparser.splitStrings(text))
+    EU4cppparser.parseValue("{"+text+"}")
     print(f"Finished {i + 1}/{count}")
 totaltime = time.time() - starttime
 print(f"Parsing: {totaltime/count}s. | {totaltime/len(text)/count}s/char")
diff --git a/EU4cppparser.cpp b/EU4cppparser.cpp
@@ -45,6 +45,108 @@ std::string trim(const std::string &text) {
     return "";
 }
 
+typedef std::variant<std::string, long long, float, EU4Date> EU4Key;
+class EU4Dict;
+typedef std::variant<std::string, long long, float, EU4Date, py::list, EU4Dict> EU4Value;
+
+typedef std::pair<EU4Key, EU4Value> VKPair;
+/**
+ * This is like a dict that can have multiple of the same key, since EU4 save files can do so.
+ * https://docs.python.org/3/library/stdtypes.html#typesmapping
+ */
+class EU4Dict {
+    std::vector<VKPair> data;
+
+public:
+    EU4Dict() : data(std::vector<VKPair>()) {}
+
+    EU4Dict(const std::map<EU4Key, EU4Value> dict) : data(std::vector<VKPair>()) {
+        for (const std::pair<EU4Key, EU4Value> &pair : dict) {
+            data.emplace_back(pair);
+        }
+    }
+
+    std::vector<VKPair> allPairs() const {
+        return data;
+    }
+
+    VKPair getPair(size_t &index) const {
+        return data[index];
+    }
+
+    void setPair(size_t &index, VKPair pair) {
+        data[index] = pair;
+    }
+    /* Comment out so there is no overloading to mess up pybind11
+        void setPair(size_t &index, EU4Key key, EU4Value value) {
+            data[index] = VKPair(key, value);
+        }
+        */
+
+    std::vector<EU4Value> getAll(const EU4Key &key) const {
+        std::vector<EU4Value> values = std::vector<EU4Value>();
+        for (size_t i = 0; i < data.size(); ++i) {
+            if (data[i].first == key)
+                values.emplace_back(data[i].second);
+        }
+        return values;
+    }
+    EU4Value getFirst(const EU4Key &key) const {
+        for (size_t i = 0; i < data.size(); ++i) {
+            if (data[i].first == key)
+                return data[i].second;
+        }
+        return nullptr;
+    }
+    EU4Value getLast(const EU4Key &key) const {
+        for (size_t i = data.size() - 1; i > 0; --i) {
+            if (data[i].first == key)
+                return data[i].second;
+        }
+        return nullptr;
+    }
+
+    size_t length() const {
+        return data.size();
+    }
+
+    void add(EU4Key key, EU4Value value) {
+        data.emplace_back(VKPair(key, value));
+    }
+
+    // Kinda inefficient because it copies before deleting
+    VKPair popBack() {
+        VKPair out = data.back();
+        data.pop_back();
+        return out;
+    }
+
+    std::string toString() {
+        std::string s = "{";
+        for (size_t i = 0; i < data.size(); ++i) {
+            // TODO: All this casting is bad. Not sure if there's a better way to do this.
+            s += std::string(py::str(py::cast(data[i].first))) + ": " + std::string(py::str(py::cast(data[i].second)));
+            if (i != data.size() - 1)
+                s += ", ";
+        }
+        return s + "}";
+    }
+
+    bool operator==(const EU4Dict &other) const {
+        return data == other.data;
+    }
+    bool operator!=(const EU4Dict &other) const {
+        return !(*this == other);
+    }
+
+    VKPair operator[](size_t index) const {
+        return data[index];
+    }
+    VKPair &operator[](size_t index) {
+        return data[index];
+    }
+};
+
 std::list<std::string> splitStrings(const std::string &text) {
     std::list<std::string> out;
     unsigned char bracketCount = 0;
@@ -71,6 +173,82 @@ std::list<std::string> splitStrings(const std::string &text) {
     return out;
 }
 
+EU4Key parseKey(const std::string &text) {
+    const std::string &trimmed(trim(text));
+    unsigned char dots = 0;
+    for (size_t i = 0; i < text.size(); ++i) {
+        if (text[i] == '.') {
+            if (++dots > 2)
+                return trimmed; // STRING
+        } else if (!std::isdigit(text[i])) {
+            return trimmed; // STRING
+        }                   // If it isn't a '.' or get caught by !isdigit then it is a digit and we continue
+    }
+    // So at this point we have only run into 0, 1, or 2 '.' and the rest of the characters have been digits
+    if (dots == 0)
+        return std::stoll(trimmed); // INT
+    else if (dots == 1)
+        return std::stof(trimmed); // FLOAT
+    else                           // (dots == 2)
+        return EU4Date(trimmed);   // DATE
+}
+
+EU4Value parseValue(const std::string &text) {
+    const std::string &trimmed(trim(text));
+    // First check for a group
+    if (text[0] == '{' && text.back() == '}') {
+        std::list<std::string> items = splitStrings(trimmed.substr(1, trimmed.size() - 2));
+        if (items.size() == 0) {
+            return py::list();
+        } else {
+            std::string &first = items.front();
+            bool isDict = false;
+            // If we run into a '=' before a potential '{' then it's a dict, and may or may not be a dict with dict keys
+            // If we run into a '{' before '=' then it's a list of groups but not a dict. The groups may be dicts.
+            for (size_t i = 0; i < first.size(); i++) {
+                if (first[i] == '=') {
+                    isDict = true;
+                    break;
+                } else if (first[i] == '{') {
+                    isDict = false;
+                    break;
+                }
+            }
+            if (isDict) {
+                EU4Dict dict = EU4Dict();
+                for (const std::string &item : items) {
+                    const size_t eqIndex = item.find('=');
+                    dict.add(parseKey(item.substr(0, eqIndex)), parseValue(item.substr(eqIndex + 1)));
+                }
+                return dict;
+            } else {
+                py::list list = py::list();
+                for (const std::string &item : items) {
+                    list.append(parseValue(item));
+                }
+                return list;
+            }
+        }
+    }
+    // Next check int/float/date
+    unsigned char dots = 0;
+    for (size_t i = 0; i < text.size(); ++i) {
+        if (text[i] == '.') {
+            if (++dots > 2)
+                return trimmed; // STRING
+        } else if (!std::isdigit(text[i])) {
+            return trimmed; // STRING
+        }                   // If it isn't a '.' or get caught by !isdigit then it is a digit and we continue
+    }
+    // So at this point we have only run into 0, 1, or 2 '.' and the rest of the characters have been digits
+    if (dots == 0)
+        return std::stoll(trimmed); // INT
+    else if (dots == 1)
+        return std::stof(trimmed); // FLOAT
+    else                           // (dots == 2)
+        return EU4Date(trimmed);   // DATE
+}
+
 PYBIND11_MODULE(EU4cppparser, m) {
     m.doc() = "EU4 parser C++ library.";
 
@@ -91,6 +269,24 @@ PYBIND11_MODULE(EU4cppparser, m) {
         .def("isEU4Date", &EU4Date::isEU4Date)
         .def_static("stringValid", &EU4Date::stringValid, py::arg("text"));
 
+    py::class_<EU4Dict>(m, "EU4Dict")
+        .def(py::init<>())
+        .def(py::init<const std::map<EU4Key, EU4Value> &>(), py::arg("map"))
+        .def("allPairs", &EU4Dict::allPairs)
+        .def("__getitem__", &EU4Dict::getPair, py::arg("index"))
+        .def("__setitem__", &EU4Dict::setPair)
+        .def("getAll", &EU4Dict::getAll, py::arg("key"))
+        .def("getFirst", &EU4Dict::getFirst, py::arg("key"))
+        .def("getLast", &EU4Dict::getLast, py::arg("key"))
+        .def("__len__", &EU4Dict::length)
+        .def("append", &EU4Dict::add, py::arg("key"), py::arg("value"))
+        .def("popBack", &EU4Dict::popBack)
+        .def("__repr__", &EU4Dict::toString)
+        .def(py::self == py::self)
+        .def(py::self != py::self);
+
+    m.def("parseValue", &parseValue, "Parses a value.", py::arg("text"));
+
     m.def("isEmpty", &isEmpty, "Returns true if this string does not contain characters other than whitespace.", py::arg("text"));
     m.def("splitStrings", &splitStrings, py::arg("text"));
 }