Skip to content
This repository was archived by the owner on Oct 14, 2024. It is now read-only.

Commit bb1bca2

Browse files
committed
Rewrote EU4Parser sideproject in C++
This should probably be a separate repo or at least a separate branch. Oh well. It's just me on the project so who cares. I don't think anybody even reads these and I rarely do because I'm the one who wrote it. Overall, an approximately 40% time reduction on my machine (6.4s to 3.8s on a 1444 save). Custom C++ dict-like class that allows multiple of the same key. Between EU4Dict and using py::list, we avoid the problems in std::variant with recursive variants, meaning we can have groups nested within each other indefinitely. I'm sure that a lot more performance can be gotten by refining the C++ code. - Large substrings mean a lot of copying. If we were to use pointers to just refer to smaller sections of the original string, that may significantly improve performance. - There's probably a better system for EU4Dict such as a buckets system and/or custom containers. - Not sure how much of an issue this is, but long longs are used to accommodate the large numbers in save files, but most numbers are much smaller. However, that's mostly memory and when Python gets ahold of the data it'll be huge anyway. - Should we be using python lists?
1 parent 670794f commit bb1bca2

File tree

3 files changed

+204
-39
lines changed

3 files changed

+204
-39
lines changed

.vscode/launch.json

+7-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,13 @@
44
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
55
"version": "0.2.0",
66
"configurations": [
7-
7+
{
8+
"name": "Python: Current File",
9+
"type": "python",
10+
"request": "launch",
11+
"program": "${file}",
12+
"console": "integratedTerminal"
13+
},
814
{
915
"name": "Debug EU4Bot",
1016
"type": "python",

EU4Parser.py

+1-38
Original file line numberDiff line numberDiff line change
@@ -155,43 +155,6 @@
155155
}
156156
"""
157157

158-
eu4Types = Union[str, int, float, EU4cppparser.EU4Date, List[str], dict]
159-
160-
161-
def parseGroup(group: List[str]) -> Union[List[eu4Types], dict]:
162-
"""
163-
Parses the text of a list of string items (either values or key-value pairs)
164-
165-
Returns either a list or a dict of
166-
"""
167-
if len(group) == 0:
168-
return []
169-
elif "=" in group[0] and ("{" not in group[0] or (group[0].index("=") < group[0].index("{"))):
170-
# It's a dict.
171-
dictGroup = {}
172-
for item in group:
173-
key, value = item.split("=", maxsplit=1)
174-
dictGroup[key] = parseType(value)
175-
return dictGroup
176-
else:
177-
# It's a list.
178-
return list(map(parseType, group))
179-
180-
181-
def parseType(text: str) -> eu4Types:
182-
text = text.strip()
183-
if text.isdigit(): # int
184-
return int(text)
185-
elif text.isdecimal(): # float
186-
return float(text)
187-
elif EU4cppparser.EU4Date.stringValid(text): # date
188-
return EU4cppparser.EU4Date(text)
189-
elif text[0] == "{" and text[-1] == "}": # group
190-
# The string starts and ends with {} so we need to remove that for splitting
191-
return parseGroup(EU4cppparser.splitStrings(text[1:-1]))
192-
else: # str
193-
return text.strip("\"")
194-
195158

196159
def formatFix(text: str) -> str:
197160
return text.replace("map_area_data{", "map_area_data={").replace("EU4txt", "")
@@ -207,7 +170,7 @@ def formatFix(text: str) -> str:
207170
starttime = time.time()
208171
count = 10
209172
for i in range(count):
210-
parseGroup(EU4cppparser.splitStrings(text))
173+
EU4cppparser.parseValue("{"+text+"}")
211174
print(f"Finished {i + 1}/{count}")
212175
totaltime = time.time() - starttime
213176
print(f"Parsing: {totaltime/count}s. | {totaltime/len(text)/count}s/char")

EU4cppparser.cpp

+196
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,108 @@ std::string trim(const std::string &text) {
4545
return "";
4646
}
4747

48+
typedef std::variant<std::string, long long, float, EU4Date> EU4Key;
49+
class EU4Dict;
50+
typedef std::variant<std::string, long long, float, EU4Date, py::list, EU4Dict> EU4Value;
51+
52+
typedef std::pair<EU4Key, EU4Value> VKPair;
53+
/**
54+
* This is like a dict that can have multiple of the same key, since EU4 save files can do so.
55+
* https://docs.python.org/3/library/stdtypes.html#typesmapping
56+
*/
57+
class EU4Dict {
58+
std::vector<VKPair> data;
59+
60+
public:
61+
EU4Dict() : data(std::vector<VKPair>()) {}
62+
63+
EU4Dict(const std::map<EU4Key, EU4Value> dict) : data(std::vector<VKPair>()) {
64+
for (const std::pair<EU4Key, EU4Value> &pair : dict) {
65+
data.emplace_back(pair);
66+
}
67+
}
68+
69+
std::vector<VKPair> allPairs() const {
70+
return data;
71+
}
72+
73+
VKPair getPair(size_t &index) const {
74+
return data[index];
75+
}
76+
77+
void setPair(size_t &index, VKPair pair) {
78+
data[index] = pair;
79+
}
80+
/* Comment out so there is no overloading to mess up pybind11
81+
void setPair(size_t &index, EU4Key key, EU4Value value) {
82+
data[index] = VKPair(key, value);
83+
}
84+
*/
85+
86+
std::vector<EU4Value> getAll(const EU4Key &key) const {
87+
std::vector<EU4Value> values = std::vector<EU4Value>();
88+
for (size_t i = 0; i < data.size(); ++i) {
89+
if (data[i].first == key)
90+
values.emplace_back(data[i].second);
91+
}
92+
return values;
93+
}
94+
EU4Value getFirst(const EU4Key &key) const {
95+
for (size_t i = 0; i < data.size(); ++i) {
96+
if (data[i].first == key)
97+
return data[i].second;
98+
}
99+
return nullptr;
100+
}
101+
EU4Value getLast(const EU4Key &key) const {
102+
for (size_t i = data.size() - 1; i > 0; --i) {
103+
if (data[i].first == key)
104+
return data[i].second;
105+
}
106+
return nullptr;
107+
}
108+
109+
size_t length() const {
110+
return data.size();
111+
}
112+
113+
void add(EU4Key key, EU4Value value) {
114+
data.emplace_back(VKPair(key, value));
115+
}
116+
117+
// Kinda inefficient because it copies before deleting
118+
VKPair popBack() {
119+
VKPair out = data.back();
120+
data.pop_back();
121+
return out;
122+
}
123+
124+
std::string toString() {
125+
std::string s = "{";
126+
for (size_t i = 0; i < data.size(); ++i) {
127+
// TODO: All this casting is bad. Not sure if there's a better way to do this.
128+
s += std::string(py::str(py::cast(data[i].first))) + ": " + std::string(py::str(py::cast(data[i].second)));
129+
if (i != data.size() - 1)
130+
s += ", ";
131+
}
132+
return s + "}";
133+
}
134+
135+
bool operator==(const EU4Dict &other) const {
136+
return data == other.data;
137+
}
138+
bool operator!=(const EU4Dict &other) const {
139+
return !(*this == other);
140+
}
141+
142+
VKPair operator[](size_t index) const {
143+
return data[index];
144+
}
145+
VKPair &operator[](size_t index) {
146+
return data[index];
147+
}
148+
};
149+
48150
std::list<std::string> splitStrings(const std::string &text) {
49151
std::list<std::string> out;
50152
unsigned char bracketCount = 0;
@@ -71,6 +173,82 @@ std::list<std::string> splitStrings(const std::string &text) {
71173
return out;
72174
}
73175

176+
EU4Key parseKey(const std::string &text) {
177+
const std::string &trimmed(trim(text));
178+
unsigned char dots = 0;
179+
for (size_t i = 0; i < text.size(); ++i) {
180+
if (text[i] == '.') {
181+
if (++dots > 2)
182+
return trimmed; // STRING
183+
} else if (!std::isdigit(text[i])) {
184+
return trimmed; // STRING
185+
} // If it isn't a '.' or get caught by !isdigit then it is a digit and we continue
186+
}
187+
// So at this point we have only run into 0, 1, or 2 '.' and the rest of the characters have been digits
188+
if (dots == 0)
189+
return std::stoll(trimmed); // INT
190+
else if (dots == 1)
191+
return std::stof(trimmed); // FLOAT
192+
else // (dots == 2)
193+
return EU4Date(trimmed); // DATE
194+
}
195+
196+
EU4Value parseValue(const std::string &text) {
197+
const std::string &trimmed(trim(text));
198+
// First check for a group
199+
if (text[0] == '{' && text.back() == '}') {
200+
std::list<std::string> items = splitStrings(trimmed.substr(1, trimmed.size() - 2));
201+
if (items.size() == 0) {
202+
return py::list();
203+
} else {
204+
std::string &first = items.front();
205+
bool isDict = false;
206+
// If we run into a '=' before a potential '{' then it's a dict, and may or may not be a dict with dict keys
207+
// If we run into a '{' before '=' then it's a list of groups but not a dict. The groups may be dicts.
208+
for (size_t i = 0; i < first.size(); i++) {
209+
if (first[i] == '=') {
210+
isDict = true;
211+
break;
212+
} else if (first[i] == '{') {
213+
isDict = false;
214+
break;
215+
}
216+
}
217+
if (isDict) {
218+
EU4Dict dict = EU4Dict();
219+
for (const std::string &item : items) {
220+
const size_t eqIndex = item.find('=');
221+
dict.add(parseKey(item.substr(0, eqIndex)), parseValue(item.substr(eqIndex + 1)));
222+
}
223+
return dict;
224+
} else {
225+
py::list list = py::list();
226+
for (const std::string &item : items) {
227+
list.append(parseValue(item));
228+
}
229+
return list;
230+
}
231+
}
232+
}
233+
// Next check int/float/date
234+
unsigned char dots = 0;
235+
for (size_t i = 0; i < text.size(); ++i) {
236+
if (text[i] == '.') {
237+
if (++dots > 2)
238+
return trimmed; // STRING
239+
} else if (!std::isdigit(text[i])) {
240+
return trimmed; // STRING
241+
} // If it isn't a '.' or get caught by !isdigit then it is a digit and we continue
242+
}
243+
// So at this point we have only run into 0, 1, or 2 '.' and the rest of the characters have been digits
244+
if (dots == 0)
245+
return std::stoll(trimmed); // INT
246+
else if (dots == 1)
247+
return std::stof(trimmed); // FLOAT
248+
else // (dots == 2)
249+
return EU4Date(trimmed); // DATE
250+
}
251+
74252
PYBIND11_MODULE(EU4cppparser, m) {
75253
m.doc() = "EU4 parser C++ library.";
76254

@@ -91,6 +269,24 @@ PYBIND11_MODULE(EU4cppparser, m) {
91269
.def("isEU4Date", &EU4Date::isEU4Date)
92270
.def_static("stringValid", &EU4Date::stringValid, py::arg("text"));
93271

272+
py::class_<EU4Dict>(m, "EU4Dict")
273+
.def(py::init<>())
274+
.def(py::init<const std::map<EU4Key, EU4Value> &>(), py::arg("map"))
275+
.def("allPairs", &EU4Dict::allPairs)
276+
.def("__getitem__", &EU4Dict::getPair, py::arg("index"))
277+
.def("__setitem__", &EU4Dict::setPair)
278+
.def("getAll", &EU4Dict::getAll, py::arg("key"))
279+
.def("getFirst", &EU4Dict::getFirst, py::arg("key"))
280+
.def("getLast", &EU4Dict::getLast, py::arg("key"))
281+
.def("__len__", &EU4Dict::length)
282+
.def("append", &EU4Dict::add, py::arg("key"), py::arg("value"))
283+
.def("popBack", &EU4Dict::popBack)
284+
.def("__repr__", &EU4Dict::toString)
285+
.def(py::self == py::self)
286+
.def(py::self != py::self);
287+
288+
m.def("parseValue", &parseValue, "Parses a value.", py::arg("text"));
289+
94290
m.def("isEmpty", &isEmpty, "Returns true if this string does not contain characters other than whitespace.", py::arg("text"));
95291
m.def("splitStrings", &splitStrings, py::arg("text"));
96292
}

0 commit comments

Comments
 (0)