-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyze.py
385 lines (293 loc) · 13.6 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
import pickle, os, math
from collections import defaultdict
# Local libraries, used so that this file isn't a million lines long
from loader import DataLoader
from entry import Entry
from visualize import Visualizer
from indicators import Indicators
# Completely arbitrary, will be decided based on real info soon
LOW_SALARY = 20000
HIGH_SALARY = 60000
# List of all maps between number and data
# Each key is one of the first elements in each mapTypes list
# Each value is a dict, which maps numbers (keys) to their human-readable value
maps = {}
# The different variable types we care about.
# [variable name, col_start - 1, col_end]
mapTypes = [["serial", 16, 28],
["gq", 36, 38],
["regnro", 38, 39],
["ownership", 39, 40],
["rooms", 43, 45],
["hhtype", 48, 50],
["ro2011a_livarea", 50, 53],
["pernum", 53, 57],
["momloc", 65, 68],
["parrule", 68, 70],
["stepmom", 70, 71],
["age", 71, 74],
["sex", 74, 75],
["marst", 75, 76],
["chborn", 79, 81],
["religion", 81, 82],
["educro", 86, 89],
["empstat", 89, 90],
["eempstat", 93, 96],
["indgen", 96, 99],
["hrsmain", 99, 102],
["ro2011a_ethnic", 102, 104],
["income"]]
def getEntriesWithProps(entries, props):
return [p for p in filter(lambda p: p.hasProps(props), entries)]
def calculateDistribution(entries, variable):
regions = ["Bucharest Sector 1", "Bucharest Sector 2", "Bucharest Sector 3", "Bucharest Sector 4", "Bucharest Sector 5", "Bucharest Sector 6"]
distributions = {}
for r in regions:
print(f"Region: {r}")
distributions[r] = {}
# Get all entries within that region
in_region = getEntriesWithProps(entries, {"location": r})
# For each possible ethnicity, calculate the fraction of that ethnicity over the total pop of that region
for e in maps[variable].values():
fraction_of_pop = len(getEntriesWithProps(in_region, {variable: e}))/len(in_region)
# Add each value that is != 0 to distributions[region][ethnicity]
if fraction_of_pop > 0:
distributions[r][e] = fraction_of_pop
# Sort the items in the dict by value, highest to lowest (reverse)
sorted_distributions = sorted(distributions[r].items(), key=lambda item: item[1], reverse=True)
for e,f in sorted_distributions:
print(f"{variable}: {e}\t\t\t\t\t\t% Pop: {f*100:.3f}")
print()
########################################
# BASE VARIABLES FOR ARMAS CALCULATION #
########################################
# See indicators.py
########################################
########################################
# INDICATOR PROCESSING #
########################################
# indicatorMap = {"elderly": 0,
# "female": 1,
# "children": 2,
# "widows": 3,
# "wage_earners": 4,
# "minimum_education": 5,
# "women_with_3_children": 6,
# "unemployed": 7,
# ""}
def getIndicators(entries, all):
indicators = []
### SOCIAL ###
# 0: Ratio elderly
indicators.append(Indicators.countElderly(entries) / len(entries))
# 1: Ratio female
indicators.append(Indicators.countWomen(entries) / len(entries))
# 2: Ratio children
indicators.append(Indicators.countChildren(entries) / len(entries))
# 3: Ratio widows in female population
indicators.append(Indicators.countWidows(entries) / Indicators.countWomen(entries))
# Skipped housing density, since it is 1/avg room area per person from our calculations
# 4: Average wage earners per household
indicators.append(Indicators.countEconomicallyActive(entries) / Indicators.countHouseholds(entries))
# 5: Minimum level of education
indicators.append(Indicators.countWithMinimumEducation(entries) / len(entries))
# 6: Ratio of women with 3 children and more to women
# In some reports it is 3, others 5. Chose 3 here, but can be changed (pass in as parameter?)
indicators.append(Indicators.countWomenWith3Children(entries) / Indicators.countWomen(entries))
##############
### ECONOMIC ###
# 7: Ratio of unemployed
indicators.append(Indicators.countUnemployed(entries) / len(entries))
# 8: Ratio of low income
indicators.append(Indicators.countLowIncome(entries) / len(entries))
# 9: Ratio of high income women
indicators.append(Indicators.countHighIncomeWomen(entries) / Indicators.countWomen(entries))
# 10: Ratio of high income men
indicators.append(Indicators.countHighIncomeMen(entries) / Indicators.countMen(entries))
################
### HOUSING ###
# 11: Room occupancy per household
indicators.append(len(entries) / Indicators.countRooms(entries))
# 12: Average household room area
indicators.append(Indicators.countTotalLivingArea(entries) / Indicators.countRooms(entries))
# 13: Household population density
indicators.append(len(entries) / Indicators.countHouseholds(entries))
# 14: Avg no. of private/owned households w/ 5 or more rooms
indicators.append(Indicators.countPrivateWith5Rooms(entries) / Indicators.countPrivateHomes(entries))
# 15: Average room area per person
indicators.append(Indicators.countTotalLivingArea(entries) / len(entries))
###############
return indicators
########################################
########################################
### INTERFACE MODES ###
########################################
# Run armas analysis and save data to indicators.csv
def analyze():
with open("indicators.csv", "w") as f:
print(maps["religion"])
cols = ["", "Count", "Ratio elderly", "Ratio female", "Ratio children", "Ratio widows in female population", "Average wage earners per household",
"Minimum level of education", "Ratio of women with 3 children and more to women", "Ratio of unemployed", "Ratio of low income", "Ratio of high income women",
"Ratio of high income men", "Room occupancy per household", "Average household room area", "Household population density", "Avg no. of private/owned households w/ 5 or more rooms",
"Average room area per person"]
f.write(",".join(cols) + "\n")
# For each religion, get their indicators and write to file
for _,r in maps["religion"].items():
entries = list(filter(lambda p: p.getProp("religion") == r, entries_in_bucharest))
if len(entries) > 20:
indicators = getIndicators(entries, entries_in_bucharest)
indicator_str = ",".join(str(i) for i in indicators)
indicator_str = r + "," + str(len(entries)) + "," + indicator_str + "\n"
f.write(indicator_str)
print(f"Got indicators for {r}")
else:
print(f"Skipped {r} due to no entries")
# For each ethnicity, get their indicators and write to file
for _,e in maps["ro2011a_ethnic"].items():
entries = list(filter(lambda p: p.getProp("ro2011a_ethnic") == e, entries_in_bucharest))
if len(entries) > 20:
indicators = getIndicators(entries, entries_in_bucharest)
indicator_str = ",".join(str(i) for i in indicators)
indicator_str = e + "," + str(len(entries)) + "," + indicator_str + "\n"
f.write(indicator_str)
print(f"Got indicators for {e}")
else:
print(f"Skipped {e} due to no entries")
print("Wrote to indicators.csv, go see if it worked!")
# Run the interactive data explorer
def interactive():
done = False
while not done:
var_name = input("Variable name (? for all, q to quit): ")
if var_name == "q":
done = True
continue
if var_name == "?":
for m in mapTypes:
print(m[0])
continue
if var_name not in [m[0] for m in mapTypes]:
print("Invalid variable name.")
continue
val = input("Value (? for possible values): ")
if val == "?":
m = maps[var_name]
for k in m:
print(m[k])
continue
if val not in maps[var_name].values() and not val == "all":
print("Invalid value.")
continue
if val == "all":
print(f"All % for {var_name}:")
for v in maps[var_name].values():
print(v)
matching_entries = [p for p in filter(lambda p: p.hasProps({var_name: v}), entries_in_bucharest)]
print(f"\tTotal count: {len(matching_entries)}")
print(f"\tTotal percent: {len(matching_entries)/len(entries_in_bucharest)*100:.3f}%")
else:
matching_entries = [p for p in filter(lambda p: p.hasProps({var_name: val}), entries_in_bucharest)]
print(f"Total count: {len(matching_entries)}")
print(f"Total percent: {len(matching_entries)/len(entries_in_bucharest)*100:.3f}%")
# Run an interactive plotting tool
def graph(entries_in_bucharest):
done = False
while not done:
mode = input("Religion or ethnicity? (r or e, q to quit): ")
entries = []
title = ""
if mode == "q":
done = True
continue
elif mode == "r":
religion = input("Which religion: ")
title = religion
entries = [e for e in entries_in_bucharest if e.getProp("religion") == religion]
elif mode == "e":
ethnicity = input("Which ethnicity: ")
title = ethnicity
entries = [e for e in entries_in_bucharest if e.getProp("ro2011a_ethnic") == ethnicity]
if len(entries) == 0:
print("Invalid identifier, or empty set. Try again.")
continue
variable = input("Which variable: ")
if variable not in [m[0] for m in mapTypes]:
print("Invalid variable.")
continue
if variable == "hrsmain":
entries = [e for e in entries if e.getProp("eempstat") == "Employed"]
title += f": {variable} (n={len(entries)})"
graph_type = input("What type of graph (help for all types): ")
if graph_type == "help":
print("Supported types: histogram")
continue
elif graph_type == "histogram":
data = getNumericHistogramVariable(entries, variable)
plt.hist(data, bins=25, color="royalblue")
plt.title(title)
plt.ylabel("Count")
plt.xlabel(f"{variable.capitalize()}")
plt.show()
else:
print("Invalid graph type.")
def graphAge():
ethnicities = ["Romanian", "Armenian", "Ukranian"]
religions = ["Jewish", ""]
########################################
# All lines from the data file
lines = []
entries_in_bucharest = []
all_people = []
entries_by_pernum = defaultdict(dict)
maps = DataLoader.loadMaps(mapTypes)
if os.path.exists("bucharest_entries.dat"):
print("Data has been processed, loading .dat")
entries_in_bucharest = pickle.load(open("bucharest_entries.dat", "rb"))
else:
print("Loading from IPUMS data")
with open("ipumsi_00006.dat") as data:
lines = data.readlines()
print(f"Data length: {len(lines)}")
# Create an entry for each data line
all_people = [Entry.createEntry(line, mapTypes, maps) for line in lines]
print("Created data set. Pulling all in Bucharest...")
entries_in_bucharest = [p for p in filter(lambda p: p.hasProps({"regnro": "Bucharest"}), all_people)]
pickle.dump(entries_in_bucharest, open("bucharest_entries.dat", "wb+"))
print("Pulled all Bucharest data. Entering interactive terminal to check different parameters for those in Bucharest")
print(f"Entries in bucharest: {len(entries_in_bucharest)}")
# Create dict indexed by pernum, for speedy lookups when connecting mother and children
for person in entries_in_bucharest:
entries_by_pernum[person.getProp("pernum")] = person
while True:
mode = input("What would you like to do? Type `help` for all modes: ")
if mode == "help":
print("Different modes: help, analyze, graph, age, interactive")
elif mode == "analyze":
analyze()
elif mode == "graph":
graph(entries_in_bucharest)
elif mode == "interactive":
interactive()
elif mode == "age":
Visualizer.ageDistribution(entries_in_bucharest, ["Romanian", "Jewish", "Armenian", "Roma", "Muslim"])
elif mode == "unemployment":
Visualizer.unemploymentChart(entries_in_bucharest, maps, 5)
elif mode == "income":
Visualizer.incomeDistribution(entries_in_bucharest)
elif mode == "education":
Visualizer.educationDistribution(entries_in_bucharest, ["Roma", "Unknown ethnicity", "Aromanian", "Muslim", "Romanian"])
elif mode == "density":
Visualizer.densityChart(entries_in_bucharest, maps, 5)
elif mode == "median":
print("Calculating income median across the country.")
print("Extracting incomes")
incomes = [int(p.getProp("income")) for p in all_people if int(p.getProp("income")) > 0]
print("Sorting")
incomes.sort()
print(f"Median income: {incomes[math.floor(len(incomes)/2)]}")
elif mode == "ethnicity":
Visualizer.popChart(entries_in_bucharest, maps)
elif mode == "religion":
Visualizer.popChart(entries_in_bucharest, maps, category="religion")
else:
print("Try again.")