-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdot_stat_learning.py
305 lines (241 loc) · 11.4 KB
/
dot_stat_learning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
#!/usr/bin/env
""" This file contains code trying to predict Direction of Trade communities from World Bank country data """
import pickle
import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np
import pprint
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix
import itertools
class NetworkLearning:
def __init__(self):
"""
Reads in network info from 'data/communities.pkl' to assign targets
"""
# read in network info datafile
with open('data/communities.pkl', 'rb') as f:
self.network_info = pickle.load(f)
# assign targets from data
self.targets = self.create_country_targets()
def plot_modularity_over_time(self): #pragma: no cover
"""
Saves plot of modularity of networks over time.
:return: nothing, saves plot to 'plots/modularity.png'
"""
years = self.network_info.keys()
mods = [v['modularity'] for k, v in self.network_info.items()]
plt.plot(years, mods)
plt.xlabel('Year')
plt.ylabel('Modularity')
plt.title('Modularity through time')
plt.savefig('plots/modularity.png')
def plot_degrees_through_time(self): #pragma: no cover
"""
Saves plot of average degree of networks with number of nodes & edges over time.
:return: nothing, saves plot to 'plots/degrees.png'
"""
years = self.network_info.keys()
out_deg = [float(v['Average out degree']) for k, v in self.network_info.items()]
edges = [int(v['Number of edges']) for k, v in self.network_info.items()]
nodes = [int(v['Number of nodes']) for k, v in self.network_info.items()]
fig, ax1 = plt.subplots()
ax1.set_xlabel('Year')
ax1.set_ylabel('Degree/Number nodes')
ln1 = ax1.plot(years, out_deg, color='orange', label="In Degree")
ln2 = ax1.plot(years, nodes, color='red', label="Nodes")
ax1.set_yticks(np.arange(30, 250, 20))
ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis
ax2.set_ylabel('Number edges')
ln3 = ax2.plot(years, edges, color='blue', label="Edges")
ax2.set_yticks(np.arange(3000, 33000, 3000))
lns = ln1 + ln2 + ln3
labs = [l.get_label() for l in lns]
plt.legend(lns, labs)
plt.title('Average degree and number of nodes/edges through time')
plt.tight_layout()
plt.savefig('plots/degrees.png')
def create_country_targets(self):
"""
Returns dictionary mapping countries, then years, to community number, from saved 'data/communities.pkl' file.
:return: dictionary with country, then years, as keys, and community numbers as values
"""
comm_dict = {}
for year in self.network_info.keys():
year_dict = {}
# start community number at 0 (doesn't start from 0 in data file)
community = 0
for _, members in self.network_info[year]['communities'].items():
# add country's community number to year_dict
for country in members:
year_dict[country] = community
community += 1
# add year's data to community dictionary
comm_dict[year] = year_dict
return comm_dict
def identify_country_name_mapping(self, feat_dict, year):
"""
Matches World Bank country names to IMF country names; returns matches and unmatched names.
:param feat_dict: dictionary of countries and their features
:param year: integer year for which to do name mapping
:return: 1. dictionary mapping country names from feature dataset to target dataset
2. list of target country names not matched
3. list of feature country names not matched
"""
ignore_words = ['Rep.', 'of', 'North', 'South', 'Republic', 'Democratic', 'and', '&', 'P.R.:', 'Middle',
'Islands', 'Dem.', 'the', 'Arab', 'Asia', 'French', 'China', 'Islamic', 'Africa',
'Other', 'St.', 'The', 'Kingdom', 'Central', 'Europe', 'East', 'West', 'PDR', 'People\'s',
'middle', 'New', 'Northern']
mapping = {}
missing_features = []
# map country names from features to targets
for country in feat_dict.keys():
if country in self.targets[year].keys():
# look for perfect matches
mapping[country] = country
else:
# look for word matches
feature_words = [x.replace(',', '') for x in country.split(' ') if x not in ignore_words]
target_words = {x: [w.replace(',', '') for w in x.split(' ') if w not in ignore_words]
for x in self.targets[year].keys()}
matches = [k for k, v in target_words.items() if (any([w in feature_words for w in v])
or any([w in v for w in feature_words]))]
if len(matches) > 0:
mapping[country] = matches[0]
else:
# if no matches found, add country to missing features
missing_features.append(country)
# get countries from targets with no match in features
missing_targets = [x for x in self.targets[year].keys()
if x not in mapping.values()]
return mapping, missing_targets, missing_features
def predict_all_years(self, years=np.arange(1960, 2020, 5)):
"""
Trains classifier for each year of data separately and plots mean cross-val score over time.
:param years: integer years for which to train classifier
:return:
"""
scores = []
num_countries = []
num_features = []
for year in years:
print('\n', year)
results = self.predict_communities(year)
scores.append(results[0])
num_countries.append(results[1][0])
num_features.append(results[1][1])
# plot cross-val score
plt.figure()
plt.plot(years, scores)
plt.xlabel('Year')
plt.ylabel('Mean cross-val score')
plt.ylim([0, 1])
plt.title('Cross-validation score through time')
plt.savefig('plots/cross_val.png')
# plot num features and countries
plt.figure()
plt.plot(years, num_features, label='countries')
plt.plot(years, num_countries, label='features')
plt.xlabel('Year')
plt.ylabel('Number')
plt.title('Number of countries and features through time')
plt.savefig('plots/num_feats.png')
def plot_confusion_matrix(self, year, X, y, classes, normalize=False):
"""
This function plots the confusion matrix from a random forest trained on X, y data.
Normalization can be applied by setting `normalize=True`.
:param year: integer year of interest
:param X: np array of data features
:param y: np array of data targets
:param classes: list of class names
:param normalize: optional boolean for whether or not to normalize the matrix
"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
forest = RandomForestClassifier().fit(X_train, y_train)
y_pred = forest.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure()
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix ' + str(year))
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.savefig('plots/confusion_mat_' + str(year) + '.png')
def plot_feature_importance(self, year, X, y, feat_names):
"""
Plots feature importance from random forest classifier and prints to txt file.
:param year: integer year of analysis
:param X: features to train on
:param y: targets
:param feat_names: list of string names of features
:return: nothing, saves feature importance plot and text to files
"""
forest = RandomForestClassifier().fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
with open('plots/feature_ranking_' + str(year) + '.txt', 'w') as file:
file.write("Feature ranking:")
for f in range(len(indices)):
file.write("%d. feature %d (%f): %s" % (f + 1, indices[f], importances[indices[f]], feat_names[f]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances " + str(year))
plt.bar(range(len(indices)), importances[indices],
color="b", yerr=std[indices], align="center")
plt.xticks(range(len(indices)), indices)
plt.tight_layout()
plt.savefig('plots/feature_imp_' + str(year) + '.png')
def predict_communities(self, year):
"""
Reads in features from year's pkl file and predicts communities using random forest.
Runs plotting of feature importance and confusion matrices.
:param year: year for which to do predictions
:return: 1. mean k-fold cross-validation score of classifier with k=5
2. shape of features matrix (num of data pts, num features)
"""
with open('data/world_bank_' + str(year) + ' [YR' + str(year) + '].pkl', 'rb') as f:
feats = pickle.load(f)
name_mapping, _, miss_feat_countries = self.identify_country_name_mapping(feats, year)
year_feats = []
year_targs = []
countries = feats.keys()
for country in countries:
if country not in miss_feat_countries:
feat_dict = feats[country]
feat_list = [feat_dict[key].iloc[0] for key in sorted(feat_dict.keys())]
year_feats.append(feat_list)
target_name = name_mapping[country]
year_targs.append(self.targets[year][target_name])
X = np.array(year_feats)
y = np.array(year_targs)
# cross-val scores
forest = RandomForestClassifier()
scores = cross_val_score(forest, X, y, cv=5)
# feature importance
self.plot_feature_importance(year, X, y, sorted(feat_dict.keys()))
# confusion matrix
self.plot_confusion_matrix(year, X, y, classes=range(4))
return np.mean(scores), X.shape
def main():
nL = NetworkLearning()
nL.predict_all_years()
if __name__ == "__main__":
main()