Skip to content

Commit

Permalink
First commit
Browse files Browse the repository at this point in the history
  • Loading branch information
aywaldron committed Nov 3, 2019
0 parents commit f8fb74f
Show file tree
Hide file tree
Showing 5 changed files with 814 additions and 0 deletions.
200 changes: 200 additions & 0 deletions dot_network.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
#!/usr/bin/env

""" This file contains code for reading Direction of Trade data from the IMF into a weighted, directed network
and using it to find trade communities """

import networkx as nx
import pandas as pd
import math
from modularity_maximization import partition
from modularity_maximization.utils import get_modularity
import pickle


def create_network_dict(df, years):
"""
Returns a dictionary of networks with the relevant years as keys.
:param df: pandas dataframe of exports to use in creating networks
:param years: iterable of integer years for which to create networks
:return: dictionary of networkx graphs with integer years as keys
"""

networks = {}
for year in years:
print('Creating network for %d...' % year)
networks[year] = create_dot_network(df, str(year))

return networks


def create_dot_network(df, year):
"""
Returns networkx directed graph of international trade with country codes as nodes.
:param df: pandas dataframe of trade exports with each row representing trade between 2 countries and columns for
each year of data
:param year: string year to create network for
:return graph: newtorkx directed graph with exports in USD as edge weights
"""

# extract only relevant data from dataframe
data = df[['Country Code', 'Counterpart Country Code', year]]

# initialize networkx DiGraph
network = nx.DiGraph()
# add edge to graph for each row of data not equal to NaN
for index, row in data.iterrows():
if not math.isnan(float(row[year])):
network.add_edge(int(row['Country Code']), int(row['Counterpart Country Code']),
weight=row[year])

return network


def extract_relevant_rows(df, column_name, column_value, not_equal=False):
"""
Returns pandas dataframe consisting only of rows with specific values in a specific column.
:param df: pandas dataframe to extract rows from
:param column_name: name of column requiring specific value
:param column_value: value required in column
:param not_equal: boolean for whether to return rows equal to passed in values (False) or not
equal to passed in values (True)
:return: pandas dataframe consisting only of desired rows
"""

if not_equal:
return df.loc[df[column_name] != column_value]

return df.loc[df[column_name] == column_value]


def prepare_data(filename='data/DOT_timeSeries.csv'):
"""
Reads in DOT datafile and filters for relevant information.
:param filename: string path to csv file
:return: pandas dataframe constructed from datafile and filtered for relevant rows
"""

# read data file into pandas dataframe
df = pd.read_csv(filename)

# extract unwanted 'countries' from dataframe
countries = ['Europe', 'Emerging and Developing Europe', 'Emerging and Developing Asia',
'Middle East, North Africa, and Pakistan', 'Export earnings: nonfuel',
'Sub-Saharan Africa', 'Export earnings: fuel', 'Western Hemisphere',
'World', 'Special Categories', 'Advanced Economies', 'CIS',
'Emerging and Developing Economies']
for country in countries:
df = extract_relevant_rows(df, column_name='Country Name', column_value=country, not_equal=True)
df = extract_relevant_rows(df, column_name='Counterpart Country Name', column_value=country, not_equal=True)

# extract exports only from data
exports = extract_relevant_rows(df, column_name='Indicator Code', column_value='TXG_FOB_USD')
# extract value attributes only from exports
export_values = extract_relevant_rows(exports, column_name='Attribute', column_value='Value')

return export_values


def create_country_code_dict(df):
"""
Creates a dictionary of country names with country codes as keys from the passed in dataframe.
:param df: pandas dataframe from which to extract country codes & names
:return: dictionary with country codes as keys and country names as values
"""

code_dict = {}

# check both country and counterpart country columns for unique country codes
for col in ['Country', 'Counterpart Country']:
for code in df[col + ' Code'].unique():
code_dict[int(code)] = df.loc[df[col + ' Code'] == code][col + ' Name'].values[0]

return code_dict


def find_and_print_network_communities(G, code_dict=None):
"""
Finds network communities through modularity maximization and returns dictionary of community
members by country name with community numbers as keys.
:param G: networkx Graph to find communities in
:param code_dict: dictionary mapping country codes to names - if passed in, will use mappings for
recording community members
:return: 1. dictionary with community numbers as keys and list of string country names as values
2. modularity of discovered community partitions
"""

comm_dict = partition(G)

comm_members = {}
for comm in set(comm_dict.values()):
countries = [node for node in comm_dict if comm_dict[node] == comm]
if code_dict is not None:
countries = [code_dict[code] for code in countries]

comm_members[comm] = countries

return comm_members, get_modularity(G, comm_dict)


def get_network_info_dict(network):
"""
Returns dictionary of network characteristics obtained from networkx.info method.
:param network: network to get info on
:return: dictionary mapping network characteristic name to value
"""
info_str = nx.info(network)
lines = info_str.split('\n')

info_dict = {}
for line in lines:
pair = line.split(':')
info_dict[pair[0]] = pair[1].strip()

return info_dict


def save_all_community_information(networks, code_dict=None, filename='data/communities.pkl'):
"""
Finds communities in each network and saves modularity, network info, and community members to file.
:param networks: dictionary mapping integer years to networks
:param code_dict: dictionary mapping country codes to names - if passed in, will use mappings for
recording community members
:param filename: string name, including extension, of file to save info to
:return: nothing, saves network info to 'communities.pkl'
"""

save_dict = {}
for year, network in networks.items():
print('Finding communities for %d network...' % year)
comms, mod = find_and_print_network_communities(network, code_dict)
info_dict = get_network_info_dict(network)
comm_dict = {'modularity': mod,
'communities': comms}
save_dict[year] = {**info_dict, **comm_dict}

with open(filename, 'wb') as f:
pickle.dump(save_dict, f, pickle.HIGHEST_PROTOCOL)


def main():
# clean data & create country code dictionary
data = prepare_data()
country_dict = create_country_code_dict(data)

# create dictionary of networks with keys as years
networks = create_network_dict(data, years=range(1948, 2018))

# save community info for all networks
save_all_community_information(networks, code_dict=country_dict)


if __name__ == "__main__":
main()
106 changes: 106 additions & 0 deletions dot_network_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#!/usr/bin/env

""" This file contains tests for the dot_network.py file that creates DOT networks """

import pandas as pd
import dot_network as dot
import networkx as nx
import pickle


class test_dot_network:

def setup(self):
""" Setup method creates the test csv file and writes to data/test_DOT_files.csv """
# extract test data from DOT data frame
df = pd.read_csv('data/DOT_timeSeries.csv')
test_df = df.loc[(df['Country Name'] == 'Angola') & (df['Counterpart Country Name'] == 'Colombia')]
test_df = test_df.append(df.loc[(df['Country Name'] == 'Angola') &
(df['Counterpart Country Name'] == 'Moldova')])
test_df = test_df.append(df.loc[(df['Country Name'] == 'Moldova') &
(df['Counterpart Country Name'] == 'Angola')])
test_df = test_df.append(df.loc[(df['Country Name'] == 'World') &
(df['Counterpart Country Name'] == 'Moldova')])

self.filename = 'data/test_DOT_file.csv'
self.test_data = test_df
self.filtered_data = dot.prepare_data(self.filename)
self.code_dict = dot.create_country_code_dict(self.filtered_data)
self.network = dot.create_dot_network(self.filtered_data, '2007')
self.network_dict = dot.create_network_dict(self.filtered_data, range(2007, 2010))

# save test dataframe to file
test_df.to_csv(self.filename)

def test_extract_relevant_rows(self):
""" Tests that extract_relevant_rows only returns the relevant rows """
df = dot.extract_relevant_rows(self.test_data,
column_name='Country Name',
column_value='Angola')
assert (df['Country Name'] == 'Angola').all()

def test_extract_relevant_rows_not_equal(self):
""" Tests that extract_relevant_rows filters out undesired rows when not_equal=True"""
df = dot.extract_relevant_rows(self.test_data,
column_name='Country Name',
column_value='Angola',
not_equal=True)
assert not (df['Country Name'] == 'Angola').any()

def test_prepare_data(self):
""" Tests the prepare_data function by asserting that only relevant rows are returned """
df = self.filtered_data
assert len(df) == 3
assert ((df['Country Name'] == 'Angola') & (df['Counterpart Country Name'] == 'Colombia')).any()
assert ((df['Country Name'] == 'Angola') & (df['Counterpart Country Name'] == 'Moldova')).any()
assert ((df['Country Name'] == 'Moldova') & (df['Counterpart Country Name'] == 'Angola')).any()

def test_create_dot_network(self):
""" Tests that create_dot_network returns the correct network """
assert list(self.network.edges.data()) == [(614.0, 233.0, {'weight': 73172520.0}),
(921.0, 614.0, {'weight': 263001.0})]

def test_create_network_dict(self):
""" Tests that create_network_dict returns a dictionary of networks """
assert [type(self.network_dict[year]) == nx.DiGraph for year in range(2007, 2010)]

def test_code_dict_creation(self):
""" Tests that code dict created is correct """
assert self.code_dict == {921: 'Moldova', 614: 'Angola', 233: 'Colombia'}

def test_community_finding(self):
""" Tests that network community finding function is working properly """
comm, mod = dot.find_and_print_network_communities(self.network, code_dict=self.code_dict)
assert comm == {0: ['Colombia', 'Angola', 'Moldova']}
assert mod == 0.0

def test_network_info_saving(self):
""" Tests that network community info is correctly saved to file """
dot.save_all_community_information(self.network_dict, code_dict=self.code_dict, filename='data/test.pkl')
with open('data/test.pkl', 'rb') as f:
loaded = pickle.load(f)

assert loaded == {2008: {'communities': {0: ['Moldova', 'Angola']},
'Average in degree': '1.0000',
'Type': 'DiGraph',
'Number of edges': '2',
'Name': '',
'Number of nodes': '2',
'Average out degree': '1.0000',
'modularity': 0.0},
2009: {'communities': {0: ['Moldova', 'Angola']},
'Average in degree': '0.5000',
'Type': 'DiGraph',
'Number of edges': '1',
'Name': '',
'Number of nodes': '2',
'Average out degree': '0.5000',
'modularity': 0.0},
2007: {'communities': {0: ['Colombia', 'Angola', 'Moldova']},
'Average in degree': '0.6667',
'Type': 'DiGraph',
'Number of edges': '2',
'Name': '',
'Number of nodes': '3',
'Average out degree': '0.6667',
'modularity': 0.0}}
Loading

0 comments on commit f8fb74f

Please sign in to comment.