-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_wrangling.py
389 lines (328 loc) · 15.4 KB
/
data_wrangling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 6 10:44:03 2023
@author: denizaycan
"""
#Part 1: Data Wrangling
import os
import json
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from mpl_toolkits.axes_grid1 import make_axes_locatable
from datetime import datetime
import wbdata
from country_converter import CountryConverter
import requests
import xml.etree.ElementTree as ET
import pycountry
from pypdf import PdfReader
import re
import statsmodels.api as sm
from linearmodels.panel import PanelOLS
import sklearn
from sklearn.feature_extraction.text import TfidfTransformer
from wordcloud import WordCloud
import nltk
original_data_path = r'/users/denizaycan/Documents/GitHub/final-project-deniz/data/unmodified'
final_data_path = r'/users/denizaycan/Documents/GitHub/final-project-deniz/data/final'
#Part 1: downloading datas
def parse_excel_tabs(base_path, file_name):
"""
Parse each tab of an Excel file as separate DataFrames and print the results.
Parameters:
- base_path (str): Base path where the Excel file is located.
- file_name (str): Name of the Excel file.
Returns:
- dict: A dictionary where keys are sheet names and values are corresponding DataFrames.
"""
# Construct the full file path
file_path = os.path.join(base_path, file_name)
all_dfs = pd.read_excel(file_path, sheet_name=None)
for sheet_name, df in all_dfs.items():
print(f"Sheet Name: {sheet_name}")
print(df)
print("\n")
return all_dfs
result = parse_excel_tabs(original_data_path, 'CFU-Website-MASTER-February-2023.xlsx')
if result:
ODI_pledges = result['Pledges']
ODI_projects = result['Projects']
def load_andprep_excel_data(base_path, file_name):
'''
This function loads and prepares the excel file by removing extra lines.
Arguments: base_path (str)
file_name (str)
Returns the df.
'''
full_path = os.path.join(base_path, file_name)
df = pd.read_excel(full_path)
return df
UK_ICF = load_andprep_excel_data(original_data_path, 'UK ICF data 2011-2023_read only.xlsx')
def load_csv_data(base_path, file_name):
'''
This function loads and prepares the csv file.
Arguments: base_path (str)
file_name (str)
Returns the df.
'''
full_path = os.path.join(base_path, file_name)
df = pd.read_csv(full_path)
return df
emissions = load_csv_data(original_data_path, 'ghg-emissions_percapita.csv')
colonial_emissions = load_csv_data(original_data_path, 'full_year_emissions_summary_table_1850_2023.csv')
colonial_for_plot = load_csv_data(original_data_path, 'full_summary_table_2023.csv')
def download_wb_data(indicator):
data = wbdata.get_dataframe(indicators={indicator: 'GDP_ppp'}, country='all')
data.reset_index(inplace=True)
return data
GDP_PPP_1 = download_wb_data('NY.GDP.PCAP.PP.KD')
def get_oecd_data(api_url):
response = requests.get(api_url)
if response.status_code == 200:
try:
# Parse XML content
root = ET.fromstring(response.content)
# Extract information from XML
observations = []
for obs in root.findall(".//{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic}Obs"):
obs_data = {}
for key in obs.findall(".//{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic}ObsKey"):
for value in key.findall(".//{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic}Value"):
obs_data[value.attrib['id']] = value.attrib['value']
obs_value = obs.find(".//{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic}ObsValue")
if obs_value is not None:
obs_data['ObsValue'] = obs_value.attrib['value']
observations.append(obs_data)
# Convert to DataFrame
OECD = pd.DataFrame(observations)
return OECD
except ET.ParseError as e:
print(f"Error parsing XML: {e}")
print("Response content:", response.text)
return None
else:
print(f"Error: {response.status_code}")
return None
api_url = "https://sdmx.oecd.org/public/rest/data/OECD.ENV.EPI,DSD_CAPMF@DF_CAPMF,1.0/MLT+IND+ZAF+GBR+NZL+CAN+AUS.A.POL_COUNT+POL_STRINGENCY.LEV3_GHG_ACC+LEV3_UNFCCC+LEV3_EVAL_BR+LEV3_BAN_FF_ABROAD+LEV3_BAN_CREDIT+LEV2_INT_GHGREP+LEV2_INT_C_FIN+LEV2_INT_C_COORD+LEV2_CROSS_SEC_CG+LEV2_CROSS_SEC_FFPP+LEV2_CROSS_SEC_RDD+LEV2_CROSS_SEC_GHGTAR+LEV2_SEC_T_NMBI+LEV2_SEC_T_MBI+LEV2_SEC_B_NMBI+LEV2_SEC_B_MBI+LEV2_SEC_I_NMBI+LEV2_SEC_I_MBI+LEV2_SEC_E_NMBI+LEV2_SEC_E_MBI+LEV1_INT+LEV1_CROSS_SEC+LEV1_SEC.0_TO_10+PL?startPeriod=1990&endPeriod=2022&dimensionAtObservation=AllDimensions"
OECD = get_oecd_data(api_url)
#Note that when we download the OECD data from csv instead of API, the data comes with more columns, so we have to drop them first to make it look like what we get it from API.
#Same goes for WB API.
def process_OECD_data(df):
"""
Process OECD data with the following steps:
1. Rename the first column to 'Year' and the second column to 'countrycode'.
2. Remove the 3rd column.
3. Limit the 4th column to 'POL_STRINGENCY'.
4. Remove the 6th column.
5. Rename the 7th column to 'Score'.
Parameters:
- OECD (pd.DataFrame): Input DataFrame containing OECD data.
Returns:
- pd.DataFrame: Processed DataFrame."""
df1 = df.copy()
df1 = df1.rename(columns={'TIME_PERIOD': 'Year', 'REF_AREA': 'countrycode'})
df1 = df1.drop(columns=['FREQ'])
df1 = df1[df1['MEASURE'] == 'POL_STRINGENCY']
df1 = df1.drop(columns=['MEASURE'])
df1 = df1.drop(columns=['UNIT_MEASURE'])
df1 = df1.rename(columns={'ObsValue': 'Score'})
return df1
processed_OECD = process_OECD_data(OECD)
wide_OECD = processed_OECD.pivot(index=['Year', 'countrycode'], columns='CLIM_ACT_POL', values='Score').reset_index()
#I will do the merges based on countrycodes, so I am adding countrycodes to my dfs
def get_country_code(country_name):
try:
country = pycountry.countries.get(name=country_name)
if country:
return country.alpha_3
except AttributeError:
return None
#Country dictionary
commonwealth_countries = {
"Africa": [
"Botswana", "Cameroon", "Gabon", "Gambia", "The Gambia", "Gambia, The", "Ghana",
"Kenya", "Kingdom of Eswatini", "Lesotho", "Malawi", "Mauritius",
"Mozambique", "Namibia", "Nigeria", "Rwanda", "Seychelles", "Sierra Leone",
"South Africa",
"Eswatini", "Togo", "Uganda", "Tanzania", "United Republic of Tanzania", "Tanzania", "Zambia"
],
"Asia": [
"Bangladesh", "Brunei Darussalam", "India", "Malaysia", "Maldives", "Pakistan",
"Singapore", "Sri Lanka"
],
"Caribbean and Americas": [
"Antigua and Barbuda", "Bahamas", "The Bahamas", "Bahamas, The", "Barbados", "Belize",
"Canada", "Dominica", "Grenada",
"Guyana", "Jamaica", "Saint Kitts and Nevis", "St. Kitts and Nevis", "Saint Lucia", "St. Lucia",
"Saint Vincent and the Grenadines", "St. Vincent and the Grenadines",
"Trinidad and Tobago"
],
"Europe": [
"Cyprus", "Malta", "United Kingdom"
],
"Pacific": [
"Australia", "Fiji", "Kiribati", "Nauru", "New Zealand",
"Papua New Guinea", "Samoa",
"Solomon Islands", "Tonga", "Tuvalu", "Vanuatu"
]
}
# Create a list of all Commonwealth countries
all_commonwealth_countries = [country for countries in commonwealth_countries.values() for country in countries]
ODI_projects_cw = ODI_projects[ODI_projects['Country'].isin(all_commonwealth_countries) | ODI_projects['Country'].str.contains('global', case=False)]
ODI_projects_cw['countrycode'] = ODI_projects_cw['Country'].apply(get_country_code)
#Data preparation for word clouds
ODI_projects.rename(columns={'Approved year': 'Year', 'Amount of Funding Approved (USD millions)': 'Grant', 'Summary ': 'Summary'}, inplace=True)
def process_ICF_data(df):
df1 = df.copy()
df1.drop(columns=['Unnamed: 0', 'URL','Dept', 'Start','End', 'Total ICF spend £000'], inplace=True)
column_mapping = {
'2011/12 ICF spend £000': 2011,
'2012/13 ICF spend £000': 2012,
'2013/14 ICF spend £000': 2013,
'2014/15 ICF spend £000': 2014,
'2015/16 ICF spend £000': 2015,
'2016/17 ICF spend £000': 2016,
'2017/2018 ICF spend £000': 2017,
'2018/2019 ICF spend £000': 2018,
'2019/2020 ICF spend £000': 2019,
'2020/2021 ICF spend £000': 2020,
'2021/2022 ICF spend £000': 2021,
'2022/2023 ICF spend £000': 2022
}
df1.rename(columns=column_mapping, inplace=True)
columns_to_stay = ['Project title', 'Country/region', 'Description']
df_long = pd.melt(df1, id_vars=columns_to_stay, var_name='Year', value_name= 'Grant')
df_long = df_long.dropna(subset=['Grant'])
df_long.rename(columns={'Country/region' : 'Country', 'Description': 'Summary'}, inplace=True)
return df_long
UK_ICF_long = process_ICF_data(UK_ICF)
#Data preparation for plots:
def process_col_for_plot_emissions(df):
"""
Process emissions data with the following steps:
Returns:
- pd.DataFrame: Processed DataFrame.
"""
df1 = df.copy()
df1.rename(columns={'country': 'Country'}, inplace=True)
df2= df1[['Country','Population (2023), millions', 'Colonial Total/GtCO2', 'Colonial consumption per yearly capita /tCO2/pc']]
df2['Country'] = df2['Country'].replace('USA', 'United States')
df2['Country'] = df2['Country'].replace('Russia', 'Russian Federation')
return df2
processed_colonial_for_plot = process_col_for_plot_emissions(colonial_for_plot)
def process_ODI_pledges(df):
"""
Process emissions data with the following steps:
Returns:
- pd.DataFrame: Processed DataFrame.
"""
df1 = df.copy()
df1.rename(columns={'country': 'Country'}, inplace=True)
# Select relevant columns
df2 = df1[['Country', 'Pledged (USD million current)']]
# Group by country and calculate the sum of pledges
grouped_df = df2.groupby('Country').agg({'Pledged (USD million current)': 'sum'}).reset_index()
# Calculate the percentage of each country's pledge to the total sum of pledges
grouped_df['Pledge Percentage (Non per capita)'] = (grouped_df['Pledged (USD million current)'] / grouped_df['Pledged (USD million current)'].sum()) * 100
return grouped_df
processed_ODI_pledges = process_ODI_pledges(ODI_pledges)
def merge_and_plot_data(df1, df2):
"""
Merge two dataframes, calculate colonial emissions percentage, and pledge per capita.
Parameters:
- df1 (pd.DataFrame): The first dataframe containing processed ODI pledges data.
- df2 (pd.DataFrame): The second dataframe containing processed colonial data for plotting.
Returns:
- pd.DataFrame: Merged dataframe with additional calculated columns.
"""
plot_merge = pd.merge(df1, df2, on=['Country'], how='left')
plot_merge = plot_merge.dropna()
total_colonial_emissions = plot_merge['Colonial Total/GtCO2'].sum()
plot_merge['Colonial Percentage/GtCO2'] = (plot_merge['Colonial Total/GtCO2'] / total_colonial_emissions) * 100
plot_merge['Pledge per Capita (2023)'] = (plot_merge['Pledged (USD million current)'] / plot_merge['Population (2023), millions'])
return plot_merge
plot_merge = merge_and_plot_data(processed_ODI_pledges, processed_colonial_for_plot)
#Data preparation for panel regressions and shiny plots
def process_gdp_data(df):
df1 = df.copy()
df1['countrycode'] = df1['country'].apply(get_country_code)
country_code_mapping = {
'British Virgin Islands': 'VGB',
'Channel Islands': 'CHI',
'St. Kitts and Nevis': 'KNA',
'St. Lucia': 'LCA',
'St. Vincent and the Grenadines': 'VCT',
'Tanzania': 'TZA',
'Gambia, The': 'GMB',
'Bahamas, The': 'BHS'
}
df1['countrycode'] = df1.apply(lambda row: country_code_mapping.get(row['country'], row['countrycode']), axis=1)
df_cw = df1[df1['country'].isin(all_commonwealth_countries)]
df_cw.rename(columns={'date': 'Year'}, inplace=True)
return df_cw
GDP_PPP_2 = process_gdp_data(GDP_PPP_1)
def process_emissions_data(df):
"""
Process emissions data with the following steps:
Returns:
- pd.DataFrame: Processed DataFrame.
"""
df1 = df.copy()
df1.rename(columns={'iso': 'countrycode'}, inplace=True)
df_cw = df1[df1['Country/Region'].isin(all_commonwealth_countries)]
df_cw.drop(columns=['Country/Region'], inplace=True)
df_cw.drop(columns=['unit'], inplace=True)
df2 = pd.melt(df_cw, id_vars='countrycode', var_name='Year', value_name='Emissions')
df2['Emissions'] = pd.to_numeric(df2['Emissions'], errors='coerce')
return df2
processed_emissions = process_emissions_data(emissions)
def process_col_emissions_data(df):
"""
Process emissions data with the following steps:
Returns:
- pd.DataFrame: Processed DataFrame.
"""
df1 = df.copy()
df1.rename(columns={'iso_code': 'countrycode'}, inplace=True)
df1.rename(columns={'country': 'Country'}, inplace=True)
df1.rename(columns={'year': 'Year'}, inplace=True)
df1.drop(columns=['Unnamed: 0', 'region',
'Fossil CO2/GtCO2',
'Land CO2/GtCO2', 'Territorial Total/GtCO2', 'Fossil Cons CO2/GtCO2',
'Consumption Total/GtCO2', 'Emp Fossil Total/GtCO2',
'Col Fossil Total/GtCO2', 'Emp Land Total/GtCO2',
'Col Land Total/GtCO2', 'Emp Fossil Cons Total/GtCO2',
'Col Fossil Cons Total/GtCO2', 'Emp Cons Total/GtCO2', 'Col Cons Total/GtCO2'], inplace=True)
df_cw = df1[df1['Country'].isin(all_commonwealth_countries)]
return df_cw
processed_colonial_emissions = process_col_emissions_data(colonial_emissions)
panel1 = pd.merge(GDP_PPP_2, processed_emissions, on=['countrycode', 'Year'], how='right')
panel1['Year'] = pd.to_numeric(panel1['Year'], errors='coerce')
processed_colonial_emissions['Year'] = pd.to_numeric(processed_colonial_emissions['Year'], errors='coerce')
panel2 = pd.merge(panel1, processed_colonial_emissions, on=['countrycode', 'Year'], how='left')
wide_OECD['Year'] = pd.to_numeric(wide_OECD['Year'], errors='coerce')
panel3 = pd.merge(panel2, wide_OECD, on=['countrycode', 'Year'], how='left')
#Saving the datasets as excels and lists:
#Datasets for text processing
file_path = os.path.join(final_data_path, 'all_commonwealth_countries.txt')
with open(file_path, 'w') as f:
for item in all_commonwealth_countries:
f.write(str(item) + '\n')
ODI_projects_excel_file_path = os.path.join(final_data_path, 'ODI_projects_final.xlsx')
ODI_projects.to_excel(ODI_projects_excel_file_path, index=False)
ODI_projects_cw_excel_file_path = os.path.join(final_data_path, 'ODI_projects_cw_final.xlsx')
ODI_projects_cw.to_excel(ODI_projects_cw_excel_file_path, index=False)
UK_ICF_long_excel_file_path = os.path.join(final_data_path, 'UK_ICF_long_final.xlsx')
UK_ICF_long.to_excel(UK_ICF_long_excel_file_path, index=False)
#Datasets for plotting & regressions
OECD_excel_file_path = os.path.join(final_data_path, 'OECD_final.xlsx')
OECD.to_excel(OECD_excel_file_path, index=False)
PANEL_excel_file_path = os.path.join(final_data_path, 'panel3_final.xlsx')
panel3.to_excel(PANEL_excel_file_path, index=False)
plot_merge_excel_file_path = os.path.join(final_data_path, 'plot_merge_final.xlsx')
plot_merge.to_excel(plot_merge_excel_file_path, index=False)