-
Notifications
You must be signed in to change notification settings - Fork 0
/
script.py
229 lines (191 loc) · 9.85 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# -*- coding: utf-8 -*-
# Derya Gumustel
# Ocean 340
# Final Project
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
from collections import OrderedDict
from scipy import stats
def getFileList(path):
"""Takes in a relative path (str), gets list of files in path location, returns lists of file
names with and without '.csv'.
"""
fileList = os.listdir(path) # get list of files in folder
fileListOut = []
locationNames = []
for dataset in fileList: # go through every file
if dataset == "script.py": # exclude script, want only .csv files
pass
else:
string = str(dataset) # turn file name into string
fileListOut.append(dataset) # add to file list with .csv
locationNames.append(string[:-4]) # add to file list without .csv
return fileListOut, locationNames
def openFile(path, fileName):
"""Takes in file path (str) and file name (str), creates pandas dataframe of provided file,
returns dataframe.
"""
df = pd.read_csv(path + fileName)
return df
def getShallowData(df):
"""Takes in a pandas dataframe, writes new dataframe that contains no rows with depth values
greater than 10 ft, returns new dataframe.
"""
depths = np.arange(10, 100) # create range of undesired depths
for depth in depths: # go through undesired depths
# write new df from provided df excluding rows that have undesired depths
discard = " -" + str(depth) + " ft"
df = df[df[" Depth (Ft)"] != discard]
return df
def indexByDateTime(df, var):
"""Takes in a pandas dataframe and variable name (str), creates and returns pandas time series
using the datetime column as an index.
"""
data = df[var] # get desired data
index = pd.DatetimeIndex(df['Date and Time']) # get desired index (datetime)
data = pd.Series(list(data), index=index) # create time series of data
return data
def avgData(indexedData):
"""Takes in pandas series indexed by datetime, resamples series to get daily averages, returns
resampled series.
"""
series = indexedData
return series.resample('D').mean() # resample to daily means
def mergeData(s1, s2):
"""Takes in two pandas series indexed by time, creates dataframes from them, merges the new
dataframes using the datetime column found in both of them, returns merged dataframe.
"""
# convert series to dataframes
df1 = pd.DataFrame({'Date and Time':s1.index, 'Chlorophyll':s1.values})
df2 = pd.DataFrame({'Date and Time':s2.index, 'Oxygen':s2.values})
output = pd.merge_asof(df1, df2, on='Date and Time') # merge dataframes using datetime
return output
def plotAvg(series, figNum, nrows, ncols, index, title, location, ylabel, figsize=[20,10], legend=False):
"""Takes in a pandas series indexed by time, a figure number (int), the number of rows and
columns to be used for defining a subplot (ints), the index of that subplot to plot on (int), a
plot title (str), a data collection location (str), and a y-label (str). legend=True produces a
plot legend. Plots the time series data on desired subplot within figure, shows plot.
"""
plt.figure(num=figNum, figsize=figsize) # create figure
plt.subplot(nrows, ncols, index) # define subplot system
plt.subplots_adjust(hspace=0.5)
# add title and axes labels
plt.title(title, fontsize='xx-large')
plt.xlabel("Date")
plt.ylabel(ylabel)
ax = series.plot(style='o', label=location) # plot series, label by location
if legend == True:
ax.legend() # create legend from series.plot labels
ax.set_xlim(pd.Timestamp('2019-01-04'), pd.Timestamp('2019-03-08')) # set x-axis range
ax.xaxis.set_major_locator(mdates.WeekdayLocator()) # set major ticks to one week intervals
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %d')) # set major ticks format
ax.set_xticklabels([],minor=True) # show minor ticks
plt.show()
def plotCorr(df, figNum, nrows, ncols, index, title, figsize=[20,10]):
"""Takes in a pandas dataframe containing two variables to be correlated, a figure number (int),
the number of rows and columns to be used for defining a subplot (ints), the index of that
subplot to plot on (int), and a plot title (str). Plots two variables in the dataframe against
each other and calculates and plots the trendline for these two variables. Shows plot and
returns r squared value for the two variables.
"""
plt.figure(num=figNum, figsize=figsize) # create figure
plt.subplot(nrows, ncols, index) # define subplot system
plt.subplots_adjust(hspace=0.5)
# add title and axes labels
plt.title(title, fontsize='xx-large')
plt.xlabel("Chlorophyll (ug/L)")
plt.ylabel("Oxygen (mg/L)")
# get and plot variable data
x = df['Chlorophyll']
y = df['Oxygen']
plt.plot(x, y, 'o', zorder=2)
# get statistical information
mask = ~np.isnan(x) & ~np.isnan(y) # hide nans
slope, intercept, r_value, p_value, std_err = stats.linregress(x[mask], y[mask])
# use stats to plot trendline
xx = np.array(x)
yy = slope*xx+intercept
plt.plot(xx, yy, color='lightsteelblue', ls='-', zorder=1)
plt.show()
return(r_value**2) # return r**2 for statistical analysis
def plotMap(locationDict, cmap='rainbow', figsize=[10,10]):
"""Takes in an orderered dictionary with data collection sites as keys and corresponding
latitude-longitude coordinate pairs as values, creates and shows plot of Puget Sound area with
data collection sites plotted appropriately.
"""
plt.figure(num=4, figsize=figsize) # create figure
ax = plt.axes(projection=ccrs.PlateCarree()) # set map projection
# add coastlines and axes ranges
ax.coastlines(resolution='10m')
ax.set_extent([-125.5, -122, 46.5, 49.5])
# add colored land and water (ocean feature has long loading time)
land = cfeature.NaturalEarthFeature('physical', 'land', '10m', facecolor='darkseagreen')
ax.add_feature(land, zorder=1)
#ocean = cfeature.NaturalEarthFeature('physical', 'ocean', '10m', facecolor='royalblue')
#ax.add_feature(ocean, zorder=1)
# add title and gridlines, customize axes labels
plt.title("Data Collection Sites In and Around Puget Sound", fontsize='xx-large')
gl = ax.gridlines(crs=ccrs.PlateCarree(), draw_labels=True,
linewidth=1, color='white', alpha=0.5, linestyle=':', zorder=2)
gl.xlabels_top = False
gl.xformatter = LONGITUDE_FORMATTER
gl.yformatter = LATITUDE_FORMATTER
# plot locations
colors = eval("plt.cm." + cmap + "(np.linspace(0, 1, len(locationDict)))")
legendList = list(locationDict.keys())
for i in range(len(locationDict)): # for every location
c = colors[i] # get new color
coords = locationDict[legendList[i]] # get coords from ordered dictionary
lat = coords[0]
lon = coords[1]
plt.scatter(lon, lat, marker='o', color=c, zorder=3, label=legendList[i]) # plot location
ax.legend() # add legend
plt.show()
def main():
script_dir = os.path.dirname(__file__) # find local script location
path = script_dir + "\\" # format to path style
fileList, locationNames = getFileList(path) # use formatted path to access data files
# create and fill ordered location dictionary
locationDict = OrderedDict()
locationDict["Bellingham"] = [48.7237, -122.5765]
locationDict["Coast"] = [47.9627, -124.9580]
locationDict["Point Williams"] = [47.5372, -122.4061]
locationDict["Yacht Club, Vashon"] = [47.3942, -122.4635]
locations = list(locationDict.keys())
rsquaredvals = []
count = 1
count3 = 0
for dataset in fileList: # go through each file
df = openFile(path, dataset) # get dataframe
place = locationNames[count-1] # get data collection site
if ' Chlorophyll' in df.dtypes: # for every chlorophyll file
chldf = getShallowData(df) # get shallow data
chl = indexByDateTime(chldf, " Chlorophyll") # get time series
chlAvg = avgData(chl) # average series to daily values
# get plots of chlorophyll
plotAvg(chlAvg, 1, 4, 2, count, place, locations[count3], "Chlorophyll (ug/L)")
plotAvg(chlAvg, 2, 2, 1, 1, "Chlorophyll Concentrations Across Puget Sound Region",
locations[count3], "Chlorophyll (ug/L)", legend=True)
else: # for every oxygen file
oxydf = getShallowData(df) # get shallow data
oxy = indexByDateTime(oxydf, " Oxygen Conc. (mg/L)") # get time series
oxyAvg = avgData(oxy) # average series to daily values
# get plots of oxygen
plotAvg(oxyAvg, 1, 4, 2, count, place, locations[count3], "Oxygen (mg/L)")
plotAvg(oxyAvg, 2, 2, 1, 2, "Oxygen Concentrations Across Puget Sound Region",
locations[count3], "Oxygen (mg/L)", legend=True)
# merge chlorophyll and oxygen data, occurs once for each location
df = mergeData(chlAvg, oxyAvg)
# get r squared values for analysis, get correlation plot
rsquaredvals.append(plotCorr(df, 3, 2, 2, count3+1, place[:-21]))
count3+=1
count+=1
plotMap(locationDict) # get map of Puget Sound showing data collection sites
if __name__ == "__main__":
main()