diff --git a/Chapter2-DataManipulation/2.6_resampling.html b/Chapter2-DataManipulation/2.6_resampling.html index 2c3e8bd2..296ab9b5 100644 --- a/Chapter2-DataManipulation/2.6_resampling.html +++ b/Chapter2-DataManipulation/2.6_resampling.html @@ -801,12 +801,32 @@
# fix the random seed for reproducibility. Place it on the top to reproduce the entire notebook exactly once. But use it in every cell to re-run each cells independently.
+np.random.seed(42)
+
# We define a random generator
rng = np.random.default_rng()
rng
+
Generator(PCG64) at 0x318353900
+
# We begin with two datasets, A and B
+np.random.seed(42)
+rng = np.random.default_rng(seed=42)
A = rng.normal(5, 2.5, 100)
B = rng.normal(5.5, 2.5, 100)
@@ -834,8 +856,8 @@ 1.1 Randomization
-The means of A and B are, 4.916816615939592 and 5.629794831291764, respectively.
-The difference of means is, -0.7129782153521722.
+The means of A and B are, 4.874325971290356 and 5.473420405447642, respectively.
+The difference of means is, -0.5990944341572852.
@@ -904,7 +926,7 @@ 1.1 Randomization
-
+
We can verify that the Pearson correlation coefficient is close to our target using the numpy function corrcoef
.
# The correlation matrix of the first and second columns of correlated_data
correlation_matrix = np.corrcoef(correlated_data[:, 0], correlated_data[:, 1])
-
-# Check the correlation coefficient
+
correlation_matrix
+
array([[ 1. , -0.72540304],
+ [-0.72540304, 1. ]])
+
# Check the correlation coefficient
print('The correlation coefficient is: ' + str(correlation_matrix[0,1]) + '.')
The correlation coefficient is: -0.7589433978306135.
+The correlation coefficient is: -0.7254030411293304.
nsubset=10
-subset = rng.choice(correlated_data, size=nsubset, replace=False)
+subset = rng.choice(correlated_data, size=nsubset, replace=True)
+
subset
+
array([[ 1.08149552, -0.75473902],
+ [-0.15064708, -0.88421426],
+ [ 0.03331803, 0.51951422],
+ [ 0.58669726, 0.51967288],
+ [ 0.42511267, 0.47888931],
+ [-0.05371048, 0.15998988],
+ [ 0.13428352, 0.24026582],
+ [-0.13976786, 0.57151718],
+ [-0.74081486, 0.22763649],
+ [ 0.02037839, -0.36898269]])
The correlation coefficient of our sample is: -0.8537303444308123.
+The correlation coefficient of our sample is: -0.19742697173045337.
-
+
Now, we will bootstrap. Once again, we define a number of runs.
@@ -1002,11 +1063,62 @@subset
+
array([[ 1.08149552, -0.75473902],
+ [-0.15064708, -0.88421426],
+ [ 0.03331803, 0.51951422],
+ [ 0.58669726, 0.51967288],
+ [ 0.42511267, 0.47888931],
+ [-0.05371048, 0.15998988],
+ [ 0.13428352, 0.24026582],
+ [-0.13976786, 0.57151718],
+ [-0.74081486, 0.22763649],
+ [ 0.02037839, -0.36898269]])
+
nsubset=10
+subset
+
array([[ 1.08149552, -0.75473902],
+ [-0.15064708, -0.88421426],
+ [ 0.03331803, 0.51951422],
+ [ 0.58669726, 0.51967288],
+ [ 0.42511267, 0.47888931],
+ [-0.05371048, 0.15998988],
+ [ 0.13428352, 0.24026582],
+ [-0.13976786, 0.57151718],
+ [-0.74081486, 0.22763649],
+ [ 0.02037839, -0.36898269]])
+
nsubset=500
# Now, for each run
for i in range(number_runs):
# We want to draw length_sub samples WITH REPLACEMENT
- new_pairs = rng.choice(subset, size=length_sub, replace=True)
+ new_pairs = rng.choice(correlated_data, size=nsubset, replace=True)
# Calculate and store the correlation coefficient
corr_coef_collector[i] = np.corrcoef(new_pairs[:, 0], new_pairs[:, 1])[0,1]
@@ -1024,7 +1136,7 @@ 1.2 Bootstrapping
-
+
We see that the median Pearson coefficient is not exactly the true Pearson coefficient. Why is that?
@@ -1076,7 +1188,7 @@We now estimate \(\pi\) by using the length of the number points in the circle and in the quarter as approximation to their area.
@@ -1091,7 +1203,7 @@# We now make a data frame
-crap={'station':sta,'date':date,'date_year':date_year,'east':ue,'north':un,'up':uv}
-if len(df)==0:
- df = pd.DataFrame(crap, columns = ['station', 'date','date_year','east','north','up'])
-else:
- df=pd.concat([df,pd.DataFrame(crap, columns = ['station', 'date','date_year','east','north','up'])])
-df.describe()
+
+http://geodesy.unr.edu/gps_timeseries/tenv/IGS14/P395.tenv
+P395 06JAN25 2006.0671 53760 1359 3 -123.9 3347.67917 4987420.31375 53.03678 0.0083 0.00069 0.00105 0.00327 -0.04832 0.01695 -0.31816
-
-
+
+
+
+
+ station ID (SSSS)
+ date (yymmmdd)
+ decimal year
+ modified Julian day
+ GPS week
+ day of GPS week
+ longitude (degrees) of reference meridian
+ delta e (m)
+ delta n (m)
+ delta v (m)
+ antenna height (m)
+ sigma e (m)
+ sigma n (m)
+ sigma v (m)
+ correlation en
+ correlation ev
+ correlation nv
+ new delta e (m)
+ new delta n (m)
+ new delta v (m)
+
+
+
+
+ 0
+ P395
+ 06JAN25
+ 2006.0671
+ 53760.0
+ 1359.0
+ 3.0
+ -123.9
+ 3347.67917
+ 4.987420e+06
+ 53.03678
+ 0.0083
+ 0.00069
+ 0.00105
+ 0.00327
+ -0.04832
+ 0.01695
+ -0.31816
+ 0.00000
+ 0.00000
+ 0.00000
- 50%
- 2015.392200
- 3.347626e+06
- 4.987420e+09
- 53038.670000
+ 1
+ P395
+ 06JAN26
+ 2006.0698
+ 53761.0
+ 1359.0
+ 4.0
+ -123.9
+ 3347.68086
+ 4.987420e+06
+ 53.03003
+ 0.0083
+ 0.00069
+ 0.00104
+ 0.00321
+ -0.04648
+ 0.00271
+ -0.30970
+ 0.00169
+ -0.00067
+ -0.00675
- 75%
- 2020.067050
- 3.347653e+06
- 4.987420e+09
- 53042.450000
+ 2
+ P395
+ 06JAN27
+ 2006.0726
+ 53762.0
+ 1359.0
+ 5.0
+ -123.9
+ 3347.68072
+ 4.987420e+06
+ 53.03906
+ 0.0083
+ 0.00069
+ 0.00105
+ 0.00326
+ -0.02367
+ 0.00817
+ -0.31941
+ 0.00155
+ 0.00101
+ 0.00228
- max
- 2024.742000
- 3.347683e+06
- 4.987420e+09
- 53065.440000
+ 3
+ P395
+ 06JAN28
+ 2006.0753
+ 53763.0
+ 1359.0
+ 6.0
+ -123.9
+ 3347.67938
+ 4.987420e+06
+ 53.04382
+ 0.0083
+ 0.00069
+ 0.00105
+ 0.00324
+ -0.03681
+ 0.00908
+ -0.30515
+ 0.00021
+ -0.00150
+ 0.00704
+
+
+ 4
+ P395
+ 06JAN29
+ 2006.0780
+ 53764.0
+ 1360.0
+ 0.0
+ -123.9
+ 3347.68042
+ 4.987420e+06
+ 53.03513
+ 0.0083
+ 0.00068
+ 0.00105
+ 0.00328
+ -0.04815
+ 0.00619
+ -0.33029
+ 0.00125
+ -0.00162
+ -0.00165
@@ -1227,20 +1588,15 @@ 2.1 Plate Motion - Geodetic Data
-# Plot the GPS time series
-fig,ax=plt.subplots(3,1,figsize=(11,8),sharex=True)
-ax[0].plot(df['date_year'][df['station']==sta],df['east'][df['station']==sta]);ax[0].grid(True);ax[0].set_ylabel('Easting (mm)')
-ax[1].plot(df['date_year'][df['station']==sta],df['north'][df['station']==sta]);ax[1].grid(True);ax[1].set_ylabel('Northing (mm)')
-ax[2].plot(df['date_year'][df['station']==sta],df['up'][df['station']==sta]);ax[2].grid(True);ax[2].set_ylabel('Up (mm)')
-ax[2].set_xlabel('Time (years)')
+plt.plot(df['decimal year'], df['new delta e (m)'], label='East displacement')
-Text(0.5, 0, 'Time (years)')
+[<matplotlib.lines.Line2D at 0x319394a90>]
-
+
@@ -1260,21 +1616,389 @@ 2.2 Linear regression\(R^2\) is to one.
+# remove nans with dropna for the specific delta e column and replace df with the new dataframe
+df = df.dropna(subset=['delta e (m)'])
+
+
+
+
+
+
+df.head()
+
+
+
+
+
+
+
+
+
+
+ station ID (SSSS)
+ date (yymmmdd)
+ decimal year
+ modified Julian day
+ GPS week
+ day of GPS week
+ longitude (degrees) of reference meridian
+ delta e (m)
+ delta n (m)
+ delta v (m)
+ antenna height (m)
+ sigma e (m)
+ sigma n (m)
+ sigma v (m)
+ correlation en
+ correlation ev
+ correlation nv
+ new delta e (m)
+ new delta n (m)
+ new delta v (m)
+
+
+
+
+ 0
+ P395
+ 06JAN25
+ 2006.0671
+ 53760.0
+ 1359.0
+ 3.0
+ -123.9
+ 3347.67917
+ 4.987420e+06
+ 53.03678
+ 0.0083
+ 0.00069
+ 0.00105
+ 0.00327
+ -0.04832
+ 0.01695
+ -0.31816
+ 0.00000
+ 0.00000
+ 0.00000
+
+
+ 1
+ P395
+ 06JAN26
+ 2006.0698
+ 53761.0
+ 1359.0
+ 4.0
+ -123.9
+ 3347.68086
+ 4.987420e+06
+ 53.03003
+ 0.0083
+ 0.00069
+ 0.00104
+ 0.00321
+ -0.04648
+ 0.00271
+ -0.30970
+ 0.00169
+ -0.00067
+ -0.00675
+
+
+ 2
+ P395
+ 06JAN27
+ 2006.0726
+ 53762.0
+ 1359.0
+ 5.0
+ -123.9
+ 3347.68072
+ 4.987420e+06
+ 53.03906
+ 0.0083
+ 0.00069
+ 0.00105
+ 0.00326
+ -0.02367
+ 0.00817
+ -0.31941
+ 0.00155
+ 0.00101
+ 0.00228
+
+
+ 3
+ P395
+ 06JAN28
+ 2006.0753
+ 53763.0
+ 1359.0
+ 6.0
+ -123.9
+ 3347.67938
+ 4.987420e+06
+ 53.04382
+ 0.0083
+ 0.00069
+ 0.00105
+ 0.00324
+ -0.03681
+ 0.00908
+ -0.30515
+ 0.00021
+ -0.00150
+ 0.00704
+
+
+ 4
+ P395
+ 06JAN29
+ 2006.0780
+ 53764.0
+ 1360.0
+ 0.0
+ -123.9
+ 3347.68042
+ 4.987420e+06
+ 53.03513
+ 0.0083
+ 0.00068
+ 0.00105
+ 0.00328
+ -0.04815
+ 0.00619
+ -0.33029
+ 0.00125
+ -0.00162
+ -0.00165
+
+
+
+
+
+
+
+plt.plot(df['decimal year'], df['new delta e (m)'], label='East displacement')
+
+
+
+
+[<matplotlib.lines.Line2D at 0x3195279a0>]
+
+
+
+
+
+
+
+df.head()
+
+
+
+
+
+
+
+
+
+
+ station ID (SSSS)
+ date (yymmmdd)
+ decimal year
+ modified Julian day
+ GPS week
+ day of GPS week
+ longitude (degrees) of reference meridian
+ delta e (m)
+ delta n (m)
+ delta v (m)
+ antenna height (m)
+ sigma e (m)
+ sigma n (m)
+ sigma v (m)
+ correlation en
+ correlation ev
+ correlation nv
+ new delta e (m)
+ new delta n (m)
+ new delta v (m)
+
+
+
+
+ 0
+ P395
+ 06JAN25
+ 2006.0671
+ 53760.0
+ 1359.0
+ 3.0
+ -123.9
+ 3347.67917
+ 4.987420e+06
+ 53.03678
+ 0.0083
+ 0.00069
+ 0.00105
+ 0.00327
+ -0.04832
+ 0.01695
+ -0.31816
+ 0.00000
+ 0.00000
+ 0.00000
+
+
+ 1
+ P395
+ 06JAN26
+ 2006.0698
+ 53761.0
+ 1359.0
+ 4.0
+ -123.9
+ 3347.68086
+ 4.987420e+06
+ 53.03003
+ 0.0083
+ 0.00069
+ 0.00104
+ 0.00321
+ -0.04648
+ 0.00271
+ -0.30970
+ 0.00169
+ -0.00067
+ -0.00675
+
+
+ 2
+ P395
+ 06JAN27
+ 2006.0726
+ 53762.0
+ 1359.0
+ 5.0
+ -123.9
+ 3347.68072
+ 4.987420e+06
+ 53.03906
+ 0.0083
+ 0.00069
+ 0.00105
+ 0.00326
+ -0.02367
+ 0.00817
+ -0.31941
+ 0.00155
+ 0.00101
+ 0.00228
+
+
+ 3
+ P395
+ 06JAN28
+ 2006.0753
+ 53763.0
+ 1359.0
+ 6.0
+ -123.9
+ 3347.67938
+ 4.987420e+06
+ 53.04382
+ 0.0083
+ 0.00069
+ 0.00105
+ 0.00324
+ -0.03681
+ 0.00908
+ -0.30515
+ 0.00021
+ -0.00150
+ 0.00704
+
+
+ 4
+ P395
+ 06JAN29
+ 2006.0780
+ 53764.0
+ 1360.0
+ 0.0
+ -123.9
+ 3347.68042
+ 4.987420e+06
+ 53.03513
+ 0.0083
+ 0.00068
+ 0.00105
+ 0.00328
+ -0.04815
+ 0.00619
+ -0.33029
+ 0.00125
+ -0.00162
+ -0.00165
+
+
+
+
+
+
+
# now let's find the trends and detrend the data.
from scipy import stats
# linear regression such that: displacement = Velocity * time
# velocity in the East component.
-Ve, intercept, r_value, p_value, std_err = stats.linregress(df['date_year'][df['station']==sta],df['east'][df['station']==sta])
+
+Ve, intercept, r_value, p_value, std_err = stats.linregress(df['decimal year'],df['new delta e (m)'])
# horizontal plate motion:
-print(sta,"overall plate motion there",Ve,'mm/year')
+print(sta,"overall plate motion there",Ve,'m/year')
print("parameters: Coefficient of determination %f4.2, P-value %f4.2, standard deviation of errors %f4.2"\
%(r_value,p_value,std_err))
-P395 overall plate motion there 0.0 mm/year
-parameters: Coefficient of determination 0.0000004.2, P-value 1.0000004.2, standard deviation of errors 0.0000004.2
+P395 overall plate motion there -0.0064397312911273945 m/year
+parameters: Coefficient of determination -0.9970084.2, P-value 0.0000004.2, standard deviation of errors 0.0000064.2
+
+
+
+
+
+
+np.isnan(df['new delta e (m)']).any()
+
+
+
+
+False
@@ -1284,8 +2008,10 @@ 2.2 Linear regression
from sklearn.linear_model import LinearRegression
# convert the data into numpy arrays.
-E = np.asarray(df['east'][df['station']==sta]).reshape(-1, 1)# reshaping was necessary to be an argument of Linear regress
-t = np.asarray(df['date_year'][df['station']==sta]).reshape(-1, 1)
+E = np.asarray(df['new delta e (m)']).reshape(-1, 1)# reshaping was necessary to be an argument of Linear regress
+# E = np.asarray(df['east'][df['station']==sta]).reshape(-1, 1)# reshaping was necessary to be an argument of Linear regress
+# make a new time array
+t = np.asarray(df['decimal year']).reshape(-1, 1)
tt = np.linspace(np.min(t),np.max(t),1000)
# perform the linear regression. First we will use the entire available data
@@ -1297,21 +2023,19 @@ 2.2 Linear regression# The coefficients
print('Coefficient / Velocity eastward (mm/year): ', regr.coef_[0][0])
-
-plt.plot(t,E);ax[0].grid(True);ax[0].set_ylabel('East (mm)')
-plt.plot(t,Epred,color="red")
-plt.grid(True)
-plt.xticks(())
-plt.yticks(())
-plt.show()
+# plot the data
+plt.plot(t,E,'b',label='data')
-Coefficient / Velocity eastward (mm/year): 0.0
+Coefficient / Velocity eastward (mm/year): -0.006439731291127403
-
+[<matplotlib.lines.Line2D at 0x318690940>]
+
+
+
To evaluate the errors of the model fit using the module sklearn
, we will use the following function
@@ -1329,7 +2053,7 @@ 2.2 Linear regression
Mean squared error (mm): 0.00
-Coefficient of determination: 1.00
+Coefficient of determination: 0.99
@@ -1363,16 +2087,18 @@ 2.3 Bootstrapping# the data shows clearly a trend, so the predictions of the trends are close to each other:
print("mean of the velocity estimates %f4.2 and the standard deviation %f4.2"%(np.mean(vel),np.std(vel)))
-plt.hist(vel,50);plt.title('Distribution of eastward velocities (mm/year)');plt.grid(True)
+plt.hist(vel,10);plt.title('Distribution of eastward velocities (mm/year)');plt.grid(True)
+# only show a few values in the x-axis
+
plt.show()
-mean of the velocity estimates 0.0000004.2 and the standard deviation 0.0000004.2
+mean of the velocity estimates -0.0064404.2 and the standard deviation 0.0000064.2
-
+
@@ -1414,10 +2140,10 @@ 2.4 Cross validation
-<matplotlib.legend.Legend at 0x30baba520>
+<matplotlib.legend.Legend at 0x3196996a0>
-
+
Now fit the data and evaluate the error
@@ -1452,16 +2178,16 @@ 2.4 Cross validation
-Training set: Coefficient / Velocity eastward (mm/year): 0.0
+Training set: Coefficient / Velocity eastward (mm/year): -0.006437910455226973
MSE (mean square error) on training set (mm): 0.00
-Coefficient of determination on training set: 1.00
-MSE on validation set (mm): 0.00 and coefficient of determiniation on 1.00
+Coefficient of determination on training set: 0.99
+MSE on validation set (mm): 0.00 and coefficient of determiniation on 0.99
Text(0.5, 1.0, 'Random selection for data split')
-
+
We can also select the training and validation to be chronological. If the “state” of the data changes through time, this may induce a bias in the training. But let’s see.
@@ -1498,14 +2224,14 @@ 2.4 Cross validation
- Training set: Coefficient / Velocity eastward (mm/year): 0.0
-Validation set MSE (mm) and Coef of Determination: 0.00,1.00
+ Training set: Coefficient / Velocity eastward (mm/year): -0.006250720119447979
+Validation set MSE (mm) and Coef of Determination: 0.00,0.92
Text(0.5, 1.0, 'Chronological selection for data split')
-
+
Now you see that the choice of training vs validating data is important to fit a model that will generalize.
@@ -1560,7 +2286,7 @@ 2.5 Leave One Out Cross Validation
-