Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
ZihengSun committed Mar 17, 2023
1 parent d7dd5f1 commit fe14199
Show file tree
Hide file tree
Showing 7 changed files with 276 additions and 0 deletions.
29 changes: 29 additions & 0 deletions code/data_preparation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# import functions and packages
from functions_book_chapter_SCA import *

dir_raster = './data/planet/20180528_181110_1025_3B_AnalyticMS_SR_clip.tif'
planet = rasterio.open(dir_raster).read()/10000
planet = np.where(planet[0,:,:] == 0, np.nan, planet) # the default nan data value is 0, replace with np.nan

fig, axs = plt.subplots(2,2,figsize=(15,7.5))
im1 = axs[0,0].imshow(planet[0,:,:],cmap='jet')
axs[0,0].set_title("Surface reflectance of blue band", fontsize=16)

im2 = axs[0,1].imshow(planet[1,:,:], cmap='jet')
axs[0,1].set_title("Surface reflectance of green band", fontsize=16)

im3 = axs[1,0].imshow(planet[2,:,:], cmap='jet')
axs[1,0].set_title("Surface reflectance of red band", fontsize=16)

im4 = axs[1,1].imshow(planet[3,:,:], cmap='jet')
axs[1,1].set_title("Surface reflectance of NIR band", fontsize=16)

cbar_ax = fig.add_axes([0.95, 0.15, 0.02, 0.7])
fig.colorbar(im1, cax=cbar_ax)

# read model input features and labels
data = pd.read_csv('./data/samples/sample_100K.csv', index_col = False)
print("Sample dimentions:".format(), data.shape)
print(data.head())
X = data[['blue','green','red','nir']]
y = data['label']
71 changes: 71 additions & 0 deletions code/hyper_parameter_tuning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# prepare data
data = pd.read_csv('./data/samples/sample_10K.csv', index_col = False)
print("Sample dimentions:".format(), data.shape)
data.head()
X = data[['blue','green','red','nir']]
y = data['label']

# customize models with different sample sizes
models = get_models_size()
results, names = list(), list()
for name, model in models.items():
# evaluate models using k-fold cross-validation
scores = evaluate_model(model, X, y)
results.append(scores)
names.append(name)
# print the mean and standard deviation of models
print('>%s Mean Score: %.6f (Score SD: %.6f)' % ('Sample size: ' + str(int(float(name) * 10000)), scores.mean(), scores.std()))

# display model performance
plt.figure(figsize=(10,5))
plt.boxplot(results, labels=names, showmeans=True)
plt.show()

# customize models with different model feature sizes
models = get_models_feature()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate models using k-fold cross-validation
scores = evaluate_model(model, X, y)
results.append(scores)
names.append(name)
# print the mean and standard deviation of models
# print('>%s %.3f (%.3f)' % (name, scores.mean(), scores.std()))
print('>%s Mean Score: %.6f (Score SD: %.6f)' % ('Features: ' + name, scores.mean(), scores.std()))
# display model performance
plt.boxplot(results, labels=names, showmeans=True)
plt.show()

# customize models with different tree numbers
models = get_models_tree()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate models using k-fold cross-validation
scores = evaluate_model(model, X, y)
results.append(scores)
names.append(name)
# print the mean and standard deviation of models
# print('>%s %.3f (%.3f)' % (name, scores.mean(), scores.std()))
print('>%s Mean Score: %.6f (Score SD: %.6f)' % ('Tree numbers: ' + name, scores.mean(), scores.std()))
# display model performance
plt.boxplot(results, labels=names, showmeans=True)
plt.show()

# customize models with different tree depths
models = get_models_depth()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate models using k-fold cross-validation
scores = evaluate_model(model, X, y)
results.append(scores)
names.append(name)
# print the mean and standard deviation of models
# print('>%s %.3f (%.3f)' % (name, scores.mean(), scores.std()))
print('>%s Mean Score: %.6f (Score SD: %.6f)' % ('Tree Depth: ' + name, scores.mean(), scores.std()))
# display model performance
plt.figure(figsize=(10,5))
plt.boxplot(results, labels=names, showmeans=True)
plt.show()
42 changes: 42 additions & 0 deletions code/modeL-training.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# read model input features and labels
data = pd.read_csv('./data/samples/sample_100K.csv', index_col = False)
print("Sample dimentions:".format(), data.shape)
print(data.head())
X = data[['blue','green','red','nir']]
y = data['label']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.96,random_state=1)

# define the model
model = RandomForestClassifier(n_estimators=10, max_depth=10, max_features=4)

# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1000)
n_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
# report model performance
print('Mean Score: %.6f (SD: %.6f)' % (n_scores.mean(),n_scores.std()))

# the histogram of the scores
n, bins, patches = plt.hist(n_scores, density=True, facecolor='blue', alpha=0.75)
plt.text(0.91, 15, r'mean = ' + str(n_scores.mean().round(6)) + ' '+ 'SD = ' + str(n_scores.std().round(6)))
plt.xlim(0.9, 1.01)
plt.xlabel('Acuuracy')
plt.ylabel('Probability (%)')
plt.grid(True)
plt.show()

model.fit(X_train,y_train)
result = permutation_importance(model, X_train, y_train, n_repeats=1000, random_state=42, n_jobs=2)
print('Permutation importance - average:'.format(), X_train.columns)
print([round(i, 6) for i in result.importances_mean])

# displace feature importance
fig, ax = plt.subplots(figsize=(6,5))
ax.boxplot(result.importances.T)
ax.set_title("Permutation Importances", fontsize = 16)
ax.set_xticklabels(labels=X_train.columns, fontsize=14)
plt.show()

# save model
dir_model = "./models/random_forest_SCA_binary.joblib"
joblib.dump(model, dir_model)
91 changes: 91 additions & 0 deletions code/model_evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
dir_aso = './data/ASO/ASO_3M_SD_USCATE_20180528_clip.tif'
raso = rasterio.open(dir_aso,'r').read()
raso = np.where(raso[0,:,:] < 0, np.nan, raso)

th = 0.1 # using 10 cm threshold
raso_binary = np.where(raso >= 0.1, 1, 0) # if the SD is higher than 10 cm, then snow; otherwise, no-snow

fig, axs = plt.subplots(1,2,figsize=(16,6))
im1 = axs[0].imshow(raso[0,:,:],cmap = 'Blues',vmin = 0, vmax = 5)
axs[0].set_title('ASO snow depth', fontsize=16)
fig.colorbar(im1, ax = axs[0], label = 'snow depth (meter)', extend = 'max')

im2 = axs[1].imshow(raso_binary[0,:,:], cmap = 'gray', interpolation = 'none')
axs[1].set_title('ASO snow cover (TH = 10 cm)', fontsize=16)

dir_model = "./models/random_forest_SCA_binary.joblib"
# load model
model = joblib.load(dir_model)

df = pd.DataFrame()
df['obs'] = y_test
df['predict'] = model.predict(X_test)

# Cross-tabulate predictions
print(pd.crosstab(df['obs'], df['predict'], margins=True))
print(calculate_metrics(df))

dir_raster = './data/planet/20180528_181110_1025_3B_AnalyticMS_SR_clip.tif'
dir_out = './data/SCA/'
nodata_flag = 9
run_sca_prediction(dir_raster, dir_out, nodata_flag, model)

dir_planet = './data/planet/20180528_181110_1025_3B_AnalyticMS_SR_clip.tif'
r_na_flag = rasterio.open(dir_planet, 'r').read()
r_planet = rasterio.open(dir_planet, 'r').read([4,3,2])/10000

dir_sca = './data/SCA/20180528_181110_1025_3B_AnalyticMS_SR_clip_SCA.tif'
r_sca = rasterio.open(dir_sca, 'r')

fig, (ax1, ax2) = plt.subplots(1,2, figsize = (16,4))
show(r_planet, ax= ax1, cmap='jet', interpolation = 'none',title = 'Planet false color Image')
show(r_sca.read().squeeze(), ax= ax2, interpolation = 'none',title = 'Planet Snow Cover')

dir_planet_ext = './data/GIS/extent/CATE_20180528_181110_img_ext.shp'
with fiona.open(dir_planet_ext, "r") as shapefile:
shapes = [feature["geometry"] for feature in shapefile]

dir_aso = "./data/ASO/ASO_3M_SD_USCATE_20180528_binary_clip.tif"
with rasterio.open(dir_aso,'r') as src:
r_aso = rasterio.mask.mask(src, shapes, crop=True, filled = False)

dir_pred = './data/SCA/20180528_181110_1025_3B_AnalyticMS_SR_clip_SCA.tif'
with rasterio.open(dir_pred,'r') as src:
r_predict = rasterio.mask.mask(src, shapes, crop=True, filled = False)

dir_watermask = './data/mask/waterbody_TB_UTM11_clip.tif'
with rasterio.open(dir_watermask,'r') as src:
r_watermask = rasterio.mask.mask(src, shapes, crop=True, filled = False)

dir_glaciermask = './data/mask/02_rgi60_WesternCanadaUS_hypso_TB_clip.tif'
with rasterio.open(dir_glaciermask,'r') as src:
r_glaciermask = rasterio.mask.mask(src, shapes, crop=True, filled = False)


df = pd.DataFrame()
df['predict'] = r_predict[0].ravel()
df['obs'] = r_aso[0].ravel()
df['watermask'] = r_watermask[0].ravel()
df['glaciermask'] = r_watermask[0].ravel()

# remove NA data region, water bodies, and glaciers
df_mask = df[(df.predict >= 0) & (df.watermask != 0) & (df.glaciermask != 0)]
# print(df)

print("overall model performance:")
print(calculate_metrics(df_mask))

file_landcover = './data/ASO/ASO_3M_CHM_USCATB_20140827_binary_clip.tif' # 1 - forest, 0 open area
with rasterio.open(file_landcover,'r') as src:
r_landcover = rasterio.mask.mask(src, shapes, crop=True, filled = False)

df['landcover'] = r_landcover[0].ravel()

df_mask = df[(df.predict >= 0) & (df.watermask != 0) & (df.glaciermask != 0)]
df_open = df_mask[df_mask.landcover == 0]
print("Model performance in open areas:")
print(calculate_metrics(df_open))

df_forest = df_mask[df_mask.landcover == 1]
print("Model performance in forested areas:")
print(calculate_metrics(df_forest))
33 changes: 33 additions & 0 deletions code/process.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
[{
"id" : "rgfp44",
"name" : "data_preparation",
"description" : "python",
"code" : "# import functions and packages\nfrom functions_book_chapter_SCA import *\n\ndir_raster = './data/planet/20180528_181110_1025_3B_AnalyticMS_SR_clip.tif'\nplanet = rasterio.open(dir_raster).read()/10000\nplanet = np.where(planet[0,:,:] == 0, np.nan, planet) # the default nan data value is 0, replace with np.nan\n\nfig, axs = plt.subplots(2,2,figsize=(15,7.5))\nim1 = axs[0,0].imshow(planet[0,:,:],cmap='jet')\naxs[0,0].set_title(\"Surface reflectance of blue band\", fontsize=16)\n\nim2 = axs[0,1].imshow(planet[1,:,:], cmap='jet')\naxs[0,1].set_title(\"Surface reflectance of green band\", fontsize=16)\n\nim3 = axs[1,0].imshow(planet[2,:,:], cmap='jet')\naxs[1,0].set_title(\"Surface reflectance of red band\", fontsize=16)\n\nim4 = axs[1,1].imshow(planet[3,:,:], cmap='jet')\naxs[1,1].set_title(\"Surface reflectance of NIR band\", fontsize=16)\n\ncbar_ax = fig.add_axes([0.95, 0.15, 0.02, 0.7])\nfig.colorbar(im1, cax=cbar_ax)\n\n# read model input features and labels \ndata = pd.read_csv('./data/samples/sample_100K.csv', index_col = False)\nprint(\"Sample dimentions:\".format(), data.shape)\nprint(data.head())\nX = data[['blue','green','red','nir']]\ny = data['label']",
"lang" : "python",
"owner" : "111111",
"confidential" : "FALSE"
},{
"id" : "4p0kit",
"name" : "hyper_parameter_tuning",
"description" : "python",
"code" : "# prepare data \ndata = pd.read_csv('./data/samples/sample_10K.csv', index_col = False)\nprint(\"Sample dimentions:\".format(), data.shape)\ndata.head()\nX = data[['blue','green','red','nir']]\ny = data['label']\n\n# customize models with different sample sizes\nmodels = get_models_size()\nresults, names = list(), list()\nfor name, model in models.items():\n # evaluate models using k-fold cross-validation\n scores = evaluate_model(model, X, y)\n results.append(scores)\n names.append(name)\n # print the mean and standard deviation of models \n print('>%s Mean Score: %.6f (Score SD: %.6f)' % ('Sample size: ' + str(int(float(name) * 10000)), scores.mean(), scores.std()))\n \n# display model performance \nplt.figure(figsize=(10,5))\nplt.boxplot(results, labels=names, showmeans=True)\nplt.show()\n\n# customize models with different model feature sizes\nmodels = get_models_feature()\n# evaluate the models and store results\nresults, names = list(), list()\nfor name, model in models.items():\n # evaluate models using k-fold cross-validation\n scores = evaluate_model(model, X, y)\n results.append(scores)\n names.append(name)\n # print the mean and standard deviation of models \n # print('>%s %.3f (%.3f)' % (name, scores.mean(), scores.std()))\n print('>%s Mean Score: %.6f (Score SD: %.6f)' % ('Features: ' + name, scores.mean(), scores.std()))\n# display model performance \nplt.boxplot(results, labels=names, showmeans=True)\nplt.show()\n\n# customize models with different tree numbers\nmodels = get_models_tree()\n# evaluate the models and store results\nresults, names = list(), list()\nfor name, model in models.items():\n # evaluate models using k-fold cross-validation\n scores = evaluate_model(model, X, y)\n results.append(scores)\n names.append(name)\n # print the mean and standard deviation of models \n # print('>%s %.3f (%.3f)' % (name, scores.mean(), scores.std()))\n print('>%s Mean Score: %.6f (Score SD: %.6f)' % ('Tree numbers: ' + name, scores.mean(), scores.std()))\n# display model performance \nplt.boxplot(results, labels=names, showmeans=True)\nplt.show()\n\n# customize models with different tree depths\nmodels = get_models_depth()\n# evaluate the models and store results\nresults, names = list(), list()\nfor name, model in models.items():\n # evaluate models using k-fold cross-validation\n scores = evaluate_model(model, X, y)\n results.append(scores)\n names.append(name)\n # print the mean and standard deviation of models \n # print('>%s %.3f (%.3f)' % (name, scores.mean(), scores.std()))\n print('>%s Mean Score: %.6f (Score SD: %.6f)' % ('Tree Depth: ' + name, scores.mean(), scores.std()))\n# display model performance \nplt.figure(figsize=(10,5))\nplt.boxplot(results, labels=names, showmeans=True)\nplt.show()",
"lang" : "python",
"owner" : "111111",
"confidential" : "FALSE"
},{
"id" : "xcw4h1",
"name" : "model_evaluation",
"description" : "python",
"code" : "dir_aso = './data/ASO/ASO_3M_SD_USCATE_20180528_clip.tif'\nraso = rasterio.open(dir_aso,'r').read()\nraso = np.where(raso[0,:,:] < 0, np.nan, raso)\n\nth = 0.1 # using 10 cm threshold\nraso_binary = np.where(raso >= 0.1, 1, 0) # if the SD is higher than 10 cm, then snow; otherwise, no-snow\n\nfig, axs = plt.subplots(1,2,figsize=(16,6))\nim1 = axs[0].imshow(raso[0,:,:],cmap = 'Blues',vmin = 0, vmax = 5)\naxs[0].set_title('ASO snow depth', fontsize=16)\nfig.colorbar(im1, ax = axs[0], label = 'snow depth (meter)', extend = 'max')\n\nim2 = axs[1].imshow(raso_binary[0,:,:], cmap = 'gray', interpolation = 'none')\naxs[1].set_title('ASO snow cover (TH = 10 cm)', fontsize=16)\n\ndir_model = \"./models/random_forest_SCA_binary.joblib\"\n# load model\nmodel = joblib.load(dir_model)\n\ndf = pd.DataFrame()\ndf['obs'] = y_test\ndf['predict'] = model.predict(X_test)\n\n# Cross-tabulate predictions\nprint(pd.crosstab(df['obs'], df['predict'], margins=True))\nprint(calculate_metrics(df))\n\ndir_raster = './data/planet/20180528_181110_1025_3B_AnalyticMS_SR_clip.tif'\ndir_out = './data/SCA/'\nnodata_flag = 9\nrun_sca_prediction(dir_raster, dir_out, nodata_flag, model)\n\ndir_planet = './data/planet/20180528_181110_1025_3B_AnalyticMS_SR_clip.tif'\nr_na_flag = rasterio.open(dir_planet, 'r').read()\nr_planet = rasterio.open(dir_planet, 'r').read([4,3,2])/10000\n\ndir_sca = './data/SCA/20180528_181110_1025_3B_AnalyticMS_SR_clip_SCA.tif'\nr_sca = rasterio.open(dir_sca, 'r')\n\nfig, (ax1, ax2) = plt.subplots(1,2, figsize = (16,4))\nshow(r_planet, ax= ax1, cmap='jet', interpolation = 'none',title = 'Planet false color Image')\nshow(r_sca.read().squeeze(), ax= ax2, interpolation = 'none',title = 'Planet Snow Cover')\n\ndir_planet_ext = './data/GIS/extent/CATE_20180528_181110_img_ext.shp'\nwith fiona.open(dir_planet_ext, \"r\") as shapefile:\n shapes = [feature[\"geometry\"] for feature in shapefile]\n \ndir_aso = \"./data/ASO/ASO_3M_SD_USCATE_20180528_binary_clip.tif\"\nwith rasterio.open(dir_aso,'r') as src:\n r_aso = rasterio.mask.mask(src, shapes, crop=True, filled = False)\n \ndir_pred = './data/SCA/20180528_181110_1025_3B_AnalyticMS_SR_clip_SCA.tif'\nwith rasterio.open(dir_pred,'r') as src:\n r_predict = rasterio.mask.mask(src, shapes, crop=True, filled = False)\n\ndir_watermask = './data/mask/waterbody_TB_UTM11_clip.tif'\nwith rasterio.open(dir_watermask,'r') as src:\n r_watermask = rasterio.mask.mask(src, shapes, crop=True, filled = False)\n \ndir_glaciermask = './data/mask/02_rgi60_WesternCanadaUS_hypso_TB_clip.tif'\nwith rasterio.open(dir_glaciermask,'r') as src:\n r_glaciermask = rasterio.mask.mask(src, shapes, crop=True, filled = False)\n \n \ndf = pd.DataFrame()\ndf['predict'] = r_predict[0].ravel()\ndf['obs'] = r_aso[0].ravel()\ndf['watermask'] = r_watermask[0].ravel()\ndf['glaciermask'] = r_watermask[0].ravel()\n\n# remove NA data region, water bodies, and glaciers \ndf_mask = df[(df.predict >= 0) & (df.watermask != 0) & (df.glaciermask != 0)]\n# print(df)\n\nprint(\"overall model performance:\")\nprint(calculate_metrics(df_mask))\n\nfile_landcover = './data/ASO/ASO_3M_CHM_USCATB_20140827_binary_clip.tif' # 1 - forest, 0 open area\nwith rasterio.open(file_landcover,'r') as src:\n r_landcover = rasterio.mask.mask(src, shapes, crop=True, filled = False)\n \ndf['landcover'] = r_landcover[0].ravel()\n\ndf_mask = df[(df.predict >= 0) & (df.watermask != 0) & (df.glaciermask != 0)]\ndf_open = df_mask[df_mask.landcover == 0]\nprint(\"Model performance in open areas:\")\nprint(calculate_metrics(df_open))\n\ndf_forest = df_mask[df_mask.landcover == 1]\nprint(\"Model performance in forested areas:\")\nprint(calculate_metrics(df_forest))",
"lang" : "python",
"owner" : "111111",
"confidential" : "FALSE"
},{
"id" : "8e59lt",
"name" : "modeL-training",
"description" : "python",
"code" : "# read model input features and labels \ndata = pd.read_csv('./data/samples/sample_100K.csv', index_col = False)\nprint(\"Sample dimentions:\".format(), data.shape)\nprint(data.head())\nX = data[['blue','green','red','nir']]\ny = data['label']\n\nX_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.96,random_state=1)\n\n# define the model\nmodel = RandomForestClassifier(n_estimators=10, max_depth=10, max_features=4)\n\n# evaluate the model\ncv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1000)\nn_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)\n# report model performance\nprint('Mean Score: %.6f (SD: %.6f)' % (n_scores.mean(),n_scores.std()))\n\n# the histogram of the scores\nn, bins, patches = plt.hist(n_scores, density=True, facecolor='blue', alpha=0.75)\nplt.text(0.91, 15, r'mean = ' + str(n_scores.mean().round(6)) + ' '+ 'SD = ' + str(n_scores.std().round(6)))\nplt.xlim(0.9, 1.01)\nplt.xlabel('Acuuracy')\nplt.ylabel('Probability (%)')\nplt.grid(True)\nplt.show()\n\nmodel.fit(X_train,y_train)\nresult = permutation_importance(model, X_train, y_train, n_repeats=1000, random_state=42, n_jobs=2)\nprint('Permutation importance - average:'.format(), X_train.columns)\nprint([round(i, 6) for i in result.importances_mean])\n\n# displace feature importance\nfig, ax = plt.subplots(figsize=(6,5))\nax.boxplot(result.importances.T)\nax.set_title(\"Permutation Importances\", fontsize = 16)\nax.set_xticklabels(labels=X_train.columns, fontsize=14)\nplt.show()\n\n# save model \ndir_model = \"./models/random_forest_SCA_binary.joblib\"\njoblib.dump(model, dir_model)",
"lang" : "python",
"owner" : "111111",
"confidential" : "FALSE"
}]
1 change: 1 addition & 0 deletions history/m8s75d7cn7seb8mf2z2q.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[]
Loading

0 comments on commit fe14199

Please sign in to comment.