first commit

geo-smart · Mar 17, 2023 · fe14199 · fe14199
1 parent d7dd5f1
commit fe14199
Show file tree

Hide file tree

Showing 7 changed files with 276 additions and 0 deletions.
diff --git a/code/data_preparation.py b/code/data_preparation.py
@@ -0,0 +1,29 @@
+# import functions and packages
+from functions_book_chapter_SCA import *
+
+dir_raster = './data/planet/20180528_181110_1025_3B_AnalyticMS_SR_clip.tif'
+planet = rasterio.open(dir_raster).read()/10000
+planet = np.where(planet[0,:,:] == 0, np.nan, planet) # the default nan data value is 0, replace with np.nan
+
+fig, axs = plt.subplots(2,2,figsize=(15,7.5))
+im1 = axs[0,0].imshow(planet[0,:,:],cmap='jet')
+axs[0,0].set_title("Surface reflectance of blue band", fontsize=16)
+
+im2 = axs[0,1].imshow(planet[1,:,:], cmap='jet')
+axs[0,1].set_title("Surface reflectance of green band", fontsize=16)
+
+im3 = axs[1,0].imshow(planet[2,:,:], cmap='jet')
+axs[1,0].set_title("Surface reflectance of red band", fontsize=16)
+
+im4 = axs[1,1].imshow(planet[3,:,:], cmap='jet')
+axs[1,1].set_title("Surface reflectance of NIR band", fontsize=16)
+
+cbar_ax = fig.add_axes([0.95, 0.15, 0.02, 0.7])
+fig.colorbar(im1, cax=cbar_ax)
+
+# read model input features and labels 
+data = pd.read_csv('./data/samples/sample_100K.csv', index_col = False)
+print("Sample dimentions:".format(), data.shape)
+print(data.head())
+X = data[['blue','green','red','nir']]
+y = data['label']
diff --git a/code/hyper_parameter_tuning.py b/code/hyper_parameter_tuning.py
@@ -0,0 +1,71 @@
+# prepare data 
+data = pd.read_csv('./data/samples/sample_10K.csv', index_col = False)
+print("Sample dimentions:".format(), data.shape)
+data.head()
+X = data[['blue','green','red','nir']]
+y = data['label']
+
+# customize models with different sample sizes
+models = get_models_size()
+results, names = list(), list()
+for name, model in models.items():
+    # evaluate models using k-fold cross-validation
+    scores = evaluate_model(model, X, y)
+    results.append(scores)
+    names.append(name)
+    # print the mean and standard deviation of models 
+    print('>%s   Mean Score: %.6f (Score SD: %.6f)' % ('Sample size: ' + str(int(float(name) * 10000)), scores.mean(), scores.std()))
+
+# display model performance 
+plt.figure(figsize=(10,5))
+plt.boxplot(results, labels=names, showmeans=True)
+plt.show()
+
+# customize models with different model feature sizes
+models = get_models_feature()
+# evaluate the models and store results
+results, names = list(), list()
+for name, model in models.items():
+    # evaluate models using k-fold cross-validation
+    scores = evaluate_model(model, X, y)
+    results.append(scores)
+    names.append(name)
+    # print the mean and standard deviation of models 
+    # print('>%s %.3f (%.3f)' % (name, scores.mean(), scores.std()))
+    print('>%s   Mean Score: %.6f (Score SD: %.6f)' % ('Features: ' + name, scores.mean(), scores.std()))
+# display model performance 
+plt.boxplot(results, labels=names, showmeans=True)
+plt.show()
+
+# customize models with different tree numbers
+models = get_models_tree()
+# evaluate the models and store results
+results, names = list(), list()
+for name, model in models.items():
+    # evaluate models using k-fold cross-validation
+    scores = evaluate_model(model, X, y)
+    results.append(scores)
+    names.append(name)
+    # print the mean and standard deviation of models 
+    # print('>%s %.3f (%.3f)' % (name, scores.mean(), scores.std()))
+    print('>%s   Mean Score: %.6f (Score SD: %.6f)' % ('Tree numbers: ' + name, scores.mean(), scores.std()))
+# display model performance 
+plt.boxplot(results, labels=names, showmeans=True)
+plt.show()
+
+# customize models with different tree depths
+models = get_models_depth()
+# evaluate the models and store results
+results, names = list(), list()
+for name, model in models.items():
+     # evaluate models using k-fold cross-validation
+    scores = evaluate_model(model, X, y)
+    results.append(scores)
+    names.append(name)
+    # print the mean and standard deviation of models 
+    # print('>%s %.3f (%.3f)' % (name, scores.mean(), scores.std()))
+    print('>%s   Mean Score: %.6f (Score SD: %.6f)' % ('Tree Depth: ' + name, scores.mean(), scores.std()))
+# display model performance 
+plt.figure(figsize=(10,5))
+plt.boxplot(results, labels=names, showmeans=True)
+plt.show()
diff --git a/code/modeL-training.py b/code/modeL-training.py
@@ -0,0 +1,42 @@
+# read model input features and labels 
+data = pd.read_csv('./data/samples/sample_100K.csv', index_col = False)
+print("Sample dimentions:".format(), data.shape)
+print(data.head())
+X = data[['blue','green','red','nir']]
+y = data['label']
+
+X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.96,random_state=1)
+
+# define the model
+model = RandomForestClassifier(n_estimators=10, max_depth=10, max_features=4)
+
+# evaluate the model
+cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1000)
+n_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
+# report model performance
+print('Mean Score: %.6f (SD: %.6f)' % (n_scores.mean(),n_scores.std()))
+
+# the histogram of the scores
+n, bins, patches = plt.hist(n_scores, density=True, facecolor='blue', alpha=0.75)
+plt.text(0.91, 15, r'mean = ' + str(n_scores.mean().round(6)) + '  '+ 'SD = ' + str(n_scores.std().round(6)))
+plt.xlim(0.9, 1.01)
+plt.xlabel('Acuuracy')
+plt.ylabel('Probability (%)')
+plt.grid(True)
+plt.show()
+
+model.fit(X_train,y_train)
+result = permutation_importance(model, X_train, y_train, n_repeats=1000, random_state=42, n_jobs=2)
+print('Permutation importance - average:'.format(), X_train.columns)
+print([round(i, 6) for i in result.importances_mean])
+
+# displace feature importance
+fig, ax = plt.subplots(figsize=(6,5))
+ax.boxplot(result.importances.T)
+ax.set_title("Permutation Importances", fontsize = 16)
+ax.set_xticklabels(labels=X_train.columns, fontsize=14)
+plt.show()
+
+# save model 
+dir_model = "./models/random_forest_SCA_binary.joblib"
+joblib.dump(model, dir_model)
diff --git a/code/model_evaluation.py b/code/model_evaluation.py
@@ -0,0 +1,91 @@
+dir_aso = './data/ASO/ASO_3M_SD_USCATE_20180528_clip.tif'
+raso = rasterio.open(dir_aso,'r').read()
+raso = np.where(raso[0,:,:] < 0, np.nan, raso)
+
+th = 0.1 # using 10 cm threshold
+raso_binary = np.where(raso >= 0.1, 1, 0) # if the SD is higher than 10 cm, then snow; otherwise, no-snow
+
+fig, axs = plt.subplots(1,2,figsize=(16,6))
+im1 = axs[0].imshow(raso[0,:,:],cmap = 'Blues',vmin = 0, vmax = 5)
+axs[0].set_title('ASO snow depth', fontsize=16)
+fig.colorbar(im1, ax = axs[0], label = 'snow depth (meter)', extend = 'max')
+
+im2 = axs[1].imshow(raso_binary[0,:,:], cmap = 'gray', interpolation = 'none')
+axs[1].set_title('ASO snow cover (TH = 10 cm)', fontsize=16)
+
+dir_model = "./models/random_forest_SCA_binary.joblib"
+# load model
+model = joblib.load(dir_model)
+
+df = pd.DataFrame()
+df['obs'] = y_test
+df['predict'] = model.predict(X_test)
+
+# Cross-tabulate predictions
+print(pd.crosstab(df['obs'], df['predict'], margins=True))
+print(calculate_metrics(df))
+
+dir_raster = './data/planet/20180528_181110_1025_3B_AnalyticMS_SR_clip.tif'
+dir_out = './data/SCA/'
+nodata_flag = 9
+run_sca_prediction(dir_raster, dir_out, nodata_flag, model)
+
+dir_planet = './data/planet/20180528_181110_1025_3B_AnalyticMS_SR_clip.tif'
+r_na_flag = rasterio.open(dir_planet, 'r').read()
+r_planet = rasterio.open(dir_planet, 'r').read([4,3,2])/10000
+
+dir_sca = './data/SCA/20180528_181110_1025_3B_AnalyticMS_SR_clip_SCA.tif'
+r_sca = rasterio.open(dir_sca, 'r')
+
+fig, (ax1, ax2) = plt.subplots(1,2, figsize = (16,4))
+show(r_planet, ax= ax1, cmap='jet', interpolation = 'none',title = 'Planet false color Image')
+show(r_sca.read().squeeze(), ax= ax2, interpolation = 'none',title = 'Planet Snow Cover')
+
+dir_planet_ext = './data/GIS/extent/CATE_20180528_181110_img_ext.shp'
+with fiona.open(dir_planet_ext, "r") as shapefile:
+    shapes = [feature["geometry"] for feature in shapefile]
+
+dir_aso = "./data/ASO/ASO_3M_SD_USCATE_20180528_binary_clip.tif"
+with rasterio.open(dir_aso,'r') as src:
+    r_aso = rasterio.mask.mask(src, shapes, crop=True, filled = False)
+
+dir_pred = './data/SCA/20180528_181110_1025_3B_AnalyticMS_SR_clip_SCA.tif'
+with rasterio.open(dir_pred,'r') as src:
+    r_predict = rasterio.mask.mask(src, shapes, crop=True, filled = False)
+
+dir_watermask = './data/mask/waterbody_TB_UTM11_clip.tif'
+with rasterio.open(dir_watermask,'r') as src:
+    r_watermask = rasterio.mask.mask(src, shapes, crop=True, filled = False)
+
+dir_glaciermask = './data/mask/02_rgi60_WesternCanadaUS_hypso_TB_clip.tif'
+with rasterio.open(dir_glaciermask,'r') as src:
+    r_glaciermask = rasterio.mask.mask(src, shapes, crop=True, filled = False)
+
+
+df = pd.DataFrame()
+df['predict'] = r_predict[0].ravel()
+df['obs'] = r_aso[0].ravel()
+df['watermask'] = r_watermask[0].ravel()
+df['glaciermask'] = r_watermask[0].ravel()
+
+# remove NA data region, water bodies, and glaciers 
+df_mask = df[(df.predict >= 0) & (df.watermask != 0) & (df.glaciermask != 0)]
+# print(df)
+
+print("overall model performance:")
+print(calculate_metrics(df_mask))
+
+file_landcover = './data/ASO/ASO_3M_CHM_USCATB_20140827_binary_clip.tif' # 1 - forest, 0 open area
+with rasterio.open(file_landcover,'r') as src:
+    r_landcover = rasterio.mask.mask(src, shapes, crop=True, filled = False)
+
+df['landcover'] = r_landcover[0].ravel()
+
+df_mask = df[(df.predict >= 0) & (df.watermask != 0) & (df.glaciermask != 0)]
+df_open = df_mask[df_mask.landcover == 0]
+print("Model performance in open areas:")
+print(calculate_metrics(df_open))
+
+df_forest = df_mask[df_mask.landcover == 1]
+print("Model performance in forested areas:")
+print(calculate_metrics(df_forest))
diff --git a/code/process.json b/code/process.json
@@ -0,0 +1,33 @@
+[{
+  "id" : "rgfp44",
+  "name" : "data_preparation",
+  "description" : "python",
+  "code" : "# import functions and packages\nfrom functions_book_chapter_SCA import *\n\ndir_raster = './data/planet/20180528_181110_1025_3B_AnalyticMS_SR_clip.tif'\nplanet = rasterio.open(dir_raster).read()/10000\nplanet = np.where(planet[0,:,:] == 0, np.nan, planet) # the default nan data value is 0, replace with np.nan\n\nfig, axs = plt.subplots(2,2,figsize=(15,7.5))\nim1 = axs[0,0].imshow(planet[0,:,:],cmap='jet')\naxs[0,0].set_title(\"Surface reflectance of blue band\", fontsize=16)\n\nim2 = axs[0,1].imshow(planet[1,:,:], cmap='jet')\naxs[0,1].set_title(\"Surface reflectance of green band\", fontsize=16)\n\nim3 = axs[1,0].imshow(planet[2,:,:], cmap='jet')\naxs[1,0].set_title(\"Surface reflectance of red band\", fontsize=16)\n\nim4 = axs[1,1].imshow(planet[3,:,:], cmap='jet')\naxs[1,1].set_title(\"Surface reflectance of NIR band\", fontsize=16)\n\ncbar_ax = fig.add_axes([0.95, 0.15, 0.02, 0.7])\nfig.colorbar(im1, cax=cbar_ax)\n\n# read model input features and labels \ndata = pd.read_csv('./data/samples/sample_100K.csv', index_col = False)\nprint(\"Sample dimentions:\".format(), data.shape)\nprint(data.head())\nX = data[['blue','green','red','nir']]\ny = data['label']",
+  "lang" : "python",
+  "owner" : "111111",
+  "confidential" : "FALSE"
+},{
+  "id" : "4p0kit",
+  "name" : "hyper_parameter_tuning",
+  "description" : "python",
+  "code" : "# prepare data \ndata = pd.read_csv('./data/samples/sample_10K.csv', index_col = False)\nprint(\"Sample dimentions:\".format(), data.shape)\ndata.head()\nX = data[['blue','green','red','nir']]\ny = data['label']\n\n# customize models with different sample sizes\nmodels = get_models_size()\nresults, names = list(), list()\nfor name, model in models.items():\n    # evaluate models using k-fold cross-validation\n    scores = evaluate_model(model, X, y)\n    results.append(scores)\n    names.append(name)\n    # print the mean and standard deviation of models \n    print('>%s   Mean Score: %.6f (Score SD: %.6f)' % ('Sample size: ' + str(int(float(name) * 10000)), scores.mean(), scores.std()))\n    \n# display model performance \nplt.figure(figsize=(10,5))\nplt.boxplot(results, labels=names, showmeans=True)\nplt.show()\n\n# customize models with different model feature sizes\nmodels = get_models_feature()\n# evaluate the models and store results\nresults, names = list(), list()\nfor name, model in models.items():\n    # evaluate models using k-fold cross-validation\n    scores = evaluate_model(model, X, y)\n    results.append(scores)\n    names.append(name)\n    # print the mean and standard deviation of models \n    # print('>%s %.3f (%.3f)' % (name, scores.mean(), scores.std()))\n    print('>%s   Mean Score: %.6f (Score SD: %.6f)' % ('Features: ' + name, scores.mean(), scores.std()))\n# display model performance \nplt.boxplot(results, labels=names, showmeans=True)\nplt.show()\n\n# customize models with different tree numbers\nmodels = get_models_tree()\n# evaluate the models and store results\nresults, names = list(), list()\nfor name, model in models.items():\n    # evaluate models using k-fold cross-validation\n    scores = evaluate_model(model, X, y)\n    results.append(scores)\n    names.append(name)\n    # print the mean and standard deviation of models \n    # print('>%s %.3f (%.3f)' % (name, scores.mean(), scores.std()))\n    print('>%s   Mean Score: %.6f (Score SD: %.6f)' % ('Tree numbers: ' + name, scores.mean(), scores.std()))\n# display model performance \nplt.boxplot(results, labels=names, showmeans=True)\nplt.show()\n\n# customize models with different tree depths\nmodels = get_models_depth()\n# evaluate the models and store results\nresults, names = list(), list()\nfor name, model in models.items():\n     # evaluate models using k-fold cross-validation\n    scores = evaluate_model(model, X, y)\n    results.append(scores)\n    names.append(name)\n    # print the mean and standard deviation of models \n    # print('>%s %.3f (%.3f)' % (name, scores.mean(), scores.std()))\n    print('>%s   Mean Score: %.6f (Score SD: %.6f)' % ('Tree Depth: ' + name, scores.mean(), scores.std()))\n# display model performance \nplt.figure(figsize=(10,5))\nplt.boxplot(results, labels=names, showmeans=True)\nplt.show()",
+  "lang" : "python",
+  "owner" : "111111",
+  "confidential" : "FALSE"
+},{
+  "id" : "xcw4h1",
+  "name" : "model_evaluation",
+  "description" : "python",
+  "code" : "dir_aso = './data/ASO/ASO_3M_SD_USCATE_20180528_clip.tif'\nraso = rasterio.open(dir_aso,'r').read()\nraso = np.where(raso[0,:,:] < 0, np.nan, raso)\n\nth = 0.1 # using 10 cm threshold\nraso_binary = np.where(raso >= 0.1, 1, 0) # if the SD is higher than 10 cm, then snow; otherwise, no-snow\n\nfig, axs = plt.subplots(1,2,figsize=(16,6))\nim1 = axs[0].imshow(raso[0,:,:],cmap = 'Blues',vmin = 0, vmax = 5)\naxs[0].set_title('ASO snow depth', fontsize=16)\nfig.colorbar(im1, ax = axs[0], label = 'snow depth (meter)', extend = 'max')\n\nim2 = axs[1].imshow(raso_binary[0,:,:], cmap = 'gray', interpolation = 'none')\naxs[1].set_title('ASO snow cover (TH = 10 cm)', fontsize=16)\n\ndir_model = \"./models/random_forest_SCA_binary.joblib\"\n# load model\nmodel = joblib.load(dir_model)\n\ndf = pd.DataFrame()\ndf['obs'] = y_test\ndf['predict'] = model.predict(X_test)\n\n# Cross-tabulate predictions\nprint(pd.crosstab(df['obs'], df['predict'], margins=True))\nprint(calculate_metrics(df))\n\ndir_raster = './data/planet/20180528_181110_1025_3B_AnalyticMS_SR_clip.tif'\ndir_out = './data/SCA/'\nnodata_flag = 9\nrun_sca_prediction(dir_raster, dir_out, nodata_flag, model)\n\ndir_planet = './data/planet/20180528_181110_1025_3B_AnalyticMS_SR_clip.tif'\nr_na_flag = rasterio.open(dir_planet, 'r').read()\nr_planet = rasterio.open(dir_planet, 'r').read([4,3,2])/10000\n\ndir_sca = './data/SCA/20180528_181110_1025_3B_AnalyticMS_SR_clip_SCA.tif'\nr_sca = rasterio.open(dir_sca, 'r')\n\nfig, (ax1, ax2) = plt.subplots(1,2, figsize = (16,4))\nshow(r_planet, ax= ax1, cmap='jet', interpolation = 'none',title = 'Planet false color Image')\nshow(r_sca.read().squeeze(), ax= ax2, interpolation = 'none',title = 'Planet Snow Cover')\n\ndir_planet_ext = './data/GIS/extent/CATE_20180528_181110_img_ext.shp'\nwith fiona.open(dir_planet_ext, \"r\") as shapefile:\n    shapes = [feature[\"geometry\"] for feature in shapefile]\n    \ndir_aso = \"./data/ASO/ASO_3M_SD_USCATE_20180528_binary_clip.tif\"\nwith rasterio.open(dir_aso,'r') as src:\n    r_aso = rasterio.mask.mask(src, shapes, crop=True, filled = False)\n    \ndir_pred = './data/SCA/20180528_181110_1025_3B_AnalyticMS_SR_clip_SCA.tif'\nwith rasterio.open(dir_pred,'r') as src:\n    r_predict = rasterio.mask.mask(src, shapes, crop=True, filled = False)\n\ndir_watermask = './data/mask/waterbody_TB_UTM11_clip.tif'\nwith rasterio.open(dir_watermask,'r') as src:\n    r_watermask = rasterio.mask.mask(src, shapes, crop=True, filled = False)\n    \ndir_glaciermask = './data/mask/02_rgi60_WesternCanadaUS_hypso_TB_clip.tif'\nwith rasterio.open(dir_glaciermask,'r') as src:\n    r_glaciermask = rasterio.mask.mask(src, shapes, crop=True, filled = False)\n    \n    \ndf = pd.DataFrame()\ndf['predict'] = r_predict[0].ravel()\ndf['obs'] = r_aso[0].ravel()\ndf['watermask'] = r_watermask[0].ravel()\ndf['glaciermask'] = r_watermask[0].ravel()\n\n# remove NA data region, water bodies, and glaciers \ndf_mask = df[(df.predict >= 0) & (df.watermask != 0) & (df.glaciermask != 0)]\n# print(df)\n\nprint(\"overall model performance:\")\nprint(calculate_metrics(df_mask))\n\nfile_landcover = './data/ASO/ASO_3M_CHM_USCATB_20140827_binary_clip.tif' # 1 - forest, 0 open area\nwith rasterio.open(file_landcover,'r') as src:\n    r_landcover = rasterio.mask.mask(src, shapes, crop=True, filled = False)\n    \ndf['landcover'] = r_landcover[0].ravel()\n\ndf_mask = df[(df.predict >= 0) & (df.watermask != 0) & (df.glaciermask != 0)]\ndf_open = df_mask[df_mask.landcover == 0]\nprint(\"Model performance in open areas:\")\nprint(calculate_metrics(df_open))\n\ndf_forest = df_mask[df_mask.landcover == 1]\nprint(\"Model performance in forested areas:\")\nprint(calculate_metrics(df_forest))",
+  "lang" : "python",
+  "owner" : "111111",
+  "confidential" : "FALSE"
+},{
+  "id" : "8e59lt",
+  "name" : "modeL-training",
+  "description" : "python",
+  "code" : "# read model input features and labels \ndata = pd.read_csv('./data/samples/sample_100K.csv', index_col = False)\nprint(\"Sample dimentions:\".format(), data.shape)\nprint(data.head())\nX = data[['blue','green','red','nir']]\ny = data['label']\n\nX_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.96,random_state=1)\n\n# define the model\nmodel = RandomForestClassifier(n_estimators=10, max_depth=10, max_features=4)\n\n# evaluate the model\ncv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1000)\nn_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)\n# report model performance\nprint('Mean Score: %.6f (SD: %.6f)' % (n_scores.mean(),n_scores.std()))\n\n# the histogram of the scores\nn, bins, patches = plt.hist(n_scores, density=True, facecolor='blue', alpha=0.75)\nplt.text(0.91, 15, r'mean = ' + str(n_scores.mean().round(6)) + '  '+ 'SD = ' + str(n_scores.std().round(6)))\nplt.xlim(0.9, 1.01)\nplt.xlabel('Acuuracy')\nplt.ylabel('Probability (%)')\nplt.grid(True)\nplt.show()\n\nmodel.fit(X_train,y_train)\nresult = permutation_importance(model, X_train, y_train, n_repeats=1000, random_state=42, n_jobs=2)\nprint('Permutation importance - average:'.format(), X_train.columns)\nprint([round(i, 6) for i in result.importances_mean])\n\n# displace feature importance\nfig, ax = plt.subplots(figsize=(6,5))\nax.boxplot(result.importances.T)\nax.set_title(\"Permutation Importances\", fontsize = 16)\nax.set_xticklabels(labels=X_train.columns, fontsize=14)\nplt.show()\n\n# save model \ndir_model = \"./models/random_forest_SCA_binary.joblib\"\njoblib.dump(model, dir_model)",
+  "lang" : "python",
+  "owner" : "111111",
+  "confidential" : "FALSE"
+}]
diff --git a/history/m8s75d7cn7seb8mf2z2q.json b/history/m8s75d7cn7seb8mf2z2q.json
@@ -0,0 +1 @@
+[]