add estonly for mgwr

Ziqi-Li · Mar 26, 2021 · a8d18c2 · a8d18c2
1 parent ae701b6
commit a8d18c2
Show file tree

Hide file tree

Showing 12 changed files with 444 additions and 203 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 ![GitHub](https://img.shields.io/github/license/Ziqi-Li/fastgwr)
 
 # FastGWR
-A command line tool for fast parallel computation of Geographically Weighted Regression models.
+A command line tool for fast parallel computation of Geographically Weighted Regression models (GWR and MGWR).
 ### New feature:
 Multi-scale GWR model added!
 
@@ -34,37 +34,36 @@ $ fastgwr testmgwr
 ```
 
 
-# Examples
+## Examples
 Example call to the `fastgwr` to fit GWR model:
 
 ```bash
-$ fastgwr run -np 4 -data input.csv -adaptive -constant
+$ fastgwr run -np 4 -data input.csv
 ```
 
 Example call to the `fastgwr` to fit MGWR model:
 
 ```bash
-$ fastgwr run -np 4 -data input.csv -adaptive -constant -mgwr
+$ fastgwr run -np 4 -data input.csv -mgwr
 ```
 where:
 
 ```bash
 -np 4             Number of processors (e.g. 4).
 -data input.csv   Input data matrix. (e.g. input.csv)
-                  Can also be URL (e.g. https://raw.github.com/
+                  Can also be an URL (e.g. https://raw.github.com/
                   Ziqi-Li/FastGWR/master/Zillow-test-dataset/zillow_1k.csv)
 -out results.csv  Output GWR results matrix including local parameter 
                   estimates, standard errors and local diagnostics.
--adaptive         Adaptive Bisquare kernel.
--fixed            Fixed Gaussian kernel.
--constant         Adding a constant column vector of 1 to the design matrix.
+-adaptive/-fixed  Adaptive Bisquare kernel (defualt) or Fixed Gaussian kernel.
 -bw 1000          Pre-defined bandwidth parameter. If missing, it will
                   search (golden-section) for the optimal bandwidth and use
                   that to fit the GWR model.
 -minbw 45         Lower bound in golden-section search. (e.g. 45)
 -mgwr             Fitting an MGWR model.
 -chunks           Number of chunks for MGWR computation (set to a larger 
                   number to reduce memory footprint).
+-estonly          Allowing MGWR to output parameter estimation only.
 ```
 
 The input needs to be prepared in this order:
@@ -84,12 +83,12 @@ X1...Xk: independent variables
 ```
 See the example Zillow datasets in the repository.
 
-### Results Validation
+## Results Validation
 
 The results are validated against the [mgwr](https://github.com/pysal/mgwr), which can be seen in the [notebooks here](https://github.com/Ziqi-Li/FastGWR/tree/master/validation%20notebook).
 
 
-### Citations
+## Citations
 
 This program is developed based on these two papers:
 

diff --git a/fastgwr/FastGWR.py b/fastgwr/FastGWR.py
@@ -89,6 +89,7 @@ def parse_gwr_args(self):
         self.fout  = parser_arg.out
         self.fixed = parser_arg.fixed
         self.constant = parser_arg.constant
+        self.estonly = parser_arg.estonly
 
         if parser_arg.bw:
             if self.fixed:
@@ -112,7 +113,7 @@ def parse_gwr_args(self):
 
             print("Data Input Path:",self.fname)
             print("Output Result Path:",self.fout)
-            print("Constant:",self.constant)
+            print("Intercept:",self.constant)
 
 
     def read_file(self):

diff --git a/fastgwr/FastMGWR.py b/fastgwr/FastMGWR.py
@@ -129,6 +129,19 @@ def backfitting(self):
         self.err = err
         self.params = newbetas
 
+        if self.comm.rank == 0 and self.estonly:
+            header="index,residual,"
+            varNames = np.genfromtxt(self.fname, dtype=str, delimiter=',',names=True, max_rows=1).dtype.names[3:]
+            if self.constant:
+                varNames = ['intercept'] + list(varNames)
+            for x in varNames:
+                header += ("b_"+x+',')
+
+            self.output_diag(None,None,self.R2)
+            index = np.arange(self.n).reshape(-1,1)
+            output = np.hstack([index,self.err.reshape(-1,1),self.params])
+            self.save_results(output,header)
+
 
     def _chunk_compute_R(self, chunk_id=0):
         """
@@ -189,7 +202,8 @@ def mgwr_fit(self,n_chunks=2):
         """
         Fit MGWR model and output results
         """
-
+        if self.estonly:
+            return
         if self.comm.rank ==0:
             print("Computing Inference with",n_chunks,"Chunk(s)")
         self.n_chunks = self.comm.size * n_chunks

diff --git a/fastgwr/__main__.py b/fastgwr/__main__.py
@@ -3,7 +3,7 @@
 import fastgwr
 
 @click.group()
-@click.version_option("0.2.7")
+@click.version_option("0.2.8")
 def main():
     pass
 
@@ -12,44 +12,45 @@ def main():
 @click.option("-data", required=True)
 @click.option("-out", default="fastgwr_rslt.csv", required=False)
 @click.option("-adaptive/-fixed" ,default=True, required=True)
-@click.option("-constant", required=False, is_flag=True)
 @click.option("-bw", required=False)
 @click.option("-minbw", required=False)
-@click.option("-chunks", required=False)
 @click.option("-mgwr", default=False, required=False, is_flag=True)
-def run(np, data, out, adaptive, constant, bw, minbw, mgwr, chunks):
+@click.option("-chunks", required=False)
+@click.option("-estonly", default=False, is_flag=True)
+def run(np, data, out, adaptive, bw, minbw, mgwr, chunks, estonly):
     """
     Fast(M)GWR
     
-    -np: number of processors to use
-    
-    -data: input data matrix containing y and X
+    -np:       number of processors to use. (default: 4)
     
-    -out: output GWR results (default: "fastgwr_rslt.csv")
+    -data:     input data matrix containing y and X. Can be URL:
+               e.g. https://raw.github.com/Ziqi-Li/FastGWR/master/Zillow-test-dataset/zillow_1k.csv
     
-    -adaptive: using adaptive bisquare kernel
+    -out:      output GWR results (default: "fastgwr_rslt.csv").
     
-    -fixed: using fixed gaussian kernel
+    -adaptive/-fixed: using an adaptive bisquare kernel (default) or a fixed gaussian kernel.
     
-    -constant: adding a constant vector to the X matrix
+    -bw:       using a pre-specified bandwidth to fit GWR.
     
-    -bw: using a pre-specified bandwidth to fit GWR
+    -minbw:    lower bound in the golden section search in GWR.
     
-    -minbw: lower bound in the golden section search
+    -mgwr:     fitting an MGWR model.
     
-    -mgwr: fitting an MGWR model
+    -chunks:   number of chunks for MGWR computation (default: 1).
+               Increase the number if run out of memory but should keep it as low as possible.
     
-    -chunks: number of chunks for MGWR computation (default: 1).
-             Increase the number if run out of memory but should keep it as low as possible.
+    -estonly:  output the parameter estimation only for MGWR, no standard errors of the estimates
+               and model diagnostics. Ideal for quick model checking (default: False).
     
     """
 
     mpi_path = os.path.dirname(fastgwr.__file__) + '/fastgwr_mpi.py'
 
     command = 'mpiexec ' + ' -np ' + str(np) + ' python ' + mpi_path + ' -data ' + data + ' -out ' + out
+    command += ' -c '
+
     if mgwr:
         command += ' -mgwr '
-
     if adaptive:
         command += ' -a '
     else:
@@ -58,10 +59,10 @@ def run(np, data, out, adaptive, constant, bw, minbw, mgwr, chunks):
         command += (' -bw ' + bw)
     if minbw:
         command += (' -minbw ' + minbw)
-    if constant:
-        command += ' --constant '
     if chunks:
         command += (' -chunks ' + chunks)
+    if estonly:
+        command += (' -estonly ')
 
     os.system(command)
     pass
@@ -88,8 +89,8 @@ def testmgwr():
 
     print("Testing MGWR with zillow data:")
     mpi_path = os.path.dirname(fastgwr.__file__) + '/fastgwr_mpi.py'
-
-    command = "mpiexec -np 2 python " + mpi_path + " -data https://raw.github.com/Ziqi-Li/FastGWR/master/Zillow-test-dataset/zillow_1k.csv -c -mgwr"
+    print(mpi_path)
+    command = "mpiexec -np 2 python " + mpi_path + " -data https://raw.github.com/Ziqi-Li/FastGWR/master/Zillow-test-dataset/zillow_1k.csv -mgwr -c"
     os.system(command)
     pass
 

diff --git a/fastgwr/fastgwr_mpi.py b/fastgwr/fastgwr_mpi.py
@@ -10,8 +10,8 @@
 
 #Direct Example Call:
 #You can direct call this script by:
-#mpiexec -np 4 python fastgwr_mpi.py -data ../Zillow-test-dataset/zillow_1k.csv -c
-#mpiexec -np 4 python fastgwr_mpi.py -data ../Zillow-test-dataset/zillow_1k.csv -c -mgwr
+#mpiexec -np 4 python fastgwr_mpi.py -data ../Zillow-test-dataset/zillow_1k.csv
+#mpiexec -np 4 python fastgwr_mpi.py -data ../Zillow-test-dataset/zillow_1k.csv -mgwr -c
 
 if __name__ == "__main__":
 
@@ -27,6 +27,8 @@
     parser.add_argument("-bw")
     parser.add_argument("-minbw")
     parser.add_argument("-chunks",default=1)
+
+    parser.add_argument('-estonly',action='store_true')
     parser.add_argument('-mgwr',action='store_true')
     parser.add_argument('-f','--fixed',action='store_true')
     parser.add_argument('-a','--adaptive',action='store_true')

diff --git a/paper/paper.md b/paper/paper.md
@@ -31,11 +31,9 @@ As geospatial data are increasingly available from different sources such as rem
 
 # State of the Field
 
-There are currently existing packages in different languages that allow users to fit GWR and MGWR models. Two most popular open-source options are `mgwr` in python [@oshan2019mgwr] and `GWmodel` in R [@gollini2013gwmodel], both of which provide friendly APIs and are actively maintained. `GWmodel` supports a wide array of geographically weighted models and analysis tools; however, the performance of `GWmodel` is lagged behind and not suitable for large data sets. A comprehensive performance comparison between `GWmodel` and `fastgwr` can be found in @li2019fast and @li2020computational. As for `mgwr`, as mentioned in the previous section, the parallsim of `fastgwr` has been built into `mgwr` by leveraging the `multiprocessing` package. However, the major advantage of `fastgwr` is that the use of MPI-based parallelism allows the program to run in parallel across multiple computer nodes. In this way, `fastgwr` is the only option if the analyst wants to run the GWR program on a high performance computing cluster, which empowers larger-scale analysis that is impossible for a single workstation.  `fastgwr` has been tested on the University of Arizona's Ocelote cluster, and the scalability can be seen in \autoref{fig:example}.
+There are currently existing packages in different languages that allow users to fit GWR and MGWR models. Two most popular open-source options are `mgwr` in python [@oshan2019mgwr] and `GWmodel` in R [@gollini2013gwmodel], both of which provide friendly APIs and are actively maintained. `GWmodel` supports a wide array of geographically weighted models and analysis tools; however, the performance of `GWmodel` is lagged behind and not suitable for large data sets. A comprehensive performance comparison between `GWmodel` and `fastgwr` can be found in @li2019fast and @li2020computational. As for `mgwr`, as mentioned in the previous section, the parallsim of `fastgwr` has been built into `mgwr` by leveraging the `multiprocessing` package. However, the major advantage of `fastgwr` is that the use of MPI-based parallelism allows the program to run in parallel across multiple computer nodes. In this way, `fastgwr` is the only option if the analyst wants to run the GWR program on a high performance computing cluster, which empowers larger-scale analysis that is impossible for a single workstation.  `fastgwr` has been tested on the University of Arizona's Ocelote cluster, and the scalability can be seen in \autoref{fig:example}. The model fitting results of `fastgwr` have been validated against `mgwr` which can be found in the [notebooks](https://github.com/Ziqi-Li/FastGWR/tree/master/validation%20notebook) in the attached [Gituhb repository](https://github.com/Ziqi-Li/FastGWR).
 
-The model fitting results of `fastgwr` have been validated against `mgwr` which can be found in the [notebooks](https://github.com/Ziqi-Li/FastGWR/tree/master/validation%20notebook) in the attached [Gituhb repository](https://github.com/Ziqi-Li/FastGWR).
-
-![Caption for example figure.\label{fig:example}](scalability.png)
+![Caption for example figure.\label{fig:example}](scalability.png){ width=20% }
 
 
 # Installation
@@ -67,43 +65,40 @@ $ fastgwr testmgwr
 
 
 # Examples
-Example call to `fastgwr` to fit a GWR model:
+An example call to `fastgwr` to fit a GWR model:
 
 ```bash
-$ fastgwr run -np 4 -data input.csv -adaptive -constant
+$ fastgwr run -np 4 -data input.csv
 ```
 
-Example call to `fastgwr` to fit an MGWR model:
+An example call to `fastgwr` to fit an MGWR model:
 
 ```bash
-$ fastgwr run -np 4 -data input.csv -adaptive -constant -mgwr
+$ fastgwr run -np 4 -data input.csv -mgwr
 ```
 where:
 
 ```bash
 -np 4             Number of processors (e.g. 4).
 -data input.csv   Input data matrix. (e.g. input.csv)
-                  Can also be URL (e.g. https://raw.github.com/
+                  Can also be an URL (e.g. https://raw.github.com/
                   Ziqi-Li/FastGWR/master/Zillow-test-dataset/zillow_1k.csv)
 -out results.csv  Output GWR results matrix including local parameter 
                   estimates, standard errors and local diagnostics.
--adaptive         Adaptive Bisquare kernel.
--fixed            Fixed Gaussian kernel.
--constant         Adding a constant column vector of 1 to the design matrix.
+-adaptive/-fixed  Adaptive Bisquare kernel (defualt) or Fixed Gaussian kernel.
 -bw 1000          Pre-defined bandwidth parameter. If missing, it will
                   search (golden-section) for the optimal bandwidth and use
                   that to fit the GWR model.
 -minbw 45         Lower bound in golden-section search. (e.g. 45)
 -mgwr             Fitting an MGWR model.
 -chunks           Number of chunks for MGWR computation (set to a larger 
                   number to reduce memory footprint).
+-estonly          Allowing MGWR to output parameter estimation only.
 ```
 
 Alternatively, users can call the CLI from a Jupyter Notebook by prefixing `fastgwr` command  the `!` character. [Examples notebooks](https://github.com/Ziqi-Li/FastGWR/tree/master/validation%20notebook) are available in the [Gituhb repository](https://github.com/Ziqi-Li/FastGWR).
 
 
-
-
 # Dependencies
 
 `fastgwr` is based on the following dependencies:

diff --git a/setup.py b/setup.py
@@ -21,7 +21,7 @@
 
 setup(
     name="fastgwr",
-    version='0.2.7',
+    version='0.2.8',
     description="Fast Parallel Computation of Geographically Weighted Regression",
     long_description="A MPI-based command line tool for calibrating Geographically Weighted Regression models.",
     author="Ziqi Li",

diff --git a/validation notebook/.DS_Store b/validation notebook/.DS_Store