cleanup (#47)

slin96 · web-flow · commit a53cef2bf2a2 · 2021-07-07T01:33:51.000+02:00
* updated main readme

* typos readme

* move examples

* moved examples

* wip examples for probing

* added gitkeep to have empty directory in examples

* approach examples

* remove stoarge intensive tests

* better explaination for config file
diff --git a/README.md b/README.md
@@ -1,7 +1,22 @@
 # mmlib
 
-- A library for model management and related tasks.
-
+mmlib is a library that implements different approaches to save and recover models. It was developed as part of my
+master thesis ([link to thesis repo](https://github.com/slin96/master-thesis)).
+
+The approach names in the thesis match the following implementations:
+- baseline approach 
+    - implemented by the `BaselineSaveService`
+- parameter update approach 
+    - implemented by `WeightUpdateSaveService` (set `improved_version=False`)
+- improved parameter update approach 
+    - implemented by `WeightUpdateSaveService` (set `improved_version=True`)
+- provenance approach
+    - implemented by `ProvenanceSaveService`
+    
+Next to the approaches to save and recover models we also implemented a **probing tool**
+- the corresponding code is in `probe.py`
+- examples are provided under [examples](examples)
+    
 ## Installation
 
 ### Option 1: Docker
@@ -10,15 +25,15 @@
 - **Build Library**
     - clone this repo
     - run the script `generate-archives-docker.sh`
-        - it runs a docker container and builds the *mmlib* in it.
+        - it runs a docker container and builds the *mmlib* in it
         - the created `dist` directory is copied back to repository root
         - it contains the `.whl` file that can be used to install the library with pip (see below)
 - **Install**
     - to install mmlib run: `pip install <PATH>/dist/mmlib-0.0.1-py3-none-any.whl`
 
 ### Option 2: Local Build
 
-- **Requirements**: Python 3.8
+- **Requirements**: Python 3.8 and Python `venv`
 - **Build Library**
     - run the script `generate-archives.sh`
         - it creates a virtual environment, activates it, and installs all requirements
@@ -28,7 +43,7 @@
 
 ## Examples
 
-- For examples on how to use mmlib checkout the [examples](mmlib/examples) directory.
+- For examples on how to use mmlib checkout the [examples](examples) directory.
 
 
 
diff --git a/examples/README.md b/examples/README.md
@@ -2,12 +2,28 @@
 
 This directory contains examples of how to use the functionality offered by the *mmlib*.
 
+## Approaches to save and recover models
+- to execute all examples we use a MongoDB, in all examples the MongoDB is started using docker
+- if you don't have docker installed you have to either install it or slightly adjust the examples
+- in `baseline_save.py` we provide an example of how to save and recover a model using the baseline approach
+- for all other approaches we do not give explict examples and refer to our [test for the appraoches](../tests/save)
+
+
+## Probing Tool
+
+We provide some basic examples to see the different use cases of the probing tool
+
+### Create a probe summary for a given model
 - *probe_store.py* - Creates and stores a probe summary of the training process of a GoogLeNet.
-    - execution: `python probe_store.py --path <optional path to store probe summary>`
+- execution: `python probe_store.py --path <optional path to store probe summary>`
+    
+### Create new summary and compare to given one 
 - *probe_load_compare.py* - Creates a probe summary of the training process of a GoogLeNet and compares it to a stored
   probe summary
-    - execution: `python probe_load_compare.py --path <path to the already stored probe summary>`
-    - note: To generate and store a probe summary to compare to use the *probe_store.py* script.
+- execution: `python probe_load_compare.py --path <path to the already stored probe summary>`
+- note: To generate and store a probe summary to compare to use the *probe_store.py* script.
+    
+### Extensive example
 - *probe_example.py* - Shows extensively how the probe functionality offered by the *mmlib* can be used to make the
   PyTorch implementation of GoogLeNet reproducible. It runs the following steps:
     - simple summary
diff --git a/examples/__init__.py b/examples/__init__.py
diff --git a/examples/baseline_save.py b/examples/baseline_save.py
@@ -3,7 +3,8 @@
 from mmlib.equal import model_equal
 from mmlib.persistence import FileSystemPersistenceService, MongoDictPersistenceService
 from mmlib.save import BaselineSaveService
-from mmlib.schema import ModelSaveInfoBuilder
+from mmlib.schema.save_info_builder import ModelSaveInfoBuilder
+from mmlib.track_env import track_current_environment
 from mmlib.util.dummy_data import imagenet_input
 from tests.example_files.mynets.mobilenet import mobilenet_v2
 
@@ -24,8 +25,9 @@
     # initialize instance of mobilenet_v2
     model = mobilenet_v2(pretrained=True)
     # create the info to save the model
+    env = track_current_environment()
     save_info_builder = ModelSaveInfoBuilder()
-    save_info_builder.add_model_info(model=model)
+    save_info_builder.add_model_info(model=model, env=env)
     save_info = save_info_builder.build()
     # given the save info we can store the model, and get a model id back
     model_id = save_service.save_model(save_info)
diff --git a/examples/filesystem-tmp/.gitkeep b/examples/filesystem-tmp/.gitkeep
diff --git a/examples/probe_example.py b/examples/probe_example.py
diff --git a/examples/probe_load_compare.py b/examples/probe_load_compare.py
@@ -1,18 +1,20 @@
 import argparse
 
-from mmlib.examples.probe_store import _generate_probe_training_summary
+from examples.probe_store import _generate_probe_training_summary
 from mmlib.probe import ProbeSummary, ProbeInfo
 
 
 def main(args):
+    # we use the functionality from the probe_store script to generate a summary for the GoogLeNet
     summary = _generate_probe_training_summary()
-
+    # We load the summary from the path given in the args
     loaded_summary = ProbeSummary(summary_path=args.path)
-
-    common = [ProbeInfo.LAYER_NAME]
+    # we specify the fields both summaries have in common (they are excluded form comparing)
+    common = [ProbeInfo.LAYER_NAME, ProbeInfo.FORWARD_INDEX]
+    # we define the fields we want to compare; in this case different kind of tensors for teh forward an backward pass
     compare = [ProbeInfo.INPUT_TENSOR, ProbeInfo.OUTPUT_TENSOR, ProbeInfo.GRAD_INPUT_TENSOR,
                ProbeInfo.GRAD_OUTPUT_TENSOR]
-
+    # haven created one summary and loaded one we compare them and print the comparison to the console
     summary.compare_to(loaded_summary, common, compare)
 
 
diff --git a/examples/probe_store.py b/examples/probe_store.py
@@ -11,19 +11,25 @@
 
 
 def main(args):
+    # we create a probe summary and get it back as an object
     summary = _generate_probe_training_summary()
-
+    # we can save the summary to the path given in the args
     output_path = os.path.join(args.path, 'summary')
     summary.save(output_path)
 
 
 def _generate_probe_training_summary():
+    # First, we force the implementation to be deterministic using mmlib's set_deterministic() function
     set_deterministic()
+    # as an example we want to prob the GoogLeNet architecture
+    model = models.googlenet(pretrained=True)
+    # to probe tha forward and backward pass we have to create some dummy data
+    # we need: input, target, loss function and optimizer
     dummy_input = imagenet_input()
     dummy_target = imagenet_target(dummy_input)
     loss_func = nn.CrossEntropyLoss()
-    model = models.googlenet(pretrained=True)
     optimizer = torch.optim.SGD(model.parameters(), 1e-3)
+    # having created the model and all dummy data we can execute a probe run and return the summary
     summary = probe_training(model, dummy_input, optimizer, loss_func, dummy_target)
     return summary
 
diff --git a/mmlib/examples/provenance_save.py b/mmlib/examples/provenance_save.py
diff --git a/mmlib/schema/schema_obj.py b/mmlib/schema/schema_obj.py
@@ -13,7 +13,7 @@
 
 class SchemaObj(metaclass=abc.ABCMeta):
 
-    def __init__(self, store_id: str = None, logging=True):
+    def __init__(self, store_id: str = None, logging=False):
         self.store_id = store_id
         self.logging = logging
 
diff --git a/tests/model_equal/test_equal.py b/tests/model_equal/test_equal.py
@@ -5,8 +5,8 @@
 
 from mmlib.deterministic import set_deterministic
 from mmlib.equal import state_dict_equal, model_equal
-from mmlib.util import state_dict_hash, tensor_hash
 from mmlib.util.dummy_data import imagenet_input
+from mmlib.util.hash import state_dict_hash, tensor_hash
 
 
 class TestStateDictEqual(unittest.TestCase):
@@ -58,12 +58,6 @@ def test_resnet18_pretrained(self):
 
         self.assertTrue(model_equal(mod1, mod2, imagenet_input))
 
-    def test_resnet50_pretrained(self):
-        mod1 = models.resnet50(pretrained=True)
-        mod2 = models.resnet50(pretrained=True)
-
-        self.assertTrue(model_equal(mod1, mod2, imagenet_input))
-
     def test_googlenet_pretrained(self):
         mod1 = models.googlenet(pretrained=True)
         mod2 = models.googlenet(pretrained=True)
@@ -76,9 +70,9 @@ def test_mobilenet_v2_pretrained(self):
 
         self.assertTrue(model_equal(mod1, mod2, imagenet_input))
 
-    def test_resnet18_resnet152_pretrained(self):
+    def test_resnet18_mobilenet_pretrained(self):
         mod1 = models.resnet18(pretrained=True)
-        mod2 = models.resnet152(pretrained=True)
+        mod2 = models.mobilenet_v2(pretrained=True)
 
         self.assertFalse(model_equal(mod1, mod2, imagenet_input))
 
diff --git a/tests/save/test_prov_save_servcie.py b/tests/save/test_prov_save_servcie.py
@@ -32,7 +32,11 @@ class TestProvSaveService(TestBaselineSaveService):
     def setUp(self) -> None:
         super().setUp()
         assert os.path.isfile(CONFIG), \
-            'to run these tests define your onw config file named \'local-config\' with respect to the template file'
+            'to run these tests define your onw config file named \'local-config\'' \
+            'to do so copy the file under tests/example_files/config.ini, and place it in the same directory' \
+            'rename it to local-config.ini, create an empty directory and put the path to it in the newly created' \
+            'config file, to define the current_data_root' \
+
         os.environ[MMLIB_CONFIG] = CONFIG
 
     def init_save_service(self, dict_pers_service, file_pers_service):