From 9959217affaeafa046f78ea0ee8dc0a50fb472c5 Mon Sep 17 00:00:00 2001
From: lauraporta <29216006+lauraporta@users.noreply.github.com>
Date: Mon, 7 Oct 2024 16:33:18 +0100
Subject: [PATCH 01/37] Add classes to read and write folders and the beginning
 of a possible pipeline

---
 MANIFEST.in                                 |  4 ++
 calcium_imaging_automation/core/__init__.py |  0
 calcium_imaging_automation/core/reader.py   | 37 +++++++++++++++++
 calcium_imaging_automation/core/writer.py   | 32 +++++++++++++++
 examples/__init__.py                        |  0
 examples/example_usage.py                   | 45 +++++++++++++++++++++
 examples/example_usage.sh                   |  4 ++
 7 files changed, 122 insertions(+)
 create mode 100644 calcium_imaging_automation/core/__init__.py
 create mode 100644 calcium_imaging_automation/core/reader.py
 create mode 100644 calcium_imaging_automation/core/writer.py
 create mode 100644 examples/__init__.py
 create mode 100644 examples/example_usage.py
 create mode 100755 examples/example_usage.sh

diff --git a/MANIFEST.in b/MANIFEST.in
index e16ea33..53a61e3 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,6 +2,10 @@ include LICENSE
 include README.md
 exclude .pre-commit-config.yaml
 
+recursive-include calcium_imaging_automation *.py
+recursive-include examples *.py
+recursive-include examples *.sh
+
 recursive-exclude * __pycache__
 recursive-exclude * *.py[co]
 recursive-exclude docs *
diff --git a/calcium_imaging_automation/core/__init__.py b/calcium_imaging_automation/core/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/calcium_imaging_automation/core/reader.py b/calcium_imaging_automation/core/reader.py
new file mode 100644
index 0000000..6a5c500
--- /dev/null
+++ b/calcium_imaging_automation/core/reader.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+from typing import List
+
+
+class ReadAllPathsInFolder:
+    def __init__(
+        self, raw_data_folder: Path, filetypes_of_interest: List[str]
+    ):
+        self.filetypes_of_interest = filetypes_of_interest
+        self.datasets_paths = self.get_folders_first_layer(raw_data_folder)
+        self.dataset_names = [
+            dataset_path.name for dataset_path in self.datasets_paths
+        ]
+
+    def get_folders_first_layer(self, file_path: Path) -> List[Path]:
+        return list(file_path.glob("*"))
+
+    def get_files_paths(self, folder: Path) -> List[Path]:
+        return [
+            file
+            for filetype in self.filetypes_of_interest
+            for file in folder.rglob(f"*.{filetype}")
+        ]
+
+    def total_objects_by_filetype(self, folder: Path) -> dict:
+        return {
+            filetype: len(self.get_files_paths(folder))
+            for filetype in self.filetypes_of_interest
+        }
+
+    def max_session_number(self, filetype="tif", max_allowed=5) -> int:
+        total_tif_number = [
+            self.total_objects_by_filetype(dataset_path).get(filetype, 0)
+            for dataset_path in self.datasets_paths
+        ]
+
+        return min(max(total_tif_number), max_allowed)
diff --git a/calcium_imaging_automation/core/writer.py b/calcium_imaging_automation/core/writer.py
new file mode 100644
index 0000000..d61d056
--- /dev/null
+++ b/calcium_imaging_automation/core/writer.py
@@ -0,0 +1,32 @@
+from pathlib import Path
+from typing import List
+
+from datashuttle.configs.config_class import Configs
+from datashuttle.utils import folders
+
+
+class DatashuttleWrapper:
+    def __init__(self, output_path: Path) -> None:
+        # This is supposed to run in the cluster and have direct access
+        # to the central storages
+        self.datashuttle_cfg = Configs(
+            project_name=output_path.name,
+            file_path=output_path,
+            input_dict={
+                "local_path": output_path,
+                "central_path": "",
+                "connection_method": "local_filesystem",
+            },
+        )
+
+    def create_folders(self, dataset_names: List[str], session_number) -> None:
+        folders.create_folder_trees(
+            cfg=self.datashuttle_cfg,
+            top_level_folder="derivatives",
+            sub_names=[
+                f"sub-{i}_{dataset_name}"
+                for i, dataset_name in enumerate(dataset_names)
+            ],
+            ses_names=[f"ses-{i}" for i in range(session_number)],
+            datatype="funcimg",
+        )
diff --git a/examples/__init__.py b/examples/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/examples/example_usage.py b/examples/example_usage.py
new file mode 100644
index 0000000..f65eded
--- /dev/null
+++ b/examples/example_usage.py
@@ -0,0 +1,45 @@
+import argparse
+from pathlib import Path
+
+from calcium_imaging_automation.core.reader import ReadAllPathsInFolder
+from calcium_imaging_automation.core.writer import DatashuttleWrapper
+
+
+def main(raw_data_path: Path, output_path: Path, filetypes_of_interest: list):
+    """
+    Draft usage of the pipeline, now consisting of read and write operations.
+    """
+    reader = ReadAllPathsInFolder(raw_data_path, filetypes_of_interest)
+
+    writer = DatashuttleWrapper(output_path)
+    number_of_tiffs = reader.max_session_number(filetype="tif")
+    writer.create_folders(reader.dataset_names, session_number=number_of_tiffs)
+
+    # [Placeholder for data processing]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Example usage of the pipeline manager."
+    )
+
+    parser.add_argument(
+        "raw_data_path", type=Path, help="Path to the raw data."
+    )
+    parser.add_argument(
+        "output_path", type=Path, help="Path to the output data."
+    )
+    parser.add_argument(
+        "--filetypes",
+        type=list,
+        nargs="+",
+        help="Filetypes of interest.",
+        default=["tif", "bin"],
+    )
+
+    args = parser.parse_args()
+    raw_data_path = args.raw_data_path
+    output_path = args.output_path
+    file_types = args.filetypes
+
+    main(raw_data_path, output_path, file_types)
diff --git a/examples/example_usage.sh b/examples/example_usage.sh
new file mode 100755
index 0000000..3e5d965
--- /dev/null
+++ b/examples/example_usage.sh
@@ -0,0 +1,4 @@
+#! /bin/bash
+python ./examples/example_usage.py \
+    /Users/lauraporta/local_data/rotation/ \
+    /Users/lauraporta/local_data/test/

From a1c703e0c21bad593374c86467d68f9832dc7fb8 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Tue, 5 Nov 2024 15:27:14 +0000
Subject: [PATCH 02/37] Update dependencies

---
 pyproject.toml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 5a9258d..7506ae9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,9 @@ readme = "README.md"
 requires-python = ">=3.9.0"
 dynamic = ["version"]
 
-dependencies = []
+dependencies = [
+  "datashuttle",
+]
 
 license = {text = "BSD-3-Clause"}
 

From 98ac2b5b4115a0f00248d8121e6347c43eae9029 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Tue, 5 Nov 2024 15:27:40 +0000
Subject: [PATCH 03/37] Add logging and usage of pattern for glob

---
 calcium_imaging_automation/core/reader.py | 13 +++++---
 examples/example_usage.py                 | 40 ++++++++++++++++++-----
 examples/example_usage.sh                 |  5 +--
 3 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/calcium_imaging_automation/core/reader.py b/calcium_imaging_automation/core/reader.py
index 6a5c500..400044d 100644
--- a/calcium_imaging_automation/core/reader.py
+++ b/calcium_imaging_automation/core/reader.py
@@ -4,16 +4,19 @@
 
 class ReadAllPathsInFolder:
     def __init__(
-        self, raw_data_folder: Path, filetypes_of_interest: List[str]
+        self,
+        raw_data_folder: Path,
+        filetypes_of_interest: List[str],
+        folder_read_pattern: str,
     ):
         self.filetypes_of_interest = filetypes_of_interest
+        self.folder_read_pattern = folder_read_pattern
+
         self.datasets_paths = self.get_folders_first_layer(raw_data_folder)
-        self.dataset_names = [
-            dataset_path.name for dataset_path in self.datasets_paths
-        ]
+        self.dataset_names = [dataset_path.name for dataset_path in self.datasets_paths]
 
     def get_folders_first_layer(self, file_path: Path) -> List[Path]:
-        return list(file_path.glob("*"))
+        return list(file_path.glob(self.folder_read_pattern))
 
     def get_files_paths(self, folder: Path) -> List[Path]:
         return [
diff --git a/examples/example_usage.py b/examples/example_usage.py
index f65eded..4b19a1b 100644
--- a/examples/example_usage.py
+++ b/examples/example_usage.py
@@ -1,34 +1,49 @@
 import argparse
 from pathlib import Path
+import logging
 
 from calcium_imaging_automation.core.reader import ReadAllPathsInFolder
 from calcium_imaging_automation.core.writer import DatashuttleWrapper
 
 
-def main(raw_data_path: Path, output_path: Path, filetypes_of_interest: list):
+def main(
+    raw_data_path: Path,
+    output_path: Path,
+    filetypes_of_interest: list,
+    folder_read_pattern: str,
+):
     """
     Draft usage of the pipeline, now consisting of read and write operations.
     """
-    reader = ReadAllPathsInFolder(raw_data_path, filetypes_of_interest)
+    logging.basicConfig(
+        filename=output_path / "logs" / "pipeline.log",
+        level=logging.INFO,
+        format="%(asctime)s - %(message)s",
+    ) 
+
+    reader = ReadAllPathsInFolder(
+        raw_data_path, filetypes_of_interest, folder_read_pattern
+    )
+    logging.info(f"Found {len(reader.datasets_paths)} datasets.")
+    logging.info(f"Dataset names: {reader.dataset_names}")
 
     writer = DatashuttleWrapper(output_path)
+    
     number_of_tiffs = reader.max_session_number(filetype="tif")
     writer.create_folders(reader.dataset_names, session_number=number_of_tiffs)
 
     # [Placeholder for data processing]
 
+    logging.info("Pipeline finished.")
+    
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Example usage of the pipeline manager."
     )
 
-    parser.add_argument(
-        "raw_data_path", type=Path, help="Path to the raw data."
-    )
-    parser.add_argument(
-        "output_path", type=Path, help="Path to the output data."
-    )
+    parser.add_argument("raw_data_path", type=Path, help="Path to the raw data.")
+    parser.add_argument("output_path", type=Path, help="Path to the output data.")
     parser.add_argument(
         "--filetypes",
         type=list,
@@ -36,10 +51,17 @@ def main(raw_data_path: Path, output_path: Path, filetypes_of_interest: list):
         help="Filetypes of interest.",
         default=["tif", "bin"],
     )
+    parser.add_argument(
+        "--folder_read_pattern",
+        type=str,
+        help="Glob pattern for reading files.",
+        default="*",
+    )
 
     args = parser.parse_args()
     raw_data_path = args.raw_data_path
     output_path = args.output_path
     file_types = args.filetypes
+    folder_read_pattern = args.folder_read_pattern
 
-    main(raw_data_path, output_path, file_types)
+    main(raw_data_path, output_path, file_types, folder_read_pattern)
diff --git a/examples/example_usage.sh b/examples/example_usage.sh
index 3e5d965..3d047c0 100755
--- a/examples/example_usage.sh
+++ b/examples/example_usage.sh
@@ -1,4 +1,5 @@
 #! /bin/bash
 python ./examples/example_usage.py \
-    /Users/lauraporta/local_data/rotation/ \
-    /Users/lauraporta/local_data/test/
+    /Volumes/winstor/swc/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/ \
+    /Users/laura/local_data/calcimaut/ \
+    --folder_read_pattern '2*' \

From b7a1f34645e4b2f4317e269fbfc7c80057417172 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Tue, 5 Nov 2024 15:29:29 +0000
Subject: [PATCH 04/37] Pre-commit hook modifications

---
 calcium_imaging_automation/core/reader.py |  4 +++-
 examples/example_usage.py                 | 16 ++++++++++------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/calcium_imaging_automation/core/reader.py b/calcium_imaging_automation/core/reader.py
index 400044d..ea77848 100644
--- a/calcium_imaging_automation/core/reader.py
+++ b/calcium_imaging_automation/core/reader.py
@@ -13,7 +13,9 @@ def __init__(
         self.folder_read_pattern = folder_read_pattern
 
         self.datasets_paths = self.get_folders_first_layer(raw_data_folder)
-        self.dataset_names = [dataset_path.name for dataset_path in self.datasets_paths]
+        self.dataset_names = [
+            dataset_path.name for dataset_path in self.datasets_paths
+        ]
 
     def get_folders_first_layer(self, file_path: Path) -> List[Path]:
         return list(file_path.glob(self.folder_read_pattern))
diff --git a/examples/example_usage.py b/examples/example_usage.py
index 4b19a1b..3e638ca 100644
--- a/examples/example_usage.py
+++ b/examples/example_usage.py
@@ -1,6 +1,6 @@
 import argparse
-from pathlib import Path
 import logging
+from pathlib import Path
 
 from calcium_imaging_automation.core.reader import ReadAllPathsInFolder
 from calcium_imaging_automation.core.writer import DatashuttleWrapper
@@ -19,7 +19,7 @@ def main(
         filename=output_path / "logs" / "pipeline.log",
         level=logging.INFO,
         format="%(asctime)s - %(message)s",
-    ) 
+    )
 
     reader = ReadAllPathsInFolder(
         raw_data_path, filetypes_of_interest, folder_read_pattern
@@ -28,22 +28,26 @@ def main(
     logging.info(f"Dataset names: {reader.dataset_names}")
 
     writer = DatashuttleWrapper(output_path)
-    
+
     number_of_tiffs = reader.max_session_number(filetype="tif")
     writer.create_folders(reader.dataset_names, session_number=number_of_tiffs)
 
     # [Placeholder for data processing]
 
     logging.info("Pipeline finished.")
-    
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Example usage of the pipeline manager."
     )
 
-    parser.add_argument("raw_data_path", type=Path, help="Path to the raw data.")
-    parser.add_argument("output_path", type=Path, help="Path to the output data.")
+    parser.add_argument(
+        "raw_data_path", type=Path, help="Path to the raw data."
+    )
+    parser.add_argument(
+        "output_path", type=Path, help="Path to the output data."
+    )
     parser.add_argument(
         "--filetypes",
         type=list,

From e11b2e2d7f1b40b351e5ffbef90678a96cc60750 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Tue, 5 Nov 2024 17:07:29 +0000
Subject: [PATCH 05/37] Fix error with logging filepath

---
 examples/example_usage.py | 3 ++-
 examples/example_usage.sh | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/example_usage.py b/examples/example_usage.py
index 3e638ca..999f7b5 100644
--- a/examples/example_usage.py
+++ b/examples/example_usage.py
@@ -15,8 +15,9 @@ def main(
     """
     Draft usage of the pipeline, now consisting of read and write operations.
     """
+    (output_path / "logs").mkdir(exist_ok=True)
     logging.basicConfig(
-        filename=output_path / "logs" / "pipeline.log",
+        filename=str(output_path / "logs" / "pipeline.log"),
         level=logging.INFO,
         format="%(asctime)s - %(message)s",
     )
diff --git a/examples/example_usage.sh b/examples/example_usage.sh
index 3d047c0..890024f 100755
--- a/examples/example_usage.sh
+++ b/examples/example_usage.sh
@@ -1,5 +1,5 @@
 #! /bin/bash
 python ./examples/example_usage.py \
-    /Volumes/winstor/swc/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/ \
-    /Users/laura/local_data/calcimaut/ \
+    /nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/ \
+    /ceph/margrie/laura/cimaut/ \
     --folder_read_pattern '2*' \

From e85206cefc0480e2b8acbbb18e89ad75541e1a21 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Tue, 5 Nov 2024 18:29:26 +0000
Subject: [PATCH 06/37] WIP on filename

---
 calcium_imaging_automation/core/reader.py |  2 ++
 examples/example_usage.py                 | 21 +++++++++++++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/calcium_imaging_automation/core/reader.py b/calcium_imaging_automation/core/reader.py
index ea77848..f1ccf9e 100644
--- a/calcium_imaging_automation/core/reader.py
+++ b/calcium_imaging_automation/core/reader.py
@@ -8,9 +8,11 @@ def __init__(
         raw_data_folder: Path,
         filetypes_of_interest: List[str],
         folder_read_pattern: str,
+        file_read_pattern: str,
     ):
         self.filetypes_of_interest = filetypes_of_interest
         self.folder_read_pattern = folder_read_pattern
+        self.file_read_pattern = file_read_pattern
 
         self.datasets_paths = self.get_folders_first_layer(raw_data_folder)
         self.dataset_names = [
diff --git a/examples/example_usage.py b/examples/example_usage.py
index 999f7b5..616138b 100644
--- a/examples/example_usage.py
+++ b/examples/example_usage.py
@@ -11,6 +11,7 @@ def main(
     output_path: Path,
     filetypes_of_interest: list,
     folder_read_pattern: str,
+    file_read_pattern: str,
 ):
     """
     Draft usage of the pipeline, now consisting of read and write operations.
@@ -23,7 +24,10 @@ def main(
     )
 
     reader = ReadAllPathsInFolder(
-        raw_data_path, filetypes_of_interest, folder_read_pattern
+        raw_data_path,
+        filetypes_of_interest,
+        folder_read_pattern,
+        file_read_pattern,
     )
     logging.info(f"Found {len(reader.datasets_paths)} datasets.")
     logging.info(f"Dataset names: {reader.dataset_names}")
@@ -59,6 +63,12 @@ def main(
     parser.add_argument(
         "--folder_read_pattern",
         type=str,
+        help="Glob pattern for reading folder.",
+        default="*",
+    )
+    parser.add_argument(
+        "--file_read_pattern",
+        type=str,
         help="Glob pattern for reading files.",
         default="*",
     )
@@ -68,5 +78,12 @@ def main(
     output_path = args.output_path
     file_types = args.filetypes
     folder_read_pattern = args.folder_read_pattern
+    file_read_pattern = args.file_read_pattern
 
-    main(raw_data_path, output_path, file_types, folder_read_pattern)
+    main(
+        raw_data_path,
+        output_path,
+        file_types,
+        folder_read_pattern,
+        file_read_pattern,
+    )

From 08cdd2ea8dd0cde94529fe4cd0e1d08e6cea4ecc Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Tue, 5 Nov 2024 18:31:07 +0000
Subject: [PATCH 07/37] Improve log name

---
 examples/example_usage.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/examples/example_usage.py b/examples/example_usage.py
index 999f7b5..5fb2813 100644
--- a/examples/example_usage.py
+++ b/examples/example_usage.py
@@ -1,4 +1,5 @@
 import argparse
+import datetime
 import logging
 from pathlib import Path
 
@@ -17,7 +18,12 @@ def main(
     """
     (output_path / "logs").mkdir(exist_ok=True)
     logging.basicConfig(
-        filename=str(output_path / "logs" / "pipeline.log"),
+        # save also time anda date
+        filename=str(
+            output_path
+            / "logs"
+            / f"{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.log"
+        ),
         level=logging.INFO,
         format="%(asctime)s - %(message)s",
     )

From f59b4aa2b3cceb296557656a5d07ca43d52d7fa7 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Tue, 5 Nov 2024 18:36:37 +0000
Subject: [PATCH 08/37] Be stricter on allowed sessions

---
 calcium_imaging_automation/core/reader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/calcium_imaging_automation/core/reader.py b/calcium_imaging_automation/core/reader.py
index f1ccf9e..7aeb499 100644
--- a/calcium_imaging_automation/core/reader.py
+++ b/calcium_imaging_automation/core/reader.py
@@ -35,7 +35,7 @@ def total_objects_by_filetype(self, folder: Path) -> dict:
             for filetype in self.filetypes_of_interest
         }
 
-    def max_session_number(self, filetype="tif", max_allowed=5) -> int:
+    def max_session_number(self, filetype="tif", max_allowed=1) -> int:
         total_tif_number = [
             self.total_objects_by_filetype(dataset_path).get(filetype, 0)
             for dataset_path in self.datasets_paths

From bb41c55622d160c321e491646c131221d46cb323 Mon Sep 17 00:00:00 2001
From: lauraporta <29216006+lauraporta@users.noreply.github.com>
Date: Fri, 8 Nov 2024 15:48:35 +0000
Subject: [PATCH 09/37] Change usage of file pattern in reader

---
 calcium_imaging_automation/core/reader.py | 14 ++++++--------
 examples/example_usage.py                 | 15 ++-------------
 examples/example_usage.sh                 |  1 +
 3 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/calcium_imaging_automation/core/reader.py b/calcium_imaging_automation/core/reader.py
index f1ccf9e..65d1630 100644
--- a/calcium_imaging_automation/core/reader.py
+++ b/calcium_imaging_automation/core/reader.py
@@ -6,11 +6,9 @@ class ReadAllPathsInFolder:
     def __init__(
         self,
         raw_data_folder: Path,
-        filetypes_of_interest: List[str],
         folder_read_pattern: str,
-        file_read_pattern: str,
+        file_read_pattern: list,
     ):
-        self.filetypes_of_interest = filetypes_of_interest
         self.folder_read_pattern = folder_read_pattern
         self.file_read_pattern = file_read_pattern
 
@@ -25,19 +23,19 @@ def get_folders_first_layer(self, file_path: Path) -> List[Path]:
     def get_files_paths(self, folder: Path) -> List[Path]:
         return [
             file
-            for filetype in self.filetypes_of_interest
-            for file in folder.rglob(f"*.{filetype}")
+            for pattern in self.folder_read_pattern
+            for file in folder.rglob(pattern)
         ]
 
-    def total_objects_by_filetype(self, folder: Path) -> dict:
+    def total_objects_by_format(self, folder: Path) -> dict:
         return {
             filetype: len(self.get_files_paths(folder))
-            for filetype in self.filetypes_of_interest
+            for filetype in self.folder_read_pattern.split(".")[-1]
         }
 
     def max_session_number(self, filetype="tif", max_allowed=5) -> int:
         total_tif_number = [
-            self.total_objects_by_filetype(dataset_path).get(filetype, 0)
+            self.total_objects_by_format(dataset_path).get(filetype, 0)
             for dataset_path in self.datasets_paths
         ]
 
diff --git a/examples/example_usage.py b/examples/example_usage.py
index 616138b..d3a48b5 100644
--- a/examples/example_usage.py
+++ b/examples/example_usage.py
@@ -9,7 +9,6 @@
 def main(
     raw_data_path: Path,
     output_path: Path,
-    filetypes_of_interest: list,
     folder_read_pattern: str,
     file_read_pattern: str,
 ):
@@ -25,7 +24,6 @@ def main(
 
     reader = ReadAllPathsInFolder(
         raw_data_path,
-        filetypes_of_interest,
         folder_read_pattern,
         file_read_pattern,
     )
@@ -53,13 +51,6 @@ def main(
     parser.add_argument(
         "output_path", type=Path, help="Path to the output data."
     )
-    parser.add_argument(
-        "--filetypes",
-        type=list,
-        nargs="+",
-        help="Filetypes of interest.",
-        default=["tif", "bin"],
-    )
     parser.add_argument(
         "--folder_read_pattern",
         type=str,
@@ -68,22 +59,20 @@ def main(
     )
     parser.add_argument(
         "--file_read_pattern",
-        type=str,
+        type=list,
         help="Glob pattern for reading files.",
-        default="*",
+        default=["*.tif", "*.bin"],
     )
 
     args = parser.parse_args()
     raw_data_path = args.raw_data_path
     output_path = args.output_path
-    file_types = args.filetypes
     folder_read_pattern = args.folder_read_pattern
     file_read_pattern = args.file_read_pattern
 
     main(
         raw_data_path,
         output_path,
-        file_types,
         folder_read_pattern,
         file_read_pattern,
     )
diff --git a/examples/example_usage.sh b/examples/example_usage.sh
index 890024f..9998c69 100755
--- a/examples/example_usage.sh
+++ b/examples/example_usage.sh
@@ -3,3 +3,4 @@ python ./examples/example_usage.py \
     /nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/ \
     /ceph/margrie/laura/cimaut/ \
     --folder_read_pattern '2*' \
+    --file_read_pattern 'rotation_00001.tif, *.bin' \

From 805448465548bfcf1b9fc404190d71f59891d3ba Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Fri, 8 Nov 2024 16:04:09 +0000
Subject: [PATCH 10/37] Change folder

---
 examples/example_usage.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/example_usage.sh b/examples/example_usage.sh
index 890024f..52e8599 100755
--- a/examples/example_usage.sh
+++ b/examples/example_usage.sh
@@ -1,5 +1,5 @@
 #! /bin/bash
 python ./examples/example_usage.py \
-    /nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/ \
+    /nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/ \
     /ceph/margrie/laura/cimaut/ \
     --folder_read_pattern '2*' \

From ad5d0ae0ff241f52db253ce9bcf92f7e3c3eb2ad Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Fri, 8 Nov 2024 17:31:36 +0000
Subject: [PATCH 11/37] Fix error related to file reading patterns

---
 calcium_imaging_automation/core/reader.py | 18 +++++++-----------
 examples/example_usage.py                 | 12 ++++++++----
 examples/example_usage.sh                 | 11 +++++++----
 3 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/calcium_imaging_automation/core/reader.py b/calcium_imaging_automation/core/reader.py
index 085ee00..2180a27 100644
--- a/calcium_imaging_automation/core/reader.py
+++ b/calcium_imaging_automation/core/reader.py
@@ -7,7 +7,7 @@ def __init__(
         self,
         raw_data_folder: Path,
         folder_read_pattern: str,
-        file_read_pattern: list,
+        file_read_pattern: List[str],
     ):
         self.folder_read_pattern = folder_read_pattern
         self.file_read_pattern = file_read_pattern
@@ -20,20 +20,16 @@ def __init__(
     def get_folders_first_layer(self, file_path: Path) -> List[Path]:
         return list(file_path.glob(self.folder_read_pattern))
 
-    def get_files_paths(self, folder: Path) -> List[Path]:
-        return [
-            file
-            for pattern in self.folder_read_pattern
-            for file in folder.rglob(pattern)
-        ]
-
+    def get_files_paths_by_format(self, folder: Path, filetype="tif") -> List[Path]:
+        return list(folder.rglob(filetype))
+    
     def total_objects_by_format(self, folder: Path) -> dict:
         return {
-            filetype: len(self.get_files_paths(folder))
-            for filetype in self.folder_read_pattern.split(".")[-1]
+            filetype.split(".")[-1]: len(self.get_files_paths_by_format(folder, filetype))
+            for filetype in self.file_read_pattern
         }
 
-    def max_session_number(self, filetype="tif", max_allowed=1) -> int:
+    def max_session_number(self, filetype="tif", max_allowed=5) -> int:
         total_tif_number = [
             self.total_objects_by_format(dataset_path).get(filetype, 0)
             for dataset_path in self.datasets_paths
diff --git a/examples/example_usage.py b/examples/example_usage.py
index 28043ea..cd82703 100644
--- a/examples/example_usage.py
+++ b/examples/example_usage.py
@@ -2,6 +2,7 @@
 import datetime
 import logging
 from pathlib import Path
+from typing import List
 
 from calcium_imaging_automation.core.reader import ReadAllPathsInFolder
 from calcium_imaging_automation.core.writer import DatashuttleWrapper
@@ -11,7 +12,7 @@ def main(
     raw_data_path: Path,
     output_path: Path,
     folder_read_pattern: str,
-    file_read_pattern: str,
+    file_read_pattern: List[str],
 ):
     """
     Draft usage of the pipeline, now consisting of read and write operations.
@@ -36,9 +37,12 @@ def main(
     logging.info(f"Found {len(reader.datasets_paths)} datasets.")
     logging.info(f"Dataset names: {reader.dataset_names}")
 
+
     writer = DatashuttleWrapper(output_path)
 
     number_of_tiffs = reader.max_session_number(filetype="tif")
+    logging.info(f"Max of tiffs found: {number_of_tiffs}")
+
     writer.create_folders(reader.dataset_names, session_number=number_of_tiffs)
 
     # [Placeholder for data processing]
@@ -65,9 +69,9 @@ def main(
     )
     parser.add_argument(
         "--file_read_pattern",
-        type=list,
-        help="Glob pattern for reading files.",
-        default=["*.tif", "*.bin"],
+        type=str,
+        help="List of glob patterns for reading files.",
+        action="append",
     )
 
     args = parser.parse_args()
diff --git a/examples/example_usage.sh b/examples/example_usage.sh
index 9a0fa0b..1b6d8d2 100755
--- a/examples/example_usage.sh
+++ b/examples/example_usage.sh
@@ -1,6 +1,9 @@
 #! /bin/bash
-python ./examples/example_usage.py \
-    /nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/ \
-    /ceph/margrie/laura/cimaut/ \
+
+python examples/example_usage.py \
+    '/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/' \
+    '/ceph/margrie/laura/cimaut/' \
     --folder_read_pattern '2*' \
-    --file_read_pattern 'rotation_00001.tif, *.bin' \
+    --file_read_pattern 'rotation_00001.tif' \
+    --file_read_pattern '*.bin' \
+    
\ No newline at end of file

From 8a34d2ab972ea2c96d4ea9779ae09abfa85fb390 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Fri, 8 Nov 2024 17:31:51 +0000
Subject: [PATCH 12/37] Add script useful to launch debugging

---
 examples/debugging.py | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 examples/debugging.py

diff --git a/examples/debugging.py b/examples/debugging.py
new file mode 100644
index 0000000..da17601
--- /dev/null
+++ b/examples/debugging.py
@@ -0,0 +1,10 @@
+from example_usage import main
+from pathlib import Path   
+
+
+main(
+    raw_data_path = Path('/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/'),
+    output_path = Path('/ceph/margrie/laura/cimaut/'),
+    folder_read_pattern = '2*',
+    file_read_pattern = ['rotation_00001.tif', '*.bin']
+)
\ No newline at end of file

From 9c49db16e45e7bc33b67481f29e16b64e57785c9 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Fri, 8 Nov 2024 18:06:05 +0000
Subject: [PATCH 13/37] Add minimal wandb implementation

---
 examples/example_usage.py | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/examples/example_usage.py b/examples/example_usage.py
index cd82703..5063669 100644
--- a/examples/example_usage.py
+++ b/examples/example_usage.py
@@ -3,6 +3,8 @@
 import logging
 from pathlib import Path
 from typing import List
+import wandb
+import numpy as np
 
 from calcium_imaging_automation.core.reader import ReadAllPathsInFolder
 from calcium_imaging_automation.core.writer import DatashuttleWrapper
@@ -17,6 +19,7 @@ def main(
     """
     Draft usage of the pipeline, now consisting of read and write operations.
     """
+    # --- Setup ---
     (output_path / "logs").mkdir(exist_ok=True)
     logging.basicConfig(
         # save also time anda date
@@ -29,6 +32,11 @@ def main(
         format="%(asctime)s - %(message)s",
     )
 
+    wandb.init(project="example_usage")
+    run_id = wandb.run.id
+
+
+    # --- Read folders and files ---
     reader = ReadAllPathsInFolder(
         raw_data_path,
         folder_read_pattern,
@@ -37,16 +45,33 @@ def main(
     logging.info(f"Found {len(reader.datasets_paths)} datasets.")
     logging.info(f"Dataset names: {reader.dataset_names}")
 
-
-    writer = DatashuttleWrapper(output_path)
-
     number_of_tiffs = reader.max_session_number(filetype="tif")
     logging.info(f"Max of tiffs found: {number_of_tiffs}")
 
+
+    # --- Write folders and files ---
+    writer = DatashuttleWrapper(output_path)
     writer.create_folders(reader.dataset_names, session_number=number_of_tiffs)
 
-    # [Placeholder for data processing]
 
+    for dataset in reader.datasets_paths:
+        dataset = dataset.stem
+        for session in range(1, number_of_tiffs + 1):
+            logging.info(f"Processing dataset {dataset} session {session}...")
+
+            # mock processing
+            data = np.random.rand(100, 100)
+            metric_measured = np.random.rand()
+
+            wandb.log({
+                "dataset": dataset,
+                "session": session,
+                "metric_measured": metric_measured,
+                "image": wandb.Image(data),
+                "run_id": run_id
+            })
+          
+    wandb.finish()     
     logging.info("Pipeline finished.")
 
 

From 831ed716aebb4f7cedbdc1bcfcf94f70be8c1d2e Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Fri, 8 Nov 2024 18:55:35 +0000
Subject: [PATCH 14/37] =?UTF-8?q?WIP:=20saving=20images=20=F0=9F=90=9B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 calcium_imaging_automation/core/reader.py | 10 +++++--
 calcium_imaging_automation/core/writer.py | 26 +++++++++++++++--
 examples/debugging.py                     | 16 ++++++-----
 examples/example_usage.py                 | 34 ++++++++++++++---------
 examples/example_usage.sh                 |  1 -
 5 files changed, 61 insertions(+), 26 deletions(-)

diff --git a/calcium_imaging_automation/core/reader.py b/calcium_imaging_automation/core/reader.py
index 2180a27..e600844 100644
--- a/calcium_imaging_automation/core/reader.py
+++ b/calcium_imaging_automation/core/reader.py
@@ -20,12 +20,16 @@ def __init__(
     def get_folders_first_layer(self, file_path: Path) -> List[Path]:
         return list(file_path.glob(self.folder_read_pattern))
 
-    def get_files_paths_by_format(self, folder: Path, filetype="tif") -> List[Path]:
+    def get_files_paths_by_format(
+        self, folder: Path, filetype="tif"
+    ) -> List[Path]:
         return list(folder.rglob(filetype))
-    
+
     def total_objects_by_format(self, folder: Path) -> dict:
         return {
-            filetype.split(".")[-1]: len(self.get_files_paths_by_format(folder, filetype))
+            filetype.split(".")[-1]: len(
+                self.get_files_paths_by_format(folder, filetype)
+            )
             for filetype in self.file_read_pattern
         }
 
diff --git a/calcium_imaging_automation/core/writer.py b/calcium_imaging_automation/core/writer.py
index d61d056..109c5a3 100644
--- a/calcium_imaging_automation/core/writer.py
+++ b/calcium_imaging_automation/core/writer.py
@@ -1,14 +1,17 @@
 from pathlib import Path
-from typing import List
+from typing import Dict, List
 
+import numpy as np
 from datashuttle.configs.config_class import Configs
 from datashuttle.utils import folders
+from PIL import Image
 
 
 class DatashuttleWrapper:
     def __init__(self, output_path: Path) -> None:
         # This is supposed to run in the cluster and have direct access
         # to the central storages
+        self.output_path = output_path
         self.datashuttle_cfg = Configs(
             project_name=output_path.name,
             file_path=output_path,
@@ -20,7 +23,8 @@ def __init__(self, output_path: Path) -> None:
         )
 
     def create_folders(self, dataset_names: List[str], session_number) -> None:
-        folders.create_folder_trees(
+        # all_paths is a dictionary with keys: sub, ses
+        self.all_paths: Dict[str, List[Path]] = folders.create_folder_trees(
             cfg=self.datashuttle_cfg,
             top_level_folder="derivatives",
             sub_names=[
@@ -30,3 +34,21 @@ def create_folders(self, dataset_names: List[str], session_number) -> None:
             ses_names=[f"ses-{i}" for i in range(session_number)],
             datatype="funcimg",
         )
+
+    def get_dataset_path(self, dataset_name: str) -> Path:
+        print((self.output_path / "derivatives"))
+        return next(
+            (self.output_path / "derivatives").glob(f"*{dataset_name}*"), None
+        )
+
+    def save_image(
+        self,
+        image: np.ndarray,
+        run_id: int,
+        dataset_name: str,
+        session_number: int,
+        filename: str,
+    ) -> None:
+        path = self.get_dataset_path(dataset_name)
+        image = Image.fromarray(image)
+        image.save(path / f"ses-{session_number}" / f"{filename}-{run_id}")
diff --git a/examples/debugging.py b/examples/debugging.py
index da17601..550faca 100644
--- a/examples/debugging.py
+++ b/examples/debugging.py
@@ -1,10 +1,12 @@
-from example_usage import main
-from pathlib import Path   
+from pathlib import Path
 
+from example_usage import main
 
 main(
-    raw_data_path = Path('/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/'),
-    output_path = Path('/ceph/margrie/laura/cimaut/'),
-    folder_read_pattern = '2*',
-    file_read_pattern = ['rotation_00001.tif', '*.bin']
-)
\ No newline at end of file
+    raw_data_path=Path(
+        "/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/"
+    ),
+    output_path=Path("/ceph/margrie/laura/cimaut/"),
+    folder_read_pattern="2*",
+    file_read_pattern=["rotation_00001.tif", "*.bin"],
+)
diff --git a/examples/example_usage.py b/examples/example_usage.py
index 5063669..fce3890 100644
--- a/examples/example_usage.py
+++ b/examples/example_usage.py
@@ -3,9 +3,10 @@
 import logging
 from pathlib import Path
 from typing import List
-import wandb
+
 import numpy as np
 
+import wandb
 from calcium_imaging_automation.core.reader import ReadAllPathsInFolder
 from calcium_imaging_automation.core.writer import DatashuttleWrapper
 
@@ -35,7 +36,6 @@ def main(
     wandb.init(project="example_usage")
     run_id = wandb.run.id
 
-
     # --- Read folders and files ---
     reader = ReadAllPathsInFolder(
         raw_data_path,
@@ -48,12 +48,10 @@ def main(
     number_of_tiffs = reader.max_session_number(filetype="tif")
     logging.info(f"Max of tiffs found: {number_of_tiffs}")
 
-
     # --- Write folders and files ---
     writer = DatashuttleWrapper(output_path)
     writer.create_folders(reader.dataset_names, session_number=number_of_tiffs)
 
-
     for dataset in reader.datasets_paths:
         dataset = dataset.stem
         for session in range(1, number_of_tiffs + 1):
@@ -63,15 +61,25 @@ def main(
             data = np.random.rand(100, 100)
             metric_measured = np.random.rand()
 
-            wandb.log({
-                "dataset": dataset,
-                "session": session,
-                "metric_measured": metric_measured,
-                "image": wandb.Image(data),
-                "run_id": run_id
-            })
-          
-    wandb.finish()     
+            wandb.log(
+                {
+                    "dataset": dataset,
+                    "session": session,
+                    "metric_measured": metric_measured,
+                    "run_id": run_id,
+                }
+            )
+
+            # save image in session folder
+            writer.save_image(
+                image=data,
+                run_id=run_id,
+                dataset_name=dataset,
+                session_number=session,
+                filename="image",
+            )
+
+    wandb.finish()
     logging.info("Pipeline finished.")
 
 
diff --git a/examples/example_usage.sh b/examples/example_usage.sh
index 1b6d8d2..2fc7a15 100755
--- a/examples/example_usage.sh
+++ b/examples/example_usage.sh
@@ -6,4 +6,3 @@ python examples/example_usage.py \
     --folder_read_pattern '2*' \
     --file_read_pattern 'rotation_00001.tif' \
     --file_read_pattern '*.bin' \
-    
\ No newline at end of file

From ca67671afe2c7be835c4a97c14268ce252684bbe Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Mon, 11 Nov 2024 17:13:03 +0000
Subject: [PATCH 15/37] Fix image save bug

---
 calcium_imaging_automation/core/writer.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/calcium_imaging_automation/core/writer.py b/calcium_imaging_automation/core/writer.py
index 109c5a3..120deca 100644
--- a/calcium_imaging_automation/core/writer.py
+++ b/calcium_imaging_automation/core/writer.py
@@ -36,9 +36,8 @@ def create_folders(self, dataset_names: List[str], session_number) -> None:
         )
 
     def get_dataset_path(self, dataset_name: str) -> Path:
-        print((self.output_path / "derivatives"))
         return next(
-            (self.output_path / "derivatives").glob(f"*{dataset_name}*"), None
+            (self.output_path / "derivatives").glob(f"*{dataset_name}*")
         )
 
     def save_image(
@@ -50,5 +49,8 @@ def save_image(
         filename: str,
     ) -> None:
         path = self.get_dataset_path(dataset_name)
-        image = Image.fromarray(image)
-        image.save(path / f"ses-{session_number}" / f"{filename}-{run_id}")
+        image = Image.fromarray(image).convert("L")
+        image.save(
+            path / f"ses-{session_number}" / f"{filename}-{run_id}.png",
+            mode="PNG",
+        )

From e2381bc7e7b16d214958425b6a64a5461a806650 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Mon, 11 Nov 2024 17:17:31 +0000
Subject: [PATCH 16/37] Move from wandb to mlflow

---
 examples/example_usage.py | 73 ++++++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 32 deletions(-)

diff --git a/examples/example_usage.py b/examples/example_usage.py
index fce3890..02ada0b 100644
--- a/examples/example_usage.py
+++ b/examples/example_usage.py
@@ -4,9 +4,9 @@
 from pathlib import Path
 from typing import List
 
+import mlflow
 import numpy as np
 
-import wandb
 from calcium_imaging_automation.core.reader import ReadAllPathsInFolder
 from calcium_imaging_automation.core.writer import DatashuttleWrapper
 
@@ -20,10 +20,10 @@ def main(
     """
     Draft usage of the pipeline, now consisting of read and write operations.
     """
-    # --- Setup ---
+    # --- Setup experiment-wide logging to file ---
     (output_path / "logs").mkdir(exist_ok=True)
     logging.basicConfig(
-        # save also time anda date
+        # Save also time and date
         filename=str(
             output_path
             / "logs"
@@ -33,9 +33,6 @@ def main(
         format="%(asctime)s - %(message)s",
     )
 
-    wandb.init(project="example_usage")
-    run_id = wandb.run.id
-
     # --- Read folders and files ---
     reader = ReadAllPathsInFolder(
         raw_data_path,
@@ -53,33 +50,45 @@ def main(
     writer.create_folders(reader.dataset_names, session_number=number_of_tiffs)
 
     for dataset in reader.datasets_paths:
-        dataset = dataset.stem
+        dataset_name = dataset.stem
         for session in range(1, number_of_tiffs + 1):
-            logging.info(f"Processing dataset {dataset} session {session}...")
-
-            # mock processing
-            data = np.random.rand(100, 100)
-            metric_measured = np.random.rand()
-
-            wandb.log(
-                {
-                    "dataset": dataset,
-                    "session": session,
-                    "metric_measured": metric_measured,
-                    "run_id": run_id,
-                }
-            )
-
-            # save image in session folder
-            writer.save_image(
-                image=data,
-                run_id=run_id,
-                dataset_name=dataset,
-                session_number=session,
-                filename="image",
-            )
-
-    wandb.finish()
+            with (
+                mlflow.start_run()
+            ):  # Start a new MLflow run for each dataset-session
+                # Log session-specific parameters
+                mlflow.log_param("dataset_name", dataset_name)
+                mlflow.log_param("session_number", session)
+                mlflow.log_param("raw_data_path", str(raw_data_path))
+                mlflow.log_param("output_path", str(output_path))
+                mlflow.log_param("folder_read_pattern", folder_read_pattern)
+                mlflow.log_param("file_read_pattern", file_read_pattern)
+
+                logging.info(
+                    f"Processing dataset {dataset_name} session {session}..."
+                )
+
+                # Mock processing
+                data = np.random.rand(100, 100)
+                metric_measured = np.random.rand()
+
+                # Log metric with MLflow
+                mlflow.log_metric("metric_measured", metric_measured)
+
+                # Save image in session folder
+                writer.save_image(
+                    image=data,
+                    run_id=session,
+                    dataset_name=dataset_name,
+                    session_number=session,
+                    filename="image",
+                )
+
+                # Log that the run is complete for this session
+                logging.info(
+                    f"Completed MLflow run for dataset {dataset_name} "
+                    + f"session {session}"
+                )
+
     logging.info("Pipeline finished.")
 
 

From 789147db1bfda1af2d49f65e729c869299f77535 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Mon, 11 Nov 2024 18:20:40 +0000
Subject: [PATCH 17/37] Store mlflow folder differently, link artifacts

---
 calcium_imaging_automation/core/writer.py |  9 ++++--
 examples/example_usage.py                 | 35 ++++++++++++++++-------
 2 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/calcium_imaging_automation/core/writer.py b/calcium_imaging_automation/core/writer.py
index 120deca..2fbbc5c 100644
--- a/calcium_imaging_automation/core/writer.py
+++ b/calcium_imaging_automation/core/writer.py
@@ -47,10 +47,15 @@ def save_image(
         dataset_name: str,
         session_number: int,
         filename: str,
-    ) -> None:
+    ) -> Path:
         path = self.get_dataset_path(dataset_name)
         image = Image.fromarray(image).convert("L")
+        image_path = (
+            path / f"ses-{session_number}" / f"{filename}-{run_id}.png"
+        )
         image.save(
-            path / f"ses-{session_number}" / f"{filename}-{run_id}.png",
+            image_path,
             mode="PNG",
         )
+
+        return image_path
diff --git a/examples/example_usage.py b/examples/example_usage.py
index 02ada0b..3789fa9 100644
--- a/examples/example_usage.py
+++ b/examples/example_usage.py
@@ -17,13 +17,9 @@ def main(
     folder_read_pattern: str,
     file_read_pattern: List[str],
 ):
-    """
-    Draft usage of the pipeline, now consisting of read and write operations.
-    """
     # --- Setup experiment-wide logging to file ---
     (output_path / "logs").mkdir(exist_ok=True)
     logging.basicConfig(
-        # Save also time and date
         filename=str(
             output_path
             / "logs"
@@ -33,6 +29,11 @@ def main(
         format="%(asctime)s - %(message)s",
     )
 
+    # --- Setup MLflow tracking ---
+    mlflow_tracking_dir = output_path / "derivatives" / "mlflow"
+    mlflow.set_tracking_uri(str(mlflow_tracking_dir))
+    mlflow.set_experiment("calcium_imaging_pipeline")
+
     # --- Read folders and files ---
     reader = ReadAllPathsInFolder(
         raw_data_path,
@@ -51,10 +52,9 @@ def main(
 
     for dataset in reader.datasets_paths:
         dataset_name = dataset.stem
-        for session in range(1, number_of_tiffs + 1):
-            with (
-                mlflow.start_run()
-            ):  # Start a new MLflow run for each dataset-session
+        for session in range(0, number_of_tiffs):
+            # Start a new MLflow run for each dataset-session
+            with mlflow.start_run():
                 # Log session-specific parameters
                 mlflow.log_param("dataset_name", dataset_name)
                 mlflow.log_param("session_number", session)
@@ -75,7 +75,7 @@ def main(
                 mlflow.log_metric("metric_measured", metric_measured)
 
                 # Save image in session folder
-                writer.save_image(
+                image_path = writer.save_image(
                     image=data,
                     run_id=session,
                     dataset_name=dataset_name,
@@ -83,7 +83,22 @@ def main(
                     filename="image",
                 )
 
-                # Log that the run is complete for this session
+                # Log the image as an artifact in MLflow
+                mlflow.log_artifact(
+                    image_path,
+                    artifact_path=f"{dataset_name}/session_{session}",
+                )
+
+                logging.info(
+                    f"MLflow run_id: {mlflow.active_run().info.run_id}"
+                )
+                logging.info(
+                    "MLflow experiment_id: "
+                    + f"{mlflow.active_run().info.experiment_id}"
+                )
+                logging.info(
+                    f"MLflow tracking_uri: {mlflow.get_tracking_uri()}"
+                )
                 logging.info(
                     f"Completed MLflow run for dataset {dataset_name} "
                     + f"session {session}"

From 385ffecc41f8ee8fd6f3c214112342603c22d02a Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Mon, 11 Nov 2024 18:34:10 +0000
Subject: [PATCH 18/37] Update manifest

---
 MANIFEST.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 53a61e3..5089c71 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -4,9 +4,9 @@ exclude .pre-commit-config.yaml
 
 recursive-include calcium_imaging_automation *.py
 recursive-include examples *.py
-recursive-include examples *.sh
 
 recursive-exclude * __pycache__
 recursive-exclude * *.py[co]
 recursive-exclude docs *
 recursive-exclude tests *
+recursive-exclude examples *.sh

From b44e7ff1d8a76e8c012f1fd9eeb54f1930428861 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Mon, 11 Nov 2024 18:34:25 +0000
Subject: [PATCH 19/37] Remove test script

---
 .gitignore                | 3 +++
 examples/example_usage.sh | 8 --------
 2 files changed, 3 insertions(+), 8 deletions(-)
 delete mode 100755 examples/example_usage.sh

diff --git a/.gitignore b/.gitignore
index aedc8d7..22f3445 100644
--- a/.gitignore
+++ b/.gitignore
@@ -81,3 +81,6 @@ venv/
 
 # written by setuptools_scm
 **/_version.py
+
+# custom scripts
+examples/*.sh
diff --git a/examples/example_usage.sh b/examples/example_usage.sh
deleted file mode 100755
index 2fc7a15..0000000
--- a/examples/example_usage.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#! /bin/bash
-
-python examples/example_usage.py \
-    '/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/' \
-    '/ceph/margrie/laura/cimaut/' \
-    --folder_read_pattern '2*' \
-    --file_read_pattern 'rotation_00001.tif' \
-    --file_read_pattern '*.bin' \

From 3893ef7abbe4bb3b91392d460d5c6c04536de36e Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Mon, 11 Nov 2024 19:16:30 +0000
Subject: [PATCH 20/37] Add first group of docstrings

---
 calcium_imaging_automation/core/reader.py | 94 ++++++++++++++++++++++-
 examples/example_usage.py                 |  4 +-
 2 files changed, 93 insertions(+), 5 deletions(-)

diff --git a/calcium_imaging_automation/core/reader.py b/calcium_imaging_automation/core/reader.py
index e600844..17b5e72 100644
--- a/calcium_imaging_automation/core/reader.py
+++ b/calcium_imaging_automation/core/reader.py
@@ -2,13 +2,33 @@
 from typing import List
 
 
-class ReadAllPathsInFolder:
+class ReadAquiredData:
     def __init__(
         self,
         raw_data_folder: Path,
         folder_read_pattern: str,
         file_read_pattern: List[str],
     ):
+        """
+        Class to handle filepaths and dataset names in the raw data folder.
+        It can load folders and files based on the provided patterns, allowing
+        flexibility in the data structure of origin.
+        It also provides the maximum number of sessions for each dataset based
+        on the total number of files found in the dataset folders, by default
+        it searches for tif files.
+
+        Parameters
+        ----------
+        raw_data_folder : Path
+            The path to the raw data folder.
+        folder_read_pattern : str
+            The pattern to search for folders in the raw data folder. It
+            corresponds to the naming convention of the datasets.
+        file_read_pattern : List[str]
+            The patterns to search for files in the dataset folders. It
+            corresponds to the naming convention of the files in the dataset
+            folders.
+        """
         self.folder_read_pattern = folder_read_pattern
         self.file_read_pattern = file_read_pattern
 
@@ -18,14 +38,62 @@ def __init__(
         ]
 
     def get_folders_first_layer(self, file_path: Path) -> List[Path]:
+        """
+        Get the first layer of folders in the raw data folder. The rest
+        of the class assumes that the first layer of folders corresponds
+        to the dataset folders.
+
+        Parameters
+        ----------
+        file_path : Path
+            The path to the raw data folder.
+
+        Returns
+        -------
+        List[Path]
+            The list of paths to the dataset folders.
+        """
         return list(file_path.glob(self.folder_read_pattern))
 
     def get_files_paths_by_format(
         self, folder: Path, filetype="tif"
     ) -> List[Path]:
+        """
+        Get the paths to the files in the dataset folders based on the
+        provided file type. By default, it searches for tif files.
+
+        Parameters
+        ----------
+        folder : Path
+            The path to the dataset folder.
+        filetype : str, optional
+            The file type to search for in the dataset folder, by default
+            "tif".
+
+        Returns
+        -------
+        List[Path]
+            The list of paths to the files in the dataset folder.
+        """
         return list(folder.rglob(filetype))
 
-    def total_objects_by_format(self, folder: Path) -> dict:
+    def total_objects_by_extension(self, folder: Path) -> dict:
+        """
+        Get the total number of files in the dataset folder based on the
+        extensions included in the file_read_pattern.
+
+        Parameters
+        ----------
+        folder : Path
+            The path to the dataset folder.
+
+        Returns
+        -------
+        dict
+            The dictionary with the number of files for each extension in the
+            patterns found in file_read_pattern.
+        """
+
         return {
             filetype.split(".")[-1]: len(
                 self.get_files_paths_by_format(folder, filetype)
@@ -34,8 +102,28 @@ def total_objects_by_format(self, folder: Path) -> dict:
         }
 
     def max_session_number(self, filetype="tif", max_allowed=5) -> int:
+        """
+        Get the maximum number of sessions for each dataset based on the total
+        number of files found in the dataset folders. By default, it searches
+        for tif files and allows a maximum of 5 sessions. It assumes that every
+        tif file corresponds to an experimental session.
+
+        Parameters
+        ----------
+        filetype : str, optional
+            The file type to search for in the dataset folder, by default
+            "tif".
+        max_allowed : int, optional
+            The maximum number of sessions allowed, by default 5.
+
+        Returns
+        -------
+        int
+            The maximum number of sessions for each dataset.
+        """
+
         total_tif_number = [
-            self.total_objects_by_format(dataset_path).get(filetype, 0)
+            self.total_objects_by_extension(dataset_path).get(filetype, 0)
             for dataset_path in self.datasets_paths
         ]
 
diff --git a/examples/example_usage.py b/examples/example_usage.py
index 3789fa9..fada0f3 100644
--- a/examples/example_usage.py
+++ b/examples/example_usage.py
@@ -7,7 +7,7 @@
 import mlflow
 import numpy as np
 
-from calcium_imaging_automation.core.reader import ReadAllPathsInFolder
+from calcium_imaging_automation.core.reader import ReadAquiredData
 from calcium_imaging_automation.core.writer import DatashuttleWrapper
 
 
@@ -35,7 +35,7 @@ def main(
     mlflow.set_experiment("calcium_imaging_pipeline")
 
     # --- Read folders and files ---
-    reader = ReadAllPathsInFolder(
+    reader = ReadAquiredData(
         raw_data_path,
         folder_read_pattern,
         file_read_pattern,

From b580fbc78d5d5d15da7d15a0916f2136244a3428 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Tue, 12 Nov 2024 16:46:21 +0000
Subject: [PATCH 21/37] =?UTF-8?q?WIP:=20nested=20runs,=20=F0=9F=90=9B=20on?=
 =?UTF-8?q?=20artifacts=20saving?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 calcium_imaging_automation/core/writer.py |   3 +-
 examples/example_usage.py                 | 101 ++++++++++++----------
 2 files changed, 56 insertions(+), 48 deletions(-)

diff --git a/calcium_imaging_automation/core/writer.py b/calcium_imaging_automation/core/writer.py
index 2fbbc5c..885d5f7 100644
--- a/calcium_imaging_automation/core/writer.py
+++ b/calcium_imaging_automation/core/writer.py
@@ -43,7 +43,6 @@ def get_dataset_path(self, dataset_name: str) -> Path:
     def save_image(
         self,
         image: np.ndarray,
-        run_id: int,
         dataset_name: str,
         session_number: int,
         filename: str,
@@ -51,7 +50,7 @@ def save_image(
         path = self.get_dataset_path(dataset_name)
         image = Image.fromarray(image).convert("L")
         image_path = (
-            path / f"ses-{session_number}" / f"{filename}-{run_id}.png"
+            path / f"ses-{session_number}" / f"{filename}.png"
         )
         image.save(
             image_path,
diff --git a/examples/example_usage.py b/examples/example_usage.py
index fada0f3..a85b436 100644
--- a/examples/example_usage.py
+++ b/examples/example_usage.py
@@ -3,6 +3,7 @@
 import logging
 from pathlib import Path
 from typing import List
+import setuptools_scm
 
 import mlflow
 import numpy as np
@@ -16,6 +17,7 @@ def main(
     output_path: Path,
     folder_read_pattern: str,
     file_read_pattern: List[str],
+    experiment_name: str = "pipeline_test",
 ):
     # --- Setup experiment-wide logging to file ---
     (output_path / "logs").mkdir(exist_ok=True)
@@ -30,9 +32,9 @@ def main(
     )
 
     # --- Setup MLflow tracking ---
-    mlflow_tracking_dir = output_path / "derivatives" / "mlflow"
+    mlflow_tracking_dir = output_path / "mlflow"
     mlflow.set_tracking_uri(str(mlflow_tracking_dir))
-    mlflow.set_experiment("calcium_imaging_pipeline")
+    mlflow.set_experiment(experiment_name)
 
     # --- Read folders and files ---
     reader = ReadAquiredData(
@@ -53,8 +55,11 @@ def main(
     for dataset in reader.datasets_paths:
         dataset_name = dataset.stem
         for session in range(0, number_of_tiffs):
-            # Start a new MLflow run for each dataset-session
-            with mlflow.start_run():
+            # Generate mock data
+            data = np.random.rand(100, 100)
+
+            # Start a new MLflow experiment for each dataset-session
+            with mlflow.start_run() as parent_run:
                 # Log session-specific parameters
                 mlflow.log_param("dataset_name", dataset_name)
                 mlflow.log_param("session_number", session)
@@ -62,48 +67,49 @@ def main(
                 mlflow.log_param("output_path", str(output_path))
                 mlflow.log_param("folder_read_pattern", folder_read_pattern)
                 mlflow.log_param("file_read_pattern", file_read_pattern)
+                mlflow.log_param("local_changes_hash", setuptools_scm.get_version())
 
                 logging.info(
-                    f"Processing dataset {dataset_name} session {session}..."
-                )
-
-                # Mock processing
-                data = np.random.rand(100, 100)
-                metric_measured = np.random.rand()
-
-                # Log metric with MLflow
-                mlflow.log_metric("metric_measured", metric_measured)
-
-                # Save image in session folder
-                image_path = writer.save_image(
-                    image=data,
-                    run_id=session,
-                    dataset_name=dataset_name,
-                    session_number=session,
-                    filename="image",
+                    f"Starting MLflow experiment for dataset {dataset_name} session {session}..."
                 )
 
-                # Log the image as an artifact in MLflow
-                mlflow.log_artifact(
-                    image_path,
-                    artifact_path=f"{dataset_name}/session_{session}",
-                )
+                # Mock processing for different runs within the experiment
+                for i in range(1, 11):  # 10 runs with varying parameters
+                    # Start a child run under the main dataset-session run
+                    with mlflow.start_run(nested=True):    
+
+                        # Mock metric calculation                    
+                        metric_measured = np.mean(data) * i 
+
+                        # Log parameters and metrics specific to this run
+                        mlflow.log_param("data_size", f"{i * 10}x100")
+                        mlflow.log_param("run_iteration", i)
+                        mlflow.log_param("run_id", mlflow.active_run().info.run_id)
+                        mlflow.log_metric("metric_measured", metric_measured)
+
+                        # Log the generated data as an artifact if desired
+                        # Here, simulate an image or data file save path
+                        image_path = writer.save_image(
+                            image=data,
+                            dataset_name=dataset_name,
+                            session_number=session,
+                            filename=f"image_run_{i}",
+                        )
+                        
+                        mlflow.log_artifact(
+                            image_path,
+                            artifact_path=f"{dataset_name}/session_{session}/run_{i}",
+                        )
+
+                        logging.info(
+                            f"Completed MLflow run iteration {i} for dataset {dataset_name} session {session}"
+                        )
 
                 logging.info(
-                    f"MLflow run_id: {mlflow.active_run().info.run_id}"
-                )
-                logging.info(
-                    "MLflow experiment_id: "
-                    + f"{mlflow.active_run().info.experiment_id}"
-                )
-                logging.info(
-                    f"MLflow tracking_uri: {mlflow.get_tracking_uri()}"
-                )
-                logging.info(
-                    f"Completed MLflow run for dataset {dataset_name} "
-                    + f"session {session}"
+                    f"Completed MLflow experiment for dataset {dataset_name} session {session}"
                 )
 
+
     logging.info("Pipeline finished.")
 
 
@@ -130,16 +136,19 @@ def main(
         help="List of glob patterns for reading files.",
         action="append",
     )
+    parser.add_argument(
+        "--experiment_name",
+        type=str,
+        help="Name of the experiment.",
+        default="pipeline_test",
+    )
 
     args = parser.parse_args()
-    raw_data_path = args.raw_data_path
-    output_path = args.output_path
-    folder_read_pattern = args.folder_read_pattern
-    file_read_pattern = args.file_read_pattern
 
     main(
-        raw_data_path,
-        output_path,
-        folder_read_pattern,
-        file_read_pattern,
+        args.raw_data_path,
+        args.output_path,
+        args.folder_read_pattern,
+        args.file_read_pattern,
+        args.experiment_name,
     )

From 9a3502e2ebd80422ff2063961c3c993783c162cd Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Mon, 18 Nov 2024 11:53:55 +0000
Subject: [PATCH 22/37] Add dependencies

---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 7506ae9..70f7fcd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,6 +8,9 @@ dynamic = ["version"]
 
 dependencies = [
   "datashuttle",
+  "setuptools_scm",
+  "mlflow",
+  "numpy",
 ]
 
 license = {text = "BSD-3-Clause"}

From ca90cfebd0ea300b94c3890dd59868ce959991d5 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Mon, 18 Nov 2024 11:59:04 +0000
Subject: [PATCH 23/37] Refactoring

---
 calcium_imaging_automation/core/app.py      |  44 +++++
 calcium_imaging_automation/core/pipeline.py | 175 ++++++++++++++++++++
 calcium_imaging_automation/core/writer.py   |   4 +-
 examples/debugging.py                       |   4 +-
 examples/example_usage.py                   | 154 -----------------
 5 files changed, 222 insertions(+), 159 deletions(-)
 create mode 100644 calcium_imaging_automation/core/app.py
 create mode 100644 calcium_imaging_automation/core/pipeline.py
 delete mode 100644 examples/example_usage.py

diff --git a/calcium_imaging_automation/core/app.py b/calcium_imaging_automation/core/app.py
new file mode 100644
index 0000000..3491a25
--- /dev/null
+++ b/calcium_imaging_automation/core/app.py
@@ -0,0 +1,44 @@
+import argparse
+from pathlib import Path
+
+from calcium_imaging_automation.core.pipeline import pipeline
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Example usage of the pipeline manager."
+    )
+
+    parser.add_argument(
+        "raw_data_path", type=Path, help="Path to the raw data."
+    )
+    parser.add_argument(
+        "output_path", type=Path, help="Path to the output data."
+    )
+    parser.add_argument(
+        "--folder_read_pattern",
+        type=str,
+        help="Glob pattern for reading folder.",
+        default="*",
+    )
+    parser.add_argument(
+        "--file_read_pattern",
+        type=str,
+        help="List of glob patterns for reading files.",
+        action="append",
+    )
+    parser.add_argument(
+        "--experiment_name",
+        type=str,
+        help="Name of the experiment.",
+        default="pipeline_test",
+    )
+
+    args = parser.parse_args()
+
+    pipeline(
+        args.raw_data_path,
+        args.output_path,
+        args.folder_read_pattern,
+        args.file_read_pattern,
+        args.experiment_name,
+    )
diff --git a/calcium_imaging_automation/core/pipeline.py b/calcium_imaging_automation/core/pipeline.py
new file mode 100644
index 0000000..cc8a324
--- /dev/null
+++ b/calcium_imaging_automation/core/pipeline.py
@@ -0,0 +1,175 @@
+import datetime
+import logging
+from pathlib import Path
+from typing import List
+
+import mlflow
+import numpy as np
+import setuptools_scm
+
+from calcium_imaging_automation.core.reader import ReadAquiredData
+from calcium_imaging_automation.core.writer import DatashuttleWrapper
+
+
+def pipeline(
+    raw_data_path: Path,
+    output_path: Path,
+    folder_read_pattern: str,
+    file_read_pattern: List[str],
+    experiment_name: str = "pipeline_test",
+):
+    # --- Setup logging and MLflow ---
+    logging_setup(output_path)
+    mlflow_setup(output_path)
+
+    # --- Read folders and files ---
+    reader = ReadAquiredData(
+        raw_data_path,
+        folder_read_pattern,
+        file_read_pattern,
+    )
+    logging.info(f"Found {len(reader.datasets_paths)} datasets.")
+    logging.info(f"Dataset names: {reader.dataset_names}")
+
+    number_of_tiffs = reader.max_session_number(filetype="tif")
+    logging.info(f"Max of tiffs found: {number_of_tiffs}")
+
+    # --- Write folders and files ---
+    writer = DatashuttleWrapper(output_path)
+    writer.create_folders(reader.dataset_names, session_number=number_of_tiffs)
+
+    # --- Start processing ---
+    for dataset in reader.datasets_paths:
+        dataset_name = dataset.stem
+
+        for session in range(0, number_of_tiffs):
+            mlflow_set_experiment(experiment_name, dataset_name, session)
+
+            # Generate mock data
+            data = np.random.rand(100, 100)
+
+            # Start a new MLflow experiment for each dataset-session
+            with mlflow.start_run():  # this is the parent run
+                mlflow_parent_run_logs(
+                    dataset_name,
+                    session,
+                    raw_data_path,
+                    output_path,
+                    folder_read_pattern,
+                    file_read_pattern,
+                )
+
+                logging.info(
+                    f"Starting MLflow experiment for dataset {dataset_name} "
+                    + f"session {session}..."
+                )
+
+                # Mock processing for different runs within the experiment
+                for i in range(0, 10):  # n runs with varying parameters
+                    # Start a child run under the main dataset-session run
+                    with mlflow.start_run(nested=True):
+                        # Mock metric calculation
+                        metric_measured = np.mean(data) * i
+
+                        # Log the generated data as an artifact if desired
+                        # Here, simulate an image or data file save path
+                        image_path = writer.save_image(
+                            image=data,
+                            dataset_name=dataset_name,
+                            session_number=session,
+                            filename=f"image_{mlflow.active_run().info.run_id}.png",
+                        )
+
+                        mlflow_log_run(
+                            i,
+                            dataset_name,
+                            session,
+                            metric_measured,
+                            image_path,
+                        )
+
+                        logging.info(
+                            f"Completed MLflow run iteration {i} for dataset "
+                            + f"{dataset_name} session {session}"
+                        )
+
+                logging.info(
+                    f"Completed MLflow experiment for dataset {dataset_name}"
+                    + f" session {session}"
+                )
+
+    logging.info("Pipeline finished.")
+
+
+def logging_setup(output_path: Path):
+    # --- Setup experiment-wide logging to file ---
+    (output_path / "logs").mkdir(exist_ok=True)
+    logging.basicConfig(
+        filename=str(
+            output_path
+            / "logs"
+            / f"{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.log"
+        ),
+        level=logging.INFO,
+        format="%(asctime)s - %(message)s",
+    )
+
+
+def mlflow_setup(output_path: Path):
+    # --- Setup MLflow tracking ---
+    mlflow_tracking_dir = output_path / "mlflow"
+    mlflow.set_tracking_uri(str(mlflow_tracking_dir))
+
+
+def mlflow_set_experiment(
+    experiment_name: str, dataset_name: str, session: int
+):
+    # Start a new MLflow experiment for each dataset and session
+    mlflow.set_experiment(
+        f"{experiment_name}/{dataset_name}/session_{session}"
+    )
+
+
+def mlflow_parent_run_logs(
+    dataset_name: str,
+    session: int,
+    raw_data_path: Path,
+    output_path: Path,
+    folder_read_pattern: str,
+    file_read_pattern: List[str],
+):
+    # give specific name to the parent run
+    mlflow.set_tag("mlflow.runName", f"{dataset_name}_session_{session}")
+
+    # Log session-specific parameters
+    mlflow.log_param("mlflow.Dataset", dataset_name)
+    mlflow.log_param("session_number", session)
+    mlflow.log_param("raw_data_path", str(raw_data_path))
+    mlflow.log_param("output_path", str(output_path))
+    mlflow.log_param("folder_read_pattern", folder_read_pattern)
+    mlflow.log_param("file_read_pattern", file_read_pattern)
+    mlflow.log_param("local_changes_hash", setuptools_scm.get_version())
+
+
+def mlflow_log_run(
+    i: int,
+    dataset_name: str,
+    session: int,
+    metric_measured: float,
+    image_path: Path,
+):
+    # give specific name to the run
+    mlflow.set_tag("mlflow.runName", f"param_{i}")
+
+    # Log parameters and metrics specific to this run
+    mlflow.log_param("data_size", f"{i * 10}x100")
+    mlflow.log_param("run_iteration", i)
+    mlflow.log_param("run_id", mlflow.active_run().info.run_id)
+    mlflow.log_metric("metric_measured", metric_measured)
+
+    mlflow.log_artifact(
+        # where I am storing the image according to Neuroblueprint
+        # I think it gets copied in the mlflow data structure
+        image_path,
+        artifact_path=f"{dataset_name}/session_{session}/run_{i}",
+    )
diff --git a/calcium_imaging_automation/core/writer.py b/calcium_imaging_automation/core/writer.py
index 885d5f7..6e713c4 100644
--- a/calcium_imaging_automation/core/writer.py
+++ b/calcium_imaging_automation/core/writer.py
@@ -49,9 +49,7 @@ def save_image(
     ) -> Path:
         path = self.get_dataset_path(dataset_name)
         image = Image.fromarray(image).convert("L")
-        image_path = (
-            path / f"ses-{session_number}" / f"{filename}.png"
-        )
+        image_path = path / f"ses-{session_number}" / "funcimg" / f"{filename}"
         image.save(
             image_path,
             mode="PNG",
diff --git a/examples/debugging.py b/examples/debugging.py
index 550faca..47e953f 100644
--- a/examples/debugging.py
+++ b/examples/debugging.py
@@ -1,8 +1,8 @@
 from pathlib import Path
 
-from example_usage import main
+from calcium_imaging_automation.core.pipeline import pipeline
 
-main(
+pipeline(
     raw_data_path=Path(
         "/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/"
     ),
diff --git a/examples/example_usage.py b/examples/example_usage.py
deleted file mode 100644
index a85b436..0000000
--- a/examples/example_usage.py
+++ /dev/null
@@ -1,154 +0,0 @@
-import argparse
-import datetime
-import logging
-from pathlib import Path
-from typing import List
-import setuptools_scm
-
-import mlflow
-import numpy as np
-
-from calcium_imaging_automation.core.reader import ReadAquiredData
-from calcium_imaging_automation.core.writer import DatashuttleWrapper
-
-
-def main(
-    raw_data_path: Path,
-    output_path: Path,
-    folder_read_pattern: str,
-    file_read_pattern: List[str],
-    experiment_name: str = "pipeline_test",
-):
-    # --- Setup experiment-wide logging to file ---
-    (output_path / "logs").mkdir(exist_ok=True)
-    logging.basicConfig(
-        filename=str(
-            output_path
-            / "logs"
-            / f"{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.log"
-        ),
-        level=logging.INFO,
-        format="%(asctime)s - %(message)s",
-    )
-
-    # --- Setup MLflow tracking ---
-    mlflow_tracking_dir = output_path / "mlflow"
-    mlflow.set_tracking_uri(str(mlflow_tracking_dir))
-    mlflow.set_experiment(experiment_name)
-
-    # --- Read folders and files ---
-    reader = ReadAquiredData(
-        raw_data_path,
-        folder_read_pattern,
-        file_read_pattern,
-    )
-    logging.info(f"Found {len(reader.datasets_paths)} datasets.")
-    logging.info(f"Dataset names: {reader.dataset_names}")
-
-    number_of_tiffs = reader.max_session_number(filetype="tif")
-    logging.info(f"Max of tiffs found: {number_of_tiffs}")
-
-    # --- Write folders and files ---
-    writer = DatashuttleWrapper(output_path)
-    writer.create_folders(reader.dataset_names, session_number=number_of_tiffs)
-
-    for dataset in reader.datasets_paths:
-        dataset_name = dataset.stem
-        for session in range(0, number_of_tiffs):
-            # Generate mock data
-            data = np.random.rand(100, 100)
-
-            # Start a new MLflow experiment for each dataset-session
-            with mlflow.start_run() as parent_run:
-                # Log session-specific parameters
-                mlflow.log_param("dataset_name", dataset_name)
-                mlflow.log_param("session_number", session)
-                mlflow.log_param("raw_data_path", str(raw_data_path))
-                mlflow.log_param("output_path", str(output_path))
-                mlflow.log_param("folder_read_pattern", folder_read_pattern)
-                mlflow.log_param("file_read_pattern", file_read_pattern)
-                mlflow.log_param("local_changes_hash", setuptools_scm.get_version())
-
-                logging.info(
-                    f"Starting MLflow experiment for dataset {dataset_name} session {session}..."
-                )
-
-                # Mock processing for different runs within the experiment
-                for i in range(1, 11):  # 10 runs with varying parameters
-                    # Start a child run under the main dataset-session run
-                    with mlflow.start_run(nested=True):    
-
-                        # Mock metric calculation                    
-                        metric_measured = np.mean(data) * i 
-
-                        # Log parameters and metrics specific to this run
-                        mlflow.log_param("data_size", f"{i * 10}x100")
-                        mlflow.log_param("run_iteration", i)
-                        mlflow.log_param("run_id", mlflow.active_run().info.run_id)
-                        mlflow.log_metric("metric_measured", metric_measured)
-
-                        # Log the generated data as an artifact if desired
-                        # Here, simulate an image or data file save path
-                        image_path = writer.save_image(
-                            image=data,
-                            dataset_name=dataset_name,
-                            session_number=session,
-                            filename=f"image_run_{i}",
-                        )
-                        
-                        mlflow.log_artifact(
-                            image_path,
-                            artifact_path=f"{dataset_name}/session_{session}/run_{i}",
-                        )
-
-                        logging.info(
-                            f"Completed MLflow run iteration {i} for dataset {dataset_name} session {session}"
-                        )
-
-                logging.info(
-                    f"Completed MLflow experiment for dataset {dataset_name} session {session}"
-                )
-
-
-    logging.info("Pipeline finished.")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Example usage of the pipeline manager."
-    )
-
-    parser.add_argument(
-        "raw_data_path", type=Path, help="Path to the raw data."
-    )
-    parser.add_argument(
-        "output_path", type=Path, help="Path to the output data."
-    )
-    parser.add_argument(
-        "--folder_read_pattern",
-        type=str,
-        help="Glob pattern for reading folder.",
-        default="*",
-    )
-    parser.add_argument(
-        "--file_read_pattern",
-        type=str,
-        help="List of glob patterns for reading files.",
-        action="append",
-    )
-    parser.add_argument(
-        "--experiment_name",
-        type=str,
-        help="Name of the experiment.",
-        default="pipeline_test",
-    )
-
-    args = parser.parse_args()
-
-    main(
-        args.raw_data_path,
-        args.output_path,
-        args.folder_read_pattern,
-        args.file_read_pattern,
-        args.experiment_name,
-    )

From 472008674522247a2e8fe409c8435749f9048619 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Tue, 26 Nov 2024 16:34:25 +0000
Subject: [PATCH 24/37] Use with real data and add integration with submitit

---
 calcium_imaging_automation/core/app.py      |  10 +-
 calcium_imaging_automation/core/pipeline.py | 176 ++++++++++++--------
 pyproject.toml                              |   1 +
 3 files changed, 115 insertions(+), 72 deletions(-)

diff --git a/calcium_imaging_automation/core/app.py b/calcium_imaging_automation/core/app.py
index 3491a25..35cf0a4 100644
--- a/calcium_imaging_automation/core/app.py
+++ b/calcium_imaging_automation/core/app.py
@@ -1,7 +1,7 @@
 import argparse
 from pathlib import Path
 
-from calcium_imaging_automation.core.pipeline import pipeline
+from calcium_imaging_automation.core.pipeline import mlflow_orchestrator
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
@@ -32,13 +32,19 @@
         help="Name of the experiment.",
         default="pipeline_test",
     )
+    parser.add_argument(
+        "--compute_metric",
+        type=Path,
+        help="Path to the suite2p ops file.",
+    )
 
     args = parser.parse_args()
 
-    pipeline(
+    mlflow_orchestrator(
         args.raw_data_path,
         args.output_path,
         args.folder_read_pattern,
         args.file_read_pattern,
         args.experiment_name,
+        args.compute_metric,
     )
diff --git a/calcium_imaging_automation/core/pipeline.py b/calcium_imaging_automation/core/pipeline.py
index cc8a324..e4d0a14 100644
--- a/calcium_imaging_automation/core/pipeline.py
+++ b/calcium_imaging_automation/core/pipeline.py
@@ -1,27 +1,34 @@
 import datetime
 import logging
+import time
 from pathlib import Path
-from typing import List
+from typing import Callable, List
 
 import mlflow
-import numpy as np
 import setuptools_scm
+import submitit
+from submitit import AutoExecutor
 
 from calcium_imaging_automation.core.reader import ReadAquiredData
 from calcium_imaging_automation.core.writer import DatashuttleWrapper
 
 
-def pipeline(
+def mlflow_orchestrator(
     raw_data_path: Path,
     output_path: Path,
     folder_read_pattern: str,
     file_read_pattern: List[str],
+    preprocessing_function: Callable,
+    compute_metric: Callable,
     experiment_name: str = "pipeline_test",
 ):
     # --- Setup logging and MLflow ---
     logging_setup(output_path)
     mlflow_setup(output_path)
 
+    #  mkdir for submitit logs submitit / timestamp
+    (output_path / "submitit").mkdir(exist_ok=True)
+
     # --- Read folders and files ---
     reader = ReadAquiredData(
         raw_data_path,
@@ -39,68 +46,97 @@ def pipeline(
     writer.create_folders(reader.dataset_names, session_number=number_of_tiffs)
 
     # --- Start processing ---
-    for dataset in reader.datasets_paths:
-        dataset_name = dataset.stem
-
-        for session in range(0, number_of_tiffs):
-            mlflow_set_experiment(experiment_name, dataset_name, session)
-
-            # Generate mock data
-            data = np.random.rand(100, 100)
-
-            # Start a new MLflow experiment for each dataset-session
-            with mlflow.start_run():  # this is the parent run
-                mlflow_parent_run_logs(
-                    dataset_name,
-                    session,
-                    raw_data_path,
-                    output_path,
-                    folder_read_pattern,
-                    file_read_pattern,
-                )
-
-                logging.info(
-                    f"Starting MLflow experiment for dataset {dataset_name} "
-                    + f"session {session}..."
-                )
-
-                # Mock processing for different runs within the experiment
-                for i in range(0, 10):  # n runs with varying parameters
-                    # Start a child run under the main dataset-session run
-                    with mlflow.start_run(nested=True):
-                        # Mock metric calculation
-                        metric_measured = np.mean(data) * i
-
-                        # Log the generated data as an artifact if desired
-                        # Here, simulate an image or data file save path
-                        image_path = writer.save_image(
-                            image=data,
-                            dataset_name=dataset_name,
-                            session_number=session,
-                            filename=f"image_{mlflow.active_run().info.run_id}.png",
-                        )
-
-                        mlflow_log_run(
-                            i,
-                            dataset_name,
-                            session,
-                            metric_measured,
-                            image_path,
-                        )
-
-                        logging.info(
-                            f"Completed MLflow run iteration {i} for dataset "
-                            + f"{dataset_name} session {session}"
-                        )
-
-                logging.info(
-                    f"Completed MLflow experiment for dataset {dataset_name}"
-                    + f" session {session}"
-                )
+    results, errors = launch_job_array(
+        datasets=reader.datasets_paths,
+        output_path=output_path,
+        analysis_pipeline=analysis_pipeline,
+        writer=writer,
+        preprocessing_function=preprocessing_function,
+        compute_metric=compute_metric,
+    )
+
+    # --- Log all results with MLflow ---
+    for dataset, result, error in zip(reader.dataset_names, results, errors):
+        mlflow_set_experiment(experiment_name, dataset, 0)
+
+        with mlflow.start_run():
+            mlflow_parent_run_logs(
+                dataset,
+                0,
+                raw_data_path,
+                output_path,
+                folder_read_pattern,
+                file_read_pattern,
+            )
+
+            #  log error if any
+            if error:
+                mlflow.log_param("error", error)
+
+            if result:
+                mlflow.log_metric("stability", result)
+
+            mlflow.end_run()
 
     logging.info("Pipeline finished.")
 
 
+def launch_job_array(
+    datasets,
+    output_path,
+    analysis_pipeline,
+    writer,
+    preprocessing_function,
+    compute_metric,
+):
+    executor = AutoExecutor(folder=output_path / "submitit")
+    executor.update_parameters(
+        timeout_min=30,
+        slurm_partition="fast",
+        cpus_per_task=1,
+        tasks_per_node=1,
+        slurm_mem="16G",
+        slurm_array_parallelism=20,
+    )
+
+    logging.info(f"Running {len(datasets)} jobs.")
+    jobs = executor.map_array(
+        analysis_pipeline,
+        datasets,
+        [writer.get_dataset_path(dataset.stem) for dataset in datasets],
+        [preprocessing_function] * len(datasets),
+        [compute_metric] * len(datasets),
+    )
+
+    results = []
+    errors = []
+    for job in jobs:
+        while not job.done():
+            time.sleep(10)
+        try:
+            results.append(job.result())
+            errors.append(None)
+        except submitit.core.utils.FailedJobError as e:
+            logging.error(f"Job {job.job_id} failed: {e}")
+            results.append(None)
+            errors.append(job.stderr())
+
+    return results, errors
+
+
+def analysis_pipeline(
+    dataset, output_path_dataset, preprocessing_function, compute_metric
+):
+    import os
+
+    os.system("module load miniconda")
+    os.system("source activate /nfs/nhome/live/lporta/.conda/envs/cimat")
+    output_path_dataset = output_path_dataset / "ses-0/funcimg/"
+    data = preprocessing_function(dataset, output_path_dataset)
+    metric_measured = compute_metric(data)
+    return metric_measured
+
+
 def logging_setup(output_path: Path):
     # --- Setup experiment-wide logging to file ---
     (output_path / "logs").mkdir(exist_ok=True)
@@ -156,7 +192,7 @@ def mlflow_log_run(
     dataset_name: str,
     session: int,
     metric_measured: float,
-    image_path: Path,
+    # image_path: Path,
 ):
     # give specific name to the run
     mlflow.set_tag("mlflow.runName", f"param_{i}")
@@ -165,11 +201,11 @@ def mlflow_log_run(
     mlflow.log_param("data_size", f"{i * 10}x100")
     mlflow.log_param("run_iteration", i)
     mlflow.log_param("run_id", mlflow.active_run().info.run_id)
-    mlflow.log_metric("metric_measured", metric_measured)
-
-    mlflow.log_artifact(
-        # where I am storing the image according to Neuroblueprint
-        # I think it gets copied in the mlflow data structure
-        image_path,
-        artifact_path=f"{dataset_name}/session_{session}/run_{i}",
-    )
+    mlflow.log_metric("stability", metric_measured)
+
+    # mlflow.log_artifact(
+    #     # where I am storing the image according to Neuroblueprint
+    #     # I think it gets copied in the mlflow data structure
+    #     image_path,
+    #     artifact_path=f"{dataset_name}/session_{session}/run_{i}",
+    # )
diff --git a/pyproject.toml b/pyproject.toml
index 70f7fcd..0617e07 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,6 +11,7 @@ dependencies = [
   "setuptools_scm",
   "mlflow",
   "numpy",
+  "submitit",
 ]
 
 license = {text = "BSD-3-Clause"}

From dc886dd960067aa9f740a65a8dae51767054af10 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Tue, 26 Nov 2024 16:35:07 +0000
Subject: [PATCH 25/37] Reduce min sessions

---
 calcium_imaging_automation/core/reader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/calcium_imaging_automation/core/reader.py b/calcium_imaging_automation/core/reader.py
index 17b5e72..f0fcb3c 100644
--- a/calcium_imaging_automation/core/reader.py
+++ b/calcium_imaging_automation/core/reader.py
@@ -101,7 +101,7 @@ def total_objects_by_extension(self, folder: Path) -> dict:
             for filetype in self.file_read_pattern
         }
 
-    def max_session_number(self, filetype="tif", max_allowed=5) -> int:
+    def max_session_number(self, filetype="tif", max_allowed=1) -> int:
         """
         Get the maximum number of sessions for each dataset based on the total
         number of files found in the dataset folders. By default, it searches

From f7eb891039770ad1e3ef91ae6d8c133ce282500d Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Thu, 28 Nov 2024 17:20:42 +0000
Subject: [PATCH 26/37] =?UTF-8?q?Bye=20bye=20mlflow=20=F0=9F=91=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 calcium_imaging_automation/core/app.py      |   4 +-
 calcium_imaging_automation/core/pipeline.py | 103 +++-----------------
 pyproject.toml                              |   1 -
 3 files changed, 17 insertions(+), 91 deletions(-)

diff --git a/calcium_imaging_automation/core/app.py b/calcium_imaging_automation/core/app.py
index 35cf0a4..6ca2b60 100644
--- a/calcium_imaging_automation/core/app.py
+++ b/calcium_imaging_automation/core/app.py
@@ -1,7 +1,7 @@
 import argparse
 from pathlib import Path
 
-from calcium_imaging_automation.core.pipeline import mlflow_orchestrator
+from calcium_imaging_automation.core.pipeline import orchestrator
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
@@ -40,7 +40,7 @@
 
     args = parser.parse_args()
 
-    mlflow_orchestrator(
+    orchestrator(
         args.raw_data_path,
         args.output_path,
         args.folder_read_pattern,
diff --git a/calcium_imaging_automation/core/pipeline.py b/calcium_imaging_automation/core/pipeline.py
index e4d0a14..0270c9a 100644
--- a/calcium_imaging_automation/core/pipeline.py
+++ b/calcium_imaging_automation/core/pipeline.py
@@ -4,8 +4,7 @@
 from pathlib import Path
 from typing import Callable, List
 
-import mlflow
-import setuptools_scm
+import pandas as pd
 import submitit
 from submitit import AutoExecutor
 
@@ -13,7 +12,7 @@
 from calcium_imaging_automation.core.writer import DatashuttleWrapper
 
 
-def mlflow_orchestrator(
+def orchestrator(
     raw_data_path: Path,
     output_path: Path,
     folder_read_pattern: str,
@@ -24,7 +23,6 @@ def mlflow_orchestrator(
 ):
     # --- Setup logging and MLflow ---
     logging_setup(output_path)
-    mlflow_setup(output_path)
 
     #  mkdir for submitit logs submitit / timestamp
     (output_path / "submitit").mkdir(exist_ok=True)
@@ -55,28 +53,11 @@ def mlflow_orchestrator(
         compute_metric=compute_metric,
     )
 
-    # --- Log all results with MLflow ---
-    for dataset, result, error in zip(reader.dataset_names, results, errors):
-        mlflow_set_experiment(experiment_name, dataset, 0)
-
-        with mlflow.start_run():
-            mlflow_parent_run_logs(
-                dataset,
-                0,
-                raw_data_path,
-                output_path,
-                folder_read_pattern,
-                file_read_pattern,
-            )
-
-            #  log error if any
-            if error:
-                mlflow.log_param("error", error)
-
-            if result:
-                mlflow.log_metric("stability", result)
-
-            mlflow.end_run()
+    # save the results and errors as csv
+    results_df = pd.DataFrame(results)
+    results_df.to_csv(output_path / "results.csv")
+    errors_df = pd.DataFrame(errors)
+    errors_df.to_csv(output_path / "errors.csv")
 
     logging.info("Pipeline finished.")
 
@@ -132,8 +113,14 @@ def analysis_pipeline(
     os.system("module load miniconda")
     os.system("source activate /nfs/nhome/live/lporta/.conda/envs/cimat")
     output_path_dataset = output_path_dataset / "ses-0/funcimg/"
-    data = preprocessing_function(dataset, output_path_dataset)
-    metric_measured = compute_metric(data)
+    try:
+        data = preprocessing_function(dataset, output_path_dataset)
+        metric_measured = compute_metric(data)
+        with open(output_path_dataset / "metric.txt", "w") as f:
+            f.write(str(metric_measured))
+    except Exception as e:
+        with open(output_path_dataset / "error.txt", "w") as f:
+            f.write(str(e.args))
     return metric_measured
 
 
@@ -149,63 +136,3 @@ def logging_setup(output_path: Path):
         level=logging.INFO,
         format="%(asctime)s - %(message)s",
     )
-
-
-def mlflow_setup(output_path: Path):
-    # --- Setup MLflow tracking ---
-    mlflow_tracking_dir = output_path / "mlflow"
-    mlflow.set_tracking_uri(str(mlflow_tracking_dir))
-
-
-def mlflow_set_experiment(
-    experiment_name: str, dataset_name: str, session: int
-):
-    # Start a new MLflow experiment for each dataset and session
-    mlflow.set_experiment(
-        f"{experiment_name}/{dataset_name}/session_{session}"
-    )
-
-
-def mlflow_parent_run_logs(
-    dataset_name: str,
-    session: int,
-    raw_data_path: Path,
-    output_path: Path,
-    folder_read_pattern: str,
-    file_read_pattern: List[str],
-):
-    # give specific name to the parent run
-    mlflow.set_tag("mlflow.runName", f"{dataset_name}_session_{session}")
-
-    # Log session-specific parameters
-    mlflow.log_param("mlflow.Dataset", dataset_name)
-    mlflow.log_param("session_number", session)
-    mlflow.log_param("raw_data_path", str(raw_data_path))
-    mlflow.log_param("output_path", str(output_path))
-    mlflow.log_param("folder_read_pattern", folder_read_pattern)
-    mlflow.log_param("file_read_pattern", file_read_pattern)
-    mlflow.log_param("local_changes_hash", setuptools_scm.get_version())
-
-
-def mlflow_log_run(
-    i: int,
-    dataset_name: str,
-    session: int,
-    metric_measured: float,
-    # image_path: Path,
-):
-    # give specific name to the run
-    mlflow.set_tag("mlflow.runName", f"param_{i}")
-
-    # Log parameters and metrics specific to this run
-    mlflow.log_param("data_size", f"{i * 10}x100")
-    mlflow.log_param("run_iteration", i)
-    mlflow.log_param("run_id", mlflow.active_run().info.run_id)
-    mlflow.log_metric("stability", metric_measured)
-
-    # mlflow.log_artifact(
-    #     # where I am storing the image according to Neuroblueprint
-    #     # I think it gets copied in the mlflow data structure
-    #     image_path,
-    #     artifact_path=f"{dataset_name}/session_{session}/run_{i}",
-    # )
diff --git a/pyproject.toml b/pyproject.toml
index 0617e07..a9d5d73 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,6 @@ dynamic = ["version"]
 dependencies = [
   "datashuttle",
   "setuptools_scm",
-  "mlflow",
   "numpy",
   "submitit",
 ]

From dd445297f54acfe590e0c04a975b6628e4a72136 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Tue, 3 Dec 2024 17:16:10 +0000
Subject: [PATCH 27/37] Add basic snakemake script (test)

---
 .gitignore                                    |  3 +
 MANIFEST.in                                   |  1 +
 Snakefile                                     |  6 ++
 .../core/rules/setup.py                       | 70 +++++++++++++++++++
 4 files changed, 80 insertions(+)
 create mode 100644 Snakefile
 create mode 100644 calcium_imaging_automation/core/rules/setup.py

diff --git a/.gitignore b/.gitignore
index 22f3445..ce92f74 100644
--- a/.gitignore
+++ b/.gitignore
@@ -84,3 +84,6 @@ venv/
 
 # custom scripts
 examples/*.sh
+
+# snakemake
+.snakemake/*
diff --git a/MANIFEST.in b/MANIFEST.in
index 5089c71..699c0d3 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,6 @@
 include LICENSE
 include README.md
+include Snakefile
 exclude .pre-commit-config.yaml
 
 recursive-include calcium_imaging_automation *.py
diff --git a/Snakefile b/Snakefile
new file mode 100644
index 0000000..5cb7849
--- /dev/null
+++ b/Snakefile
@@ -0,0 +1,6 @@
+rule setup:
+    input:
+        datasets_path="/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/",
+        writing_path="/ceph/margrie/laura/cimaut/",
+    output: "setup_output.txt"
+    shell: "python calcium_imaging_automation/core/rules/setup.py {input.datasets_path} {input.writing_path} --folder_read_pattern '2*' --file_read_pattern 'rotation_00001.tif' --file_read_pattern '*.bin' > {output}"
diff --git a/calcium_imaging_automation/core/rules/setup.py b/calcium_imaging_automation/core/rules/setup.py
new file mode 100644
index 0000000..446cc66
--- /dev/null
+++ b/calcium_imaging_automation/core/rules/setup.py
@@ -0,0 +1,70 @@
+import argparse
+import shutil
+from pathlib import Path
+
+from calcium_imaging_automation.core.reader import ReadAquiredData
+from calcium_imaging_automation.core.writer import DatashuttleWrapper
+
+
+def setup(raw_data_path, folder_read_pattern, file_read_pattern, output_path):
+    try:
+        shutil.rmtree("/ceph/margrie/laura/cimaut/derivatives/")
+        shutil.rmtree("/ceph/margrie/laura/cimaut/submitit/")
+    except FileNotFoundError:
+        print("No derivatives folder found")
+
+    print(f"Reading data from {raw_data_path}")
+    reader = ReadAquiredData(
+        raw_data_path,
+        folder_read_pattern,
+        file_read_pattern,
+    )
+    print(f"Found {len(reader.datasets_paths)} datasets.")
+
+    number_of_tiffs = reader.max_session_number(filetype="tif")
+    print(f"Max of tiffs found: {number_of_tiffs}")
+
+    writer = DatashuttleWrapper(output_path)
+    print(f"Dataset names: {reader.dataset_names}")
+    writer.create_folders(reader.dataset_names, session_number=number_of_tiffs)
+    print("Folders created")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Example usage of the pipeline manager."
+    )
+
+    parser.add_argument(
+        "raw_data_path", type=Path, help="Path to the raw data."
+    )
+    parser.add_argument(
+        "output_path", type=Path, help="Path to the output data."
+    )
+    parser.add_argument(
+        "--folder_read_pattern",
+        type=str,
+        help="Glob pattern for reading folder.",
+        default="*",
+    )
+    parser.add_argument(
+        "--file_read_pattern",
+        type=str,
+        help="List of glob patterns for reading files.",
+        action="append",
+    )
+
+    args = parser.parse_args()
+
+    try:
+        setup(
+            args.raw_data_path,
+            args.folder_read_pattern,
+            args.file_read_pattern,
+            args.output_path,
+        )
+
+        print("Success")
+    except Exception as e:
+        print(f"Error: {e.args}")
+        print(e.with_traceback(e.__traceback__))

From 4947e31e5ef32ff56710e4154fb0fe6ba007e401 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Wed, 4 Dec 2024 11:55:09 +0000
Subject: [PATCH 28/37] Add working snakemake rule for preprocessing

---
 Snakefile                                     | 19 +++++++++++++++
 .../core/rules/preprocess.py                  | 24 +++++++++++++++++++
 .../core/rules/setup.py                       | 15 +++++++++++-
 3 files changed, 57 insertions(+), 1 deletion(-)
 create mode 100644 calcium_imaging_automation/core/rules/preprocess.py

diff --git a/Snakefile b/Snakefile
index 5cb7849..df34677 100644
--- a/Snakefile
+++ b/Snakefile
@@ -4,3 +4,22 @@ rule setup:
         writing_path="/ceph/margrie/laura/cimaut/",
     output: "setup_output.txt"
     shell: "python calcium_imaging_automation/core/rules/setup.py {input.datasets_path} {input.writing_path} --folder_read_pattern '2*' --file_read_pattern 'rotation_00001.tif' --file_read_pattern '*.bin' > {output}"
+
+import pandas as pd
+
+paths = pd.read_csv("datasets.csv")
+
+rule all:
+    input:
+        expand("preprocess_output_{index}.txt", index=paths["index"])
+
+rule preprocess:
+    input:
+        lambda wildcards: paths.loc[int(wildcards.index), "read_dataset_path"],
+        lambda wildcards: paths.loc[int(wildcards.index), "write_dataset_path"],
+    output:
+        "preprocess_output_{index}.txt"
+    params:
+        index=lambda wildcards: wildcards.index
+    script:
+        "calcium_imaging_automation/core/rules/preprocess.py"
diff --git a/calcium_imaging_automation/core/rules/preprocess.py b/calcium_imaging_automation/core/rules/preprocess.py
new file mode 100644
index 0000000..d45d7fd
--- /dev/null
+++ b/calcium_imaging_automation/core/rules/preprocess.py
@@ -0,0 +1,24 @@
+from pathlib import Path
+
+from derotation.analysis.metrics import stability_of_most_detected_blob
+from derotation.derotate_batch import derotate
+from snakemake.script import snakemake
+
+try:
+    # Input arguments
+    read_dataset_path = Path(snakemake.input[0])
+    write_dataset_path = Path(snakemake.input[1])
+    output = snakemake.output[0]
+
+    # os.system("module load miniconda")
+    # os.system("source activate /nfs/nhome/live/lporta/.conda/envs/cimat")
+    output_path_dataset = write_dataset_path / "ses-0/funcimg/"
+
+    data = derotate(read_dataset_path, output_path_dataset)
+    metric_measured = stability_of_most_detected_blob(data)
+    with open(output, "w") as f:
+        f.write(f"dataset: {read_dataset_path.stem} metric: {metric_measured}")
+except Exception as e:
+    print(e.args)
+    with open(output, "w") as f:
+        f.write(str(e.args))
diff --git a/calcium_imaging_automation/core/rules/setup.py b/calcium_imaging_automation/core/rules/setup.py
index 446cc66..98a88da 100644
--- a/calcium_imaging_automation/core/rules/setup.py
+++ b/calcium_imaging_automation/core/rules/setup.py
@@ -2,6 +2,8 @@
 import shutil
 from pathlib import Path
 
+import pandas as pd
+
 from calcium_imaging_automation.core.reader import ReadAquiredData
 from calcium_imaging_automation.core.writer import DatashuttleWrapper
 
@@ -14,6 +16,7 @@ def setup(raw_data_path, folder_read_pattern, file_read_pattern, output_path):
         print("No derivatives folder found")
 
     print(f"Reading data from {raw_data_path}")
+
     reader = ReadAquiredData(
         raw_data_path,
         folder_read_pattern,
@@ -25,10 +28,20 @@ def setup(raw_data_path, folder_read_pattern, file_read_pattern, output_path):
     print(f"Max of tiffs found: {number_of_tiffs}")
 
     writer = DatashuttleWrapper(output_path)
-    print(f"Dataset names: {reader.dataset_names}")
     writer.create_folders(reader.dataset_names, session_number=number_of_tiffs)
     print("Folders created")
 
+    datasets = pd.DataFrame(
+        {
+            "read_dataset_path": reader.datasets_paths,
+            "write_dataset_path": [
+                writer.get_dataset_path(dt.stem)
+                for dt in reader.datasets_paths
+            ],
+        }
+    )
+    datasets.to_csv("datasets.csv", index=True, index_label="index")
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(

From 4499d50d56ae5cbe491aaf78e7bfa4db72e68a00 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Wed, 4 Dec 2024 17:14:29 +0000
Subject: [PATCH 29/37] =?UTF-8?q?Run=20datasets=20on=20the=20cluster=20?=
 =?UTF-8?q?=E2=9C=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 24 ++++++++++++++++++++++++
 Snakefile |  6 ++++++
 2 files changed, 30 insertions(+)

diff --git a/README.md b/README.md
index 8175b08..128e7cc 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,25 @@
 # calcium-imaging-automation
+
+CIMAT simplifies the analysis of multi-photon calcium imaging data by integrating algorithms from tools like Suite2p and Caiman into a modular Snakemake pipeline. Researchers can evaluate, compare, and combine methods for each processing step, such as registration or source extraction, and explore metrics to identify the best fit for their datasets.
+
+With support for local or cluster-based parallelization, CIMAT provides visualization tools, reports, and guides to streamline decision-making and enhance reproducibility.
+
+## Installation
+
+
+### Run workflow with Snakemake
+To extract dataset names
+```bash
+snakemake --cores 1 setup_output.txt
+```
+
+
+To run preprocessing with slurm, use the following command for one dataset:
+```bash
+snakemake --executor slurm --jobs 20 --latency-wait 10 preprocess_output_0.txt
+```
+For an array of datasets:
+```bash
+snakemake --executor slurm --jobs 20 --latency-wait 10 preprocess_output_{0..N}.txt
+```
+Replace N with the number of datasets you have in the `datasets.csv` file.
diff --git a/Snakefile b/Snakefile
index df34677..19c6d7c 100644
--- a/Snakefile
+++ b/Snakefile
@@ -21,5 +21,11 @@ rule preprocess:
         "preprocess_output_{index}.txt"
     params:
         index=lambda wildcards: wildcards.index
+    resources:
+        partition="fast",
+        mem_mb=16000,
+        cpu_per_task=1,
+        tasks=1,
+        nodes=1,
     script:
         "calcium_imaging_automation/core/rules/preprocess.py"

From 5a892140dbcc169d12d8021b793954131652987e Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Wed, 4 Dec 2024 18:12:22 +0000
Subject: [PATCH 30/37] Update debugging script

---
 examples/debugging.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/examples/debugging.py b/examples/debugging.py
index 47e953f..c0fca25 100644
--- a/examples/debugging.py
+++ b/examples/debugging.py
@@ -1,12 +1,26 @@
+import shutil
 from pathlib import Path
 
-from calcium_imaging_automation.core.pipeline import pipeline
+from derotation.analysis.metrics import stability_of_most_detected_blob
+from derotation.derotate_batch import derotate
 
-pipeline(
+from calcium_imaging_automation.core.pipeline import orchestrator
+
+try:
+    shutil.rmtree("/ceph/margrie/laura/cimaut/derivatives/")
+    shutil.rmtree("/ceph/margrie/laura/cimaut/submitit/")
+except FileNotFoundError:
+    print("No derivatives folder found")
+
+orchestrator(
     raw_data_path=Path(
         "/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/"
     ),
     output_path=Path("/ceph/margrie/laura/cimaut/"),
     folder_read_pattern="2*",
     file_read_pattern=["rotation_00001.tif", "*.bin"],
+    experiment_name="submitit_04",
+    preprocessing_function=derotate,
+    compute_metric=stability_of_most_detected_blob,
+    # suite2p_ops_path="/ceph/margrie/laura/derotation/suite2p/laura_ops.npy",
 )

From 8a47b8f6293c21cda4dd5c56c1b2fb254de10972 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Wed, 4 Dec 2024 18:16:59 +0000
Subject: [PATCH 31/37] Move snakefile in workflow folder

---
 MANIFEST.in                     | 2 +-
 Snakefile => workflow/Snakefile | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename Snakefile => workflow/Snakefile (100%)

diff --git a/MANIFEST.in b/MANIFEST.in
index 699c0d3..bb8163b 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,6 @@
 include LICENSE
 include README.md
-include Snakefile
+include workflow/Snakefile
 exclude .pre-commit-config.yaml
 
 recursive-include calcium_imaging_automation *.py
diff --git a/Snakefile b/workflow/Snakefile
similarity index 100%
rename from Snakefile
rename to workflow/Snakefile

From 0e3821b5520d1e7e52731945739fe680dfca44f0 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Wed, 4 Dec 2024 18:17:31 +0000
Subject: [PATCH 32/37] Remove outdated scripts

---
 calcium_imaging_automation/core/app.py      |  50 -------
 calcium_imaging_automation/core/pipeline.py | 138 --------------------
 examples/debugging.py                       |  26 ----
 3 files changed, 214 deletions(-)
 delete mode 100644 calcium_imaging_automation/core/app.py
 delete mode 100644 calcium_imaging_automation/core/pipeline.py
 delete mode 100644 examples/debugging.py

diff --git a/calcium_imaging_automation/core/app.py b/calcium_imaging_automation/core/app.py
deleted file mode 100644
index 6ca2b60..0000000
--- a/calcium_imaging_automation/core/app.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import argparse
-from pathlib import Path
-
-from calcium_imaging_automation.core.pipeline import orchestrator
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Example usage of the pipeline manager."
-    )
-
-    parser.add_argument(
-        "raw_data_path", type=Path, help="Path to the raw data."
-    )
-    parser.add_argument(
-        "output_path", type=Path, help="Path to the output data."
-    )
-    parser.add_argument(
-        "--folder_read_pattern",
-        type=str,
-        help="Glob pattern for reading folder.",
-        default="*",
-    )
-    parser.add_argument(
-        "--file_read_pattern",
-        type=str,
-        help="List of glob patterns for reading files.",
-        action="append",
-    )
-    parser.add_argument(
-        "--experiment_name",
-        type=str,
-        help="Name of the experiment.",
-        default="pipeline_test",
-    )
-    parser.add_argument(
-        "--compute_metric",
-        type=Path,
-        help="Path to the suite2p ops file.",
-    )
-
-    args = parser.parse_args()
-
-    orchestrator(
-        args.raw_data_path,
-        args.output_path,
-        args.folder_read_pattern,
-        args.file_read_pattern,
-        args.experiment_name,
-        args.compute_metric,
-    )
diff --git a/calcium_imaging_automation/core/pipeline.py b/calcium_imaging_automation/core/pipeline.py
deleted file mode 100644
index 0270c9a..0000000
--- a/calcium_imaging_automation/core/pipeline.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import datetime
-import logging
-import time
-from pathlib import Path
-from typing import Callable, List
-
-import pandas as pd
-import submitit
-from submitit import AutoExecutor
-
-from calcium_imaging_automation.core.reader import ReadAquiredData
-from calcium_imaging_automation.core.writer import DatashuttleWrapper
-
-
-def orchestrator(
-    raw_data_path: Path,
-    output_path: Path,
-    folder_read_pattern: str,
-    file_read_pattern: List[str],
-    preprocessing_function: Callable,
-    compute_metric: Callable,
-    experiment_name: str = "pipeline_test",
-):
-    # --- Setup logging and MLflow ---
-    logging_setup(output_path)
-
-    #  mkdir for submitit logs submitit / timestamp
-    (output_path / "submitit").mkdir(exist_ok=True)
-
-    # --- Read folders and files ---
-    reader = ReadAquiredData(
-        raw_data_path,
-        folder_read_pattern,
-        file_read_pattern,
-    )
-    logging.info(f"Found {len(reader.datasets_paths)} datasets.")
-    logging.info(f"Dataset names: {reader.dataset_names}")
-
-    number_of_tiffs = reader.max_session_number(filetype="tif")
-    logging.info(f"Max of tiffs found: {number_of_tiffs}")
-
-    # --- Write folders and files ---
-    writer = DatashuttleWrapper(output_path)
-    writer.create_folders(reader.dataset_names, session_number=number_of_tiffs)
-
-    # --- Start processing ---
-    results, errors = launch_job_array(
-        datasets=reader.datasets_paths,
-        output_path=output_path,
-        analysis_pipeline=analysis_pipeline,
-        writer=writer,
-        preprocessing_function=preprocessing_function,
-        compute_metric=compute_metric,
-    )
-
-    # save the results and errors as csv
-    results_df = pd.DataFrame(results)
-    results_df.to_csv(output_path / "results.csv")
-    errors_df = pd.DataFrame(errors)
-    errors_df.to_csv(output_path / "errors.csv")
-
-    logging.info("Pipeline finished.")
-
-
-def launch_job_array(
-    datasets,
-    output_path,
-    analysis_pipeline,
-    writer,
-    preprocessing_function,
-    compute_metric,
-):
-    executor = AutoExecutor(folder=output_path / "submitit")
-    executor.update_parameters(
-        timeout_min=30,
-        slurm_partition="fast",
-        cpus_per_task=1,
-        tasks_per_node=1,
-        slurm_mem="16G",
-        slurm_array_parallelism=20,
-    )
-
-    logging.info(f"Running {len(datasets)} jobs.")
-    jobs = executor.map_array(
-        analysis_pipeline,
-        datasets,
-        [writer.get_dataset_path(dataset.stem) for dataset in datasets],
-        [preprocessing_function] * len(datasets),
-        [compute_metric] * len(datasets),
-    )
-
-    results = []
-    errors = []
-    for job in jobs:
-        while not job.done():
-            time.sleep(10)
-        try:
-            results.append(job.result())
-            errors.append(None)
-        except submitit.core.utils.FailedJobError as e:
-            logging.error(f"Job {job.job_id} failed: {e}")
-            results.append(None)
-            errors.append(job.stderr())
-
-    return results, errors
-
-
-def analysis_pipeline(
-    dataset, output_path_dataset, preprocessing_function, compute_metric
-):
-    import os
-
-    os.system("module load miniconda")
-    os.system("source activate /nfs/nhome/live/lporta/.conda/envs/cimat")
-    output_path_dataset = output_path_dataset / "ses-0/funcimg/"
-    try:
-        data = preprocessing_function(dataset, output_path_dataset)
-        metric_measured = compute_metric(data)
-        with open(output_path_dataset / "metric.txt", "w") as f:
-            f.write(str(metric_measured))
-    except Exception as e:
-        with open(output_path_dataset / "error.txt", "w") as f:
-            f.write(str(e.args))
-    return metric_measured
-
-
-def logging_setup(output_path: Path):
-    # --- Setup experiment-wide logging to file ---
-    (output_path / "logs").mkdir(exist_ok=True)
-    logging.basicConfig(
-        filename=str(
-            output_path
-            / "logs"
-            / f"{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.log"
-        ),
-        level=logging.INFO,
-        format="%(asctime)s - %(message)s",
-    )
diff --git a/examples/debugging.py b/examples/debugging.py
deleted file mode 100644
index c0fca25..0000000
--- a/examples/debugging.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import shutil
-from pathlib import Path
-
-from derotation.analysis.metrics import stability_of_most_detected_blob
-from derotation.derotate_batch import derotate
-
-from calcium_imaging_automation.core.pipeline import orchestrator
-
-try:
-    shutil.rmtree("/ceph/margrie/laura/cimaut/derivatives/")
-    shutil.rmtree("/ceph/margrie/laura/cimaut/submitit/")
-except FileNotFoundError:
-    print("No derivatives folder found")
-
-orchestrator(
-    raw_data_path=Path(
-        "/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/"
-    ),
-    output_path=Path("/ceph/margrie/laura/cimaut/"),
-    folder_read_pattern="2*",
-    file_read_pattern=["rotation_00001.tif", "*.bin"],
-    experiment_name="submitit_04",
-    preprocessing_function=derotate,
-    compute_metric=stability_of_most_detected_blob,
-    # suite2p_ops_path="/ceph/margrie/laura/derotation/suite2p/laura_ops.npy",
-)

From 76ff8b7f68a2ca96d4175b164abc6a6308ea4056 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Wed, 4 Dec 2024 18:56:39 +0000
Subject: [PATCH 33/37] =?UTF-8?q?WIP:=20changing=20script=20=F0=9F=90=9B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 _datasets.csv                                 | 21 +++++++
 .../core/rules/preprocess.py                  |  2 -
 .../core/rules/setup.py                       | 63 ++++++-------------
 workflow/Snakefile                            | 56 +++++++++--------
 4 files changed, 69 insertions(+), 73 deletions(-)
 create mode 100644 _datasets.csv

diff --git a/_datasets.csv b/_datasets.csv
new file mode 100644
index 0000000..c9c0036
--- /dev/null
+++ b/_datasets.csv
@@ -0,0 +1,21 @@
+index,read_dataset_path,write_dataset_path
+0,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230804_CAA_1119917,/ceph/margrie/laura/cimaut/derivatives/sub-0_230804_CAA_1119917
+1,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230818_CAA_1120210,/ceph/margrie/laura/cimaut/derivatives/sub-1_230818_CAA_1120210
+2,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230803_CAA_1119915,/ceph/margrie/laura/cimaut/derivatives/sub-2_230803_CAA_1119915
+3,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230801_CAA_1120181,/ceph/margrie/laura/cimaut/derivatives/sub-3_230801_CAA_1120181
+4,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230802_CAA_1120182,/ceph/margrie/laura/cimaut/derivatives/sub-4_230802_CAA_1120182
+5,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230803_CAA_1120181,/ceph/margrie/laura/cimaut/derivatives/sub-5_230803_CAA_1120181
+6,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230822_CAA_1120509,/ceph/margrie/laura/cimaut/derivatives/sub-6_230822_CAA_1120509
+7,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230823_CAA_1120181,/ceph/margrie/laura/cimaut/derivatives/sub-7_230823_CAA_1120181
+8,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230824_CAA_1119915,/ceph/margrie/laura/cimaut/derivatives/sub-8_230824_CAA_1119915
+9,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230825_CAA_1120182,/ceph/margrie/laura/cimaut/derivatives/sub-9_230825_CAA_1120182
+10,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230905_CAA_1119917,/ceph/margrie/laura/cimaut/derivatives/sub-10_230905_CAA_1119917
+11,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230907_CAA_1120210,/ceph/margrie/laura/cimaut/derivatives/sub-11_230907_CAA_1120210
+12,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230907_CAA_1120509,/ceph/margrie/laura/cimaut/derivatives/sub-12_230907_CAA_1120509
+13,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230912_CAA_1119915,/ceph/margrie/laura/cimaut/derivatives/sub-13_230912_CAA_1119915
+14,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230912_CAA_1120051,/ceph/margrie/laura/cimaut/derivatives/sub-14_230912_CAA_1120051
+15,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230913_CAA_1120182,/ceph/margrie/laura/cimaut/derivatives/sub-15_230913_CAA_1120182
+16,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230913_CAA_1120395,/ceph/margrie/laura/cimaut/derivatives/sub-16_230913_CAA_1120395
+17,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230914_CAA_1120181,/ceph/margrie/laura/cimaut/derivatives/sub-17_230914_CAA_1120181
+18,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230914_CAA_1120210,/ceph/margrie/laura/cimaut/derivatives/sub-18_230914_CAA_1120210
+19,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230915_CAA_1120509,/ceph/margrie/laura/cimaut/derivatives/sub-19_230915_CAA_1120509
diff --git a/calcium_imaging_automation/core/rules/preprocess.py b/calcium_imaging_automation/core/rules/preprocess.py
index d45d7fd..e9dc3b1 100644
--- a/calcium_imaging_automation/core/rules/preprocess.py
+++ b/calcium_imaging_automation/core/rules/preprocess.py
@@ -10,8 +10,6 @@
     write_dataset_path = Path(snakemake.input[1])
     output = snakemake.output[0]
 
-    # os.system("module load miniconda")
-    # os.system("source activate /nfs/nhome/live/lporta/.conda/envs/cimat")
     output_path_dataset = write_dataset_path / "ses-0/funcimg/"
 
     data = derotate(read_dataset_path, output_path_dataset)
diff --git a/calcium_imaging_automation/core/rules/setup.py b/calcium_imaging_automation/core/rules/setup.py
index 98a88da..56f1835 100644
--- a/calcium_imaging_automation/core/rules/setup.py
+++ b/calcium_imaging_automation/core/rules/setup.py
@@ -6,19 +6,27 @@
 
 from calcium_imaging_automation.core.reader import ReadAquiredData
 from calcium_imaging_automation.core.writer import DatashuttleWrapper
+from snakemake.script import snakemake
 
 
-def setup(raw_data_path, folder_read_pattern, file_read_pattern, output_path):
+try:
+    read_dataset_path = Path(snakemake.input[0])
+    write_dataset_path = Path(snakemake.input[1])
+    folder_read_pattern = snakemake.params.folder_read_pattern
+    file_read_pattern = snakemake.params.file_read_pattern
+    
+    output = snakemake.output[0]
+
     try:
         shutil.rmtree("/ceph/margrie/laura/cimaut/derivatives/")
         shutil.rmtree("/ceph/margrie/laura/cimaut/submitit/")
     except FileNotFoundError:
         print("No derivatives folder found")
 
-    print(f"Reading data from {raw_data_path}")
+    print(f"Reading data from {read_dataset_path}")
 
     reader = ReadAquiredData(
-        raw_data_path,
+        read_dataset_path,
         folder_read_pattern,
         file_read_pattern,
     )
@@ -27,7 +35,7 @@ def setup(raw_data_path, folder_read_pattern, file_read_pattern, output_path):
     number_of_tiffs = reader.max_session_number(filetype="tif")
     print(f"Max of tiffs found: {number_of_tiffs}")
 
-    writer = DatashuttleWrapper(output_path)
+    writer = DatashuttleWrapper(write_dataset_path)
     writer.create_folders(reader.dataset_names, session_number=number_of_tiffs)
     print("Folders created")
 
@@ -40,44 +48,9 @@ def setup(raw_data_path, folder_read_pattern, file_read_pattern, output_path):
             ],
         }
     )
-    datasets.to_csv("datasets.csv", index=True, index_label="index")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Example usage of the pipeline manager."
-    )
-
-    parser.add_argument(
-        "raw_data_path", type=Path, help="Path to the raw data."
-    )
-    parser.add_argument(
-        "output_path", type=Path, help="Path to the output data."
-    )
-    parser.add_argument(
-        "--folder_read_pattern",
-        type=str,
-        help="Glob pattern for reading folder.",
-        default="*",
-    )
-    parser.add_argument(
-        "--file_read_pattern",
-        type=str,
-        help="List of glob patterns for reading files.",
-        action="append",
-    )
-
-    args = parser.parse_args()
-
-    try:
-        setup(
-            args.raw_data_path,
-            args.folder_read_pattern,
-            args.file_read_pattern,
-            args.output_path,
-        )
-
-        print("Success")
-    except Exception as e:
-        print(f"Error: {e.args}")
-        print(e.with_traceback(e.__traceback__))
+    datasets.to_csv(output, index=True, index_label="index")
+   
+except Exception as e:
+    print(e.args)
+    with open(output, "w") as f:
+        f.write(str(e.args))
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 19c6d7c..e817e52 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -1,31 +1,35 @@
 rule setup:
     input:
-        datasets_path="/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/",
-        writing_path="/ceph/margrie/laura/cimaut/",
-    output: "setup_output.txt"
-    shell: "python calcium_imaging_automation/core/rules/setup.py {input.datasets_path} {input.writing_path} --folder_read_pattern '2*' --file_read_pattern 'rotation_00001.tif' --file_read_pattern '*.bin' > {output}"
-
-import pandas as pd
+        "/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/",
+        "/ceph/margrie/laura/cimaut/",
+    params:
+        folder_read_pattern="2*",
+        file_read_pattern=["rotation_00001.tif", "*.bin"],
+    output: "datasets.csv"
+    run:
+        "calcium_imaging_automation/core/rules/setup.py"
+        
+# import pandas as pd
 
-paths = pd.read_csv("datasets.csv")
+# paths = pd.read_csv("datasets.csv")
 
-rule all:
-    input:
-        expand("preprocess_output_{index}.txt", index=paths["index"])
+# rule all:
+#     input:
+#         expand("preprocess_output_{index}.txt", index=paths["index"])
 
-rule preprocess:
-    input:
-        lambda wildcards: paths.loc[int(wildcards.index), "read_dataset_path"],
-        lambda wildcards: paths.loc[int(wildcards.index), "write_dataset_path"],
-    output:
-        "preprocess_output_{index}.txt"
-    params:
-        index=lambda wildcards: wildcards.index
-    resources:
-        partition="fast",
-        mem_mb=16000,
-        cpu_per_task=1,
-        tasks=1,
-        nodes=1,
-    script:
-        "calcium_imaging_automation/core/rules/preprocess.py"
+# rule preprocess:
+#     input:
+#         lambda wildcards: paths.loc[int(wildcards.index), "read_dataset_path"],
+#         lambda wildcards: paths.loc[int(wildcards.index), "write_dataset_path"],
+#     output:
+#         "preprocess_output_{index}.txt"
+#     params:
+#         index=lambda wildcards: wildcards.index
+#     resources:
+#         partition="fast",
+#         mem_mb=16000,
+#         cpu_per_task=1,
+#         tasks=1,
+#         nodes=1,
+#     script:
+#         "calcium_imaging_automation/core/rules/preprocess.py"

From 93cfd206e4d681d90b0a2c65075e0fce12fcc4d3 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Tue, 10 Dec 2024 15:50:27 +0000
Subject: [PATCH 34/37] Make via snakemake what all the other classes were
 doing

---
 README.md                                     |  11 +-
 _datasets.csv                                 |  21 ---
 calcium_imaging_automation/core/reader.py     | 130 ------------------
 .../core/rules/preprocess.py                  |  16 +--
 .../core/rules/setup.py                       |  56 --------
 calcium_imaging_automation/core/writer.py     |  58 --------
 workflow/Snakefile                            |  76 +++++-----
 7 files changed, 54 insertions(+), 314 deletions(-)
 delete mode 100644 _datasets.csv
 delete mode 100644 calcium_imaging_automation/core/reader.py
 delete mode 100644 calcium_imaging_automation/core/rules/setup.py
 delete mode 100644 calcium_imaging_automation/core/writer.py

diff --git a/README.md b/README.md
index 128e7cc..4c81b3d 100644
--- a/README.md
+++ b/README.md
@@ -13,13 +13,8 @@ To extract dataset names
 snakemake --cores 1 setup_output.txt
 ```
 
-
-To run preprocessing with slurm, use the following command for one dataset:
-```bash
-snakemake --executor slurm --jobs 20 --latency-wait 10 preprocess_output_0.txt
-```
-For an array of datasets:
+Run all jobs in the pipeline:
 ```bash
-snakemake --executor slurm --jobs 20 --latency-wait 10 preprocess_output_{0..N}.txt
+snakemake --executor slurm --jobs 20 --latency-wait 10 all
 ```
-Replace N with the number of datasets you have in the `datasets.csv` file.
+Add `-np --printshellcmds` for a dry run with commands printed to the terminal.
diff --git a/_datasets.csv b/_datasets.csv
deleted file mode 100644
index c9c0036..0000000
--- a/_datasets.csv
+++ /dev/null
@@ -1,21 +0,0 @@
-index,read_dataset_path,write_dataset_path
-0,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230804_CAA_1119917,/ceph/margrie/laura/cimaut/derivatives/sub-0_230804_CAA_1119917
-1,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230818_CAA_1120210,/ceph/margrie/laura/cimaut/derivatives/sub-1_230818_CAA_1120210
-2,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230803_CAA_1119915,/ceph/margrie/laura/cimaut/derivatives/sub-2_230803_CAA_1119915
-3,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230801_CAA_1120181,/ceph/margrie/laura/cimaut/derivatives/sub-3_230801_CAA_1120181
-4,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230802_CAA_1120182,/ceph/margrie/laura/cimaut/derivatives/sub-4_230802_CAA_1120182
-5,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230803_CAA_1120181,/ceph/margrie/laura/cimaut/derivatives/sub-5_230803_CAA_1120181
-6,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230822_CAA_1120509,/ceph/margrie/laura/cimaut/derivatives/sub-6_230822_CAA_1120509
-7,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230823_CAA_1120181,/ceph/margrie/laura/cimaut/derivatives/sub-7_230823_CAA_1120181
-8,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230824_CAA_1119915,/ceph/margrie/laura/cimaut/derivatives/sub-8_230824_CAA_1119915
-9,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230825_CAA_1120182,/ceph/margrie/laura/cimaut/derivatives/sub-9_230825_CAA_1120182
-10,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230905_CAA_1119917,/ceph/margrie/laura/cimaut/derivatives/sub-10_230905_CAA_1119917
-11,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230907_CAA_1120210,/ceph/margrie/laura/cimaut/derivatives/sub-11_230907_CAA_1120210
-12,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230907_CAA_1120509,/ceph/margrie/laura/cimaut/derivatives/sub-12_230907_CAA_1120509
-13,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230912_CAA_1119915,/ceph/margrie/laura/cimaut/derivatives/sub-13_230912_CAA_1119915
-14,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230912_CAA_1120051,/ceph/margrie/laura/cimaut/derivatives/sub-14_230912_CAA_1120051
-15,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230913_CAA_1120182,/ceph/margrie/laura/cimaut/derivatives/sub-15_230913_CAA_1120182
-16,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230913_CAA_1120395,/ceph/margrie/laura/cimaut/derivatives/sub-16_230913_CAA_1120395
-17,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230914_CAA_1120181,/ceph/margrie/laura/cimaut/derivatives/sub-17_230914_CAA_1120181
-18,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230914_CAA_1120210,/ceph/margrie/laura/cimaut/derivatives/sub-18_230914_CAA_1120210
-19,/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/230915_CAA_1120509,/ceph/margrie/laura/cimaut/derivatives/sub-19_230915_CAA_1120509
diff --git a/calcium_imaging_automation/core/reader.py b/calcium_imaging_automation/core/reader.py
deleted file mode 100644
index f0fcb3c..0000000
--- a/calcium_imaging_automation/core/reader.py
+++ /dev/null
@@ -1,130 +0,0 @@
-from pathlib import Path
-from typing import List
-
-
-class ReadAquiredData:
-    def __init__(
-        self,
-        raw_data_folder: Path,
-        folder_read_pattern: str,
-        file_read_pattern: List[str],
-    ):
-        """
-        Class to handle filepaths and dataset names in the raw data folder.
-        It can load folders and files based on the provided patterns, allowing
-        flexibility in the data structure of origin.
-        It also provides the maximum number of sessions for each dataset based
-        on the total number of files found in the dataset folders, by default
-        it searches for tif files.
-
-        Parameters
-        ----------
-        raw_data_folder : Path
-            The path to the raw data folder.
-        folder_read_pattern : str
-            The pattern to search for folders in the raw data folder. It
-            corresponds to the naming convention of the datasets.
-        file_read_pattern : List[str]
-            The patterns to search for files in the dataset folders. It
-            corresponds to the naming convention of the files in the dataset
-            folders.
-        """
-        self.folder_read_pattern = folder_read_pattern
-        self.file_read_pattern = file_read_pattern
-
-        self.datasets_paths = self.get_folders_first_layer(raw_data_folder)
-        self.dataset_names = [
-            dataset_path.name for dataset_path in self.datasets_paths
-        ]
-
-    def get_folders_first_layer(self, file_path: Path) -> List[Path]:
-        """
-        Get the first layer of folders in the raw data folder. The rest
-        of the class assumes that the first layer of folders corresponds
-        to the dataset folders.
-
-        Parameters
-        ----------
-        file_path : Path
-            The path to the raw data folder.
-
-        Returns
-        -------
-        List[Path]
-            The list of paths to the dataset folders.
-        """
-        return list(file_path.glob(self.folder_read_pattern))
-
-    def get_files_paths_by_format(
-        self, folder: Path, filetype="tif"
-    ) -> List[Path]:
-        """
-        Get the paths to the files in the dataset folders based on the
-        provided file type. By default, it searches for tif files.
-
-        Parameters
-        ----------
-        folder : Path
-            The path to the dataset folder.
-        filetype : str, optional
-            The file type to search for in the dataset folder, by default
-            "tif".
-
-        Returns
-        -------
-        List[Path]
-            The list of paths to the files in the dataset folder.
-        """
-        return list(folder.rglob(filetype))
-
-    def total_objects_by_extension(self, folder: Path) -> dict:
-        """
-        Get the total number of files in the dataset folder based on the
-        extensions included in the file_read_pattern.
-
-        Parameters
-        ----------
-        folder : Path
-            The path to the dataset folder.
-
-        Returns
-        -------
-        dict
-            The dictionary with the number of files for each extension in the
-            patterns found in file_read_pattern.
-        """
-
-        return {
-            filetype.split(".")[-1]: len(
-                self.get_files_paths_by_format(folder, filetype)
-            )
-            for filetype in self.file_read_pattern
-        }
-
-    def max_session_number(self, filetype="tif", max_allowed=1) -> int:
-        """
-        Get the maximum number of sessions for each dataset based on the total
-        number of files found in the dataset folders. By default, it searches
-        for tif files and allows a maximum of 5 sessions. It assumes that every
-        tif file corresponds to an experimental session.
-
-        Parameters
-        ----------
-        filetype : str, optional
-            The file type to search for in the dataset folder, by default
-            "tif".
-        max_allowed : int, optional
-            The maximum number of sessions allowed, by default 5.
-
-        Returns
-        -------
-        int
-            The maximum number of sessions for each dataset.
-        """
-
-        total_tif_number = [
-            self.total_objects_by_extension(dataset_path).get(filetype, 0)
-            for dataset_path in self.datasets_paths
-        ]
-
-        return min(max(total_tif_number), max_allowed)
diff --git a/calcium_imaging_automation/core/rules/preprocess.py b/calcium_imaging_automation/core/rules/preprocess.py
index e9dc3b1..66ddb5c 100644
--- a/calcium_imaging_automation/core/rules/preprocess.py
+++ b/calcium_imaging_automation/core/rules/preprocess.py
@@ -4,19 +4,17 @@
 from derotation.derotate_batch import derotate
 from snakemake.script import snakemake
 
-try:
-    # Input arguments
-    read_dataset_path = Path(snakemake.input[0])
-    write_dataset_path = Path(snakemake.input[1])
-    output = snakemake.output[0]
-
-    output_path_dataset = write_dataset_path / "ses-0/funcimg/"
+# Input arguments
+read_dataset_path = Path(snakemake.input[0])
+output_tif = Path(snakemake.output[0])
 
+output_path_dataset = output_tif.parent.parent
+try:
     data = derotate(read_dataset_path, output_path_dataset)
     metric_measured = stability_of_most_detected_blob(data)
-    with open(output, "w") as f:
+    with open(output_path_dataset / "metric.txt", "w") as f:
         f.write(f"dataset: {read_dataset_path.stem} metric: {metric_measured}")
 except Exception as e:
     print(e.args)
-    with open(output, "w") as f:
+    with open(output_path_dataset / "error.txt", "w") as f:
         f.write(str(e.args))
diff --git a/calcium_imaging_automation/core/rules/setup.py b/calcium_imaging_automation/core/rules/setup.py
deleted file mode 100644
index 56f1835..0000000
--- a/calcium_imaging_automation/core/rules/setup.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import argparse
-import shutil
-from pathlib import Path
-
-import pandas as pd
-
-from calcium_imaging_automation.core.reader import ReadAquiredData
-from calcium_imaging_automation.core.writer import DatashuttleWrapper
-from snakemake.script import snakemake
-
-
-try:
-    read_dataset_path = Path(snakemake.input[0])
-    write_dataset_path = Path(snakemake.input[1])
-    folder_read_pattern = snakemake.params.folder_read_pattern
-    file_read_pattern = snakemake.params.file_read_pattern
-    
-    output = snakemake.output[0]
-
-    try:
-        shutil.rmtree("/ceph/margrie/laura/cimaut/derivatives/")
-        shutil.rmtree("/ceph/margrie/laura/cimaut/submitit/")
-    except FileNotFoundError:
-        print("No derivatives folder found")
-
-    print(f"Reading data from {read_dataset_path}")
-
-    reader = ReadAquiredData(
-        read_dataset_path,
-        folder_read_pattern,
-        file_read_pattern,
-    )
-    print(f"Found {len(reader.datasets_paths)} datasets.")
-
-    number_of_tiffs = reader.max_session_number(filetype="tif")
-    print(f"Max of tiffs found: {number_of_tiffs}")
-
-    writer = DatashuttleWrapper(write_dataset_path)
-    writer.create_folders(reader.dataset_names, session_number=number_of_tiffs)
-    print("Folders created")
-
-    datasets = pd.DataFrame(
-        {
-            "read_dataset_path": reader.datasets_paths,
-            "write_dataset_path": [
-                writer.get_dataset_path(dt.stem)
-                for dt in reader.datasets_paths
-            ],
-        }
-    )
-    datasets.to_csv(output, index=True, index_label="index")
-   
-except Exception as e:
-    print(e.args)
-    with open(output, "w") as f:
-        f.write(str(e.args))
diff --git a/calcium_imaging_automation/core/writer.py b/calcium_imaging_automation/core/writer.py
deleted file mode 100644
index 6e713c4..0000000
--- a/calcium_imaging_automation/core/writer.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from pathlib import Path
-from typing import Dict, List
-
-import numpy as np
-from datashuttle.configs.config_class import Configs
-from datashuttle.utils import folders
-from PIL import Image
-
-
-class DatashuttleWrapper:
-    def __init__(self, output_path: Path) -> None:
-        # This is supposed to run in the cluster and have direct access
-        # to the central storages
-        self.output_path = output_path
-        self.datashuttle_cfg = Configs(
-            project_name=output_path.name,
-            file_path=output_path,
-            input_dict={
-                "local_path": output_path,
-                "central_path": "",
-                "connection_method": "local_filesystem",
-            },
-        )
-
-    def create_folders(self, dataset_names: List[str], session_number) -> None:
-        # all_paths is a dictionary with keys: sub, ses
-        self.all_paths: Dict[str, List[Path]] = folders.create_folder_trees(
-            cfg=self.datashuttle_cfg,
-            top_level_folder="derivatives",
-            sub_names=[
-                f"sub-{i}_{dataset_name}"
-                for i, dataset_name in enumerate(dataset_names)
-            ],
-            ses_names=[f"ses-{i}" for i in range(session_number)],
-            datatype="funcimg",
-        )
-
-    def get_dataset_path(self, dataset_name: str) -> Path:
-        return next(
-            (self.output_path / "derivatives").glob(f"*{dataset_name}*")
-        )
-
-    def save_image(
-        self,
-        image: np.ndarray,
-        dataset_name: str,
-        session_number: int,
-        filename: str,
-    ) -> Path:
-        path = self.get_dataset_path(dataset_name)
-        image = Image.fromarray(image).convert("L")
-        image_path = path / f"ses-{session_number}" / "funcimg" / f"{filename}"
-        image.save(
-            image_path,
-            mode="PNG",
-        )
-
-        return image_path
diff --git a/workflow/Snakefile b/workflow/Snakefile
index e817e52..27bc8b3 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -1,35 +1,47 @@
-rule setup:
-    input:
-        "/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/",
-        "/ceph/margrie/laura/cimaut/",
-    params:
-        folder_read_pattern="2*",
-        file_read_pattern=["rotation_00001.tif", "*.bin"],
-    output: "datasets.csv"
-    run:
-        "calcium_imaging_automation/core/rules/setup.py"
-        
-# import pandas as pd
+# Base paths
+raw_data_base = "/nfs/winstor/margrie/SimonWeiler/RawData/Invivo_imaging/3photon_rotation/shared/"
+processed_data_base = "/ceph/margrie/laura/cimaut/derivatives"
+
+# Dynamically discover folders matching the "2*" pattern
+datasets = glob_wildcards(f"{raw_data_base}{{dataset}}").dataset
+datasets = [ds for ds in datasets if ds.startswith("2")]
+datasets = [ds.split("/")[0] for ds in datasets]
+datasets = list(set(datasets))
+datasets.sort()
 
-# paths = pd.read_csv("datasets.csv")
+#  for the output
+datasets_no_underscore = [ds.replace("_", "") for ds in datasets]
 
-# rule all:
-#     input:
-#         expand("preprocess_output_{index}.txt", index=paths["index"])
+#  Final state of the pipeline
+#  Are all the outputs files present?
+rule all:
+    input:
+        expand(
+            [
+                f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.tif",
+                f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.csv",
+            ],
+            zip,
+            index=range(len(datasets)),
+            datasets_no_underscore=datasets_no_underscore,
+        )
 
-# rule preprocess:
-#     input:
-#         lambda wildcards: paths.loc[int(wildcards.index), "read_dataset_path"],
-#         lambda wildcards: paths.loc[int(wildcards.index), "write_dataset_path"],
-#     output:
-#         "preprocess_output_{index}.txt"
-#     params:
-#         index=lambda wildcards: wildcards.index
-#     resources:
-#         partition="fast",
-#         mem_mb=16000,
-#         cpu_per_task=1,
-#         tasks=1,
-#         nodes=1,
-#     script:
-#         "calcium_imaging_automation/core/rules/preprocess.py"
+rule preprocess:
+    input:
+        raw=lambda wildcards: f"{raw_data_base}{datasets[int(wildcards.index)]}/",
+        # Dynamically match input files using patterns
+        # bin=lambda wildcards: f"{raw_data_base}{datasets[int(wildcards.index)]}/aux_stim/*rotation_*001.bin",
+        # tif=lambda wildcards: f"{raw_data_base}{datasets[int(wildcards.index)]}/imaging/rotation_*001.tif",
+    output:
+        tiff=f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.tif",
+        csv=f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.csv",
+    params:
+        index=lambda wildcards: wildcards.index
+    resources:
+        partition="fast",
+        mem_mb=16000,
+        cpu_per_task=1,
+        tasks=1,
+        nodes=1,
+    script:
+        "../calcium_imaging_automation/core/rules/preprocess.py"

From 724daad08096c8eb29d678a8094543669bc32eab Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Tue, 10 Dec 2024 15:53:30 +0000
Subject: [PATCH 35/37] Update pyproject.toml

---
 pyproject.toml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a9d5d73..c3013a9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,10 +7,9 @@ requires-python = ">=3.9.0"
 dynamic = ["version"]
 
 dependencies = [
-  "datashuttle",
-  "setuptools_scm",
   "numpy",
-  "submitit",
+  "snakemake",
+  "snakemake-executor-plugin-slurm",
 ]
 
 license = {text = "BSD-3-Clause"}

From 826792b9f2d5e2aa6f4b4b65810f2deaba811bba Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Wed, 11 Dec 2024 15:58:35 +0000
Subject: [PATCH 36/37] Generate report with datavzrd

---
 .gitignore                                    |  3 ++
 MANIFEST.in                                   |  1 +
 README.md                                     | 16 ++++++---
 .../core/rules/preprocess.py                  | 13 +++++---
 .../core/rules/summarize_data.py              | 28 ++++++++++++++++
 pyproject.toml                                |  2 ++
 workflow/Snakefile                            | 33 ++++++++++++++++---
 workflow/resources/datavzrd_config.yaml       | 22 +++++++++++++
 8 files changed, 105 insertions(+), 13 deletions(-)
 create mode 100644 calcium_imaging_automation/core/rules/summarize_data.py
 create mode 100644 workflow/resources/datavzrd_config.yaml

diff --git a/.gitignore b/.gitignore
index ce92f74..6c02c04 100644
--- a/.gitignore
+++ b/.gitignore
@@ -87,3 +87,6 @@ examples/*.sh
 
 # snakemake
 .snakemake/*
+
+# datavzrd
+workflow/results/
diff --git a/MANIFEST.in b/MANIFEST.in
index bb8163b..c76a641 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -5,6 +5,7 @@ exclude .pre-commit-config.yaml
 
 recursive-include calcium_imaging_automation *.py
 recursive-include examples *.py
+recursive-include workflow *.yaml
 
 recursive-exclude * __pycache__
 recursive-exclude * *.py[co]
diff --git a/README.md b/README.md
index 4c81b3d..1d80ec8 100644
--- a/README.md
+++ b/README.md
@@ -8,13 +8,19 @@ With support for local or cluster-based parallelization, CIMAT provides visualiz
 
 
 ### Run workflow with Snakemake
-To extract dataset names
-```bash
-snakemake --cores 1 setup_output.txt
-```
-
 Run all jobs in the pipeline:
 ```bash
 snakemake --executor slurm --jobs 20 --latency-wait 10 all
 ```
 Add `-np --printshellcmds` for a dry run with commands printed to the terminal.
+
+### See interactive report with datavzrd
+Build the csv:
+```bash
+snakemake --cores 1 workflow/results/data/summary.csv
+```
+Create the report:
+```bash
+datavzrd workflow/resources/datavzrd_config.yaml --output workflow/results/datavzrd
+```
+Then open the report (`index.html`) in a browser.
diff --git a/calcium_imaging_automation/core/rules/preprocess.py b/calcium_imaging_automation/core/rules/preprocess.py
index 66ddb5c..27434fb 100644
--- a/calcium_imaging_automation/core/rules/preprocess.py
+++ b/calcium_imaging_automation/core/rules/preprocess.py
@@ -1,3 +1,4 @@
+import traceback
 from pathlib import Path
 
 from derotation.analysis.metrics import stability_of_most_detected_blob
@@ -13,8 +14,12 @@
     data = derotate(read_dataset_path, output_path_dataset)
     metric_measured = stability_of_most_detected_blob(data)
     with open(output_path_dataset / "metric.txt", "w") as f:
-        f.write(f"dataset: {read_dataset_path.stem} metric: {metric_measured}")
-except Exception as e:
-    print(e.args)
+        f.write(f"stability_of_most_detected_blob: {metric_measured}")
+    # make empty error file
     with open(output_path_dataset / "error.txt", "w") as f:
-        f.write(str(e.args))
+        f.write("")
+except Exception:
+    with open(output_path_dataset / "error.txt", "w") as f:
+        f.write(traceback.format_exc())
+    with open(output_path_dataset / "metric.txt", "w") as f:
+        f.write(f"dataset: {read_dataset_path.stem} metric: NaN")
diff --git a/calcium_imaging_automation/core/rules/summarize_data.py b/calcium_imaging_automation/core/rules/summarize_data.py
new file mode 100644
index 0000000..566d3c9
--- /dev/null
+++ b/calcium_imaging_automation/core/rules/summarize_data.py
@@ -0,0 +1,28 @@
+from pathlib import Path
+
+import pandas as pd
+from snakemake.script import snakemake
+
+# Retrieve parameters and inputs from Snakemake
+datasets = snakemake.params.datasets
+processed_data_base = snakemake.params.base_path
+
+data = []
+for idx, dataset in enumerate(datasets):
+    metric_file = Path(
+        f"{processed_data_base}/sub-{idx}_{dataset}/ses-0/funcimg/metric.txt"
+    )
+    error_file = Path(
+        f"{processed_data_base}/sub-{idx}_{dataset}/ses-0/funcimg/error.txt"
+    )
+
+    # Read metric and error values
+    metric = metric_file.read_text().strip() if metric_file.exists() else "N/A"
+    error = error_file.read_text().strip() if error_file.exists() else "N/A"
+
+    # Append results
+    data.append({"Dataset": dataset, "Metric": metric, "Error": error})
+
+# Create a DataFrame and write to CSV
+df = pd.DataFrame(data)
+df.to_csv(snakemake.output[0], index=False)
diff --git a/pyproject.toml b/pyproject.toml
index c3013a9..4043067 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,8 +8,10 @@ dynamic = ["version"]
 
 dependencies = [
   "numpy",
+  "pandas",
   "snakemake",
   "snakemake-executor-plugin-slurm",
+  "datavzrd",
 ]
 
 license = {text = "BSD-3-Clause"}
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 27bc8b3..677d5cd 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -12,6 +12,7 @@ datasets.sort()
 #  for the output
 datasets_no_underscore = [ds.replace("_", "") for ds in datasets]
 
+#  -----------------------------------------------------
 #  Final state of the pipeline
 #  Are all the outputs files present?
 rule all:
@@ -20,19 +21,22 @@ rule all:
             [
                 f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.tif",
                 f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.csv",
+                f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/metric.txt",
+                f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/error.txt",
             ],
             zip,
             index=range(len(datasets)),
             datasets_no_underscore=datasets_no_underscore,
-        )
+        ),
 
+#  -----------------------------------------------------
+#  Preprocess
 rule preprocess:
     input:
         raw=lambda wildcards: f"{raw_data_base}{datasets[int(wildcards.index)]}/",
-        # Dynamically match input files using patterns
-        # bin=lambda wildcards: f"{raw_data_base}{datasets[int(wildcards.index)]}/aux_stim/*rotation_*001.bin",
-        # tif=lambda wildcards: f"{raw_data_base}{datasets[int(wildcards.index)]}/imaging/rotation_*001.tif",
     output:
+        report(f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/metric.txt"),
+        report(f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/error.txt"),
         tiff=f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.tif",
         csv=f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.csv",
     params:
@@ -45,3 +49,24 @@ rule preprocess:
         nodes=1,
     script:
         "../calcium_imaging_automation/core/rules/preprocess.py"
+
+#  -----------------------------------------------------
+#  Summarize data for datavzrd report
+rule summarize_data:
+    input:
+        expand(
+            [
+                f"{processed_data_base}/sub-{{index}}_{{dataset}}/ses-0/funcimg/metric.txt",
+                f"{processed_data_base}/sub-{{index}}_{{dataset}}/ses-0/funcimg/error.txt",
+            ],
+            zip,
+            index=range(len(datasets)),
+            dataset=datasets_no_underscore,
+        )
+    output:
+        "workflow/results/data/summary.csv"
+    params:
+        datasets=datasets_no_underscore,
+        base_path=processed_data_base
+    script:
+        "../calcium_imaging_automation/core/rules/summarize_data.py"
diff --git a/workflow/resources/datavzrd_config.yaml b/workflow/resources/datavzrd_config.yaml
new file mode 100644
index 0000000..36388eb
--- /dev/null
+++ b/workflow/resources/datavzrd_config.yaml
@@ -0,0 +1,22 @@
+datasets:
+  summary:
+    path: workflow/results/data/summary.csv
+    separator: ","
+
+views:
+  summary_view:
+    dataset: summary
+    render-table:
+      columns:
+        Dataset:
+          plot:
+            ticks:
+              scale: linear
+        Metric:
+          plot:
+            ticks:
+              scale: linear
+        Error:
+          plot:
+            ticks:
+              scale: linear

From f8340ad946e710ca72ebd8eddaee0be7510b5845 Mon Sep 17 00:00:00 2001
From: lauraporta <ucqflpo@ucl.ac.uk>
Date: Fri, 24 Jan 2025 15:08:44 +0000
Subject: [PATCH 37/37] Include all code necessary for end-to-end analysis -
 but it's messy

---
 README.md                                     |  12 +-
 .../core/rules/plot_data.py                   | 315 +++++++++++++++++
 .../core/rules/postprocess.py                 | 321 ++++++++++++++++++
 .../core/rules/preprocess.py                  |  17 +-
 .../core/rules/suite2p_run.py                 |  54 +++
 workflow/Snakefile                            |  80 ++++-
 6 files changed, 781 insertions(+), 18 deletions(-)
 create mode 100644 calcium_imaging_automation/core/rules/plot_data.py
 create mode 100644 calcium_imaging_automation/core/rules/postprocess.py
 create mode 100644 calcium_imaging_automation/core/rules/suite2p_run.py

diff --git a/README.md b/README.md
index 1d80ec8..6583375 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ With support for local or cluster-based parallelization, CIMAT provides visualiz
 ### Run workflow with Snakemake
 Run all jobs in the pipeline:
 ```bash
-snakemake --executor slurm --jobs 20 --latency-wait 10 all
+snakemake --executor slurm --jobs 20 --latency-wait 10 all --forcerun preprocess --rerun-incomplete
 ```
 Add `-np --printshellcmds` for a dry run with commands printed to the terminal.
 
@@ -24,3 +24,13 @@ Create the report:
 datavzrd workflow/resources/datavzrd_config.yaml --output workflow/results/datavzrd
 ```
 Then open the report (`index.html`) in a browser.
+
+specific dataset, rerun:
+```bash
+snakemake --executor slurm --jobs 20 --latency-wait 10     /ceph/margrie/laura/cimaut/derivatives/sub-1_230802CAA1120182/ses-0/funcimg/derotation/derotated_full.tif     --forcerun preprocess --rerun-incomplete
+```
+
+summary plot:
+```bash
+snakemake --cores 1 --latency-wait 10 workflow/results/data/stability_metric.png
+```
diff --git a/calcium_imaging_automation/core/rules/plot_data.py b/calcium_imaging_automation/core/rules/plot_data.py
new file mode 100644
index 0000000..0ab09ee
--- /dev/null
+++ b/calcium_imaging_automation/core/rules/plot_data.py
@@ -0,0 +1,315 @@
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from matplotlib import pyplot as plt
+from snakemake.script import snakemake
+
+print("Plotting data...")
+
+
+# datasets = snakemake.params.datasets
+f_path = Path(snakemake.input[0])
+
+print(f"Processing file: {f_path}")
+
+dataset_path = f_path.parent.parent.parent.parent
+dataset = dataset_path.name
+f_neu_path = f_path.parent / "Fneu.npy"
+derotated_full_csv_path = (
+    dataset_path / "ses-0" / "funcimg" / "derotation" / "derotated_full.csv"
+)
+saving_path = Path(dataset_path) / "ses-0" / "traces"
+saving_path.mkdir(exist_ok=True)
+
+print(f"Dataset path: {dataset_path}")
+print(f"Dataset: {dataset}")
+print(f"Derotated full csv path: {derotated_full_csv_path}")
+print(f"Fneu path: {f_neu_path}")
+print(f"Saving path: {saving_path}")
+
+
+f = np.load(f_path)
+fneu = np.load(f_neu_path)
+rotated_frames = pd.read_csv(derotated_full_csv_path)
+f_corrected = f - 0.7 * fneu
+
+
+F_df = pd.DataFrame(f_corrected).T
+
+print(f"Shape of F_df: {F_df.shape}")
+print(F_df.head())
+
+full_dataframe = pd.concat([F_df, rotated_frames], axis=1)
+
+# --------------------------------------------------------
+# Prepare the dataset
+
+# find where do rotations start
+rotation_on = np.diff(full_dataframe["rotation_count"])
+
+
+def find_zero_chunks(arr):
+    zero_chunks = []
+    start = None
+
+    for i in range(len(arr)):
+        if arr[i] == 0 and start is None:
+            start = i
+        elif arr[i] != 0 and start is not None:
+            zero_chunks.append((start, i - 1))
+            start = None
+
+    # Check if the array ends with a chunk of zeros
+    if start is not None:
+        zero_chunks.append((start, len(arr) - 1))
+
+    return zero_chunks
+
+
+starts_ends = find_zero_chunks(rotation_on)
+
+frames_before_rotation = 15
+# frames_after_rotation = 10
+
+total_len = 100
+
+full_dataframe["rotation_frames"] = np.zeros(len(full_dataframe))
+for i, (start, end) in enumerate(starts_ends):
+    frame_array = np.arange(total_len)
+    column_index_of_rotation_frames = full_dataframe.columns.get_loc(
+        "rotation_frames"
+    )
+    full_dataframe.iloc[
+        start - frames_before_rotation : total_len
+        + start
+        - frames_before_rotation,
+        column_index_of_rotation_frames,
+    ] = frame_array
+
+    #  extend this value of speed and direction to all this range
+    this_speed = full_dataframe.loc[start, "speed"]
+    this_direction = full_dataframe.loc[start, "direction"]
+
+    full_dataframe.iloc[
+        start - frames_before_rotation : total_len
+        + start
+        - frames_before_rotation,
+        full_dataframe.columns.get_loc("speed"),
+    ] = this_speed
+    full_dataframe.iloc[
+        start - frames_before_rotation : total_len
+        + start
+        - frames_before_rotation,
+        full_dataframe.columns.get_loc("direction"),
+    ] = this_direction
+
+
+#  directtion, change -1 to CCW and 1 to CW
+full_dataframe["direction"] = np.where(
+    full_dataframe["direction"] == -1, "CCW", "CW"
+)
+
+# print(f"Full dataframe shape: {full_dataframe.shape}")
+# print(full_dataframe.head())
+
+# #  angle based calculation of ΔF/F
+# #  first calculate F0, as the 20th quantile for each angle.
+# #  consider angles every 5 degrees, from 0 to 355
+# full_dataframe["aproximated_rotation_angle"] = (
+#     full_dataframe["rotation_angle"] // 5 * 5
+# )
+
+# print("Unique angles:")
+# print(full_dataframe["aproximated_rotation_angle"].unique())
+
+# f0_as_20th_quantile_per_angle = np.zeros((360, f_corrected.shape[0]))
+# for angle in range(360):
+#     for roi in range(f_corrected.shape[0]):
+#         angle_indices = full_dataframe["aproximated_rotation_angle"] == angle
+#         print(f"Angle: {angle}, ROI: {roi}")
+#         print(f"Angle indices: {angle_indices}")
+#         #  check for nans / missing values in angle_indices
+#         if angle_indices.isnull().values.any():
+#             f0_as_20th_quantile_per_angle[angle, roi] = np.nan
+#         else:
+#             f0_as_20th_quantile_per_angle[angle, roi] = np.quantile(
+#                 f_corrected[roi][angle_indices], 0.2
+#             )
+# print("Shape of f0_as_20th_quantile_per_angle:")
+# print(f0_as_20th_quantile_per_angle.shape)
+# print(f0_as_20th_quantile_per_angle)
+
+# #  calculate ΔF/F
+# for roi in range(f_corrected.T.shape[0]):
+#     full_dataframe[roi] = (
+#         f_corrected.T[roi] - f0_as_20th_quantile_per_angle[
+#             full_dataframe["rotation_angle"], roi
+#         ]
+#     ) / f0_as_20th_quantile_per_angle[
+#         full_dataframe["rotation_angle"], roi
+#     ]
+
+# print("Full dataframe with ΔF/F:")
+# print(full_dataframe.head())
+
+rois_selection = range(F_df.shape[1])
+
+# --------------------------------------------------------
+# Plot single traces
+
+# %%
+selected_range = (400, 2000)
+
+for roi in rois_selection:
+    roi_selected = full_dataframe.loc[
+        :, [roi, "rotation_count", "speed", "direction"]
+    ]
+
+    fig, ax = plt.subplots(figsize=(27, 5))
+    ax.plot(roi_selected.loc[selected_range[0] : selected_range[1], roi])
+    ax.set(xlabel="Frames", ylabel="Neuropil corrected (a.u.)")  # "ΔF/F")
+
+    rotation_on = (
+        np.diff(
+            roi_selected.loc[
+                selected_range[0] : selected_range[1], "rotation_count"
+            ]
+        )
+        == 0
+    )
+
+    # add label at the beginning of every block of rotations
+    #  if the previous was true, do not write the label
+    for i, rotation in enumerate(rotation_on):
+        if rotation and not rotation_on[i - 1]:
+            ax.text(
+                i + selected_range[0] + 3,
+                -1100,
+                f"{int(roi_selected.loc[i + 5 + selected_range[0], 'speed'])}º/s\n{roi_selected.loc[i + 5 + selected_range[0], 'direction']}",
+                fontsize=10,
+            )
+
+    #  add gray squares when the rotation is happening using the starst_ends
+    for start, end in starts_ends:
+        if start > selected_range[0] and end < selected_range[1]:
+            ax.axvspan(start, end, color="gray", alpha=0.2)
+
+    fps = 6.74
+    # change xticks to seconds
+    xticks = ax.get_xticks()
+    ax.set_xticks(xticks)
+    ax.set_xticklabels((xticks / fps).astype(int))
+    #  change x label
+    ax.set(xlabel="Seconds", ylabel="Neuropil corrected (a.u.)")  # "ΔF/F")
+
+    ax.set_xlim(selected_range)
+    # ax.set_ylim(-10, 10)
+
+    # leave some gap between the axis and the plot
+    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)
+
+    # remove top and right spines
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+
+    plt.savefig(saving_path / f"dff_example_{roi}.pdf")
+    plt.savefig(saving_path / f"dff_example_{roi}.png")
+    plt.close()
+
+
+# --------------------------------------------------------
+# Plot averages
+
+custom_palette = sns.color_palette("dark:#5A9_r", 4)
+
+for roi in rois_selection:
+    fig, ax = plt.subplots(1, 2, figsize=(20, 10))
+    for i, direction in enumerate(["CW", "CCW"]):
+        sns.lineplot(
+            x="rotation_frames",
+            y=roi,
+            data=full_dataframe[(full_dataframe["direction"] == direction)],
+            hue="speed",
+            palette=custom_palette,
+            ax=ax[i],
+        )
+        ax[i].set_title(f"Direction: {direction}")
+        ax[i].legend(title="Speed")
+
+        #  remove top and right spines
+        ax[i].spines["top"].set_visible(False)
+        ax[i].spines["right"].set_visible(False)
+
+        # add vertical lines to show the start of the rotation
+        #  start is always at 11, end at total len - 10
+        ax[i].axvline(x=frames_before_rotation, color="gray", linestyle="--")
+
+        #  change x axis to seconds
+        fps = 6.74
+        xticks = ax[i].get_xticks()
+        ax[i].set_xticks(xticks)
+        ax[i].set_xticklabels(np.round(xticks / fps, 1))
+        #  change x label
+        ax[i].set(
+            xlabel="Seconds", ylabel="Neuropil corrected (a.u.)"
+        )  # "ΔF/F")
+
+    plt.savefig(saving_path / f"roi_{roi}_direction_speed.pdf")
+    plt.savefig(saving_path / f"roi_{roi}_direction_speed.png")
+    plt.close()
+
+    #  make also another plot showing all traces (not averaged - no std)
+
+    fig, ax = plt.subplots(figsize=(20, 10))
+    for i, direction in enumerate(["CW", "CCW"]):
+        # sns.relplot(
+        #     x="rotation_frames",
+        #     y=roi,
+        #     data=full_dataframe[(full_dataframe["direction"] == direction)],
+        #     hue="speed",
+        #     palette=custom_palette,
+        #     kind="line",
+        #     estimator=None,
+        #     style="direction",
+        #     ax=ax,
+        # )
+        #  plot single traces using matplotlib
+        for speed in full_dataframe["speed"].unique():
+            ax.plot(
+                full_dataframe[
+                    (full_dataframe["direction"] == direction)
+                    & (full_dataframe["speed"] == speed)
+                ]["rotation_frames"],
+                full_dataframe[
+                    (full_dataframe["direction"] == direction)
+                    & (full_dataframe["speed"] == speed)
+                ][roi],
+                label=f"{speed}º/s",
+                # color=custom_palette[speed],
+            )
+
+        ax.set_title(f"Direction: {direction}")
+        ax.legend(title="Speed")
+
+        #  remove top and right spines
+        ax.spines["top"].set_visible(False)
+        ax.spines["right"].set_visible(False)
+
+        # add vertical lines to show the start of the rotation
+        #  start is always at 11, end at total len - 10
+        ax.axvline(x=frames_before_rotation, color="gray", linestyle="--")
+
+        #  change x axis to seconds
+        fps = 6.74
+        xticks = ax.get_xticks()
+        ax.set_xticks(xticks)
+        ax.set_xticklabels(np.round(xticks / fps, 1))
+        #  change x label
+        ax.set(xlabel="Seconds", ylabel="Neuropil corrected (a.u.)")  # "ΔF/F")
+
+    plt.savefig(saving_path / f"roi_{roi}_direction_speed_all.pdf")
+    plt.savefig(saving_path / f"roi_{roi}_direction_speed_all.png")
+
+    plt.close()
diff --git a/calcium_imaging_automation/core/rules/postprocess.py b/calcium_imaging_automation/core/rules/postprocess.py
new file mode 100644
index 0000000..59b81ef
--- /dev/null
+++ b/calcium_imaging_automation/core/rules/postprocess.py
@@ -0,0 +1,321 @@
+import traceback
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from derotation.analysis.full_derotation_pipeline import FullPipeline
+from derotation.analysis.metrics import stability_of_most_detected_blob
+from matplotlib import pyplot as plt
+from snakemake.script import snakemake
+
+datasets = snakemake.params.datasets
+processed_data_base = snakemake.params.base_path
+
+csv_path = Path(snakemake.output[0]).with_suffix(".csv")
+img_path = Path(snakemake.output[0])
+
+if not img_path.exists():
+    datasets_paths = []
+    for idx, dataset in enumerate(datasets):
+        datasets_paths.append(
+            Path(
+                f"{snakemake.params.base_path}/sub-{idx}_{dataset}/ses-0/funcimg"
+            )
+        )
+
+    movie_bin_paths = []
+    for dataset in datasets_paths:
+        movie_bin_paths.extend(list(Path(dataset).rglob("*.bin")))
+
+    is_cell_paths = []
+    for dataset in datasets_paths:
+        is_cell_paths.extend(list(Path(dataset).rglob("iscell.npy")))
+
+    metric_paths = []
+    for path_to_bin_file in movie_bin_paths:
+        metric_path = (
+            path_to_bin_file.parent.parent.parent / "derotation_metrics.csv"
+        )
+        metric_paths.append(metric_path)
+
+    derotated_full_csv_paths = []
+    for path_to_bin_file in movie_bin_paths:
+        derotated_full_csv_path = (
+            path_to_bin_file.parent.parent.parent
+            / "derotation/derotated_full.csv"
+        )
+        derotated_full_csv_paths.append(derotated_full_csv_path)
+
+    all_metrics_df = pd.DataFrame(
+        columns=["dataset", "analysis_type", "metric", "value"]
+    )
+    analysis_types = [
+        "no_adj",
+        "adj_track",
+        "adj_largest",
+        "adj_track_shifted",
+    ]
+
+    for path_to_bin_file, metric_path, derotated_full_csv_path in zip(
+        movie_bin_paths, metric_paths, derotated_full_csv_paths
+    ):
+        print(
+            f"Processing dataset: {path_to_bin_file.parent.parent.parent.parent.parent.name}..."
+        )
+        try:
+            metric = pd.read_csv(metric_path)
+
+            path_to_bin_file = Path(path_to_bin_file)
+
+            rotation_df = pd.read_csv(derotated_full_csv_path)
+            num_frames = len(rotation_df)
+
+            shape_image = (num_frames, 256, 256)
+            registered = np.memmap(
+                path_to_bin_file, shape=shape_image, dtype="int16"
+            )
+
+            #  plot first frame as an image of registered as a way to test if the loading was correct
+            plt.imshow(registered[0])
+            plt.savefig(path_to_bin_file.parent / "first_frame_registered.png")
+            plt.close()
+
+            derotator = FullPipeline.__new__(FullPipeline)
+
+            angles = rotation_df["rotation_angle"].values
+            if len(angles) > len(registered):
+                angles = angles[: len(registered)]
+            elif len(angles) < len(registered):
+                angles = np.pad(angles, (0, len(registered) - len(angles)))
+
+            derotator.rot_deg_frame = angles
+            mean_images = derotator.calculate_mean_images(
+                registered, round_decimals=0
+            )
+
+            #  show first mean image
+            plt.imshow(mean_images[0])
+            plt.savefig(path_to_bin_file.parent / "first_mean_image.png")
+            plt.close()
+
+            path_plots = path_to_bin_file.parent
+            try:
+                ptd, std = stability_of_most_detected_blob(
+                    (mean_images, path_plots),
+                    # blob_log_kwargs={"min_sigma": 0, "max_sigma": 20, "threshold": 0.5, "overlap": 0},
+                    # clip=False
+                )
+                print(f"ptd: {ptd}, std: {std}")
+            except Exception as e:
+                print(e)
+                print(traceback.format_exc())
+                ptd = np.nan
+                std = np.nan
+
+            for i, analysis_type in enumerate(analysis_types):
+                row_ptd = {
+                    "dataset": path_to_bin_file.parent.parent.parent.parent.parent.name,
+                    "analysis_type": analysis_type,
+                    "metric": "ptd",
+                    "value": metric["ptd"][i],
+                }
+                row_std = {
+                    "dataset": path_to_bin_file.parent.parent.parent.parent.parent.name,
+                    "analysis_type": analysis_type,
+                    "metric": "std",
+                    "value": metric["std"][i],
+                }
+                all_metrics_df = pd.concat(
+                    [all_metrics_df, pd.DataFrame([row_ptd, row_std])],
+                    ignore_index=True,
+                )
+            #  add post_suite2p metrics
+            row_ptd = {
+                "dataset": path_to_bin_file.parent.parent.parent.parent.parent.name,
+                "analysis_type": "post_suite2p",
+                "metric": "ptd",
+                "value": ptd,
+            }
+            row_std = {
+                "dataset": path_to_bin_file.parent.parent.parent.parent.parent.name,
+                "analysis_type": "post_suite2p",
+                "metric": "std",
+                "value": std,
+            }
+            all_metrics_df = pd.concat(
+                [all_metrics_df, pd.DataFrame([row_ptd, row_std])],
+                ignore_index=True,
+            )
+        except Exception as e:
+            print(e)
+            print("Error in dataset")
+            continue
+
+    #  save the dataframe to a csv file (change the file extension from png to csv)
+    all_metrics_df.to_csv(csv_path, index=False)
+
+else:
+    all_metrics_df = pd.read_csv(csv_path)
+
+sns.set_theme(style="whitegrid")
+sns.set_context("paper")
+sns.set_palette("pastel")
+
+fig, axs = plt.subplots(1, 2, figsize=(10, 5))
+
+sns.pointplot(
+    x="analysis_type",
+    y="value",
+    hue="dataset",
+    data=all_metrics_df[all_metrics_df["metric"] == "ptd"],
+    ax=axs[0],
+)
+
+sns.pointplot(
+    x="analysis_type",
+    y="value",
+    hue="dataset",
+    data=all_metrics_df[all_metrics_df["metric"] == "std"],
+    ax=axs[1],
+)
+
+axs[0].set_title("PTD")
+axs[1].set_title("STD")
+
+plt.tight_layout()
+plt.savefig(img_path)
+plt.close()
+
+# make another similar plot with these analysis types:
+# 1. no_adj
+# 2. the min between "no_adj", "adj_track", "adj_largest", "adj_track_shifted" (to be calculated)
+# 3. post_suite2p
+
+fig, axs = plt.subplots(1, 2, figsize=(10, 5))
+
+data = pd.DataFrame(columns=["dataset", "analysis_type", "metric", "value"])
+for dataset in all_metrics_df["dataset"].unique():
+    dataset_df = all_metrics_df[all_metrics_df["dataset"] == dataset]
+    #  no_adj
+    no_adj_value = dataset_df[
+        (dataset_df["analysis_type"] == "no_adj")
+        & (dataset_df["metric"] == "ptd")
+    ]["value"].values[0]
+    row = {
+        "dataset": dataset,
+        "analysis_type": "no_adj",
+        "metric": "ptd",
+        "value": no_adj_value,
+    }
+    data = pd.concat([data, pd.DataFrame([row])], ignore_index=True)
+    #  min but not for post_suite2p
+    min_value = dataset_df[
+        (dataset_df["analysis_type"] != "post_suite2p")
+        & (dataset_df["metric"] == "ptd")
+    ]["value"].min()
+    row = {
+        "dataset": dataset,
+        "analysis_type": "min",
+        "metric": "ptd",
+        "value": min_value,
+    }
+    data = pd.concat([data, pd.DataFrame([row])], ignore_index=True)
+    #  post_suite2p
+    post_suite2p_value = dataset_df[
+        (dataset_df["analysis_type"] == "post_suite2p")
+        & (dataset_df["metric"] == "ptd")
+    ]["value"].values[0]
+    row = {
+        "dataset": dataset,
+        "analysis_type": "post_suite2p",
+        "metric": "ptd",
+        "value": post_suite2p_value,
+    }
+    data = pd.concat([data, pd.DataFrame([row])], ignore_index=True)
+
+#  save dataset
+data.to_csv(csv_path.with_name("min_analysis_types_min_ptd.csv"), index=False)
+
+sns.pointplot(
+    x="analysis_type",
+    y="value",
+    hue="dataset",
+    data=data[data["metric"] == "ptd"],
+    ax=axs[0],
+)
+
+
+axs[0].set_ylabel("Point to point distance (r)")
+axs[0].set_xlabel("Derotation adjustment")
+axs[0].set_xticklabels(["No", "Yes", "Post Suite2p"])
+
+data = pd.DataFrame(columns=["dataset", "analysis_type", "metric", "value"])
+
+for dataset in all_metrics_df["dataset"].unique():
+    dataset_df = all_metrics_df[all_metrics_df["dataset"] == dataset]
+    #  no_adj
+    no_adj_value = dataset_df[
+        (dataset_df["analysis_type"] == "no_adj")
+        & (dataset_df["metric"] == "std")
+    ]["value"].values[0]
+    row = {
+        "dataset": dataset,
+        "analysis_type": "no_adj",
+        "metric": "std",
+        "value": no_adj_value,
+    }
+    data = pd.concat([data, pd.DataFrame([row])], ignore_index=True)
+    #  min but not for post_suite2p
+    min_value = dataset_df[
+        (dataset_df["analysis_type"] != "post_suite2p")
+        & (dataset_df["metric"] == "std")
+    ]["value"].min()
+    row = {
+        "dataset": dataset,
+        "analysis_type": "min",
+        "metric": "std",
+        "value": min_value,
+    }
+    data = pd.concat([data, pd.DataFrame([row])], ignore_index=True)
+    #  post_suite2p
+    post_suite2p_value = dataset_df[
+        (dataset_df["analysis_type"] == "post_suite2p")
+        & (dataset_df["metric"] == "std")
+    ]["value"].values[0]
+    row = {
+        "dataset": dataset,
+        "analysis_type": "post_suite2p",
+        "metric": "std",
+        "value": post_suite2p_value,
+    }
+    data = pd.concat([data, pd.DataFrame([row])], ignore_index=True)
+
+#  save dataset
+data.to_csv(csv_path.with_name("min_analysis_types_min_std.csv"), index=False)
+
+sns.pointplot(
+    x="analysis_type",
+    y="value",
+    hue="dataset",
+    data=data[data["metric"] == "std"],
+    ax=axs[1],
+)
+
+axs[1].set_ylabel("XY standard deviation (s)")
+axs[1].set_xlabel("Derotation adjustment")
+axs[1].set_xticklabels(["No", "Yes", "Post Suite2p"])
+
+#  remove legend
+axs[0].get_legend().remove()
+axs[1].get_legend().remove()
+
+axs[0].set_title("PTD")
+axs[1].set_title("STD")
+
+#  despine
+sns.despine()
+
+plt.tight_layout()
+plt.savefig(img_path.with_name("min_analysis_types.png"))
+plt.savefig(img_path.with_name("min_analysis_types.pdf"))
diff --git a/calcium_imaging_automation/core/rules/preprocess.py b/calcium_imaging_automation/core/rules/preprocess.py
index 27434fb..0070a4a 100644
--- a/calcium_imaging_automation/core/rules/preprocess.py
+++ b/calcium_imaging_automation/core/rules/preprocess.py
@@ -1,7 +1,6 @@
 import traceback
 from pathlib import Path
 
-from derotation.analysis.metrics import stability_of_most_detected_blob
 from derotation.derotate_batch import derotate
 from snakemake.script import snakemake
 
@@ -9,17 +8,19 @@
 read_dataset_path = Path(snakemake.input[0])
 output_tif = Path(snakemake.output[0])
 
-output_path_dataset = output_tif.parent.parent
+output_path_dataset = output_tif.parent
 try:
-    data = derotate(read_dataset_path, output_path_dataset)
-    metric_measured = stability_of_most_detected_blob(data)
-    with open(output_path_dataset / "metric.txt", "w") as f:
-        f.write(f"stability_of_most_detected_blob: {metric_measured}")
+    metrics = derotate(read_dataset_path, output_path_dataset)
+    #  save metrics as csv (matrix is already a pandas dataframe)
+    metrics.to_csv(output_path_dataset / "derotation_metrics.csv", index=False)
+
     # make empty error file
     with open(output_path_dataset / "error.txt", "w") as f:
         f.write("")
 except Exception:
     with open(output_path_dataset / "error.txt", "w") as f:
         f.write(traceback.format_exc())
-    with open(output_path_dataset / "metric.txt", "w") as f:
-        f.write(f"dataset: {read_dataset_path.stem} metric: NaN")
+
+    # make empty metrics file
+    with open(output_path_dataset / "derotation_metrics.csv", "w") as f:
+        f.write("")
diff --git a/calcium_imaging_automation/core/rules/suite2p_run.py b/calcium_imaging_automation/core/rules/suite2p_run.py
new file mode 100644
index 0000000..e27957f
--- /dev/null
+++ b/calcium_imaging_automation/core/rules/suite2p_run.py
@@ -0,0 +1,54 @@
+import datetime
+import traceback
+from pathlib import Path
+
+import numpy as np
+from snakemake.script import snakemake
+from suite2p import run_s2p
+
+# Retrieve parameters and inputs from Snakemake
+input_path = Path(snakemake.input[0])
+ops_file = snakemake.input[1]
+dataset_folder = Path(input_path).parent.parent
+
+# load ops
+ops = np.load(ops_file, allow_pickle=True).item()
+ops["save_folder"] = str(dataset_folder)
+ops["save_path0"] = str(dataset_folder)
+ops["fast_disk"] = str(dataset_folder)
+ops["data_path"] = [input_path.parent]
+
+#  change ops for non-rigid registration
+ops["nonrigid"] = True
+ops["block_size"] = [64, 64]
+ops["snr_thresh"] = 1.7
+ops["maxregshiftNR"] = 15
+
+db = {"data_path": input_path}
+try:
+    assert type(ops) == dict, f"ops is not a dict, it is {type(ops)}"
+    assert type(db) == dict, f"db is not a dict, it is {type(db)}"
+    ops_end = run_s2p(ops=ops)
+
+    #  get registration metrics from ops
+    metrics = {
+        "regDX": ops_end.get("regDX", "NaN"),
+        "regPC": ops_end.get("regPC", "NaN"),
+        "tPC": ops_end.get("tPC", "NaN"),
+    }
+
+    #  append in the metrics file the new metrics
+    with open(dataset_folder / "suite2p_metrics.txt", "w") as f:
+        f.write("registration metrics: \n")
+        for key, value in metrics.items():
+            f.write(f"{key}: {value}\n")
+    # make empty error file
+    with open(dataset_folder / "error.txt", "a") as f:
+        f.write("")
+except Exception:
+    with open(dataset_folder / "error.txt", "a") as f:
+        #  add timestamp to the error file
+        f.write(f"Error at {datetime.datetime.now()}\n")
+        f.write(traceback.format_exc())
+    with open(dataset_folder / "suite2p_metrics.txt", "w") as f:
+        f.write("registration metrics: NaN\n")
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 677d5cd..f944ee4 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -12,6 +12,9 @@ datasets.sort()
 #  for the output
 datasets_no_underscore = [ds.replace("_", "") for ds in datasets]
 
+valid_indices = [0, 1, 6, 7, 8, 9, 11, 13, 15, 16, 18]
+subsample_datasets = [datasets_no_underscore[i] for i in valid_indices]
+
 #  -----------------------------------------------------
 #  Final state of the pipeline
 #  Are all the outputs files present?
@@ -19,15 +22,20 @@ rule all:
     input:
         expand(
             [
-                f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.tif",
-                f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.csv",
-                f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/metric.txt",
-                f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/error.txt",
+                # f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.tif",
+                # f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.csv",
+                # f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation_metrics.csv",
+                # f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/error.txt",
+                # f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/plane0/stat.npy",
+                f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/traces/dff_example_100.png",
             ],
             zip,
-            index=range(len(datasets)),
-            datasets_no_underscore=datasets_no_underscore,
+            # index=range(len(datasets)),
+            index=valid_indices,
+            # datasets_no_underscore=datasets_no_underscore,
+            datasets_no_underscore=subsample_datasets,
         ),
+        # f"{processed_data_base}/stability_metric.png",
 
 #  -----------------------------------------------------
 #  Preprocess
@@ -35,21 +43,75 @@ rule preprocess:
     input:
         raw=lambda wildcards: f"{raw_data_base}{datasets[int(wildcards.index)]}/",
     output:
-        report(f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/metric.txt"),
-        report(f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/error.txt"),
+        f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation_metrics.csv",
+        f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/error.txt",
         tiff=f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.tif",
         csv=f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.csv",
     params:
         index=lambda wildcards: wildcards.index
     resources:
         partition="fast",
-        mem_mb=16000,
+        mem_mb=32000,
         cpu_per_task=1,
         tasks=1,
         nodes=1,
     script:
         "../calcium_imaging_automation/core/rules/preprocess.py"
 
+# -----------------------------------------------------
+# Suite2p
+rule suite2p:
+    input:
+        f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/derotation/derotated_full.tif",
+        f"/ceph/margrie/laura/cimaut/3p_non_rigid_ops.npy",
+    output:
+        f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/plane0/stat.npy",
+        f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/suite2p/plane0/data.bin",
+    params:
+        index=lambda wildcards: wildcards.index
+    resources:
+        partition="fast",
+        mem_mb=16000,
+        cpu_per_task=1,
+        tasks=1,
+        nodes=1,
+    script:
+        "../calcium_imaging_automation/core/rules/suite2p_run.py"
+
+# -----------------------------------------------------
+# Collect suite2p data and make plots
+rule postprocess:
+    output:
+        # f"{processed_data_base}/stability_metric.png",
+        "workflow/results/data/stability_metric.png",
+    params:
+        datasets=datasets_no_underscore,
+        base_path=processed_data_base
+    resources:
+        partition="fast",
+        mem_mb=8000,
+        cpu_per_task=1,
+        tasks=1,
+        nodes=1,
+    script:
+        "../calcium_imaging_automation/core/rules/postprocess.py"
+
+rule plot_traces:
+    input:
+        f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/funcimg/plane0/F.npy"
+    output:
+        f"{processed_data_base}/sub-{{index}}_{{datasets_no_underscore}}/ses-0/traces/dff_example_100.png",
+    params:
+        index=lambda wildcards: wildcards.index
+    resources:
+        partition="fast",
+        mem_mb=16000,
+        cpu_per_task=1,
+        tasks=1,
+        nodes=1,
+    script:
+        "../calcium_imaging_automation/core/rules/plot_data.py"
+
 #  -----------------------------------------------------
 #  Summarize data for datavzrd report
 rule summarize_data: