Merge pull request #8 from chris-delg/cpp-zarr

Cpp zarr
acquire-project · Aug 27, 2024 · b304221 · b304221
2 parents fb69851 + b348fce
commit b304221
Show file tree

Hide file tree

Showing 9 changed files with 309 additions and 380 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -50,4 +50,4 @@ RUN mkdir build && \
     cd ..
 
 # running the benchmark
-#CMD [ "python", "main.py" ]
+CMD [ "python", "main.py" ]
diff --git a/benchmark.py b/benchmark.py
@@ -0,0 +1,186 @@
+from zarr_libraries import *
+from typing import Optional
+import numpy as np
+import shutil
+import matplotlib.axes
+
+class Benchmark:
+    def __init__(self, shape: list, chunks: list) -> None:
+        self.__shape = shape
+        self.__chunks = chunks
+        self.__average_bandwidth = {}
+        self.__zarr_writers = {
+            "TensorStore" : Tensorstore(),
+            "Zarr Python" : Zarr_Python(),
+            "OME Zarr"    : Ome_Zarr(),
+            "Cpp Zarr"    : Cpp_Zarr()
+        }
+        self.__write_zarr = {}
+        self.__append_zarr = {}
+
+
+    ''' These functions are intended to be "private" and for use only inside the class '''    
+    def __set_write_functions(self, shape: list, zarr_data: np.ndarray) -> None:
+        self.__write_zarr = {
+            "TensorStore" : lambda: self.__zarr_writers["TensorStore"].write_zarr(shape=shape, chunks=self.chunks, zarr_data=zarr_data),
+            "Zarr Python" : lambda: self.__zarr_writers["Zarr Python"].write_zarr(shape=shape, chunks=self.chunks, zarr_data=zarr_data),
+            "OME Zarr" : lambda: self.__zarr_writers["OME Zarr"].write_zarr(chunks=self.chunks, zarr_data=zarr_data),
+            "Cpp Zarr" : lambda: self.__zarr_writers["Cpp Zarr"].write_zarr(shape=shape, chunks=self.chunks)
+        }
+
+
+    def __set_append_functions(self,new_shape: list, zarr_data: np.ndarray, multiplier: int) -> None:
+        self.__append_zarr = {
+            "TensorStore" : lambda: self.__zarr_writers["TensorStore"].append_zarr(shape=self.shape, chunks=self.chunks, new_shape=new_shape, zarr_data=zarr_data, multiplier=multiplier),
+            "Zarr Python" : lambda: self.__zarr_writers["Zarr Python"].append_zarr(shape=self.shape, chunks=self.chunks, zarr_data=zarr_data) 
+        }
+
+
+    def __print_results(self, additional_info: Optional[str] = None):
+        if additional_info: print(additional_info)
+
+        print(f"Shape {self.shape}, Chunks {self.chunks}")
+        print("----------Bandwidth----------")
+        for test, bandwidth in self.__average_bandwidth.items():
+            print(f"{test} : {bandwidth} GBps")
+        print("\n\n")
+
+
+    ''' These functions are intended to be "public" and for use outside of the class '''
+    @property
+    def shape(self) -> list:
+        return self.__shape
+
+
+    @property
+    def chunks(self) -> list:
+        return self.__chunks
+
+
+    def run_write_tests(self, num_of_gigabytes: int, show_results: bool,
+                        choose_lib: Optional[str] = None,
+                        graph: Optional[matplotlib.axes._axes.Axes] = None, 
+                        avg_graph: Optional[matplotlib.axes._axes.Axes] = None) -> None:
+
+        # error checking to see if chosen lib exists in test
+        if choose_lib and choose_lib not in set(self.__zarr_writers.keys()):
+            raise ValueError(f"There is no library of name \"{choose_lib}\".") 
+
+        gb_in_bytes = 1073741824 # represents number of bytes in a GB
+
+        for lib_name, writer in self.__zarr_writers.items():
+            # if a specified library is chosen for testing, skip any that isn't that test 
+            if choose_lib != None and choose_lib != lib_name: continue
+
+            print(f"\n\n--------{lib_name} Stress Test--------\n\n")
+
+            multiplier = 1 # multiplier that increases shape of zarr folder written
+            curr_data_size = 0 # test will run until curr_data_size reaches specified GB size passed into the function
+            write_speeds = []
+            file_sizes = []
+
+            while curr_data_size < (num_of_gigabytes * gb_in_bytes):
+                # modify the append dimension, unpack the rest 
+                new_shape = [self.shape[0] * (multiplier), *self.shape[1:]]
+                zarr_data = np.empty(())
+
+                # Cpp zarr implementation creates data in cpp_zarr.cpp, skip here to avoid making unused data 
+                if lib_name != "Cpp Zarr":
+                    zarr_data = np.random.randint(low=0, high=256, size=new_shape, dtype=np.uint8)
+
+                # returns time taken to write zarr folder 
+                self.__set_write_functions(shape=new_shape, zarr_data=zarr_data)
+                total_time = self.__write_zarr[lib_name]() # calling a lambda function inside of a dictionary
+
+                # prints info to the terminal 
+                print(f"Multiplier on first dimension : {multiplier}x\n{lib_name} -> creating zarr : {total_time} seconds")
+                print(f"The zarr folder is of size {folder_size(writer.data_path)}\n\n")
+
+                curr_data_size = np.prod(new_shape) # 3d array filled with 1 byte ints so multiplication gives accurate size in bytes
+                file_sizes.append(curr_data_size * 10**-9) # converts bytes to GB
+                write_speeds.append((curr_data_size * 10**-9) / total_time) # GB/s
+
+                # goes from 1 to 5, then adds 5 every time after that
+                multiplier += 4 if multiplier == 1 else 5 
+
+                shutil.rmtree(writer.data_path)
+
+            if graph: graph.plot(file_sizes, write_speeds, label=lib_name)
+            if avg_graph: avg_graph.bar(lib_name, np.average(write_speeds))  
+            self.__average_bandwidth[lib_name + " Write"] = np.average(write_speeds)
+
+            print("--------------------------------------------------------------\n\n")
+
+        if show_results:
+            self.__print_results(additional_info=(f"Write Test GB Soft Cap: {num_of_gigabytes}GB"))
+
+
+    def run_append_test(self, num_of_gigabytes: int, show_results: bool,
+                        choose_lib: Optional[str] = None,
+                        graph: Optional[matplotlib.axes._axes.Axes] = None, 
+                        avg_graph: Optional[matplotlib.axes._axes.Axes] = None) -> None:
+
+        # error checking to see if chosen lib exists in test
+        if choose_lib and choose_lib not in set(self.__zarr_writers.keys()):
+            raise ValueError(f"There is no library of name \"{choose_lib}\".") 
+
+        gb_in_bytes = 1073741824 # represents number of bytes in a GB
+        write_size = np.prod(self.shape) # amount of bytes appended on in each function call
+
+        for lib_name, writer in self.__zarr_writers.items():
+            # these are the only libraries that allow for appending of data
+            if lib_name != "TensorStore" and lib_name != "Zarr Python":
+                continue
+
+            # if a specified library is chosen for testing, skip any that isn't that test
+            if choose_lib != None and choose_lib != lib_name: continue  
+
+            print(f"\n\n--------{lib_name} Append Stress Test--------\n\n")
+
+            multiplier = 1 # multiplier that increases shape of zarr folder written
+            curr_data_size = 0 # test will run until curr_data_size reaches specified GB size passed into the function
+            write_speeds = []
+            write_numbers = []
+
+            while curr_data_size < (num_of_gigabytes * gb_in_bytes):
+                # modify the append dimension, unpack the rest 
+                new_shape = [self.shape[0] * (multiplier), *self.shape[1:]]
+
+                # creating new data and adjusting the shape
+                zarr_data = np.random.randint(low=0, high=256, size=self.shape, dtype=np.uint8)
+
+                # returns time taken to write zarr folder / both libraries use a different approach hence the if statements
+                self.__set_append_functions(new_shape=new_shape, zarr_data=zarr_data, multiplier=multiplier)
+                total_time = self.__append_zarr[lib_name]() # calling a lambda function inside of a dictionary
+
+                # prints info to the terminal 
+                print(f"Multiplier on first dimension : {multiplier}x\n{lib_name} -> appending zarr : {total_time} seconds")
+                print(f"The zarr folder is of size {folder_size(writer.data_path)}\n\n")
+
+                curr_data_size = np.prod(new_shape) # 3d array filled with 1 byte ints so multiplication gives accurate size in bytes
+                write_numbers.append(multiplier) # converts bytes to GB
+                write_speeds.append((write_size * 10**-9) / total_time) # GB/s
+
+                multiplier += 1
+
+            shutil.rmtree(writer.data_path) 
+
+            if graph: graph.plot(write_numbers, write_speeds, label=lib_name)
+            if avg_graph: avg_graph.bar(lib_name, np.average(write_speeds))
+            self.__average_bandwidth[lib_name + " Append"] = np.average(write_speeds)
+
+            print("--------------------------------------------------------------\n\n")
+
+        if show_results:
+            self.__print_results(additional_info=(f"Write Test GB Soft Cap: {num_of_gigabytes}GB"))
+
+
+    def run_all_tests(self, append_test_gigabytes: int, write_test_gigabytes: int, 
+                      choose_lib: Optional[str] = None,
+                      append_graph: Optional[matplotlib.axes._axes.Axes] = None, append_avg_graph: Optional[matplotlib.axes._axes.Axes] = None,
+                      write_graph: Optional[matplotlib.axes._axes.Axes] = None, write_avg_graph: Optional[matplotlib.axes._axes.Axes] = None) -> None:
+
+        self.run_append_test(num_of_gigabytes=append_test_gigabytes, show_results=False, choose_lib=choose_lib, graph=append_graph, avg_graph=append_avg_graph)
+        self.run_write_tests(num_of_gigabytes=write_test_gigabytes, show_results=False, choose_lib=choose_lib, graph=write_graph, avg_graph=write_avg_graph)
+        self.__print_results(additional_info=(f"Write Test GB Soft Cap: {write_test_gigabytes}GB | Append Test GB Soft Cap: {append_test_gigabytes}GB"))
+
diff --git a/main.py b/main.py
@@ -1,77 +1,37 @@
 import matplotlib.pyplot as plt 
-from zarr_libraries import *
+from benchmark import Benchmark
 
 
 def main() -> None:
+    fig, graph = plt.subplots(2, 2)
+    benchmark = Benchmark(shape=[64, 1080, 1920], chunks=[64, 540, 960])
 
-    def run_all_tests(shape: list, chunks: list) -> None:
-        fig, graph = plt.subplots(2, 2)
-        bandwidth_map = {}
-        zarr_writers = {
-            "TensorStore" : Tensorstore(shape=shape, chunks=chunks),
-            "Zarr Python" : Zarr_Python(shape=shape, chunks=chunks),
-            "OME Zarr"    : Ome_Zarr(shape=shape, chunks=chunks),
-            "Cpp Zarr"    : Cpp_Zarr(shape=shape, chunks=chunks)
-        }
-
-        '''
-        Append Tests:
-        - These tests benchmark the continuous appending to a single zarr folder.
-        - These tests are best suited for the following libraries:
-            * TensorStore
-            * Zarr Python
-        '''
-        for name, writer in zarr_writers.items():
-            if name != "TensorStore" and name != "Zarr Python":
-                continue
-
-            bandwidth_map[name + " Append"] = (
-                writer.continuous_append_test(graph=graph[1][0], avg_graph=graph[1][1], append_dim_size=100)
-            ) 
-
-        # setting up graph for append tests
-        graph[1][0].set_xlabel("Write Number")
-        graph[1][0].set_title("Continuous Append Test")
-        graph[1][0].legend()
+    benchmark.run_all_tests(
+        append_test_gigabytes=25, write_test_gigabytes=5,
+        append_graph=graph[1][0], append_avg_graph=graph[1][1],
+        write_graph=graph[0][0], write_avg_graph=graph[0][1]
+    )
 
-        '''
-        Continuous write tests:
-        - These tests benchmark the creation of many increasingly large zarr folders.
-        - These tests are best suited for the following libraries:
-            * TensorStore
-            * Zarr Python
-            * OME Zarr
-        '''
-        for name, writer in zarr_writers.items():
-            bandwidth_map[name + " Write"] = (
-                writer.continuous_write_test(graph=graph[0][0], avg_graph=graph[0][1], append_dim_size=51, step=5)
-            ) 
-
-        # print the average bandwidth for each of the tests
-        print(f"Shape {shape}, Chunks {chunks}")
-        print("----------Bandwidth----------")
-        for test, bandwidth in bandwidth_map.items():
-            print(f"{test} : {bandwidth} GBps")
-        print("\n\n")
-
-        # setting up graphs for write tests
-        graph[0][0].set_xlabel("Data Size (GB)")
-        graph[0][0].set_title("Continuous Write Test")
-        graph[0][0].legend()
-
-        # setting up graphs for average bandwidth
-        graph[0][1].set_title("Average Bandwidth:\nContinuous Write Test")
-        graph[1][1].set_title("Average Bandwidth:\nContinuous Append Test")
+    # setting up graph for append tests
+    graph[1][0].set_xlabel("Write Number")
+    graph[1][0].set_title("Continuous Append Test")
+    graph[1][0].legend()
 
-        for graph in fig.get_axes():
-            graph.set_ylabel("Bandwidth (GBps)")
-            graph.grid()
-
-        fig.canvas.manager.set_window_title(f'shape: {shape}, chunks: {chunks}')
-        plt.tight_layout()
-
+    # setting up graphs for write tests
+    graph[0][0].set_xlabel("Data Size (GB)")
+    graph[0][0].set_title("Continuous Write Test")
+    graph[0][0].legend()
 
-    run_all_tests(shape=[64, 1080, 1920], chunks=[64, 540, 960])
+    # setting up graphs for average bandwidth
+    graph[0][1].set_title("Average Bandwidth:\nContinuous Write Test")
+    graph[1][1].set_title("Average Bandwidth:\nContinuous Append Test")
+
+    for graph in fig.get_axes():
+        graph.set_ylabel("Bandwidth (GBps)")
+        graph.grid()
+
+    fig.canvas.manager.set_window_title(f'shape: {benchmark.shape}, chunks: {benchmark.chunks}')
+    plt.tight_layout()
     plt.show()
 
 

diff --git a/zarr_libraries/common.py b/zarr_libraries/common.py
@@ -6,7 +6,7 @@ def folder_size(folder_path: str) -> str:
     def convert_bytes(B: int) -> str:
         """Return the given bytes as a human friendly KB, MB, GB, or TB string."""
         B = float(B)
-        KB = float(1000) # change to 1024 for non mac file systems 
+        KB = float(1024) # change to 1024 for non mac file systems 
         MB = float(KB ** 2) # 1,048,576
         GB = float(KB ** 3) # 1,073,741,824
         TB = float(KB ** 4) # 1,099,511,627,776

diff --git a/zarr_libraries/cpp_zarr/cpp_zarr.cpp b/zarr_libraries/cpp_zarr/cpp_zarr.cpp
@@ -38,7 +38,7 @@ float write_zarr(string path, vector<uint64_t> chunks, vector<uint64_t> shape)
     zarrObject.set_shape(writeShape);
     zarrObject.set_chunks(chunks);
     zarrObject.set_fill_value(1);
-    zarrObject.set_order("C");
+    //zarrObject.set_order("C");
     zarrObject.set_dimension_separator("/");
     zarrObject.set_clevel((uint64_t)1);
     zarrObject.set_chunkInfo(startCoords, endCoords);

diff --git a/zarr_libraries/cpp_zarr/cpp_zarr.py b/zarr_libraries/cpp_zarr/cpp_zarr.py
@@ -1,53 +1,18 @@
 from build.pyCppZarr import *
-from zarr_libraries import folder_size
 from pathlib import Path
-import matplotlib.axes
 import numpy as np
-import shutil
 
 
 class Cpp_Zarr:
-    def __init__(self, shape: list, chunks: list) -> None:
-        self.abs_path_to_data = str((Path(__file__).parent / "../example_data/cpp_zarr_data").resolve())
-        self.shape = shape
-        self.chunks = chunks
-
-
-    def __continuous_write(self, result_path: str, append_dim_size: int, step: int) -> tuple[list, list]:
-        file_sizes = []
-        bandwidths = []
+    def __init__(self) -> None:
+        self.__path_to_data = str((Path(__file__).parent / "../example_data/cpp_zarr_data/test.zarr").resolve())
 
-        for i in range(0, append_dim_size, step):
-            new_shape = (self.shape[0] * (i + 1), *self.shape[1:])  # modify the append dimension, unpack the rest
-
-            # write zarr files and store total time taken 
-            total_time = write_zarr(result_path, self.chunks, new_shape)
-
-            # print info to the terminal 
-            print(f"Write #{i + 1}\nCpp Zarr -> creating zarr : {total_time} seconds")
-            print(f"The zarr folder is of size {folder_size(result_path)}\n\n")
-
-            size = np.prod(new_shape) # 3d array filled with 1 byte ints so multiplication gives accurate size in bytes
-            file_sizes.append(size * 10**-9) # converts bytes to GB
-            bandwidths.append((size * 10**-9) / total_time) # GB/s
-            shutil.rmtree(result_path)
-
-        return file_sizes, bandwidths
+
+    @property
+    def data_path(self) -> str:
+        return self.__path_to_data
 
 
-    def continuous_write_test(self, graph: matplotlib.axes._axes.Axes, 
-                              avg_graph: matplotlib.axes._axes.Axes, 
-                              append_dim_size: int, step: int) -> float:
-        # calls continuous write function and graphs results
-        print("\n\n--------Cpp Zarr Stress Test--------\n\n")
-        file_sizes, bandwidths = self.__continuous_write(
-            result_path = self.abs_path_to_data + "/stressTest.zarr",
-            append_dim_size = append_dim_size,
-            step = step
-            )
-        print("--------------------------------------------------------------\n\n")
-        graph.plot(file_sizes, bandwidths, label="Cpp Zarr", marker='o')
-        avg_graph.bar("Cpp Zarr", np.average(bandwidths))
-        return float(np.average(bandwidths))
-
-
+    def write_zarr(self, shape: list, chunks: list) -> float:
+        return write_zarr(self.data_path, chunks, shape)
+