diff --git a/Dockerfile b/Dockerfile index d73d662..6efa914 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,4 +50,4 @@ RUN mkdir build && \ cd .. # running the benchmark -#CMD [ "python", "main.py" ] \ No newline at end of file +CMD [ "python", "main.py" ] \ No newline at end of file diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 0000000..1d91ec9 --- /dev/null +++ b/benchmark.py @@ -0,0 +1,186 @@ +from zarr_libraries import * +from typing import Optional +import numpy as np +import shutil +import matplotlib.axes + +class Benchmark: + def __init__(self, shape: list, chunks: list) -> None: + self.__shape = shape + self.__chunks = chunks + self.__average_bandwidth = {} + self.__zarr_writers = { + "TensorStore" : Tensorstore(), + "Zarr Python" : Zarr_Python(), + "OME Zarr" : Ome_Zarr(), + "Cpp Zarr" : Cpp_Zarr() + } + self.__write_zarr = {} + self.__append_zarr = {} + + + ''' These functions are intended to be "private" and for use only inside the class ''' + def __set_write_functions(self, shape: list, zarr_data: np.ndarray) -> None: + self.__write_zarr = { + "TensorStore" : lambda: self.__zarr_writers["TensorStore"].write_zarr(shape=shape, chunks=self.chunks, zarr_data=zarr_data), + "Zarr Python" : lambda: self.__zarr_writers["Zarr Python"].write_zarr(shape=shape, chunks=self.chunks, zarr_data=zarr_data), + "OME Zarr" : lambda: self.__zarr_writers["OME Zarr"].write_zarr(chunks=self.chunks, zarr_data=zarr_data), + "Cpp Zarr" : lambda: self.__zarr_writers["Cpp Zarr"].write_zarr(shape=shape, chunks=self.chunks) + } + + + def __set_append_functions(self,new_shape: list, zarr_data: np.ndarray, multiplier: int) -> None: + self.__append_zarr = { + "TensorStore" : lambda: self.__zarr_writers["TensorStore"].append_zarr(shape=self.shape, chunks=self.chunks, new_shape=new_shape, zarr_data=zarr_data, multiplier=multiplier), + "Zarr Python" : lambda: self.__zarr_writers["Zarr Python"].append_zarr(shape=self.shape, chunks=self.chunks, zarr_data=zarr_data) + } + + + def __print_results(self, additional_info: Optional[str] = None): + if additional_info: print(additional_info) + + print(f"Shape {self.shape}, Chunks {self.chunks}") + print("----------Bandwidth----------") + for test, bandwidth in self.__average_bandwidth.items(): + print(f"{test} : {bandwidth} GBps") + print("\n\n") + + + ''' These functions are intended to be "public" and for use outside of the class ''' + @property + def shape(self) -> list: + return self.__shape + + + @property + def chunks(self) -> list: + return self.__chunks + + + def run_write_tests(self, num_of_gigabytes: int, show_results: bool, + choose_lib: Optional[str] = None, + graph: Optional[matplotlib.axes._axes.Axes] = None, + avg_graph: Optional[matplotlib.axes._axes.Axes] = None) -> None: + + # error checking to see if chosen lib exists in test + if choose_lib and choose_lib not in set(self.__zarr_writers.keys()): + raise ValueError(f"There is no library of name \"{choose_lib}\".") + + gb_in_bytes = 1073741824 # represents number of bytes in a GB + + for lib_name, writer in self.__zarr_writers.items(): + # if a specified library is chosen for testing, skip any that isn't that test + if choose_lib != None and choose_lib != lib_name: continue + + print(f"\n\n--------{lib_name} Stress Test--------\n\n") + + multiplier = 1 # multiplier that increases shape of zarr folder written + curr_data_size = 0 # test will run until curr_data_size reaches specified GB size passed into the function + write_speeds = [] + file_sizes = [] + + while curr_data_size < (num_of_gigabytes * gb_in_bytes): + # modify the append dimension, unpack the rest + new_shape = [self.shape[0] * (multiplier), *self.shape[1:]] + zarr_data = np.empty(()) + + # Cpp zarr implementation creates data in cpp_zarr.cpp, skip here to avoid making unused data + if lib_name != "Cpp Zarr": + zarr_data = np.random.randint(low=0, high=256, size=new_shape, dtype=np.uint8) + + # returns time taken to write zarr folder + self.__set_write_functions(shape=new_shape, zarr_data=zarr_data) + total_time = self.__write_zarr[lib_name]() # calling a lambda function inside of a dictionary + + # prints info to the terminal + print(f"Multiplier on first dimension : {multiplier}x\n{lib_name} -> creating zarr : {total_time} seconds") + print(f"The zarr folder is of size {folder_size(writer.data_path)}\n\n") + + curr_data_size = np.prod(new_shape) # 3d array filled with 1 byte ints so multiplication gives accurate size in bytes + file_sizes.append(curr_data_size * 10**-9) # converts bytes to GB + write_speeds.append((curr_data_size * 10**-9) / total_time) # GB/s + + # goes from 1 to 5, then adds 5 every time after that + multiplier += 4 if multiplier == 1 else 5 + + shutil.rmtree(writer.data_path) + + if graph: graph.plot(file_sizes, write_speeds, label=lib_name) + if avg_graph: avg_graph.bar(lib_name, np.average(write_speeds)) + self.__average_bandwidth[lib_name + " Write"] = np.average(write_speeds) + + print("--------------------------------------------------------------\n\n") + + if show_results: + self.__print_results(additional_info=(f"Write Test GB Soft Cap: {num_of_gigabytes}GB")) + + + def run_append_test(self, num_of_gigabytes: int, show_results: bool, + choose_lib: Optional[str] = None, + graph: Optional[matplotlib.axes._axes.Axes] = None, + avg_graph: Optional[matplotlib.axes._axes.Axes] = None) -> None: + + # error checking to see if chosen lib exists in test + if choose_lib and choose_lib not in set(self.__zarr_writers.keys()): + raise ValueError(f"There is no library of name \"{choose_lib}\".") + + gb_in_bytes = 1073741824 # represents number of bytes in a GB + write_size = np.prod(self.shape) # amount of bytes appended on in each function call + + for lib_name, writer in self.__zarr_writers.items(): + # these are the only libraries that allow for appending of data + if lib_name != "TensorStore" and lib_name != "Zarr Python": + continue + + # if a specified library is chosen for testing, skip any that isn't that test + if choose_lib != None and choose_lib != lib_name: continue + + print(f"\n\n--------{lib_name} Append Stress Test--------\n\n") + + multiplier = 1 # multiplier that increases shape of zarr folder written + curr_data_size = 0 # test will run until curr_data_size reaches specified GB size passed into the function + write_speeds = [] + write_numbers = [] + + while curr_data_size < (num_of_gigabytes * gb_in_bytes): + # modify the append dimension, unpack the rest + new_shape = [self.shape[0] * (multiplier), *self.shape[1:]] + + # creating new data and adjusting the shape + zarr_data = np.random.randint(low=0, high=256, size=self.shape, dtype=np.uint8) + + # returns time taken to write zarr folder / both libraries use a different approach hence the if statements + self.__set_append_functions(new_shape=new_shape, zarr_data=zarr_data, multiplier=multiplier) + total_time = self.__append_zarr[lib_name]() # calling a lambda function inside of a dictionary + + # prints info to the terminal + print(f"Multiplier on first dimension : {multiplier}x\n{lib_name} -> appending zarr : {total_time} seconds") + print(f"The zarr folder is of size {folder_size(writer.data_path)}\n\n") + + curr_data_size = np.prod(new_shape) # 3d array filled with 1 byte ints so multiplication gives accurate size in bytes + write_numbers.append(multiplier) # converts bytes to GB + write_speeds.append((write_size * 10**-9) / total_time) # GB/s + + multiplier += 1 + + shutil.rmtree(writer.data_path) + + if graph: graph.plot(write_numbers, write_speeds, label=lib_name) + if avg_graph: avg_graph.bar(lib_name, np.average(write_speeds)) + self.__average_bandwidth[lib_name + " Append"] = np.average(write_speeds) + + print("--------------------------------------------------------------\n\n") + + if show_results: + self.__print_results(additional_info=(f"Write Test GB Soft Cap: {num_of_gigabytes}GB")) + + + def run_all_tests(self, append_test_gigabytes: int, write_test_gigabytes: int, + choose_lib: Optional[str] = None, + append_graph: Optional[matplotlib.axes._axes.Axes] = None, append_avg_graph: Optional[matplotlib.axes._axes.Axes] = None, + write_graph: Optional[matplotlib.axes._axes.Axes] = None, write_avg_graph: Optional[matplotlib.axes._axes.Axes] = None) -> None: + + self.run_append_test(num_of_gigabytes=append_test_gigabytes, show_results=False, choose_lib=choose_lib, graph=append_graph, avg_graph=append_avg_graph) + self.run_write_tests(num_of_gigabytes=write_test_gigabytes, show_results=False, choose_lib=choose_lib, graph=write_graph, avg_graph=write_avg_graph) + self.__print_results(additional_info=(f"Write Test GB Soft Cap: {write_test_gigabytes}GB | Append Test GB Soft Cap: {append_test_gigabytes}GB")) + \ No newline at end of file diff --git a/main.py b/main.py index 9922b99..ef0e799 100644 --- a/main.py +++ b/main.py @@ -1,77 +1,37 @@ import matplotlib.pyplot as plt -from zarr_libraries import * +from benchmark import Benchmark def main() -> None: + fig, graph = plt.subplots(2, 2) + benchmark = Benchmark(shape=[64, 1080, 1920], chunks=[64, 540, 960]) - def run_all_tests(shape: list, chunks: list) -> None: - fig, graph = plt.subplots(2, 2) - bandwidth_map = {} - zarr_writers = { - "TensorStore" : Tensorstore(shape=shape, chunks=chunks), - "Zarr Python" : Zarr_Python(shape=shape, chunks=chunks), - "OME Zarr" : Ome_Zarr(shape=shape, chunks=chunks), - "Cpp Zarr" : Cpp_Zarr(shape=shape, chunks=chunks) - } - - ''' - Append Tests: - - These tests benchmark the continuous appending to a single zarr folder. - - These tests are best suited for the following libraries: - * TensorStore - * Zarr Python - ''' - for name, writer in zarr_writers.items(): - if name != "TensorStore" and name != "Zarr Python": - continue - - bandwidth_map[name + " Append"] = ( - writer.continuous_append_test(graph=graph[1][0], avg_graph=graph[1][1], append_dim_size=100) - ) - - # setting up graph for append tests - graph[1][0].set_xlabel("Write Number") - graph[1][0].set_title("Continuous Append Test") - graph[1][0].legend() + benchmark.run_all_tests( + append_test_gigabytes=25, write_test_gigabytes=5, + append_graph=graph[1][0], append_avg_graph=graph[1][1], + write_graph=graph[0][0], write_avg_graph=graph[0][1] + ) - ''' - Continuous write tests: - - These tests benchmark the creation of many increasingly large zarr folders. - - These tests are best suited for the following libraries: - * TensorStore - * Zarr Python - * OME Zarr - ''' - for name, writer in zarr_writers.items(): - bandwidth_map[name + " Write"] = ( - writer.continuous_write_test(graph=graph[0][0], avg_graph=graph[0][1], append_dim_size=51, step=5) - ) - - # print the average bandwidth for each of the tests - print(f"Shape {shape}, Chunks {chunks}") - print("----------Bandwidth----------") - for test, bandwidth in bandwidth_map.items(): - print(f"{test} : {bandwidth} GBps") - print("\n\n") - - # setting up graphs for write tests - graph[0][0].set_xlabel("Data Size (GB)") - graph[0][0].set_title("Continuous Write Test") - graph[0][0].legend() - - # setting up graphs for average bandwidth - graph[0][1].set_title("Average Bandwidth:\nContinuous Write Test") - graph[1][1].set_title("Average Bandwidth:\nContinuous Append Test") + # setting up graph for append tests + graph[1][0].set_xlabel("Write Number") + graph[1][0].set_title("Continuous Append Test") + graph[1][0].legend() - for graph in fig.get_axes(): - graph.set_ylabel("Bandwidth (GBps)") - graph.grid() - - fig.canvas.manager.set_window_title(f'shape: {shape}, chunks: {chunks}') - plt.tight_layout() - + # setting up graphs for write tests + graph[0][0].set_xlabel("Data Size (GB)") + graph[0][0].set_title("Continuous Write Test") + graph[0][0].legend() - run_all_tests(shape=[64, 1080, 1920], chunks=[64, 540, 960]) + # setting up graphs for average bandwidth + graph[0][1].set_title("Average Bandwidth:\nContinuous Write Test") + graph[1][1].set_title("Average Bandwidth:\nContinuous Append Test") + + for graph in fig.get_axes(): + graph.set_ylabel("Bandwidth (GBps)") + graph.grid() + + fig.canvas.manager.set_window_title(f'shape: {benchmark.shape}, chunks: {benchmark.chunks}') + plt.tight_layout() plt.show() diff --git a/zarr_libraries/common.py b/zarr_libraries/common.py index 274f1fd..206c989 100644 --- a/zarr_libraries/common.py +++ b/zarr_libraries/common.py @@ -6,7 +6,7 @@ def folder_size(folder_path: str) -> str: def convert_bytes(B: int) -> str: """Return the given bytes as a human friendly KB, MB, GB, or TB string.""" B = float(B) - KB = float(1000) # change to 1024 for non mac file systems + KB = float(1024) # change to 1024 for non mac file systems MB = float(KB ** 2) # 1,048,576 GB = float(KB ** 3) # 1,073,741,824 TB = float(KB ** 4) # 1,099,511,627,776 diff --git a/zarr_libraries/cpp_zarr/cpp_zarr.cpp b/zarr_libraries/cpp_zarr/cpp_zarr.cpp index 66b2e89..f94aa75 100644 --- a/zarr_libraries/cpp_zarr/cpp_zarr.cpp +++ b/zarr_libraries/cpp_zarr/cpp_zarr.cpp @@ -38,7 +38,7 @@ float write_zarr(string path, vector chunks, vector shape) zarrObject.set_shape(writeShape); zarrObject.set_chunks(chunks); zarrObject.set_fill_value(1); - zarrObject.set_order("C"); + //zarrObject.set_order("C"); zarrObject.set_dimension_separator("/"); zarrObject.set_clevel((uint64_t)1); zarrObject.set_chunkInfo(startCoords, endCoords); diff --git a/zarr_libraries/cpp_zarr/cpp_zarr.py b/zarr_libraries/cpp_zarr/cpp_zarr.py index fa388ed..eb1a0d9 100644 --- a/zarr_libraries/cpp_zarr/cpp_zarr.py +++ b/zarr_libraries/cpp_zarr/cpp_zarr.py @@ -1,53 +1,18 @@ from build.pyCppZarr import * -from zarr_libraries import folder_size from pathlib import Path -import matplotlib.axes import numpy as np -import shutil class Cpp_Zarr: - def __init__(self, shape: list, chunks: list) -> None: - self.abs_path_to_data = str((Path(__file__).parent / "../example_data/cpp_zarr_data").resolve()) - self.shape = shape - self.chunks = chunks - - - def __continuous_write(self, result_path: str, append_dim_size: int, step: int) -> tuple[list, list]: - file_sizes = [] - bandwidths = [] + def __init__(self) -> None: + self.__path_to_data = str((Path(__file__).parent / "../example_data/cpp_zarr_data/test.zarr").resolve()) - for i in range(0, append_dim_size, step): - new_shape = (self.shape[0] * (i + 1), *self.shape[1:]) # modify the append dimension, unpack the rest - - # write zarr files and store total time taken - total_time = write_zarr(result_path, self.chunks, new_shape) - - # print info to the terminal - print(f"Write #{i + 1}\nCpp Zarr -> creating zarr : {total_time} seconds") - print(f"The zarr folder is of size {folder_size(result_path)}\n\n") - - size = np.prod(new_shape) # 3d array filled with 1 byte ints so multiplication gives accurate size in bytes - file_sizes.append(size * 10**-9) # converts bytes to GB - bandwidths.append((size * 10**-9) / total_time) # GB/s - shutil.rmtree(result_path) - - return file_sizes, bandwidths + + @property + def data_path(self) -> str: + return self.__path_to_data - def continuous_write_test(self, graph: matplotlib.axes._axes.Axes, - avg_graph: matplotlib.axes._axes.Axes, - append_dim_size: int, step: int) -> float: - # calls continuous write function and graphs results - print("\n\n--------Cpp Zarr Stress Test--------\n\n") - file_sizes, bandwidths = self.__continuous_write( - result_path = self.abs_path_to_data + "/stressTest.zarr", - append_dim_size = append_dim_size, - step = step - ) - print("--------------------------------------------------------------\n\n") - graph.plot(file_sizes, bandwidths, label="Cpp Zarr", marker='o') - avg_graph.bar("Cpp Zarr", np.average(bandwidths)) - return float(np.average(bandwidths)) - - \ No newline at end of file + def write_zarr(self, shape: list, chunks: list) -> float: + return write_zarr(self.data_path, chunks, shape) + \ No newline at end of file diff --git a/zarr_libraries/ome_ngff/ome_zarr.py b/zarr_libraries/ome_ngff/ome_zarr.py index 5468207..b2eed5c 100644 --- a/zarr_libraries/ome_ngff/ome_zarr.py +++ b/zarr_libraries/ome_ngff/ome_zarr.py @@ -1,63 +1,30 @@ import numpy as np import zarr import time -import shutil -from zarr_libraries import folder_size from ome_zarr.io import parse_url from ome_zarr.writer import write_image from pathlib import Path -import matplotlib.axes class Ome_Zarr: - def __init__(self, shape: list, chunks: list) -> None: - self.abs_path_to_data = str((Path(__file__).parent / "../example_data/ome_zarr_data").resolve()) - self.shape = shape - self.chunks = chunks + def __init__(self) -> None: + self.__path_to_data = str((Path(__file__).parent / "../example_data/ome_zarr_data/test.zarr").resolve()) - def __continuous_write(self, result_path: str, append_dim_size: int, step: int) -> tuple[list, list]: - file_sizes = [] - bandwidths = [] - - for i in range(0, append_dim_size, step): - new_shape = (self.shape[0] * (i + 1), *self.shape[1:]) # modify the append dimension, unpack the rest - - # create zarr folder with new shape and initialize data for the folder - store = parse_url(result_path, mode="w").store - root = zarr.group(store=store) - zarr_data = np.random.randint(low=0, high=256, size=new_shape, dtype=np.uint8) - - # timing the writing of the data to the zarr folder - t = time.perf_counter() - write_image(image=zarr_data, group=root, axes="tyx", storage_options=dict(chunks=(self.chunks))) - total_time = time.perf_counter() - t - - # print info to the terminal - print(f"Write #{i + 1}\nOME-Zarr -> creating zarr : {total_time} seconds") - print(f"The zarr folder is of size {folder_size(result_path)}\n\n") - - size = np.prod(new_shape) # 3d array filled with 1 byte ints so multiplication gives accurate size in bytes - file_sizes.append(size * 10**-9) # converts bytes to GB - bandwidths.append((size * 10**-9) / total_time) # GB/s - shutil.rmtree(result_path) - - return file_sizes, bandwidths + @property + def data_path(self) -> str: + return self.__path_to_data - def continuous_write_test(self, graph: matplotlib.axes._axes.Axes, - avg_graph: matplotlib.axes._axes.Axes, - append_dim_size: int, step: int) -> float: - # calls continuous write function and graphs results - print("\n\n--------OME-Zarr Stress Test--------\n\n") - file_sizes, bandwidths = self.__continuous_write( - result_path = self.abs_path_to_data + "/stressTest.zarr", - append_dim_size = append_dim_size, - step = step - ) - print("--------------------------------------------------------------\n\n") - graph.plot(file_sizes, bandwidths, label="OME-Zarr", marker='o') - avg_graph.bar("OME-Zarr", np.average(bandwidths)) - return float(np.average(bandwidths)) + def write_zarr(self, chunks: list, zarr_data: np.ndarray) -> float: + # create zarr folder with new shape and initialize data for the folder + store = parse_url(self.data_path, mode="w").store + root = zarr.group(store=store) + # timing the data written to the zarr folder in seconds + t = time.perf_counter() + write_image(image=zarr_data, group=root, axes="tyx", storage_options=dict(chunks=(chunks))) + total_time = time.perf_counter() - t + + return total_time \ No newline at end of file diff --git a/zarr_libraries/tensorstore/tensorstore_zarr.py b/zarr_libraries/tensorstore/tensorstore_zarr.py index 759bd7a..58ceaba 100644 --- a/zarr_libraries/tensorstore/tensorstore_zarr.py +++ b/zarr_libraries/tensorstore/tensorstore_zarr.py @@ -1,83 +1,27 @@ import tensorstore as ts import time -import shutil import numpy as np -from zarr_libraries import folder_size from pathlib import Path -import matplotlib.axes class Tensorstore: - def __init__(self, shape: list, chunks: list) -> None: - self.abs_path_to_data = str((Path(__file__).parent / "../example_data/tensorstore_data").resolve()) - self.shape = shape - self.chunks = chunks + def __init__(self) -> None: + self.__path_to_data = str((Path(__file__).parent / "../example_data/tensorstore_data/test.zarr").resolve()) - - def __continuous_write(self, result_path: str, append_dim_size: int, step: int) -> tuple[list, list]: - file_sizes = [] - bandwidths = [] - for i in range(0, append_dim_size, step): - new_shape = [self.shape[0] * (i + 1), *self.shape[1:]] # modify the append dimension, unpack the rest - - # The metadata for the zarr folder that is to be created (specifications) - zarr_spec = { - 'driver': 'zarr', - 'dtype': 'uint8', - 'kvstore': { - 'driver': 'file', - 'path': result_path, - }, - 'metadata': { - 'compressor': { - 'id': 'blosc', - 'cname': 'lz4', - 'clevel': 1 - }, - 'chunks': self.chunks, - 'dimension_separator': '/', - 'dtype': '|u1', - 'fill_value': 0, - 'filters': None, - 'order': 'C', - 'shape': new_shape, - 'zarr_format': 2, - } - } - - # populate the data for the zarr folder with new shape and create the folder itself - zarr_data = np.random.randint(low=0, high=256, size=new_shape, dtype=np.uint8) - zarr_create = ts.open(zarr_spec, create=True, delete_existing=True).result() - - # timing the writing of the data to the zarr folder in seconds - t = time.perf_counter() - zarr_create[...].write(zarr_data).result() - total_time = time.perf_counter() - t - - # prints info to the terminal - print(f"Write #{i + 1}\nTensorStore -> creating zarr : {total_time} seconds") - print(f"The zarr folder is of size {folder_size(result_path)}\n\n") - - size = np.prod(new_shape) # 3d array filled with 1 byte ints so multiplication gives accurate size in bytes - file_sizes.append(size * 10**-9) # converts bytes to GB - bandwidths.append((size * 10**-9) / total_time) # GB/s - - shutil.rmtree(result_path) # clean up by deleting zarr folder - return file_sizes, bandwidths - - - def __continuous_append(self, result_path: str, append_dim_size: int) -> tuple[list, list]: - write_number = [] - bandwidths = [] + @property + def data_path(self) -> str: + return self.__path_to_data + + def write_zarr(self, shape: list, chunks: list, zarr_data: np.ndarray) -> float: # The metadata for the zarr folder that is to be created (specifications) zarr_spec = { 'driver': 'zarr', 'dtype': 'uint8', 'kvstore': { 'driver': 'file', - 'path': result_path, + 'path': self.data_path, }, 'metadata': { 'compressor': { @@ -85,73 +29,48 @@ def __continuous_append(self, result_path: str, append_dim_size: int) -> tuple[l 'cname': 'lz4', 'clevel': 1 }, - 'chunks': self.chunks, + 'chunks': chunks, 'dimension_separator': '/', 'dtype': '|u1', 'fill_value': 0, 'filters': None, 'order': 'C', - 'shape': self.shape, + 'shape': shape, 'zarr_format': 2, } } - # create and write initial data - zarr_data = np.random.randint(low=0, high=256, size=self.shape, dtype=np.uint8) - zarr_create = ts.open(zarr_spec, create=True, delete_existing=True).result() - zarr_create[...].write(zarr_data).result() - - for i in range(2, append_dim_size + 1): - new_shape = [self.shape[0] * i, *self.shape[1:]] # modify the append dimension, unpack the rest - zarr_data = np.random.randint(low=0, high=256, size=self.shape, dtype=np.uint8) - - # use resize function in tensorstore to dynamically resize the zarr folder that we created - # timing the writing of the data to the back of the zarr folder in seconds - t = time.perf_counter() - zarr_create = zarr_create.resize(exclusive_max=new_shape).result() - zarr_create[(self.shape[0] * (i - 1)):, :, :].write(zarr_data).result() - total_time = time.perf_counter() - t - - # print info to the terminal - print(f"Write #{i}\nTensorStore -> appending zarr : {total_time} seconds") - print(f"The zarr folder is of size {folder_size(result_path)}\n\n") + zarr_create = ts.open(zarr_spec, create=True, delete_existing=True).result() - size = np.prod(self.shape) # 3d array filled with 1 byte ints so multiplication gives accurate size in bytes - write_number.append(i) # append the write number - bandwidths.append((size * 10**-9) / total_time) # GB/s + # timing the writing of the data to the zarr folder in seconds + t = time.perf_counter() + zarr_create[...].write(zarr_data).result() + total_time = time.perf_counter() - t - shutil.rmtree(result_path) - return write_number, bandwidths - - - def continuous_write_test(self, graph: matplotlib.axes._axes.Axes, - avg_graph: matplotlib.axes._axes.Axes, - append_dim_size: int, step: int) -> float: - # calls continuous write function and graphs results - print("\n\n--------Tensorstore Stress Test--------\n\n") - file_sizes, bandwidths = self.__continuous_write( - result_path = self.abs_path_to_data + "/stressTest.zarr", - append_dim_size = append_dim_size, - step = step - ) - print("--------------------------------------------------------------\n\n") - graph.plot(file_sizes, bandwidths, label="TensorStore", marker='o') - avg_graph.bar("TensorStore", np.average(bandwidths)) - return float(np.average(bandwidths)) + return total_time - def continuous_append_test(self, graph: matplotlib.axes._axes.Axes, - avg_graph: matplotlib.axes._axes.Axes, - append_dim_size: int) -> float: - # calls continuous append function and graphs results - print("\n\n--------Tensorstore Stress Test--------\n\n") - write_number, bandwidths = self.__continuous_append( - result_path = self.abs_path_to_data + "/stressTestAppend.zarr", - append_dim_size = append_dim_size - ) - print("--------------------------------------------------------------\n\n") - graph.plot(write_number, bandwidths, label="TensorStore") - avg_graph.bar("TensorStore", np.average(bandwidths)) - return float(np.average(bandwidths)) + def append_zarr(self, shape: list, chunks: list, new_shape: int, zarr_data: np.ndarray, multiplier: int) -> float: + # if there is no data to append to, create it + if not Path(self.data_path).exists(): + return self.write_zarr(shape=shape, chunks=chunks, zarr_data=zarr_data) + + zarr_folder = ts.open( + { + 'driver': 'zarr', + 'kvstore': { + 'driver': 'file', + 'path': self.data_path + } + }, + open=True + ).result() + + # timing the appending of the data to the back of the zarr folder + t = time.perf_counter() + zarr_folder = zarr_folder.resize(exclusive_max=new_shape).result() + zarr_folder[(shape[0] * (multiplier - 1)):, :, :].write(zarr_data).result() + total_time = time.perf_counter() - t + return total_time \ No newline at end of file diff --git a/zarr_libraries/zarr_python/zarr_python.py b/zarr_libraries/zarr_python/zarr_python.py index 7207ef8..eca1126 100644 --- a/zarr_libraries/zarr_python/zarr_python.py +++ b/zarr_libraries/zarr_python/zarr_python.py @@ -1,119 +1,51 @@ import zarr import numpy as np import time -import shutil import zarr.storage -from zarr_libraries import folder_size from pathlib import Path -import matplotlib.axes from numcodecs import Blosc class Zarr_Python: - def __init__(self, shape: list, chunks: list) -> None: - self.abs_path_to_data = str((Path(__file__).parent / "../example_data/zarr_python_data").resolve()) - self.shape = shape - self.chunks = chunks - self.compressor = Blosc(cname="lz4", clevel=1) + def __init__(self) -> None: + self.__path_to_data = str((Path(__file__).parent / "../example_data/zarr_python_data/test.zarr").resolve()) + self.__compressor = Blosc(cname="lz4", clevel=1) - - def __continuous_write(self, result_path: str, append_dim_size: int, step: int) -> tuple[list, list]: - file_sizes = [] - bandwidths = [] - - for i in range(0, append_dim_size, step): - new_shape = (self.shape[0] * (i + 1), *self.shape[1:]) # modify the append dimension, unpack the rest - - # create zarr folder with new shape and initialize the data - zarr_create = zarr.open( - result_path, - mode="w", - shape=new_shape, - chunks=self.chunks, - dtype="u1", - compressor=self.compressor - ) - zarr_data = np.random.randint(low=0, high=256, size=new_shape, dtype=np.uint8) - - # timing the data written to the zarr folder in seconds - t = time.perf_counter() - zarr_create[...] = zarr_data - total_time = time.perf_counter() - t - # prints info to the terminal - print(f"Write #{i + 1}\nzarr-python -> creating zarr : {total_time} seconds") - print(f"The zarr folder is of size {folder_size(result_path)}\n\n") - - size = np.prod(new_shape) # 3d array filled with 1 byte ints so multiplication gives accurate size in bytes - file_sizes.append(size * 10**-9) # converts bytes to GB - bandwidths.append((size * 10**-9) / total_time) # GB/s - shutil.rmtree(result_path) # clean up by deleting created zarr folder + @property + def data_path(self) -> str: + return self.__path_to_data - return file_sizes, bandwidths - - def __continuous_append(self, result_path: str, append_dim_size: int) -> tuple[list, list]: - write_number = [] - bandwidths = [] - - # create zarr folder and fill with initial data - zarr_data = np.random.randint(low=0, high=256, size=self.shape, dtype=np.uint8) + def write_zarr(self, shape: list, chunks: list, zarr_data: np.ndarray) -> float: zarr_create = zarr.open( - result_path, - mode="w", - shape=self.shape, - chunks=self.chunks, - dtype="u1", - compressor=self.compressor - ) - zarr_create[...] = zarr_data - - for i in range(2, append_dim_size + 1): - # timing the data getting appended to the back of the zarr folder - t = time.perf_counter() - zarr_create.append(np.random.randint(low=0, high=256, size=self.shape, dtype=np.uint8)) - total_time = time.perf_counter() - t - - # prints info to the terminal - print(f"Write #{i}\nzarr-python -> appending zarr : {total_time} seconds") - print(f"The zarr folder is of size {folder_size(result_path)}\n\n") - - size = np.prod(self.shape) # 3d array filled with 1 byte ints so multiplication gives accurate size in bytes - write_number.append(i) # appends the current write number - bandwidths.append((size * 10**-9) / total_time) # GB/s - - shutil.rmtree(result_path) # clean up by deleting zarr folder - return write_number, bandwidths + self.data_path, + mode="w", + shape=shape, + chunks=chunks, + dtype="u1", + compressor=self.__compressor + ) + # timing the data written to the zarr folder in seconds + t = time.perf_counter() + zarr_create[...] = zarr_data + total_time = time.perf_counter() - t - def continuous_write_test(self, graph: matplotlib.axes._axes.Axes, - avg_graph: matplotlib.axes._axes.Axes, - append_dim_size: int, step: int) -> float: - # calls continuous write function and graphs results - print("\n\n--------Zarr-Python Stress Test--------\n\n") - file_sizes, bandwidths = self.__continuous_write( - result_path = self.abs_path_to_data + "/stressTest.zarr", - append_dim_size = append_dim_size, - step = step - ) - print("--------------------------------------------------------------\n\n") - graph.plot(file_sizes, bandwidths, label="Zarr-Python", marker='o') - avg_graph.bar("Zarr-Python", np.average(bandwidths)) - return float(np.average(bandwidths)) + return total_time - def continuous_append_test(self, graph: matplotlib.axes._axes.Axes, - avg_graph: matplotlib.axes._axes.Axes, - append_dim_size: int) -> float: - # calls continuous append function and graphs results - print("\n\n--------Zarr-Python Append Stress Test--------\n\n") - write_number, bandwidths = self.__continuous_append( - result_path = self.abs_path_to_data + "/stressTestAppend.zarr", - append_dim_size = append_dim_size - ) - print("--------------------------------------------------------------\n\n") - graph.plot(write_number, bandwidths, label="Zarr-Python") - avg_graph.bar("Zarr-Python", np.average(bandwidths)) - return float(np.average(bandwidths)) + def append_zarr(self, shape: list, chunks: list, zarr_data: np.ndarray) -> float: + # if there is no data to append to, create it + if not Path(self.data_path).exists(): + return self.write_zarr(shape=shape, chunks=chunks, zarr_data=zarr_data) + + zarr_folder = zarr.open(self.data_path) + + # timing the data getting appended to the back of the zarr folder + t = time.perf_counter() + zarr_folder.append(zarr_data) + total_time = time.perf_counter() - t - \ No newline at end of file + return total_time + \ No newline at end of file