Skip to content

Commit

Permalink
Merge pull request #8 from chris-delg/cpp-zarr
Browse files Browse the repository at this point in the history
Cpp zarr
  • Loading branch information
chris-delg authored Aug 27, 2024
2 parents fb69851 + b348fce commit b304221
Show file tree
Hide file tree
Showing 9 changed files with 309 additions and 380 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,4 @@ RUN mkdir build && \
cd ..

# running the benchmark
#CMD [ "python", "main.py" ]
CMD [ "python", "main.py" ]
186 changes: 186 additions & 0 deletions benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
from zarr_libraries import *
from typing import Optional
import numpy as np
import shutil
import matplotlib.axes

class Benchmark:
def __init__(self, shape: list, chunks: list) -> None:
self.__shape = shape
self.__chunks = chunks
self.__average_bandwidth = {}
self.__zarr_writers = {
"TensorStore" : Tensorstore(),
"Zarr Python" : Zarr_Python(),
"OME Zarr" : Ome_Zarr(),
"Cpp Zarr" : Cpp_Zarr()
}
self.__write_zarr = {}
self.__append_zarr = {}


''' These functions are intended to be "private" and for use only inside the class '''
def __set_write_functions(self, shape: list, zarr_data: np.ndarray) -> None:
self.__write_zarr = {
"TensorStore" : lambda: self.__zarr_writers["TensorStore"].write_zarr(shape=shape, chunks=self.chunks, zarr_data=zarr_data),
"Zarr Python" : lambda: self.__zarr_writers["Zarr Python"].write_zarr(shape=shape, chunks=self.chunks, zarr_data=zarr_data),
"OME Zarr" : lambda: self.__zarr_writers["OME Zarr"].write_zarr(chunks=self.chunks, zarr_data=zarr_data),
"Cpp Zarr" : lambda: self.__zarr_writers["Cpp Zarr"].write_zarr(shape=shape, chunks=self.chunks)
}


def __set_append_functions(self,new_shape: list, zarr_data: np.ndarray, multiplier: int) -> None:
self.__append_zarr = {
"TensorStore" : lambda: self.__zarr_writers["TensorStore"].append_zarr(shape=self.shape, chunks=self.chunks, new_shape=new_shape, zarr_data=zarr_data, multiplier=multiplier),
"Zarr Python" : lambda: self.__zarr_writers["Zarr Python"].append_zarr(shape=self.shape, chunks=self.chunks, zarr_data=zarr_data)
}


def __print_results(self, additional_info: Optional[str] = None):
if additional_info: print(additional_info)

print(f"Shape {self.shape}, Chunks {self.chunks}")
print("----------Bandwidth----------")
for test, bandwidth in self.__average_bandwidth.items():
print(f"{test} : {bandwidth} GBps")
print("\n\n")


''' These functions are intended to be "public" and for use outside of the class '''
@property
def shape(self) -> list:
return self.__shape


@property
def chunks(self) -> list:
return self.__chunks


def run_write_tests(self, num_of_gigabytes: int, show_results: bool,
choose_lib: Optional[str] = None,
graph: Optional[matplotlib.axes._axes.Axes] = None,
avg_graph: Optional[matplotlib.axes._axes.Axes] = None) -> None:

# error checking to see if chosen lib exists in test
if choose_lib and choose_lib not in set(self.__zarr_writers.keys()):
raise ValueError(f"There is no library of name \"{choose_lib}\".")

gb_in_bytes = 1073741824 # represents number of bytes in a GB

for lib_name, writer in self.__zarr_writers.items():
# if a specified library is chosen for testing, skip any that isn't that test
if choose_lib != None and choose_lib != lib_name: continue

print(f"\n\n--------{lib_name} Stress Test--------\n\n")

multiplier = 1 # multiplier that increases shape of zarr folder written
curr_data_size = 0 # test will run until curr_data_size reaches specified GB size passed into the function
write_speeds = []
file_sizes = []

while curr_data_size < (num_of_gigabytes * gb_in_bytes):
# modify the append dimension, unpack the rest
new_shape = [self.shape[0] * (multiplier), *self.shape[1:]]
zarr_data = np.empty(())

# Cpp zarr implementation creates data in cpp_zarr.cpp, skip here to avoid making unused data
if lib_name != "Cpp Zarr":
zarr_data = np.random.randint(low=0, high=256, size=new_shape, dtype=np.uint8)

# returns time taken to write zarr folder
self.__set_write_functions(shape=new_shape, zarr_data=zarr_data)
total_time = self.__write_zarr[lib_name]() # calling a lambda function inside of a dictionary

# prints info to the terminal
print(f"Multiplier on first dimension : {multiplier}x\n{lib_name} -> creating zarr : {total_time} seconds")
print(f"The zarr folder is of size {folder_size(writer.data_path)}\n\n")

curr_data_size = np.prod(new_shape) # 3d array filled with 1 byte ints so multiplication gives accurate size in bytes
file_sizes.append(curr_data_size * 10**-9) # converts bytes to GB
write_speeds.append((curr_data_size * 10**-9) / total_time) # GB/s

# goes from 1 to 5, then adds 5 every time after that
multiplier += 4 if multiplier == 1 else 5

shutil.rmtree(writer.data_path)

if graph: graph.plot(file_sizes, write_speeds, label=lib_name)
if avg_graph: avg_graph.bar(lib_name, np.average(write_speeds))
self.__average_bandwidth[lib_name + " Write"] = np.average(write_speeds)

print("--------------------------------------------------------------\n\n")

if show_results:
self.__print_results(additional_info=(f"Write Test GB Soft Cap: {num_of_gigabytes}GB"))


def run_append_test(self, num_of_gigabytes: int, show_results: bool,
choose_lib: Optional[str] = None,
graph: Optional[matplotlib.axes._axes.Axes] = None,
avg_graph: Optional[matplotlib.axes._axes.Axes] = None) -> None:

# error checking to see if chosen lib exists in test
if choose_lib and choose_lib not in set(self.__zarr_writers.keys()):
raise ValueError(f"There is no library of name \"{choose_lib}\".")

gb_in_bytes = 1073741824 # represents number of bytes in a GB
write_size = np.prod(self.shape) # amount of bytes appended on in each function call

for lib_name, writer in self.__zarr_writers.items():
# these are the only libraries that allow for appending of data
if lib_name != "TensorStore" and lib_name != "Zarr Python":
continue

# if a specified library is chosen for testing, skip any that isn't that test
if choose_lib != None and choose_lib != lib_name: continue

print(f"\n\n--------{lib_name} Append Stress Test--------\n\n")

multiplier = 1 # multiplier that increases shape of zarr folder written
curr_data_size = 0 # test will run until curr_data_size reaches specified GB size passed into the function
write_speeds = []
write_numbers = []

while curr_data_size < (num_of_gigabytes * gb_in_bytes):
# modify the append dimension, unpack the rest
new_shape = [self.shape[0] * (multiplier), *self.shape[1:]]

# creating new data and adjusting the shape
zarr_data = np.random.randint(low=0, high=256, size=self.shape, dtype=np.uint8)

# returns time taken to write zarr folder / both libraries use a different approach hence the if statements
self.__set_append_functions(new_shape=new_shape, zarr_data=zarr_data, multiplier=multiplier)
total_time = self.__append_zarr[lib_name]() # calling a lambda function inside of a dictionary

# prints info to the terminal
print(f"Multiplier on first dimension : {multiplier}x\n{lib_name} -> appending zarr : {total_time} seconds")
print(f"The zarr folder is of size {folder_size(writer.data_path)}\n\n")

curr_data_size = np.prod(new_shape) # 3d array filled with 1 byte ints so multiplication gives accurate size in bytes
write_numbers.append(multiplier) # converts bytes to GB
write_speeds.append((write_size * 10**-9) / total_time) # GB/s

multiplier += 1

shutil.rmtree(writer.data_path)

if graph: graph.plot(write_numbers, write_speeds, label=lib_name)
if avg_graph: avg_graph.bar(lib_name, np.average(write_speeds))
self.__average_bandwidth[lib_name + " Append"] = np.average(write_speeds)

print("--------------------------------------------------------------\n\n")

if show_results:
self.__print_results(additional_info=(f"Write Test GB Soft Cap: {num_of_gigabytes}GB"))


def run_all_tests(self, append_test_gigabytes: int, write_test_gigabytes: int,
choose_lib: Optional[str] = None,
append_graph: Optional[matplotlib.axes._axes.Axes] = None, append_avg_graph: Optional[matplotlib.axes._axes.Axes] = None,
write_graph: Optional[matplotlib.axes._axes.Axes] = None, write_avg_graph: Optional[matplotlib.axes._axes.Axes] = None) -> None:

self.run_append_test(num_of_gigabytes=append_test_gigabytes, show_results=False, choose_lib=choose_lib, graph=append_graph, avg_graph=append_avg_graph)
self.run_write_tests(num_of_gigabytes=write_test_gigabytes, show_results=False, choose_lib=choose_lib, graph=write_graph, avg_graph=write_avg_graph)
self.__print_results(additional_info=(f"Write Test GB Soft Cap: {write_test_gigabytes}GB | Append Test GB Soft Cap: {append_test_gigabytes}GB"))

92 changes: 26 additions & 66 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,77 +1,37 @@
import matplotlib.pyplot as plt
from zarr_libraries import *
from benchmark import Benchmark


def main() -> None:
fig, graph = plt.subplots(2, 2)
benchmark = Benchmark(shape=[64, 1080, 1920], chunks=[64, 540, 960])

def run_all_tests(shape: list, chunks: list) -> None:
fig, graph = plt.subplots(2, 2)
bandwidth_map = {}
zarr_writers = {
"TensorStore" : Tensorstore(shape=shape, chunks=chunks),
"Zarr Python" : Zarr_Python(shape=shape, chunks=chunks),
"OME Zarr" : Ome_Zarr(shape=shape, chunks=chunks),
"Cpp Zarr" : Cpp_Zarr(shape=shape, chunks=chunks)
}

'''
Append Tests:
- These tests benchmark the continuous appending to a single zarr folder.
- These tests are best suited for the following libraries:
* TensorStore
* Zarr Python
'''
for name, writer in zarr_writers.items():
if name != "TensorStore" and name != "Zarr Python":
continue

bandwidth_map[name + " Append"] = (
writer.continuous_append_test(graph=graph[1][0], avg_graph=graph[1][1], append_dim_size=100)
)

# setting up graph for append tests
graph[1][0].set_xlabel("Write Number")
graph[1][0].set_title("Continuous Append Test")
graph[1][0].legend()
benchmark.run_all_tests(
append_test_gigabytes=25, write_test_gigabytes=5,
append_graph=graph[1][0], append_avg_graph=graph[1][1],
write_graph=graph[0][0], write_avg_graph=graph[0][1]
)

'''
Continuous write tests:
- These tests benchmark the creation of many increasingly large zarr folders.
- These tests are best suited for the following libraries:
* TensorStore
* Zarr Python
* OME Zarr
'''
for name, writer in zarr_writers.items():
bandwidth_map[name + " Write"] = (
writer.continuous_write_test(graph=graph[0][0], avg_graph=graph[0][1], append_dim_size=51, step=5)
)

# print the average bandwidth for each of the tests
print(f"Shape {shape}, Chunks {chunks}")
print("----------Bandwidth----------")
for test, bandwidth in bandwidth_map.items():
print(f"{test} : {bandwidth} GBps")
print("\n\n")

# setting up graphs for write tests
graph[0][0].set_xlabel("Data Size (GB)")
graph[0][0].set_title("Continuous Write Test")
graph[0][0].legend()

# setting up graphs for average bandwidth
graph[0][1].set_title("Average Bandwidth:\nContinuous Write Test")
graph[1][1].set_title("Average Bandwidth:\nContinuous Append Test")
# setting up graph for append tests
graph[1][0].set_xlabel("Write Number")
graph[1][0].set_title("Continuous Append Test")
graph[1][0].legend()

for graph in fig.get_axes():
graph.set_ylabel("Bandwidth (GBps)")
graph.grid()

fig.canvas.manager.set_window_title(f'shape: {shape}, chunks: {chunks}')
plt.tight_layout()

# setting up graphs for write tests
graph[0][0].set_xlabel("Data Size (GB)")
graph[0][0].set_title("Continuous Write Test")
graph[0][0].legend()

run_all_tests(shape=[64, 1080, 1920], chunks=[64, 540, 960])
# setting up graphs for average bandwidth
graph[0][1].set_title("Average Bandwidth:\nContinuous Write Test")
graph[1][1].set_title("Average Bandwidth:\nContinuous Append Test")

for graph in fig.get_axes():
graph.set_ylabel("Bandwidth (GBps)")
graph.grid()

fig.canvas.manager.set_window_title(f'shape: {benchmark.shape}, chunks: {benchmark.chunks}')
plt.tight_layout()
plt.show()


Expand Down
2 changes: 1 addition & 1 deletion zarr_libraries/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def folder_size(folder_path: str) -> str:
def convert_bytes(B: int) -> str:
"""Return the given bytes as a human friendly KB, MB, GB, or TB string."""
B = float(B)
KB = float(1000) # change to 1024 for non mac file systems
KB = float(1024) # change to 1024 for non mac file systems
MB = float(KB ** 2) # 1,048,576
GB = float(KB ** 3) # 1,073,741,824
TB = float(KB ** 4) # 1,099,511,627,776
Expand Down
2 changes: 1 addition & 1 deletion zarr_libraries/cpp_zarr/cpp_zarr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ float write_zarr(string path, vector<uint64_t> chunks, vector<uint64_t> shape)
zarrObject.set_shape(writeShape);
zarrObject.set_chunks(chunks);
zarrObject.set_fill_value(1);
zarrObject.set_order("C");
//zarrObject.set_order("C");
zarrObject.set_dimension_separator("/");
zarrObject.set_clevel((uint64_t)1);
zarrObject.set_chunkInfo(startCoords, endCoords);
Expand Down
53 changes: 9 additions & 44 deletions zarr_libraries/cpp_zarr/cpp_zarr.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,18 @@
from build.pyCppZarr import *
from zarr_libraries import folder_size
from pathlib import Path
import matplotlib.axes
import numpy as np
import shutil


class Cpp_Zarr:
def __init__(self, shape: list, chunks: list) -> None:
self.abs_path_to_data = str((Path(__file__).parent / "../example_data/cpp_zarr_data").resolve())
self.shape = shape
self.chunks = chunks


def __continuous_write(self, result_path: str, append_dim_size: int, step: int) -> tuple[list, list]:
file_sizes = []
bandwidths = []
def __init__(self) -> None:
self.__path_to_data = str((Path(__file__).parent / "../example_data/cpp_zarr_data/test.zarr").resolve())

for i in range(0, append_dim_size, step):
new_shape = (self.shape[0] * (i + 1), *self.shape[1:]) # modify the append dimension, unpack the rest

# write zarr files and store total time taken
total_time = write_zarr(result_path, self.chunks, new_shape)

# print info to the terminal
print(f"Write #{i + 1}\nCpp Zarr -> creating zarr : {total_time} seconds")
print(f"The zarr folder is of size {folder_size(result_path)}\n\n")

size = np.prod(new_shape) # 3d array filled with 1 byte ints so multiplication gives accurate size in bytes
file_sizes.append(size * 10**-9) # converts bytes to GB
bandwidths.append((size * 10**-9) / total_time) # GB/s
shutil.rmtree(result_path)

return file_sizes, bandwidths

@property
def data_path(self) -> str:
return self.__path_to_data


def continuous_write_test(self, graph: matplotlib.axes._axes.Axes,
avg_graph: matplotlib.axes._axes.Axes,
append_dim_size: int, step: int) -> float:
# calls continuous write function and graphs results
print("\n\n--------Cpp Zarr Stress Test--------\n\n")
file_sizes, bandwidths = self.__continuous_write(
result_path = self.abs_path_to_data + "/stressTest.zarr",
append_dim_size = append_dim_size,
step = step
)
print("--------------------------------------------------------------\n\n")
graph.plot(file_sizes, bandwidths, label="Cpp Zarr", marker='o')
avg_graph.bar("Cpp Zarr", np.average(bandwidths))
return float(np.average(bandwidths))


def write_zarr(self, shape: list, chunks: list) -> float:
return write_zarr(self.data_path, chunks, shape)

Loading

0 comments on commit b304221

Please sign in to comment.