Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cpp zarr #8

Merged
merged 2 commits into from
Aug 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,4 @@ RUN mkdir build && \
cd ..

# running the benchmark
#CMD [ "python", "main.py" ]
CMD [ "python", "main.py" ]
186 changes: 186 additions & 0 deletions benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
from zarr_libraries import *
from typing import Optional
import numpy as np
import shutil
import matplotlib.axes

class Benchmark:
def __init__(self, shape: list, chunks: list) -> None:
self.__shape = shape
self.__chunks = chunks
self.__average_bandwidth = {}
self.__zarr_writers = {
"TensorStore" : Tensorstore(),
"Zarr Python" : Zarr_Python(),
"OME Zarr" : Ome_Zarr(),
"Cpp Zarr" : Cpp_Zarr()
}
self.__write_zarr = {}
self.__append_zarr = {}


''' These functions are intended to be "private" and for use only inside the class '''
def __set_write_functions(self, shape: list, zarr_data: np.ndarray) -> None:
self.__write_zarr = {
"TensorStore" : lambda: self.__zarr_writers["TensorStore"].write_zarr(shape=shape, chunks=self.chunks, zarr_data=zarr_data),
"Zarr Python" : lambda: self.__zarr_writers["Zarr Python"].write_zarr(shape=shape, chunks=self.chunks, zarr_data=zarr_data),
"OME Zarr" : lambda: self.__zarr_writers["OME Zarr"].write_zarr(chunks=self.chunks, zarr_data=zarr_data),
"Cpp Zarr" : lambda: self.__zarr_writers["Cpp Zarr"].write_zarr(shape=shape, chunks=self.chunks)
}


def __set_append_functions(self,new_shape: list, zarr_data: np.ndarray, multiplier: int) -> None:
self.__append_zarr = {
"TensorStore" : lambda: self.__zarr_writers["TensorStore"].append_zarr(shape=self.shape, chunks=self.chunks, new_shape=new_shape, zarr_data=zarr_data, multiplier=multiplier),
"Zarr Python" : lambda: self.__zarr_writers["Zarr Python"].append_zarr(shape=self.shape, chunks=self.chunks, zarr_data=zarr_data)
}


def __print_results(self, additional_info: Optional[str] = None):
if additional_info: print(additional_info)

print(f"Shape {self.shape}, Chunks {self.chunks}")
print("----------Bandwidth----------")
for test, bandwidth in self.__average_bandwidth.items():
print(f"{test} : {bandwidth} GBps")
print("\n\n")


''' These functions are intended to be "public" and for use outside of the class '''
@property
def shape(self) -> list:
return self.__shape


@property
def chunks(self) -> list:
return self.__chunks


def run_write_tests(self, num_of_gigabytes: int, show_results: bool,
choose_lib: Optional[str] = None,
graph: Optional[matplotlib.axes._axes.Axes] = None,
avg_graph: Optional[matplotlib.axes._axes.Axes] = None) -> None:

# error checking to see if chosen lib exists in test
if choose_lib and choose_lib not in set(self.__zarr_writers.keys()):
raise ValueError(f"There is no library of name \"{choose_lib}\".")

gb_in_bytes = 1073741824 # represents number of bytes in a GB

for lib_name, writer in self.__zarr_writers.items():
# if a specified library is chosen for testing, skip any that isn't that test
if choose_lib != None and choose_lib != lib_name: continue

print(f"\n\n--------{lib_name} Stress Test--------\n\n")

multiplier = 1 # multiplier that increases shape of zarr folder written
curr_data_size = 0 # test will run until curr_data_size reaches specified GB size passed into the function
write_speeds = []
file_sizes = []

while curr_data_size < (num_of_gigabytes * gb_in_bytes):
# modify the append dimension, unpack the rest
new_shape = [self.shape[0] * (multiplier), *self.shape[1:]]
zarr_data = np.empty(())

# Cpp zarr implementation creates data in cpp_zarr.cpp, skip here to avoid making unused data
if lib_name != "Cpp Zarr":
zarr_data = np.random.randint(low=0, high=256, size=new_shape, dtype=np.uint8)

# returns time taken to write zarr folder
self.__set_write_functions(shape=new_shape, zarr_data=zarr_data)
total_time = self.__write_zarr[lib_name]() # calling a lambda function inside of a dictionary

# prints info to the terminal
print(f"Multiplier on first dimension : {multiplier}x\n{lib_name} -> creating zarr : {total_time} seconds")
print(f"The zarr folder is of size {folder_size(writer.data_path)}\n\n")

curr_data_size = np.prod(new_shape) # 3d array filled with 1 byte ints so multiplication gives accurate size in bytes
file_sizes.append(curr_data_size * 10**-9) # converts bytes to GB
write_speeds.append((curr_data_size * 10**-9) / total_time) # GB/s

# goes from 1 to 5, then adds 5 every time after that
multiplier += 4 if multiplier == 1 else 5

shutil.rmtree(writer.data_path)

if graph: graph.plot(file_sizes, write_speeds, label=lib_name)
if avg_graph: avg_graph.bar(lib_name, np.average(write_speeds))
self.__average_bandwidth[lib_name + " Write"] = np.average(write_speeds)

print("--------------------------------------------------------------\n\n")

if show_results:
self.__print_results(additional_info=(f"Write Test GB Soft Cap: {num_of_gigabytes}GB"))


def run_append_test(self, num_of_gigabytes: int, show_results: bool,
choose_lib: Optional[str] = None,
graph: Optional[matplotlib.axes._axes.Axes] = None,
avg_graph: Optional[matplotlib.axes._axes.Axes] = None) -> None:

# error checking to see if chosen lib exists in test
if choose_lib and choose_lib not in set(self.__zarr_writers.keys()):
raise ValueError(f"There is no library of name \"{choose_lib}\".")

gb_in_bytes = 1073741824 # represents number of bytes in a GB
write_size = np.prod(self.shape) # amount of bytes appended on in each function call

for lib_name, writer in self.__zarr_writers.items():
# these are the only libraries that allow for appending of data
if lib_name != "TensorStore" and lib_name != "Zarr Python":
continue

# if a specified library is chosen for testing, skip any that isn't that test
if choose_lib != None and choose_lib != lib_name: continue

print(f"\n\n--------{lib_name} Append Stress Test--------\n\n")

multiplier = 1 # multiplier that increases shape of zarr folder written
curr_data_size = 0 # test will run until curr_data_size reaches specified GB size passed into the function
write_speeds = []
write_numbers = []

while curr_data_size < (num_of_gigabytes * gb_in_bytes):
# modify the append dimension, unpack the rest
new_shape = [self.shape[0] * (multiplier), *self.shape[1:]]

# creating new data and adjusting the shape
zarr_data = np.random.randint(low=0, high=256, size=self.shape, dtype=np.uint8)

# returns time taken to write zarr folder / both libraries use a different approach hence the if statements
self.__set_append_functions(new_shape=new_shape, zarr_data=zarr_data, multiplier=multiplier)
total_time = self.__append_zarr[lib_name]() # calling a lambda function inside of a dictionary

# prints info to the terminal
print(f"Multiplier on first dimension : {multiplier}x\n{lib_name} -> appending zarr : {total_time} seconds")
print(f"The zarr folder is of size {folder_size(writer.data_path)}\n\n")

curr_data_size = np.prod(new_shape) # 3d array filled with 1 byte ints so multiplication gives accurate size in bytes
write_numbers.append(multiplier) # converts bytes to GB
write_speeds.append((write_size * 10**-9) / total_time) # GB/s

multiplier += 1

shutil.rmtree(writer.data_path)

if graph: graph.plot(write_numbers, write_speeds, label=lib_name)
if avg_graph: avg_graph.bar(lib_name, np.average(write_speeds))
self.__average_bandwidth[lib_name + " Append"] = np.average(write_speeds)

print("--------------------------------------------------------------\n\n")

if show_results:
self.__print_results(additional_info=(f"Write Test GB Soft Cap: {num_of_gigabytes}GB"))


def run_all_tests(self, append_test_gigabytes: int, write_test_gigabytes: int,
choose_lib: Optional[str] = None,
append_graph: Optional[matplotlib.axes._axes.Axes] = None, append_avg_graph: Optional[matplotlib.axes._axes.Axes] = None,
write_graph: Optional[matplotlib.axes._axes.Axes] = None, write_avg_graph: Optional[matplotlib.axes._axes.Axes] = None) -> None:

self.run_append_test(num_of_gigabytes=append_test_gigabytes, show_results=False, choose_lib=choose_lib, graph=append_graph, avg_graph=append_avg_graph)
self.run_write_tests(num_of_gigabytes=write_test_gigabytes, show_results=False, choose_lib=choose_lib, graph=write_graph, avg_graph=write_avg_graph)
self.__print_results(additional_info=(f"Write Test GB Soft Cap: {write_test_gigabytes}GB | Append Test GB Soft Cap: {append_test_gigabytes}GB"))

92 changes: 26 additions & 66 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,77 +1,37 @@
import matplotlib.pyplot as plt
from zarr_libraries import *
from benchmark import Benchmark


def main() -> None:
fig, graph = plt.subplots(2, 2)
benchmark = Benchmark(shape=[64, 1080, 1920], chunks=[64, 540, 960])

def run_all_tests(shape: list, chunks: list) -> None:
fig, graph = plt.subplots(2, 2)
bandwidth_map = {}
zarr_writers = {
"TensorStore" : Tensorstore(shape=shape, chunks=chunks),
"Zarr Python" : Zarr_Python(shape=shape, chunks=chunks),
"OME Zarr" : Ome_Zarr(shape=shape, chunks=chunks),
"Cpp Zarr" : Cpp_Zarr(shape=shape, chunks=chunks)
}

'''
Append Tests:
- These tests benchmark the continuous appending to a single zarr folder.
- These tests are best suited for the following libraries:
* TensorStore
* Zarr Python
'''
for name, writer in zarr_writers.items():
if name != "TensorStore" and name != "Zarr Python":
continue

bandwidth_map[name + " Append"] = (
writer.continuous_append_test(graph=graph[1][0], avg_graph=graph[1][1], append_dim_size=100)
)

# setting up graph for append tests
graph[1][0].set_xlabel("Write Number")
graph[1][0].set_title("Continuous Append Test")
graph[1][0].legend()
benchmark.run_all_tests(
append_test_gigabytes=25, write_test_gigabytes=5,
append_graph=graph[1][0], append_avg_graph=graph[1][1],
write_graph=graph[0][0], write_avg_graph=graph[0][1]
)

'''
Continuous write tests:
- These tests benchmark the creation of many increasingly large zarr folders.
- These tests are best suited for the following libraries:
* TensorStore
* Zarr Python
* OME Zarr
'''
for name, writer in zarr_writers.items():
bandwidth_map[name + " Write"] = (
writer.continuous_write_test(graph=graph[0][0], avg_graph=graph[0][1], append_dim_size=51, step=5)
)

# print the average bandwidth for each of the tests
print(f"Shape {shape}, Chunks {chunks}")
print("----------Bandwidth----------")
for test, bandwidth in bandwidth_map.items():
print(f"{test} : {bandwidth} GBps")
print("\n\n")

# setting up graphs for write tests
graph[0][0].set_xlabel("Data Size (GB)")
graph[0][0].set_title("Continuous Write Test")
graph[0][0].legend()

# setting up graphs for average bandwidth
graph[0][1].set_title("Average Bandwidth:\nContinuous Write Test")
graph[1][1].set_title("Average Bandwidth:\nContinuous Append Test")
# setting up graph for append tests
graph[1][0].set_xlabel("Write Number")
graph[1][0].set_title("Continuous Append Test")
graph[1][0].legend()

for graph in fig.get_axes():
graph.set_ylabel("Bandwidth (GBps)")
graph.grid()

fig.canvas.manager.set_window_title(f'shape: {shape}, chunks: {chunks}')
plt.tight_layout()

# setting up graphs for write tests
graph[0][0].set_xlabel("Data Size (GB)")
graph[0][0].set_title("Continuous Write Test")
graph[0][0].legend()

run_all_tests(shape=[64, 1080, 1920], chunks=[64, 540, 960])
# setting up graphs for average bandwidth
graph[0][1].set_title("Average Bandwidth:\nContinuous Write Test")
graph[1][1].set_title("Average Bandwidth:\nContinuous Append Test")

for graph in fig.get_axes():
graph.set_ylabel("Bandwidth (GBps)")
graph.grid()

fig.canvas.manager.set_window_title(f'shape: {benchmark.shape}, chunks: {benchmark.chunks}')
plt.tight_layout()
plt.show()


Expand Down
2 changes: 1 addition & 1 deletion zarr_libraries/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def folder_size(folder_path: str) -> str:
def convert_bytes(B: int) -> str:
"""Return the given bytes as a human friendly KB, MB, GB, or TB string."""
B = float(B)
KB = float(1000) # change to 1024 for non mac file systems
KB = float(1024) # change to 1024 for non mac file systems
MB = float(KB ** 2) # 1,048,576
GB = float(KB ** 3) # 1,073,741,824
TB = float(KB ** 4) # 1,099,511,627,776
Expand Down
2 changes: 1 addition & 1 deletion zarr_libraries/cpp_zarr/cpp_zarr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ float write_zarr(string path, vector<uint64_t> chunks, vector<uint64_t> shape)
zarrObject.set_shape(writeShape);
zarrObject.set_chunks(chunks);
zarrObject.set_fill_value(1);
zarrObject.set_order("C");
//zarrObject.set_order("C");
zarrObject.set_dimension_separator("/");
zarrObject.set_clevel((uint64_t)1);
zarrObject.set_chunkInfo(startCoords, endCoords);
Expand Down
53 changes: 9 additions & 44 deletions zarr_libraries/cpp_zarr/cpp_zarr.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,18 @@
from build.pyCppZarr import *
from zarr_libraries import folder_size
from pathlib import Path
import matplotlib.axes
import numpy as np
import shutil


class Cpp_Zarr:
def __init__(self, shape: list, chunks: list) -> None:
self.abs_path_to_data = str((Path(__file__).parent / "../example_data/cpp_zarr_data").resolve())
self.shape = shape
self.chunks = chunks


def __continuous_write(self, result_path: str, append_dim_size: int, step: int) -> tuple[list, list]:
file_sizes = []
bandwidths = []
def __init__(self) -> None:
self.__path_to_data = str((Path(__file__).parent / "../example_data/cpp_zarr_data/test.zarr").resolve())

for i in range(0, append_dim_size, step):
new_shape = (self.shape[0] * (i + 1), *self.shape[1:]) # modify the append dimension, unpack the rest

# write zarr files and store total time taken
total_time = write_zarr(result_path, self.chunks, new_shape)

# print info to the terminal
print(f"Write #{i + 1}\nCpp Zarr -> creating zarr : {total_time} seconds")
print(f"The zarr folder is of size {folder_size(result_path)}\n\n")

size = np.prod(new_shape) # 3d array filled with 1 byte ints so multiplication gives accurate size in bytes
file_sizes.append(size * 10**-9) # converts bytes to GB
bandwidths.append((size * 10**-9) / total_time) # GB/s
shutil.rmtree(result_path)

return file_sizes, bandwidths

@property
def data_path(self) -> str:
return self.__path_to_data


def continuous_write_test(self, graph: matplotlib.axes._axes.Axes,
avg_graph: matplotlib.axes._axes.Axes,
append_dim_size: int, step: int) -> float:
# calls continuous write function and graphs results
print("\n\n--------Cpp Zarr Stress Test--------\n\n")
file_sizes, bandwidths = self.__continuous_write(
result_path = self.abs_path_to_data + "/stressTest.zarr",
append_dim_size = append_dim_size,
step = step
)
print("--------------------------------------------------------------\n\n")
graph.plot(file_sizes, bandwidths, label="Cpp Zarr", marker='o')
avg_graph.bar("Cpp Zarr", np.average(bandwidths))
return float(np.average(bandwidths))


def write_zarr(self, shape: list, chunks: list) -> float:
return write_zarr(self.data_path, chunks, shape)

Loading