Skip to content

Commit

Permalink
change default chunk size to 1 MB
Browse files Browse the repository at this point in the history
This value was based on the benchmarks in f0a6e96.

Signed-off-by: Spencer Schrock <[email protected]>
  • Loading branch information
spencerschrock committed Jan 16, 2025
1 parent f0a6e96 commit 5b7febd
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 18 deletions.
2 changes: 1 addition & 1 deletion benchmarks/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def create_file_of_given_size(path: str, size: int) -> None:
"""
file_path = pathlib.Path(path)
file_path.parent.mkdir(parents=True, exist_ok=True)
chunk_size = 8192
chunk_size = 1048576
num_chunks = size // chunk_size

with file_path.open("wb") as f:
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def build_parser() -> argparse.ArgumentParser:

param_groups = parser.add_argument_group("Internal parameters to fine-tune")
param_groups.add_argument(
"--chunk", help="chunk size (default: 8192)", type=int, default=8192
"--chunk", help="chunk size (default: 1048576)", type=int, default=1048576
)
param_groups.add_argument(
"--shard",
Expand Down
24 changes: 12 additions & 12 deletions src/model_signing/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,13 +160,13 @@ def _build_stream_hasher(
def _build_file_hasher_factory(
self,
hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
chunk_size: int = 8192,
chunk_size: int = 1048576,
) -> Callable[[pathlib.Path], file.SimpleFileHasher]:
"""Builds the hasher factory for a serialization by file.
Args:
hashing_algorithm: the hashing algorithm to use to hash a file
chunk_size: The amount of file to read at once. Default is 8KB. A
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
Expand All @@ -184,14 +184,14 @@ def factory(path: pathlib.Path) -> file.SimpleFileHasher:
def _build_sharded_file_hasher_factory(
self,
hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
chunk_size: int = 8192,
chunk_size: int = 1048576,
shard_size: int = 1000000,
) -> Callable[[pathlib.Path, int, int], file.ShardedFileHasher]:
"""Builds the hasher factory for a serialization by file shards.
Args:
hashing_algorithm: the hashing algorithm to use to hash a file
chunk_size: The amount of file to read at once. Default is 8KB. A
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
shard_size: The size of a file shard. Default is 1,000,000 bytes.
Expand Down Expand Up @@ -220,7 +220,7 @@ def set_serialize_by_file_to_manifest(
self,
*,
hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
chunk_size: int = 8192,
chunk_size: int = 1048576,
max_workers: Optional[int] = None,
allow_symlinks: bool = False,
) -> Self:
Expand All @@ -232,7 +232,7 @@ def set_serialize_by_file_to_manifest(
Args:
hashing_algorithm: the hashing algorithm to use to hash a file
chunk_size: The amount of file to read at once. Default is 8KB. A
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
max_workers: Maximum number of workers to use in parallel. Default
Expand All @@ -256,7 +256,7 @@ def set_serialize_by_file_to_digest(
*,
hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
merge_algorithm: Literal["sha256", "blake2"] = "sha256",
chunk_size: int = 8192,
chunk_size: int = 1048576,
allow_symlinks: bool = False,
) -> Self:
"""Configures serialization to a single digest, at file granularity.
Expand All @@ -269,7 +269,7 @@ def set_serialize_by_file_to_digest(
hashing_algorithm: the hashing algorithm to use to hash a file
merge_algorithm: the hashing algorithm to use when computing the
final digest over all the (file, digest) pairings
chunk_size: The amount of file to read at once. Default is 8KB. A
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
allow_symlinks: Controls whether symbolic links are included. If a
Expand Down Expand Up @@ -298,7 +298,7 @@ def set_serialize_by_file_shard_to_manifest(
self,
*,
hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
chunk_size: int = 8192,
chunk_size: int = 1048576,
shard_size: int = 1000000,
max_workers: Optional[int] = None,
allow_symlinks: bool = False,
Expand All @@ -312,7 +312,7 @@ def set_serialize_by_file_shard_to_manifest(
Args:
hashing_algorithm: the hashing algorithm to use to hash a file shard
chunk_size: The amount of file to read at once. Default is 8KB. A
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
shard_size: The size of a file shard. Default is 1,000,000 bytes.
Expand All @@ -339,7 +339,7 @@ def set_serialize_by_file_shard_to_digest(
*,
hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
merge_algorithm: Literal["sha256", "blake2"] = "sha256",
chunk_size: int = 8192,
chunk_size: int = 1048576,
shard_size: int = 1000000,
max_workers: Optional[int] = None,
allow_symlinks: bool = False,
Expand All @@ -354,7 +354,7 @@ def set_serialize_by_file_shard_to_digest(
hashing_algorithm: the hashing algorithm to use to hash a file shard
merge_algorithm: the hashing algorithm to use when computing the
final digest over all the (file, digest) pairings
chunk_size: The amount of file to read at once. Default is 8KB. A
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
shard_size: The size of a file shard. Default is 1,000,000 bytes.
Expand Down
8 changes: 4 additions & 4 deletions src/model_signing/hashing/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def __init__(
file: pathlib.Path,
content_hasher: hashing.StreamingHashEngine,
*,
chunk_size: int = 8192,
chunk_size: int = 1048576,
digest_name_override: Optional[str] = None,
):
"""Initializes an instance to hash a file with a specific `HashEngine`.
Expand All @@ -92,7 +92,7 @@ def __init__(
file: The file to hash. Use `set_file` to reset it.
content_hasher: A `hashing.StreamingHashEngine` instance used to
compute the digest of the file.
chunk_size: The amount of file to read at once. Default is 8KB. A
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
digest_name_override: Optional string to allow overriding the
Expand Down Expand Up @@ -235,7 +235,7 @@ def __init__(
*,
start: int,
end: int,
chunk_size: int = 8192,
chunk_size: int = 1048576,
shard_size: int = 1000000,
digest_name_override: Optional[str] = None,
):
Expand All @@ -250,7 +250,7 @@ def __init__(
end: The file offset to start reading from. Must be stricly greater
than start. If past the file size, or -1, it will be trimmed.
Reset with `set_shard`.
chunk_size: The amount of file to read at once. Default is 8KB. A
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call.
shard_size: The size of a file shard. Default is 1,000,000 bytes.
Expand Down

0 comments on commit 5b7febd

Please sign in to comment.