change default chunk size to 1 MB

This value was based on the benchmarks in f0a6e96. Signed-off-by: Spencer Schrock <[email protected]>
sigstore · Jan 16, 2025 · 5b7febd · 5b7febd
1 parent f0a6e96
commit 5b7febd
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 18 deletions.
diff --git a/benchmarks/generate.py b/benchmarks/generate.py
@@ -31,7 +31,7 @@ def create_file_of_given_size(path: str, size: int) -> None:
     """
     file_path = pathlib.Path(path)
     file_path.parent.mkdir(parents=True, exist_ok=True)
-    chunk_size = 8192
+    chunk_size = 1048576
     num_chunks = size // chunk_size
 
     with file_path.open("wb") as f:

diff --git a/benchmarks/serialize.py b/benchmarks/serialize.py
@@ -193,7 +193,7 @@ def build_parser() -> argparse.ArgumentParser:
 
     param_groups = parser.add_argument_group("Internal parameters to fine-tune")
     param_groups.add_argument(
-        "--chunk", help="chunk size (default: 8192)", type=int, default=8192
+        "--chunk", help="chunk size (default: 1048576)", type=int, default=1048576
     )
     param_groups.add_argument(
         "--shard",

diff --git a/src/model_signing/api.py b/src/model_signing/api.py
@@ -160,13 +160,13 @@ def _build_stream_hasher(
     def _build_file_hasher_factory(
         self,
         hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
-        chunk_size: int = 8192,
+        chunk_size: int = 1048576,
     ) -> Callable[[pathlib.Path], file.SimpleFileHasher]:
         """Builds the hasher factory for a serialization by file.
 
         Args:
             hashing_algorithm: the hashing algorithm to use to hash a file
-            chunk_size: The amount of file to read at once. Default is 8KB. A
+            chunk_size: The amount of file to read at once. Default is 1MB. A
               special value of 0 signals to attempt to read everything in a
               single call.
 
@@ -184,14 +184,14 @@ def factory(path: pathlib.Path) -> file.SimpleFileHasher:
     def _build_sharded_file_hasher_factory(
         self,
         hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
-        chunk_size: int = 8192,
+        chunk_size: int = 1048576,
         shard_size: int = 1000000,
     ) -> Callable[[pathlib.Path, int, int], file.ShardedFileHasher]:
         """Builds the hasher factory for a serialization by file shards.
 
         Args:
             hashing_algorithm: the hashing algorithm to use to hash a file
-            chunk_size: The amount of file to read at once. Default is 8KB. A
+            chunk_size: The amount of file to read at once. Default is 1MB. A
               special value of 0 signals to attempt to read everything in a
               single call.
             shard_size: The size of a file shard. Default is 1,000,000 bytes.
@@ -220,7 +220,7 @@ def set_serialize_by_file_to_manifest(
         self,
         *,
         hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
-        chunk_size: int = 8192,
+        chunk_size: int = 1048576,
         max_workers: Optional[int] = None,
         allow_symlinks: bool = False,
     ) -> Self:
@@ -232,7 +232,7 @@ def set_serialize_by_file_to_manifest(
 
         Args:
             hashing_algorithm: the hashing algorithm to use to hash a file
-            chunk_size: The amount of file to read at once. Default is 8KB. A
+            chunk_size: The amount of file to read at once. Default is 1MB. A
               special value of 0 signals to attempt to read everything in a
               single call.
             max_workers: Maximum number of workers to use in parallel. Default
@@ -256,7 +256,7 @@ def set_serialize_by_file_to_digest(
         *,
         hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
         merge_algorithm: Literal["sha256", "blake2"] = "sha256",
-        chunk_size: int = 8192,
+        chunk_size: int = 1048576,
         allow_symlinks: bool = False,
     ) -> Self:
         """Configures serialization to a single digest, at file granularity.
@@ -269,7 +269,7 @@ def set_serialize_by_file_to_digest(
             hashing_algorithm: the hashing algorithm to use to hash a file
             merge_algorithm: the hashing algorithm to use when computing the
               final digest over all the (file, digest) pairings
-            chunk_size: The amount of file to read at once. Default is 8KB. A
+            chunk_size: The amount of file to read at once. Default is 1MB. A
               special value of 0 signals to attempt to read everything in a
               single call.
             allow_symlinks: Controls whether symbolic links are included. If a
@@ -298,7 +298,7 @@ def set_serialize_by_file_shard_to_manifest(
         self,
         *,
         hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
-        chunk_size: int = 8192,
+        chunk_size: int = 1048576,
         shard_size: int = 1000000,
         max_workers: Optional[int] = None,
         allow_symlinks: bool = False,
@@ -312,7 +312,7 @@ def set_serialize_by_file_shard_to_manifest(
 
         Args:
             hashing_algorithm: the hashing algorithm to use to hash a file shard
-            chunk_size: The amount of file to read at once. Default is 8KB. A
+            chunk_size: The amount of file to read at once. Default is 1MB. A
               special value of 0 signals to attempt to read everything in a
               single call.
             shard_size: The size of a file shard. Default is 1,000,000 bytes.
@@ -339,7 +339,7 @@ def set_serialize_by_file_shard_to_digest(
         *,
         hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
         merge_algorithm: Literal["sha256", "blake2"] = "sha256",
-        chunk_size: int = 8192,
+        chunk_size: int = 1048576,
         shard_size: int = 1000000,
         max_workers: Optional[int] = None,
         allow_symlinks: bool = False,
@@ -354,7 +354,7 @@ def set_serialize_by_file_shard_to_digest(
             hashing_algorithm: the hashing algorithm to use to hash a file shard
             merge_algorithm: the hashing algorithm to use when computing the
               final digest over all the (file, digest) pairings
-            chunk_size: The amount of file to read at once. Default is 8KB. A
+            chunk_size: The amount of file to read at once. Default is 1MB. A
               special value of 0 signals to attempt to read everything in a
               single call.
             shard_size: The size of a file shard. Default is 1,000,000 bytes.

diff --git a/src/model_signing/hashing/file.py b/src/model_signing/hashing/file.py
@@ -83,7 +83,7 @@ def __init__(
         file: pathlib.Path,
         content_hasher: hashing.StreamingHashEngine,
         *,
-        chunk_size: int = 8192,
+        chunk_size: int = 1048576,
         digest_name_override: Optional[str] = None,
     ):
         """Initializes an instance to hash a file with a specific `HashEngine`.
@@ -92,7 +92,7 @@ def __init__(
             file: The file to hash. Use `set_file` to reset it.
             content_hasher: A `hashing.StreamingHashEngine` instance used to
               compute the digest of the file.
-            chunk_size: The amount of file to read at once. Default is 8KB. A
+            chunk_size: The amount of file to read at once. Default is 1MB. A
               special value of 0 signals to attempt to read everything in a
               single call.
             digest_name_override: Optional string to allow overriding the
@@ -235,7 +235,7 @@ def __init__(
         *,
         start: int,
         end: int,
-        chunk_size: int = 8192,
+        chunk_size: int = 1048576,
         shard_size: int = 1000000,
         digest_name_override: Optional[str] = None,
     ):
@@ -250,7 +250,7 @@ def __init__(
             end: The file offset to start reading from. Must be stricly greater
               than start. If past the file size, or -1, it will be trimmed.
               Reset with `set_shard`.
-            chunk_size: The amount of file to read at once. Default is 8KB. A
+            chunk_size: The amount of file to read at once. Default is 1MB. A
               special value of 0 signals to attempt to read everything in a
               single call.
             shard_size: The size of a file shard. Default is 1,000,000 bytes.