diff --git a/model_signing/serialization/serialize_by_file_shard.py b/model_signing/serialization/serialize_by_file_shard.py index 59aa07e8..c16c1fdd 100644 --- a/model_signing/serialization/serialize_by_file_shard.py +++ b/model_signing/serialization/serialize_by_file_shard.py @@ -14,10 +14,11 @@ """Model serializers that operated at file shard level of granularity.""" +import abc import base64 import concurrent.futures import pathlib -from typing import Callable, Iterable, TypeAlias +from typing import Callable, Iterable, cast from typing_extensions import override from model_signing.hashing import file @@ -27,21 +28,16 @@ from model_signing.serialization import serialize_by_file -_ShardSignTask: TypeAlias = tuple[pathlib.PurePath, str, int, int] - - def _build_header( *, - entry_name: str, - entry_type: str, + name: str, start: int, end: int, ) -> bytes: - """Builds a header to encode a path with given name and type. + """Builds a header to encode a path with given name and shard range. Args: entry_name: The name of the entry to build the header for. - entry_type: The type of the entry (file or directory). start: Offset for the start of the path shard. end: Offset for the end of the path shard. @@ -50,14 +46,11 @@ def _build_header( bytes. Each argument is separated by dots and the last byte is also a dot (so the file digest can be appended unambiguously). """ - # Note: This will get replaced in subsequent change, right now we're just - # moving existing code around. - encoded_type = entry_type.encode("utf-8") # Prevent confusion if name has a "." inside by encoding to base64. - encoded_name = base64.b64encode(entry_name.encode("utf-8")) + encoded_name = base64.b64encode(name.encode("utf-8")) encoded_range = f"{start}-{end}".encode("utf-8") # Note: empty string at the end, to terminate header with a "." - return b".".join([encoded_type, encoded_name, encoded_range, b""]) + return b".".join([encoded_name, encoded_range, b""]) def _endpoints(step: int, end: int) -> Iterable[int]: @@ -83,164 +76,15 @@ def _endpoints(step: int, end: int) -> Iterable[int]: yield end -class ShardedDFSSerializer(serialization.Serializer): - """DFSSerializer that uses a sharded hash engine to exploit parallelism.""" - - def __init__( - self, - file_hasher_factory: Callable[ - [pathlib.Path, int, int], file.ShardedFileHasher - ], - merge_hasher: hashing.StreamingHashEngine, - max_workers: int | None = None, - ): - """Initializes an instance to serialize a model with this serializer. - - Args: - hasher_factory: A callable to build the hash engine used to hash - every shard of the files in the model. Because each shard is - processed in parallel, every thread needs to call the factory to - start hashing. The arguments are the file, and the endpoints of - the shard. - merge_hasher: A `hashing.StreamingHashEngine` instance used to merge - individual file digests to compute an aggregate digest. - max_workers: Maximum number of workers to use in parallel. Default - is to defer to the `concurent.futures` library. - """ - self._file_hasher_factory = file_hasher_factory - self._merge_hasher = merge_hasher - self._max_workers = max_workers - - # Precompute some private values only once by using a mock file hasher. - # None of the arguments used to build the hasher are used. - hasher = file_hasher_factory(pathlib.Path(), 0, 1) - self._shard_size = hasher.shard_size - - @override - def serialize(self, model_path: pathlib.Path) -> manifest.DigestManifest: - # Note: This function currently uses `pathlib.Path.glob` so the DFS - # expansion relies on the `glob` implementation performing a DFS. We - # will be truthful again when switching to `pathlib.Path.walk`, after - # Python 3.12 is the minimum version we support. - - # TODO: github.com/sigstore/model-transparency/issues/196 - Add checks - # to exclude symlinks if desired. - serialize_by_file.check_file_or_directory(model_path) - - if model_path.is_file(): - entries = [model_path] - else: - # TODO: github.com/sigstore/model-transparency/issues/200 - When - # Python3.12 is the minimum supported version, this can be replaced - # with `pathlib.Path.walk` for a clearer interface, and some speed - # improvement. - entries = sorted(model_path.glob("**/*")) - - tasks = self._convert_paths_to_tasks(entries, model_path) - - digest_len = self._merge_hasher.digest_size - digests_buffer = bytearray(len(tasks) * digest_len) - - with concurrent.futures.ThreadPoolExecutor( - max_workers=self._max_workers - ) as tpe: - futures_dict = { - tpe.submit(self._perform_hash_task, model_path, task): i - for i, task in enumerate(tasks) - } - for future in concurrent.futures.as_completed(futures_dict): - i = futures_dict[future] - task_digest = future.result() - - task_path, task_type, task_start, task_end = tasks[i] - header = _build_header( - entry_name=task_path.name, - entry_type=task_type, - start=task_start, - end=task_end, - ) - self._merge_hasher.reset(header) - self._merge_hasher.update(task_digest) - digest = self._merge_hasher.compute().digest_value - - start = i * digest_len - end = start + digest_len - digests_buffer[start:end] = digest - - self._merge_hasher.reset(digests_buffer) - return manifest.DigestManifest(self._merge_hasher.compute()) - - def _convert_paths_to_tasks( - self, paths: Iterable[pathlib.Path], root_path: pathlib.Path - ) -> list[_ShardSignTask]: - """Returns the tasks that would hash shards of files in parallel. - - Every file in `paths` is replaced by a set of tasks. Each task computes - the digest over a shard of the file. Directories result in a single - task, just to compute a digest over a header. - - To differentiate between (empty) files and directories with the same - name, every task needs to also include a header. The header needs to - include relative path to the model root, as we want to obtain the same - digest if the model is moved. - - We don't construct an enum for the type of the entry, because these will - never escape this class. - - Note that the path component of the tasks is a `pathlib.PurePath`, so - operations on it cannot touch the filesystem. - """ - # TODO: github.com/sigstore/model-transparency/issues/196 - Add support - # for excluded files. - - tasks = [] - for path in paths: - serialize_by_file.check_file_or_directory(path) - relative_path = path.relative_to(root_path) - - if path.is_file(): - path_size = path.stat().st_size - start = 0 - for end in _endpoints(self._shard_size, path_size): - tasks.append((relative_path, "file", start, end)) - start = end - else: - tasks.append((relative_path, "dir", 0, 0)) - - return tasks - - def _perform_hash_task( - self, model_path: pathlib.Path, task: _ShardSignTask - ) -> bytes: - """Produces the hash of the file shard included in `task`.""" - task_path, task_type, task_start, task_end = task - - # TODO: github.com/sigstore/model-transparency/issues/197 - Directories - # don't need to use the file hasher. Rather than starting a process - # just for them, we should filter these ahead of time, and only use - # threading for file shards. For now, just return an empty result. - if task_type == "dir": - return b"" - - # TODO: github.com/sigstore/model-transparency/issues/197 - Similarly, - # empty files should be hashed outside of a parallel task, to not waste - # resources. - if task_start == task_end: - return b"" - - full_path = model_path.joinpath(task_path) - hasher = self._file_hasher_factory(full_path, task_start, task_end) - return hasher.compute().digest_value - - class ShardedFilesSerializer(serialization.Serializer): - """Model serializers that produces an itemized manifest, at shard level. + """Generic file shard serializer. Traverses the model directory and creates digests for every file found, sharding the file in equal shards and computing the digests in parallel. - Since the manifest lists each item individually, this will also enable - support for incremental updates (to be added later). + Subclasses can then create a manifest with these digests, either listing + them item by item, combining them into file digests, or combining all of + them into a single digest. """ def __init__( @@ -270,9 +114,7 @@ def __init__( self._shard_size = hasher.shard_size @override - def serialize( - self, model_path: pathlib.Path - ) -> manifest.ShardLevelManifest: + def serialize(self, model_path: pathlib.Path) -> manifest.Manifest: # TODO: github.com/sigstore/model-transparency/issues/196 - Add checks # to exclude symlinks if desired. serialize_by_file.check_file_or_directory(model_path) @@ -337,12 +179,96 @@ def _compute_hash( path=relative_path, digest=digest, start=start, end=end ) + @abc.abstractmethod def _build_manifest( self, items: Iterable[manifest.ShardedFileManifestItem] - ) -> manifest.ShardLevelManifest: + ) -> manifest.Manifest: """Builds an itemized manifest from a given list of items. Every subclass needs to implement this method to determine the format of the manifest. """ + pass + + +class ManifestSerializer(ShardedFilesSerializer): + """Model serializers that produces an itemized manifest, at shard level. + + Since the manifest lists each item individually, this will also enable + support for incremental updates (to be added later). + """ + + @override + def serialize( + self, model_path: pathlib.Path + ) -> manifest.ShardLevelManifest: + """Serializes the model given by the `model_path` argument. + + The only reason for the override is to change the return type, to be + more restrictive. This is to signal that the only manifests that can be + returned are `manifest.FileLevelManifest` instances. + """ + return cast(manifest.ShardLevelManifest, super().serialize(model_path)) + + @override + def _build_manifest( + self, items: Iterable[manifest.ShardedFileManifestItem] + ) -> manifest.ShardLevelManifest: return manifest.ShardLevelManifest(items) + + +class DigestSerializer(ShardedFilesSerializer): + """Serializer for a model that performs a traversal of the model directory. + + This serializer produces a single hash for the entire model. + """ + + def __init__( + self, + file_hasher_factory: Callable[ + [pathlib.Path, int, int], file.ShardedFileHasher + ], + merge_hasher: hashing.StreamingHashEngine, + max_workers: int | None = None, + ): + """Initializes an instance to serialize a model with this serializer. + + Args: + hasher_factory: A callable to build the hash engine used to hash + every shard of the files in the model. Because each shard is + processed in parallel, every thread needs to call the factory to + start hashing. The arguments are the file, and the endpoints of + the shard. + merge_hasher: A `hashing.StreamingHashEngine` instance used to merge + individual file shard digests to compute an aggregate digest. + max_workers: Maximum number of workers to use in parallel. Default + is to defer to the `concurent.futures` library. + """ + super().__init__(file_hasher_factory, max_workers) + self._merge_hasher = merge_hasher + + @override + def serialize(self, model_path: pathlib.Path) -> manifest.DigestManifest: + """Serializes the model given by the `model_path` argument. + + The only reason for the override is to change the return type, to be + more restrictive. This is to signal that the only manifests that can be + returned are `manifest.FileLevelManifest` instances. + """ + return cast(manifest.DigestManifest, super().serialize(model_path)) + + @override + def _build_manifest( + self, items: Iterable[manifest.ShardedFileManifestItem] + ) -> manifest.DigestManifest: + self._merge_hasher.reset() + + for item in sorted(items, key=lambda i: (i.path, i.start, i.end)): + header = _build_header( + name=item.path.name, start=item.start, end=item.end + ) + self._merge_hasher.update(header) + self._merge_hasher.update(item.digest.digest_value) + + digest = self._merge_hasher.compute() + return manifest.DigestManifest(digest) diff --git a/model_signing/serialization/serialize_by_file_shard_test.py b/model_signing/serialization/serialize_by_file_shard_test.py index 59f94047..33510ef0 100644 --- a/model_signing/serialization/serialize_by_file_shard_test.py +++ b/model_signing/serialization/serialize_by_file_shard_test.py @@ -29,7 +29,7 @@ # pytest model_signing/serialization/ --update_goldens -class TestShardedDFSSerializer: +class TestDigestSerializer: def _hasher_factory( self, path: pathlib.Path, start: int, end: int @@ -50,13 +50,13 @@ def test_known_models(self, request, model_fixture_name): # Set up variables (arrange) testdata_path = request.path.parent / "testdata" test_path = testdata_path / "serialize_by_file_shard" - test_class_path = test_path / "TestShardedDFSSerializer" + test_class_path = test_path / "TestDigestSerializer" golden_path = test_class_path / model_fixture_name should_update = request.config.getoption("update_goldens") model = request.getfixturevalue(model_fixture_name) # Compute model manifest (act) - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(model) @@ -76,13 +76,13 @@ def test_known_models_small_shards(self, request, model_fixture_name): # Set up variables (arrange) testdata_path = request.path.parent / "testdata" test_path = testdata_path / "serialize_by_file_shard" - test_class_path = test_path / "TestShardedDFSSerializer" + test_class_path = test_path / "TestDigestSerializer" golden_path = test_class_path / f"{model_fixture_name}_small_shards" should_update = request.config.getoption("update_goldens") model = request.getfixturevalue(model_fixture_name) # Compute model manifest (act) - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory_small_shards, memory.SHA256() ) manifest = serializer.serialize(model) @@ -98,7 +98,7 @@ def test_known_models_small_shards(self, request, model_fixture_name): assert manifest.digest.digest_hex == expected_digest def test_file_hash_is_not_same_as_hash_of_content(self, sample_model_file): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) @@ -108,7 +108,7 @@ def test_file_hash_is_not_same_as_hash_of_content(self, sample_model_file): assert manifest.digest.digest_hex != digest.digest_hex def test_file_manifest_unchanged_when_model_moved(self, sample_model_file): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(sample_model_file) @@ -122,7 +122,7 @@ def test_file_manifest_unchanged_when_model_moved(self, sample_model_file): def test_file_model_hash_changes_if_content_changes( self, sample_model_file ): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(sample_model_file) @@ -134,7 +134,7 @@ def test_file_model_hash_changes_if_content_changes( assert manifest.digest.digest_value != new_manifest.digest.digest_value def test_directory_model_with_only_known_file(self, sample_model_file): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest_file = serializer.serialize(sample_model_file) @@ -148,7 +148,7 @@ def test_directory_model_with_only_known_file(self, sample_model_file): def test_folder_model_hash_is_same_if_model_is_moved( self, sample_model_folder ): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(sample_model_folder) @@ -159,8 +159,8 @@ def test_folder_model_hash_is_same_if_model_is_moved( assert manifest == new_manifest - def test_folder_model_empty_folder_gets_included(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + def test_folder_model_empty_folder_not_included(self, sample_model_folder): + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(sample_model_folder) @@ -170,10 +170,10 @@ def test_folder_model_empty_folder_gets_included(self, sample_model_folder): new_empty_dir.mkdir() new_manifest = serializer.serialize(sample_model_folder) - assert manifest != new_manifest + assert manifest == new_manifest - def test_folder_model_empty_file_gets_included(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + def test_folder_model_empty_file_not_included(self, sample_model_folder): + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(sample_model_folder) @@ -183,10 +183,10 @@ def test_folder_model_empty_file_gets_included(self, sample_model_folder): new_empty_file.write_text("") new_manifest = serializer.serialize(sample_model_folder) - assert manifest != new_manifest + assert manifest == new_manifest def test_folder_model_rename_file(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(sample_model_folder) @@ -200,7 +200,7 @@ def test_folder_model_rename_file(self, sample_model_folder): assert manifest != new_manifest def test_folder_model_rename_dir(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(sample_model_folder) @@ -213,7 +213,7 @@ def test_folder_model_rename_dir(self, sample_model_folder): assert manifest != new_manifest def test_folder_model_replace_file_empty_folder(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(sample_model_folder) @@ -227,7 +227,7 @@ def test_folder_model_replace_file_empty_folder(self, sample_model_folder): assert manifest != new_manifest def test_folder_model_change_file(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(sample_model_folder) @@ -239,22 +239,22 @@ def test_folder_model_change_file(self, sample_model_folder): assert manifest != new_manifest - def test_empty_folder_hashes_differently_than_empty_file( + def test_empty_folder_hashes_same_as_empty_file( self, empty_model_file, empty_model_folder ): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) folder_manifest = serializer.serialize(empty_model_folder) file_manifest = serializer.serialize(empty_model_file) - assert folder_manifest != file_manifest + assert folder_manifest == file_manifest - def test_model_with_empty_folder_hashes_differently_than_with_empty_file( + def test_model_with_empty_folder_hashes_same_as_with_empty_file( self, sample_model_folder ): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) @@ -270,15 +270,15 @@ def test_model_with_empty_folder_hashes_differently_than_with_empty_file( new_empty_file.write_text("") file_manifest = serializer.serialize(sample_model_folder) - assert folder_manifest != file_manifest + assert folder_manifest == file_manifest def test_max_workers_does_not_change_digest(self, sample_model_folder): - serializer1 = serialize_by_file_shard.ShardedDFSSerializer( + serializer1 = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest1 = serializer1.serialize(sample_model_folder) - serializer2 = serialize_by_file_shard.ShardedDFSSerializer( + serializer2 = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256(), max_workers=2 ) manifest2 = serializer2.serialize(sample_model_folder) @@ -286,12 +286,12 @@ def test_max_workers_does_not_change_digest(self, sample_model_folder): assert manifest1 == manifest2 def test_shard_size_changes_digests(self, sample_model_folder): - serializer1 = serialize_by_file_shard.ShardedDFSSerializer( + serializer1 = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest1 = serializer1.serialize(sample_model_folder) - serializer2 = serialize_by_file_shard.ShardedDFSSerializer( + serializer2 = serialize_by_file_shard.DigestSerializer( self._hasher_factory_small_shards, memory.SHA256() ) manifest2 = serializer2.serialize(sample_model_folder) @@ -299,7 +299,7 @@ def test_shard_size_changes_digests(self, sample_model_folder): assert manifest1.digest.digest_value != manifest2.digest.digest_value -@dataclasses.dataclass(frozen=True) +@dataclasses.dataclass(frozen=True, order=True) class _Shard: """A shard of a file from a manifest.""" @@ -338,7 +338,7 @@ def _parse_shard_and_digest(line: str) -> tuple[_Shard, str]: return shard, digest -class TestShardedFilesSerializer: +class TestManifestSerializer: def _hasher_factory( self, path: pathlib.Path, start: int, end: int @@ -359,13 +359,13 @@ def test_known_models(self, request, model_fixture_name): # Set up variables (arrange) testdata_path = request.path.parent / "testdata" test_path = testdata_path / "serialize_by_file_shard" - test_class_path = test_path / "TestShardedFilesSerializer" + test_class_path = test_path / "TestManifestSerializer" golden_path = test_class_path / model_fixture_name should_update = request.config.getoption("update_goldens") model = request.getfixturevalue(model_fixture_name) # Compute model manifest (act) - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(model) @@ -392,13 +392,13 @@ def test_known_models_small_shards(self, request, model_fixture_name): # Set up variables (arrange) testdata_path = request.path.parent / "testdata" test_path = testdata_path / "serialize_by_file_shard" - test_class_path = test_path / "TestShardedFilesSerializer" + test_class_path = test_path / "TestManifestSerializer" golden_path = test_class_path / f"{model_fixture_name}_small_shards" should_update = request.config.getoption("update_goldens") model = request.getfixturevalue(model_fixture_name) # Compute model manifest (act) - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory_small_shards ) manifest = serializer.serialize(model) @@ -421,7 +421,7 @@ def test_known_models_small_shards(self, request, model_fixture_name): assert items == found_items def test_file_manifest_unchanged_when_model_moved(self, sample_model_file): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(sample_model_file) @@ -433,7 +433,7 @@ def test_file_manifest_unchanged_when_model_moved(self, sample_model_file): assert manifest == new_manifest def test_file_manifest_changes_if_content_changes(self, sample_model_file): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(sample_model_file) @@ -450,7 +450,7 @@ def test_file_manifest_changes_if_content_changes(self, sample_model_file): assert digests != new_digests def test_directory_model_with_only_known_file(self, sample_model_file): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest_file = serializer.serialize(sample_model_file) @@ -467,7 +467,7 @@ def test_directory_model_with_only_known_file(self, sample_model_file): def test_folder_model_hash_is_same_if_model_is_moved( self, sample_model_folder ): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(sample_model_folder) @@ -479,7 +479,7 @@ def test_folder_model_hash_is_same_if_model_is_moved( assert manifest == new_manifest def test_folder_model_empty_folder_not_included(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(sample_model_folder) @@ -492,7 +492,7 @@ def test_folder_model_empty_folder_not_included(self, sample_model_folder): assert manifest == new_manifest def test_folder_model_empty_file_not_included(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(sample_model_folder) @@ -530,7 +530,7 @@ def _check_manifests_match_except_on_renamed_file( def test_folder_model_rename_file_only_changes_path_part( self, sample_model_folder ): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(sample_model_folder) @@ -578,7 +578,7 @@ def _check_manifests_match_except_on_renamed_dir( def test_folder_model_rename_dir_only_changes_path_part( self, sample_model_folder ): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(sample_model_folder) @@ -594,7 +594,7 @@ def test_folder_model_rename_dir_only_changes_path_part( ) def test_folder_model_replace_file_empty_folder(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(sample_model_folder) @@ -633,7 +633,7 @@ def _check_manifests_match_except_on_entry( assert old_manifest._item_to_digest[shard] == digest def test_folder_model_change_file(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(sample_model_folder) @@ -650,13 +650,13 @@ def test_folder_model_change_file(self, sample_model_folder): ) def test_max_workers_does_not_change_digest(self, sample_model_folder): - serializer1 = serialize_by_file_shard.ShardedFilesSerializer( + serializer1 = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) - serializer2 = serialize_by_file_shard.ShardedFilesSerializer( + serializer2 = serialize_by_file_shard.ManifestSerializer( self._hasher_factory, max_workers=1 ) - serializer3 = serialize_by_file_shard.ShardedFilesSerializer( + serializer3 = serialize_by_file_shard.ManifestSerializer( self._hasher_factory, max_workers=3 ) diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/deep_model_folder b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/deep_model_folder new file mode 100644 index 00000000..b3a94824 --- /dev/null +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/deep_model_folder @@ -0,0 +1 @@ +6deb22c4330a8a9eb5a2d5faa73bf56c64a5c2888961f0f0df51912798fc4954 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/deep_model_folder_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/deep_model_folder_small_shards new file mode 100644 index 00000000..f826b95f --- /dev/null +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/deep_model_folder_small_shards @@ -0,0 +1 @@ +f5203504bea9ec90a7b7453a53c0aaab98a5db5d038dc1fac3613b47f6018959 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/empty_model_folder b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/empty_model_file similarity index 100% rename from model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/empty_model_folder rename to model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/empty_model_file diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/empty_model_folder_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/empty_model_file_small_shards similarity index 100% rename from model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/empty_model_folder_small_shards rename to model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/empty_model_file_small_shards diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/empty_model_folder b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/empty_model_folder new file mode 100644 index 00000000..c3068040 --- /dev/null +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/empty_model_folder @@ -0,0 +1 @@ +e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/empty_model_folder_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/empty_model_folder_small_shards new file mode 100644 index 00000000..c3068040 --- /dev/null +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/empty_model_folder_small_shards @@ -0,0 +1 @@ +e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/model_folder_with_empty_file b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/model_folder_with_empty_file new file mode 100644 index 00000000..c3068040 --- /dev/null +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/model_folder_with_empty_file @@ -0,0 +1 @@ +e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/model_folder_with_empty_file_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/model_folder_with_empty_file_small_shards new file mode 100644 index 00000000..c3068040 --- /dev/null +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/model_folder_with_empty_file_small_shards @@ -0,0 +1 @@ +e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/sample_model_file b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/sample_model_file new file mode 100644 index 00000000..8ec1d11f --- /dev/null +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/sample_model_file @@ -0,0 +1 @@ +14aebf2e466ad30ef59ea6fce67de44dc133c673784bd543b45f75b8efc3d821 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/sample_model_file_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/sample_model_file_small_shards new file mode 100644 index 00000000..7b4ad705 --- /dev/null +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/sample_model_file_small_shards @@ -0,0 +1 @@ +beb3cbbd9d73133e85a102a3cbda2ef1dc2bc61e9323e32e576e4adb0571bf86 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/sample_model_folder b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/sample_model_folder new file mode 100644 index 00000000..c94ba5d0 --- /dev/null +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/sample_model_folder @@ -0,0 +1 @@ +865a7da87d90b261ce99086bfc61986a6230e6914ad885912b4d22464a9fda13 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/sample_model_folder_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/sample_model_folder_small_shards new file mode 100644 index 00000000..6e6fd67f --- /dev/null +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/sample_model_folder_small_shards @@ -0,0 +1 @@ +02be357fc0015ab3d15dbbd363a172f35d2cbd1a854b8e0a6c67fad2e2c3390f diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/deep_model_folder b/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/deep_model_folder similarity index 100% rename from model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/deep_model_folder rename to model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/deep_model_folder diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/deep_model_folder_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/deep_model_folder_small_shards similarity index 100% rename from model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/deep_model_folder_small_shards rename to model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/deep_model_folder_small_shards diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/empty_model_file b/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/empty_model_file similarity index 100% rename from model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/empty_model_file rename to model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/empty_model_file diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/empty_model_file_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/empty_model_file_small_shards similarity index 100% rename from model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/empty_model_file_small_shards rename to model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/empty_model_file_small_shards diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/empty_model_folder b/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/empty_model_folder similarity index 100% rename from model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/empty_model_folder rename to model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/empty_model_folder diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/empty_model_folder_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/empty_model_folder_small_shards similarity index 100% rename from model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/empty_model_folder_small_shards rename to model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/empty_model_folder_small_shards diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/model_folder_with_empty_file b/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/model_folder_with_empty_file similarity index 100% rename from model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/model_folder_with_empty_file rename to model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/model_folder_with_empty_file diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/model_folder_with_empty_file_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/model_folder_with_empty_file_small_shards similarity index 100% rename from model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/model_folder_with_empty_file_small_shards rename to model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/model_folder_with_empty_file_small_shards diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/sample_model_file b/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/sample_model_file similarity index 100% rename from model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/sample_model_file rename to model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/sample_model_file diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/sample_model_file_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/sample_model_file_small_shards similarity index 100% rename from model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/sample_model_file_small_shards rename to model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/sample_model_file_small_shards diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/sample_model_folder b/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/sample_model_folder similarity index 100% rename from model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/sample_model_folder rename to model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/sample_model_folder diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/sample_model_folder_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/sample_model_folder_small_shards similarity index 100% rename from model_signing/serialization/testdata/serialize_by_file_shard/TestShardedFilesSerializer/sample_model_folder_small_shards rename to model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/sample_model_folder_small_shards diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/deep_model_folder b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/deep_model_folder deleted file mode 100644 index 528ab87c..00000000 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/deep_model_folder +++ /dev/null @@ -1 +0,0 @@ -52fa3c459aec58bc5f9702c73cb3c6b8fd19e9342aa3e4db851e1bde69ab1727 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/deep_model_folder_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/deep_model_folder_small_shards deleted file mode 100644 index a4f2f81e..00000000 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/deep_model_folder_small_shards +++ /dev/null @@ -1 +0,0 @@ -abd66cd0d8a01f3f552ac5af717f49dc6e6575f0849ec3bfb3c9051962314ce6 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/empty_model_file b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/empty_model_file deleted file mode 100644 index 9ac3ea65..00000000 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/empty_model_file +++ /dev/null @@ -1 +0,0 @@ -5f2d126b0d3540c17481fdf724e31cf03b4436a2ebabaa1d2e94fe09831be64d diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/empty_model_file_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/empty_model_file_small_shards deleted file mode 100644 index 9ac3ea65..00000000 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/empty_model_file_small_shards +++ /dev/null @@ -1 +0,0 @@ -5f2d126b0d3540c17481fdf724e31cf03b4436a2ebabaa1d2e94fe09831be64d diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/model_folder_with_empty_file b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/model_folder_with_empty_file deleted file mode 100644 index b6d24eaf..00000000 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/model_folder_with_empty_file +++ /dev/null @@ -1 +0,0 @@ -230d217d5f4f388f5087ac4174dbc9b0ff358e3122a1267b0a56669a44f11ea1 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/model_folder_with_empty_file_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/model_folder_with_empty_file_small_shards deleted file mode 100644 index b6d24eaf..00000000 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/model_folder_with_empty_file_small_shards +++ /dev/null @@ -1 +0,0 @@ -230d217d5f4f388f5087ac4174dbc9b0ff358e3122a1267b0a56669a44f11ea1 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_file b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_file deleted file mode 100644 index a94a0fa0..00000000 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_file +++ /dev/null @@ -1 +0,0 @@ -2ca48c47d5311a9b2f9305519cd5f927dcef09404fc32ef7886abe8f11450eff diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_file_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_file_small_shards deleted file mode 100644 index 5b6697c8..00000000 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_file_small_shards +++ /dev/null @@ -1 +0,0 @@ -284b613e2e1576d87e5e1c912c82da8d87b6350276f36940516404b2a35f1a74 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_folder b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_folder deleted file mode 100644 index 7fa49a73..00000000 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_folder +++ /dev/null @@ -1 +0,0 @@ -d22e0441cfa5ac2bc09715ddd88c802a7f97e29c93dc50f5498bab2954958ebb diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_folder_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_folder_small_shards deleted file mode 100644 index 161cafdf..00000000 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_folder_small_shards +++ /dev/null @@ -1 +0,0 @@ -82bb608d88cf741730c5bcb75a7630f560643acafdd8fa02ad24be20f51c1250