From 92c2265dce397c1b2feeadca80ee9787a9f51556 Mon Sep 17 00:00:00 2001 From: Mihai Maruseac Date: Tue, 16 Jul 2024 20:13:49 -0700 Subject: [PATCH 1/4] Refactor DFSSerializer to remove code duplication. There is a duplication in the directory traversal between `DFSSerializer` and `FilesSerializer`. Since the later supports parallel hashing, let's prepare to use only that. We make `FilesSerializer` be an abstract parent class that performs the directory traversal. We introduce `ManifestSerializer` for the old `FilesSerializer` class that was creating a manifest out of the model. We rename `DFSSerializer` to `DigestSerializer` for consistency. Since `FilesSerializer` (the directory traversal) only considers files now, we need to add a `_FileDigestTree` class to transform the list of files and their hashes (the `FileManifestItem`) to a directory traversal tree, so we can build the digest for `DigestSerializer` in a bottom-up fashion, like before. We could have just included only the files, instead of the directory, but that would require changing a lot of expected constants in the tests. So, we add this transformation now, we plan to migrate tests to goldens and then maybe change the hashing to only include the files when rolling up to a single digest. We still had to update one test: since the hashes are computed only for files, we no longer differentiate between a model with an empty directory and a model where that empty directory is completely removed. This is a corner case and it is ok to do this. In fact, ignoring empty directories is part of the optimization hinted at in https://github.com/sigstore/model-transparency/issues/197. Signed-off-by: Mihai Maruseac --- .../serialization/serialize_by_file.py | 239 ++++++++++++------ .../serialization/serialize_by_file_test.py | 120 ++++++--- 2 files changed, 246 insertions(+), 113 deletions(-) diff --git a/model_signing/serialization/serialize_by_file.py b/model_signing/serialization/serialize_by_file.py index d182047a..98f4250a 100644 --- a/model_signing/serialization/serialize_by_file.py +++ b/model_signing/serialization/serialize_by_file.py @@ -14,10 +14,11 @@ """Model serializers that operated at file level granularity.""" +import abc import base64 import concurrent.futures import pathlib -from typing import Callable, Iterable +from typing import Callable, Iterable, cast from typing_extensions import override from model_signing.hashing import file @@ -65,8 +66,6 @@ def _build_header( bytes. Each argument is separated by dots and the last byte is also a dot (so the file digest can be appended unambiguously). """ - # Note: This will get replaced in subsequent change, right now we're just - # moving existing code around. encoded_type = entry_type.encode("utf-8") # Prevent confusion if name has a "." inside by encoding to base64. encoded_name = base64.b64encode(entry_name.encode("utf-8")) @@ -74,75 +73,14 @@ def _build_header( return b".".join([encoded_type, encoded_name, b""]) -class DFSSerializer(serialization.Serializer): - """Serializer for a model that performs a traversal of the model directory. - - This serializer produces a single hash for the entire model. If the model is - a file, the hash is the digest of the file. If the model is a directory, we - perform a depth-first traversal of the directory, hash each individual files - and aggregate the hashes together. - """ - - def __init__( - self, - file_hasher: file.SimpleFileHasher, - merge_hasher_factory: Callable[[], hashing.StreamingHashEngine], - ): - """Initializes an instance to serialize a model with this serializer. - - Args: - hasher: The hash engine used to hash the individual files. - merge_hasher_factory: A callable that returns a - `hashing.StreamingHashEngine` instance used to merge individual - file digests to compute an aggregate digest. - """ - self._file_hasher = file_hasher - self._merge_hasher_factory = merge_hasher_factory - - @override - def serialize(self, model_path: pathlib.Path) -> manifest.DigestManifest: - # TODO: github.com/sigstore/model-transparency/issues/196 - Add checks - # to exclude symlinks if desired. - check_file_or_directory(model_path) - - if model_path.is_file(): - self._file_hasher.set_file(model_path) - return manifest.DigestManifest(self._file_hasher.compute()) - - return manifest.DigestManifest(self._dfs(model_path)) - - def _dfs(self, directory: pathlib.Path) -> hashing.Digest: - # TODO: github.com/sigstore/model-transparency/issues/196 - Add support - # for excluded files. - children = sorted([x for x in directory.iterdir()]) - - hasher = self._merge_hasher_factory() - for child in children: - check_file_or_directory(child) - - if child.is_file(): - header = _build_header(entry_name=child.name, entry_type="file") - hasher.update(header) - self._file_hasher.set_file(child) - digest = self._file_hasher.compute() - hasher.update(digest.digest_value) - else: - header = _build_header(entry_name=child.name, entry_type="dir") - hasher.update(header) - digest = self._dfs(child) - hasher.update(digest.digest_value) - - return hasher.compute() - - class FilesSerializer(serialization.Serializer): - """Model serializers that produces an itemized manifest, at file level. + """Generic file serializer. Traverses the model directory and creates digests for every file found, possibly in parallel. - Since the manifest lists each item individually, this will also enable - support for incremental updates (to be added later). + Subclasses can then create a manifest with these digests, either listing + them item by item, or combining everything into a single digest. """ def __init__( @@ -162,7 +100,7 @@ def __init__( self._max_workers = max_workers @override - def serialize(self, model_path: pathlib.Path) -> manifest.FileLevelManifest: + def serialize(self, model_path: pathlib.Path) -> manifest.Manifest: # TODO: github.com/sigstore/model-transparency/issues/196 - Add checks # to exclude symlinks if desired. check_file_or_directory(model_path) @@ -210,12 +148,169 @@ def _compute_hash( digest = self._hasher_factory(path).compute() return manifest.FileManifestItem(path=relative_path, digest=digest) + @abc.abstractmethod def _build_manifest( self, items: Iterable[manifest.FileManifestItem] - ) -> manifest.FileLevelManifest: - """Builds an itemized manifest from a given list of items. + ) -> manifest.Manifest: + """Builds the manifest representing the serialization of the model.""" + pass + + +class ManifestSerializer(FilesSerializer): + """Model serializer that produces an itemized manifest, at file level. + + Since the manifest lists each item individually, this will also enable + support for incremental updates (to be added later). + """ - Every subclass needs to implement this method to determine the format of - the manifest. + @override + def serialize(self, model_path: pathlib.Path) -> manifest.FileLevelManifest: + """Serializes the model given by the `model_path` argument. + + The only reason for the override is to change the return type, to be + more restrictive. This is to signal that the only manifests that can be + returned are `manifest.FileLevelManifest` instances. """ + return cast(manifest.FileLevelManifest, super().serialize(model_path)) + + @override + def _build_manifest( + self, items: Iterable[manifest.FileManifestItem] + ) -> manifest.FileLevelManifest: return manifest.FileLevelManifest(items) + + +class _FileDigestTree: + """A tree of files with their digests. + + Every leaf in the tree is a file, paired with its digest. Every intermediate + node represents a directory. We need to pair every directory with a digest, + in a top-down fashion. + + Every internal node has a list of children, sorted alphabetically. + """ + + def __init__( + self, path: pathlib.PurePath, digest: hashing.Digest | None = None + ): + """Builds a node in the digest tree. + + Don't call this from outside of the class. Instead, use `build_tree`. + """ + self._path = path + self._digest = digest + self._children: set[_FileDigestTree] = set() + + @classmethod + def build_tree( + cls, items: Iterable[manifest.FileManifestItem] + ) -> "_FileDigestTree": + """Builds a tree out of the sequence of manifest items.""" + path_to_node = {} + + for file_item in items: + file = file_item.path + node = cls(file, file_item.digest) + for parent in file.parents: + if parent not in path_to_node: + path_to_node[parent] = cls(parent) + parent_node = path_to_node[parent] + parent_node._children.add(node) + node = parent_node + + # Handle empty model + if not path_to_node: + return cls(pathlib.PurePath()) + + return path_to_node[pathlib.PurePath()] + + def get_digest( + self, + hasher_factory: Callable[[], hashing.StreamingHashEngine], + ) -> hashing.Digest: + """Returns the digest of this tree of files. + + Args: + hasher_factory: A callable that returns a + `hashing.StreamingHashEngine` instance used to merge individual + digests to compute an aggregate digest. + """ + hasher = hasher_factory() + + for child in sorted(self._children, key=lambda c: c._path): + name = child._path.name + if child._digest is not None: + header = _build_header(entry_name=name, entry_type="file") + hasher.update(header) + hasher.update(child._digest.digest_value) + else: + header = _build_header(entry_name=name, entry_type="dir") + hasher.update(header) + digest = child.get_digest(hasher_factory) + hasher.update(digest.digest_value) + + return hasher.compute() + + +class DigestSerializer(FilesSerializer): + """Serializer for a model that performs a traversal of the model directory. + + This serializer produces a single hash for the entire model. If the model is + a file, the hash is the digest of the file. If the model is a directory, we + perform a depth-first traversal of the directory, hash each individual files + and aggregate the hashes together. + + Currently, this has a different initialization than `FilesSerializer`, but + this will likely change in a subsequent change. Similarly, currently, this + only supports one single worker, but this will change in the future. + """ + + def __init__( + self, + file_hasher: file.SimpleFileHasher, + merge_hasher_factory: Callable[[], hashing.StreamingHashEngine], + ): + """Initializes an instance to serialize a model with this serializer. + + Args: + hasher: The hash engine used to hash the individual files. + merge_hasher_factory: A callable that returns a + `hashing.StreamingHashEngine` instance used to merge individual + file digests to compute an aggregate digest. + """ + + def _factory(path: pathlib.Path) -> file.FileHasher: + file_hasher.set_file(path) + return file_hasher + + super().__init__(_factory, max_workers=1) + self._merge_hasher_factory = merge_hasher_factory + + @override + def serialize(self, model_path: pathlib.Path) -> manifest.DigestManifest: + """Serializes the model given by the `model_path` argument. + + The only reason for the override is to change the return type, to be + more restrictive. This is to signal that the only manifests that can be + returned are `manifest.DigestManifest` instances. + """ + return cast(manifest.DigestManifest, super().serialize(model_path)) + + @override + def _build_manifest( + self, items: Iterable[manifest.FileManifestItem] + ) -> manifest.DigestManifest: + # Note: we do several computations here to try and match the old + # behavior but these would be simplified in the future. Since we are + # defining the hashing behavior, we can freely change this. + + # If the model is just one file, return the hash of the file. + # A model is a file if we have one item only and its path is empty. + items = list(items) + if len(items) == 1 and not items[0].path.name: + return manifest.DigestManifest(items[0].digest) + + # Otherwise, build a tree of files and compute the digests. + tree = _FileDigestTree.build_tree(items) + digest = tree.get_digest(self._merge_hasher_factory) + return manifest.DigestManifest(digest) diff --git a/model_signing/serialization/serialize_by_file_test.py b/model_signing/serialization/serialize_by_file_test.py index 8cd09ab5..e0a799b0 100644 --- a/model_signing/serialization/serialize_by_file_test.py +++ b/model_signing/serialization/serialize_by_file_test.py @@ -23,13 +23,15 @@ from model_signing.serialization import test_support -class TestDFSSerializer: +class TestDigestSerializer: def test_known_file(self, sample_model_file): file_hasher = file.SimpleFileHasher( test_support.UNUSED_PATH, memory.SHA256() ) - serializer = serialize_by_file.DFSSerializer(file_hasher, memory.SHA256) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) expected = ( "3aab065c7181a173b5dd9e9d32a9f79923440b413be1e1ffcdba26a7365f719b" ) @@ -42,7 +44,9 @@ def test_file_hash_is_same_as_hash_of_content(self, sample_model_file): file_hasher = file.SimpleFileHasher( test_support.UNUSED_PATH, memory.SHA256() ) - serializer = serialize_by_file.DFSSerializer(file_hasher, memory.SHA256) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) manifest = serializer.serialize(sample_model_file) digest = memory.SHA256(test_support.KNOWN_MODEL_TEXT).compute() @@ -53,7 +57,9 @@ def test_file_manifest_unchanged_when_model_moved(self, sample_model_file): file_hasher = file.SimpleFileHasher( test_support.UNUSED_PATH, memory.SHA256() ) - serializer = serialize_by_file.DFSSerializer(file_hasher, memory.SHA256) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) manifest = serializer.serialize(sample_model_file) new_name = sample_model_file.with_name("new-file") @@ -66,7 +72,9 @@ def test_file_manifest_changes_if_content_changes(self, sample_model_file): file_hasher = file.SimpleFileHasher( test_support.UNUSED_PATH, memory.SHA256() ) - serializer = serialize_by_file.DFSSerializer(file_hasher, memory.SHA256) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) manifest = serializer.serialize(sample_model_file) sample_model_file.write_bytes(test_support.ANOTHER_MODEL_TEXT) @@ -80,7 +88,9 @@ def test_directory_model_with_only_known_file(self, sample_model_file): file_hasher = file.SimpleFileHasher( test_support.UNUSED_PATH, memory.SHA256() ) - serializer = serialize_by_file.DFSSerializer(file_hasher, memory.SHA256) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) manifest_file = serializer.serialize(sample_model_file) content_digest = memory.SHA256(test_support.KNOWN_MODEL_TEXT).compute() @@ -93,7 +103,9 @@ def test_known_folder(self, sample_model_folder): file_hasher = file.SimpleFileHasher( test_support.UNUSED_PATH, memory.SHA256() ) - serializer = serialize_by_file.DFSSerializer(file_hasher, memory.SHA256) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) expected = ( "310af4fc4c52bf63cd1687c67076ed3e56bc5480a1b151539e6c550506ae0301" ) @@ -108,7 +120,9 @@ def test_folder_model_hash_is_same_if_model_is_moved( file_hasher = file.SimpleFileHasher( test_support.UNUSED_PATH, memory.SHA256() ) - serializer = serialize_by_file.DFSSerializer(file_hasher, memory.SHA256) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) manifest = serializer.serialize(sample_model_folder) new_name = sample_model_folder.with_name("new-root") @@ -117,11 +131,13 @@ def test_folder_model_hash_is_same_if_model_is_moved( assert manifest == new_manifest - def test_folder_model_empty_folder_gets_included(self, sample_model_folder): + def test_folder_model_empty_folder_not_included(self, sample_model_folder): file_hasher = file.SimpleFileHasher( test_support.UNUSED_PATH, memory.SHA256() ) - serializer = serialize_by_file.DFSSerializer(file_hasher, memory.SHA256) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) manifest = serializer.serialize(sample_model_folder) altered_dir = test_support.get_first_directory(sample_model_folder) @@ -129,13 +145,15 @@ def test_folder_model_empty_folder_gets_included(self, sample_model_folder): new_empty_dir.mkdir() new_manifest = serializer.serialize(sample_model_folder) - assert manifest != new_manifest + assert manifest == new_manifest def test_folder_model_empty_file_gets_included(self, sample_model_folder): file_hasher = file.SimpleFileHasher( test_support.UNUSED_PATH, memory.SHA256() ) - serializer = serialize_by_file.DFSSerializer(file_hasher, memory.SHA256) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) manifest = serializer.serialize(sample_model_folder) altered_dir = test_support.get_first_directory(sample_model_folder) @@ -149,7 +167,9 @@ def test_folder_model_rename_file(self, sample_model_folder): file_hasher = file.SimpleFileHasher( test_support.UNUSED_PATH, memory.SHA256() ) - serializer = serialize_by_file.DFSSerializer(file_hasher, memory.SHA256) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) manifest = serializer.serialize(sample_model_folder) altered_dir = test_support.get_first_directory(sample_model_folder) @@ -164,7 +184,9 @@ def test_folder_model_rename_dir(self, sample_model_folder): file_hasher = file.SimpleFileHasher( test_support.UNUSED_PATH, memory.SHA256() ) - serializer = serialize_by_file.DFSSerializer(file_hasher, memory.SHA256) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) manifest = serializer.serialize(sample_model_folder) dir_to_rename = test_support.get_first_directory(sample_model_folder) @@ -178,7 +200,9 @@ def test_folder_model_replace_file_empty_folder(self, sample_model_folder): file_hasher = file.SimpleFileHasher( test_support.UNUSED_PATH, memory.SHA256() ) - serializer = serialize_by_file.DFSSerializer(file_hasher, memory.SHA256) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) manifest = serializer.serialize(sample_model_folder) altered_dir = test_support.get_first_directory(sample_model_folder) @@ -193,7 +217,9 @@ def test_folder_model_change_file(self, sample_model_folder): file_hasher = file.SimpleFileHasher( test_support.UNUSED_PATH, memory.SHA256() ) - serializer = serialize_by_file.DFSSerializer(file_hasher, memory.SHA256) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) manifest = serializer.serialize(sample_model_folder) altered_dir = test_support.get_first_directory(sample_model_folder) @@ -207,7 +233,9 @@ def test_deep_folder(self, deep_model_folder): file_hasher = file.SimpleFileHasher( test_support.UNUSED_PATH, memory.SHA256() ) - serializer = serialize_by_file.DFSSerializer(file_hasher, memory.SHA256) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) expected = ( "36eed9389ebbbe15ac15d33c81dabb60ccb7c945ff641d78f59db9aa9dc47ac9" ) @@ -220,7 +248,9 @@ def test_empty_file(self, empty_model_file): file_hasher = file.SimpleFileHasher( test_support.UNUSED_PATH, memory.SHA256() ) - serializer = serialize_by_file.DFSSerializer(file_hasher, memory.SHA256) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) expected = ( "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" ) @@ -233,7 +263,9 @@ def test_empty_folder(self, empty_model_folder): file_hasher = file.SimpleFileHasher( test_support.UNUSED_PATH, memory.SHA256() ) - serializer = serialize_by_file.DFSSerializer(file_hasher, memory.SHA256) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) expected = ( "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" ) @@ -246,7 +278,9 @@ def test_directory_model_with_only_empty_file(self, empty_model_file): file_hasher = file.SimpleFileHasher( test_support.UNUSED_PATH, memory.SHA256() ) - serializer = serialize_by_file.DFSSerializer(file_hasher, memory.SHA256) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) expected = ( "8a587b2129fdecfbea38d5152b626299f5994d9b99d36b321aea356f69b38c61" ) @@ -261,7 +295,9 @@ def test_empty_folder_hashes_differently_than_empty_file( file_hasher = file.SimpleFileHasher( test_support.UNUSED_PATH, memory.SHA256() ) - serializer = serialize_by_file.DFSSerializer(file_hasher, memory.SHA256) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) folder_manifest = serializer.serialize(empty_model_folder) file_manifest = serializer.serialize(empty_model_file) @@ -274,7 +310,9 @@ def test_model_with_empty_folder_hashes_differently_than_with_empty_file( file_hasher = file.SimpleFileHasher( test_support.UNUSED_PATH, memory.SHA256() ) - serializer = serialize_by_file.DFSSerializer(file_hasher, memory.SHA256) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) # Compute digest of model with empty folder altered_dir = test_support.get_first_directory(sample_model_folder) @@ -291,13 +329,13 @@ def test_model_with_empty_folder_hashes_differently_than_with_empty_file( assert folder_manifest != file_manifest -class TestFilesSerializer: +class TestManifestSerializer: def _hasher_factory(self, path: pathlib.Path) -> file.FileHasher: return file.SimpleFileHasher(path, memory.SHA256()) def test_known_file(self, sample_model_file): - serializer = serialize_by_file.FilesSerializer(self._hasher_factory) + serializer = serialize_by_file.ManifestSerializer(self._hasher_factory) expected = [ "3aab065c7181a173b5dd9e9d32a9f79923440b413be1e1ffcdba26a7365f719b" ] @@ -308,7 +346,7 @@ def test_known_file(self, sample_model_file): assert digests == expected def test_file_manifest_unchanged_when_model_moved(self, sample_model_file): - serializer = serialize_by_file.FilesSerializer(self._hasher_factory) + serializer = serialize_by_file.ManifestSerializer(self._hasher_factory) manifest = serializer.serialize(sample_model_file) new_name = sample_model_file.with_name("new-file") @@ -318,7 +356,7 @@ def test_file_manifest_unchanged_when_model_moved(self, sample_model_file): assert manifest == new_manifest def test_file_manifest_changes_if_content_changes(self, sample_model_file): - serializer = serialize_by_file.FilesSerializer(self._hasher_factory) + serializer = serialize_by_file.ManifestSerializer(self._hasher_factory) manifest = serializer.serialize(sample_model_file) digests = set(test_support.extract_digests_from_manifest(manifest)) @@ -333,7 +371,7 @@ def test_file_manifest_changes_if_content_changes(self, sample_model_file): assert len(digests) == len(new_digests) def test_directory_model_with_only_known_file(self, sample_model_file): - serializer = serialize_by_file.FilesSerializer(self._hasher_factory) + serializer = serialize_by_file.ManifestSerializer(self._hasher_factory) manifest_file = serializer.serialize(sample_model_file) digests_file = set( test_support.extract_digests_from_manifest(manifest_file) @@ -346,7 +384,7 @@ def test_directory_model_with_only_known_file(self, sample_model_file): assert digests == digests_file def test_known_folder(self, sample_model_folder): - serializer = serialize_by_file.FilesSerializer(self._hasher_factory) + serializer = serialize_by_file.ManifestSerializer(self._hasher_factory) # Long hashes, want to update easily, so pylint: disable=line-too-long expected_items = { "f0": "997b37cc51f1ca1c7a270466607e26847429cd7264c30148c1b9352e224083fc", @@ -370,7 +408,7 @@ def test_known_folder(self, sample_model_folder): def test_folder_model_hash_is_same_if_model_is_moved( self, sample_model_folder ): - serializer = serialize_by_file.FilesSerializer(self._hasher_factory) + serializer = serialize_by_file.ManifestSerializer(self._hasher_factory) manifest = serializer.serialize(sample_model_folder) new_name = sample_model_folder.with_name("new-root") @@ -380,7 +418,7 @@ def test_folder_model_hash_is_same_if_model_is_moved( assert manifest == new_manifest def test_folder_model_empty_folder_not_included(self, sample_model_folder): - serializer = serialize_by_file.FilesSerializer(self._hasher_factory) + serializer = serialize_by_file.ManifestSerializer(self._hasher_factory) manifest = serializer.serialize(sample_model_folder) altered_dir = test_support.get_first_directory(sample_model_folder) @@ -391,7 +429,7 @@ def test_folder_model_empty_folder_not_included(self, sample_model_folder): assert manifest == new_manifest def test_folder_model_empty_file_gets_included(self, sample_model_folder): - serializer = serialize_by_file.FilesSerializer(self._hasher_factory) + serializer = serialize_by_file.ManifestSerializer(self._hasher_factory) manifest = serializer.serialize(sample_model_folder) altered_dir = test_support.get_first_directory(sample_model_folder) @@ -431,7 +469,7 @@ def _check_manifests_match_except_on_renamed_file( def test_folder_model_rename_file_only_changes_path_part( self, sample_model_folder ): - serializer = serialize_by_file.FilesSerializer(self._hasher_factory) + serializer = serialize_by_file.ManifestSerializer(self._hasher_factory) manifest = serializer.serialize(sample_model_folder) altered_dir = test_support.get_first_directory(sample_model_folder) @@ -476,7 +514,7 @@ def _check_manifests_match_except_on_renamed_dir( def test_folder_model_rename_dir_only_changes_path_part( self, sample_model_folder ): - serializer = serialize_by_file.FilesSerializer(self._hasher_factory) + serializer = serialize_by_file.ManifestSerializer(self._hasher_factory) manifest = serializer.serialize(sample_model_folder) dir_to_rename = test_support.get_first_directory(sample_model_folder) @@ -490,7 +528,7 @@ def test_folder_model_rename_dir_only_changes_path_part( ) def test_folder_model_replace_file_empty_folder(self, sample_model_folder): - serializer = serialize_by_file.FilesSerializer(self._hasher_factory) + serializer = serialize_by_file.ManifestSerializer(self._hasher_factory) manifest = serializer.serialize(sample_model_folder) altered_dir = test_support.get_first_directory(sample_model_folder) @@ -525,7 +563,7 @@ def _check_manifests_match_except_on_entry( assert old_manifest._item_to_digest[path] == digest def test_folder_model_change_file(self, sample_model_folder): - serializer = serialize_by_file.FilesSerializer(self._hasher_factory) + serializer = serialize_by_file.ManifestSerializer(self._hasher_factory) manifest = serializer.serialize(sample_model_folder) altered_dir = test_support.get_first_directory(sample_model_folder) @@ -540,7 +578,7 @@ def test_folder_model_change_file(self, sample_model_folder): ) def test_deep_folder(self, deep_model_folder): - serializer = serialize_by_file.FilesSerializer(self._hasher_factory) + serializer = serialize_by_file.ManifestSerializer(self._hasher_factory) # Long hashes, want to update easily, so pylint: disable=line-too-long expected_items = { "d0/d1/d2/d3/d4/f0": "6efa14bb03544fcb76045c55f25b9315b6eb5be2d8a85f703193a76b7874c6ff", @@ -556,7 +594,7 @@ def test_deep_folder(self, deep_model_folder): assert items == expected_items def test_empty_file(self, empty_model_file): - serializer = serialize_by_file.FilesSerializer(self._hasher_factory) + serializer = serialize_by_file.ManifestSerializer(self._hasher_factory) expected = [ "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" ] @@ -567,16 +605,16 @@ def test_empty_file(self, empty_model_file): assert digests == expected def test_empty_folder(self, empty_model_folder): - serializer = serialize_by_file.FilesSerializer(self._hasher_factory) + serializer = serialize_by_file.ManifestSerializer(self._hasher_factory) manifest = serializer.serialize(empty_model_folder) assert not manifest._item_to_digest def test_max_workers_does_not_change_digest(self, sample_model_folder): - serializer1 = serialize_by_file.FilesSerializer(self._hasher_factory) - serializer2 = serialize_by_file.FilesSerializer( + serializer1 = serialize_by_file.ManifestSerializer(self._hasher_factory) + serializer2 = serialize_by_file.ManifestSerializer( self._hasher_factory, max_workers=1 ) - serializer3 = serialize_by_file.FilesSerializer( + serializer3 = serialize_by_file.ManifestSerializer( self._hasher_factory, max_workers=3 ) From 53ce736675592cbe8085e1d8e8384e1b7f3cf1b7 Mon Sep 17 00:00:00 2001 From: Mihai Maruseac Date: Tue, 16 Jul 2024 20:31:07 -0700 Subject: [PATCH 2/4] Fix Windows Signed-off-by: Mihai Maruseac --- model_signing/serialization/serialize_by_file.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/model_signing/serialization/serialize_by_file.py b/model_signing/serialization/serialize_by_file.py index 98f4250a..62e576b6 100644 --- a/model_signing/serialization/serialize_by_file.py +++ b/model_signing/serialization/serialize_by_file.py @@ -220,9 +220,9 @@ def build_tree( # Handle empty model if not path_to_node: - return cls(pathlib.PurePath()) + return cls(pathlib.PurePosixPath()) - return path_to_node[pathlib.PurePath()] + return path_to_node[pathlib.PurePosixPath()] def get_digest( self, From 7e0d22574ced1c70bb792ae45f9fc29266e7393a Mon Sep 17 00:00:00 2001 From: Mihai Maruseac Date: Tue, 16 Jul 2024 21:29:56 -0700 Subject: [PATCH 3/4] Document `__init__` Signed-off-by: Mihai Maruseac --- .../serialization/serialize_by_file.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/model_signing/serialization/serialize_by_file.py b/model_signing/serialization/serialize_by_file.py index 62e576b6..4d31ddd4 100644 --- a/model_signing/serialization/serialize_by_file.py +++ b/model_signing/serialization/serialize_by_file.py @@ -196,10 +196,15 @@ def __init__( """Builds a node in the digest tree. Don't call this from outside of the class. Instead, use `build_tree`. + + Args: + path: Path included in the node. + digest: Optional hash of the path. Files must have a digest, + directories never have one. """ self._path = path self._digest = digest - self._children: set[_FileDigestTree] = set() + self._children: list[_FileDigestTree] = [] @classmethod def build_tree( @@ -212,10 +217,14 @@ def build_tree( file = file_item.path node = cls(file, file_item.digest) for parent in file.parents: - if parent not in path_to_node: - path_to_node[parent] = cls(parent) - parent_node = path_to_node[parent] - parent_node._children.add(node) + if parent in path_to_node: + parent_node = path_to_node[parent] + parent_node._children.append(node) + break # everything else already exists + + parent_node = cls(parent) # no digest for directories + parent_node._children.append(node) + path_to_node[parent] = parent_node node = parent_node # Handle empty model From 0a06edfe66c7653a4f2e36350cc7f022e95fc03e Mon Sep 17 00:00:00 2001 From: Mihai Maruseac Date: Wed, 17 Jul 2024 13:23:51 -0700 Subject: [PATCH 4/4] Fix typos, add a type signature Signed-off-by: Mihai Maruseac --- model_signing/serialization/serialize_by_file.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/model_signing/serialization/serialize_by_file.py b/model_signing/serialization/serialize_by_file.py index 4d31ddd4..19d54083 100644 --- a/model_signing/serialization/serialize_by_file.py +++ b/model_signing/serialization/serialize_by_file.py @@ -185,9 +185,7 @@ class _FileDigestTree: Every leaf in the tree is a file, paired with its digest. Every intermediate node represents a directory. We need to pair every directory with a digest, - in a top-down fashion. - - Every internal node has a list of children, sorted alphabetically. + in a bottom-up fashion. """ def __init__( @@ -211,7 +209,7 @@ def build_tree( cls, items: Iterable[manifest.FileManifestItem] ) -> "_FileDigestTree": """Builds a tree out of the sequence of manifest items.""" - path_to_node = {} + path_to_node: dict[pathlib.PurePath, _FileDigestTree] = {} for file_item in items: file = file_item.path